In [None]:
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import plotly.express as px
import statistics
from datetime import datetime
from datetime import date

In [None]:
#Read the data into a data frame
df = pd.read_csv("extract-3-very-clean.csv")

In [None]:
#See how many records are included
df.size

In [None]:
df['Contract date']= pd.to_datetime(df['Contract date'])
df['Settlement date']= pd.to_datetime(df['Settlement date'])

In [None]:
#Check types are okay
df.dtypes

In [None]:
# Create a list of property locations, postcodes and minimum areas you want to search for (could obvs filter by whatever, but this is my search area)

property_locations = ['Lawson','Hazelbrook','Woodford','Linden','Faulconbridge','Springwood','Valley Heights','Warrimoo']
exclude_zoning = ['IN1', 'IN2', 'I', 'B', 'B1', 'B2', 'B7']
postcode_min = 2750
postcode_max = 2800
area_min = 500
start_date = '2015-01-01'
end_date = '2100-01-01'

df_myarea = df
df_myarea = df_myarea[ df_myarea['Property locality'].isin(property_locations) ] #In location specified
df_myarea = df_myarea[ df_myarea['Area'] > area_min ] #More than minimum area size
df_myarea = df_myarea[ (df_myarea['Property post code'] >= postcode_min) & (df_myarea['Property post code'] <= postcode_max) ] #In postcode range
df_myarea = df_myarea[ ~df_myarea['Zoning'].isin(exclude_zoning) ] #Exclude weird zoning types
df_myarea = df_myarea[ (df_myarea['Contract date'] > start_date) & (df_myarea['Contract date'] < end_date)] #Make sure all values are in the correct date range

#df_myarea = df_myarea[ df_myarea['Primary purpose'] == 'Vacant land' ]

print(str(len(df_myarea.index)) + ' records kept')

In [None]:
#Show zoning types in the dataset
#Types: https://www.valuergeneral.nsw.gov.au/__data/assets/pdf_file/0019/216406/Property_Sales_Data_File_Zone_Codes_and_Descriptions_V2.pdf

df_myarea['Zoning'].unique()

In [None]:
#Fix NaNs

nans = str(df_myarea['Zoning'].isna().sum()) #See how many NaNs
df_myarea['Zoning'].fillna(value='None', inplace=True) #Change NaNs to 'None'
print('Fixed ' + nans + ' NaNs.') #Print the results

In [None]:
#Remove purchase price outliers

before=len(df_myarea.index)
df_myarea = df_myarea[(np.abs(stats.zscore(df_myarea['Purchase price'])) < 2)]
after=len(df_myarea.index)
print('Removed ' + str(before-after) + ' outliers (more than 2 standard deviations from the mean).')

In [None]:
#Price histogram in ~$50K bins (entire dataset vs last 12 months)

fig = px.histogram(df_myarea, x="Purchase price", nbins=int(df_myarea['Purchase price'].max()/50000),
    title='Price histogram - entire dataset', width=1000, height=300,
)

df_myarea_12m = df_myarea[ (df_myarea['Contract date'] > '2020-10-01') & (df_myarea['Contract date'] < date.today().strftime("%Y-%m-%d"))]
fig2 = px.histogram(df_myarea_12m, x="Purchase price", nbins=int(df_myarea['Purchase price'].max()/50000),
    title='Price histogram - last 12 months', width=1000, height=300,
)

fig.show()
fig2.show()


In [None]:
#Price by size and contract date

#Scale property size so the dots don't get too small
median = statistics.median(df_myarea['Area'])
df_myarea['Area - scaled'] = [(x - median) / 15 + median for x in df_myarea['Area']]

fig = px.scatter(
    df_myarea,
    x='Contract date',
    y='Purchase price',    
    size='Area - scaled',
    color='Zoning',
    title='Price and size of property by contract date',
    width=1000,
    height=500,
    labels={'x':'Contract date'},
    hover_name=df_myarea['Property house number'] + ' ' + df_myarea['Property street name'] + ', ' + df_myarea['Property locality'],
    hover_data={
        'Area - scaled':False,
        'Zoning':True,
        'Area':True
    }
)

fig.show()

In [None]:
#Price by contract date

fig = px.scatter(
    df_myarea,
    x='Contract date',
    y='Purchase price',    
    title='Sale price over time',
    trendline='rolling',
    trendline_options=dict(window=45),    
    trendline_color_override="red",
    width=1000,
    height=500,
    labels={'x':'Contract date'},
    hover_name=df_myarea['Property house number'] + ' ' + df_myarea['Property street name'] + ', ' + df_myarea['Property locality'],
    hover_data={
        'Area - scaled':False,
        'Zoning':True,
        'Area':True
    }
)

fig.show()

In [None]:
#Median price by contract date

df_myarea_agg=df_myarea[['Contract date','Purchase price']]
df_myarea_agg=df_myarea_agg.groupby(['Contract date']).median()

fig = px.scatter(
    df_myarea_agg,
    x=df_myarea_agg.index.values,
    y='Purchase price',    
    title='Daily median price',
    width=1000,
    height=500,
    labels={'x':'Contract date'},
)

fig.show()

In [None]:
#Monthly median price

df_myarea_aggM = df_myarea[['Contract date', 'Purchase price']]

df_myarea_aggM = df_myarea_aggM.groupby([pd.Grouper(key='Contract date', freq='M')]).agg('median')
df_myarea_aggM.reindex()

fig = px.scatter(
    df_myarea_aggM,
    x=df_myarea_aggM.index.values,
    y='Purchase price',    
    title='Monthly median price',
    width=1000,
    height=500,
    labels={'x':'Contract date'},
)

fig.show()

In [None]:
#Monthly median price

df_myarea_aggM = df_myarea[['Contract date', 'Purchase price']]
df_myarea_aggM = df_myarea_aggM.groupby([pd.Grouper(key='Contract date', freq='M')]).agg('median')

df_myarea_aggM['Rolling 6-month average'] = df_myarea_aggM.rolling(6).mean()

px.line(
    df_myarea_aggM,
    title='Monthly median purchase price'
)