In [None]:
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import plotly.express as px
import statistics
from datetime import datetime
from datetime import date

In [None]:
#Read the data into a data frame
df = pd.read_csv("extract-3-very-clean.csv")

In [218]:
#See how many records are included
df.size

24256512

In [219]:
#Change date fields to datetime type
df['Contract date']= pd.to_datetime(df['Contract date'])
df['Settlement date']= pd.to_datetime(df['Settlement date'])

#Then check types are okay
df.dtypes

Unnamed: 0                        int64
Property ID                     float64
Download date / time             object
Property name                    object
Property unit number             object
Property house number            object
Property street name             object
Property locality                object
Property post code              float64
Area                            float64
Contract date            datetime64[ns]
Settlement date          datetime64[ns]
Purchase price                    int64
Zoning                           object
Primary purpose                  object
Strata lot number               float64
dtype: object

In [241]:
#Filter the dataset to your own search area
#(could obvs filter by whatever, but this is my search area)

property_locations = ['Lawson','Hazelbrook','Woodford','Linden','Faulconbridge','Springwood','Valley Heights','Warrimoo']
exclude_zoning = ['IN1', 'IN2', 'I', 'B', 'B1', 'B2', 'B7']
exclude_primary_purpose = ['Service stations', 'Service stati', 'Service statio', 'Shop', 'Hall']
postcode_min = 2750
postcode_max = 2800
area_min = 500
area_max = None
start_date = '2015-01-01'
end_date = '2100-01-01'

#Go ahead and implement all of the above filters
df_myarea = df
if property_locations: df_myarea = df_myarea[ df_myarea['Property locality'].isin(property_locations) ] #In location specified
if area_min: df_myarea = df_myarea[ df_myarea['Area'] >= area_min ] #More than minimum area size
if area_max: df_myarea = df_myarea[ df_myarea['Area'] <= area_max ] #Less than maximum area size
if postcode_min: df_myarea = df_myarea[ df_myarea['Property post code'] >= postcode_min ] #In postcode range
if postcode_max: df_myarea = df_myarea[ df_myarea['Property post code'] <= postcode_max ] #In postcode range
if exclude_zoning: df_myarea = df_myarea[ ~df_myarea['Zoning'].isin(exclude_zoning) ] #Exclude weird zoning types
if start_date: df_myarea = df_myarea[ df_myarea['Contract date'] >= start_date] #Make sure all values are in the correct date range
if end_date: df_myarea = df_myarea[ df_myarea['Contract date'] <= end_date] #Make sure all values are in the correct date range
if exclude_primary_purpose: df_myarea = df_myarea[ ~df_myarea['Primary purpose'].isin(exclude_primary_purpose) ] #Exclude weird zoning types
#df_myarea = df_myarea[ df_myarea['Primary purpose'] == 'Vacant land' ]

print(str(len(df_myarea.index)) + ' records kept')

3000 records kept


In [244]:
#Show zoning and purpose types in the dataset
#Types: https://www.valuergeneral.nsw.gov.au/__data/assets/pdf_file/0019/216406/Property_Sales_Data_File_Zone_Codes_and_Descriptions_V2.pdf

display(df_myarea['Primary purpose'].unique())
display(df_myarea['Zoning'].unique())

array(['Residence', 'Vacant land', 'Commercial'], dtype=object)

array(['A', nan, 'R', 'R2', 'E4', 'E3', 'R3', 'E2'], dtype=object)

In [238]:
#Fix NaNs

nans = str(df_myarea['Zoning'].isna().sum()) #See how many NaNs
df_myarea['Zoning'].fillna(value='None', inplace=True) #Change NaNs to 'None'
print('Fixed ' + nans + ' NaNs.') #Print the results

Fixed 54 NaNs.


In [239]:
#Remove purchase price outliers

before=len(df_myarea.index)
display(df_myarea[(np.abs(stats.zscore(df_myarea['Purchase price'])) >= 2)])
df_myarea = df_myarea[(np.abs(stats.zscore(df_myarea['Purchase price'])) < 2)]
after=len(df_myarea.index)
print('Removed ' + str(before-after) + ' outliers (more than 2 standard deviations from the mean).')

Unnamed: 0.1,Unnamed: 0,Property ID,Download date / time,Property name,Property unit number,Property house number,Property street name,Property locality,Property post code,Area,Contract date,Settlement date,Purchase price,Zoning,Primary purpose,Strata lot number
358492,358492,2251113.0,20160808 01:07,,,51,Summer Rd,Faulconbridge,2776.0,55100.0,2016-05-02,2016-08-02,2650000,E4,Residence,
477643,477643,3674142.0,20170123 01:07,,,310,Great Western Hwy,Lawson,2783.0,1842.0,2016-07-10,2016-08-08,25375000,R2,Service stati,
730271,730271,2277586.0,20180115 01:26,,,24,Old Bathurst Rd,Woodford,2778.0,26180.0,2017-11-20,2017-12-20,1800000,E3,Residence,
904364,904364,2250299.0,20181119 01:21,,,132,Grose Rd,Faulconbridge,2776.0,103100.0,2018-05-29,2018-08-20,3000000,E4,Commercial,
1262057,1262057,2269294.0,20200921 01:02,,,170,Paterson Rd,Springwood,2777.0,28750.0,2020-07-31,2020-09-17,1700000,E3,Residence,
1387757,1387757,2268280.0,20210329 01:02,,,67,Hawkesbury Rd,Springwood,2777.0,3130.0,2021-02-16,2021-03-25,1700000,E4,Residence,
1443905,1443905,2267532.0,20210614 01:04,,,4,David Rd,Springwood,2777.0,4300.0,2021-02-22,2021-06-09,1900000,E4,Residence,
1496498,1496498,2250299.0,20210830 01:30,,,132,Grose Rd,Faulconbridge,2776.0,103100.0,2021-06-01,2021-08-20,4600000,E4,Commercial,


Removed 8 outliers (more than 2 standard deviations from the mean).


In [None]:
#Price histogram in ~$50K bins (entire dataset vs last 12 months)

fig = px.histogram(df_myarea, x="Purchase price", nbins=int(df_myarea['Purchase price'].max()/50000),
    title='Price histogram - entire dataset', width=1000, height=300,
)

df_myarea_12m = df_myarea[ (df_myarea['Contract date'] > '2020-10-01') & (df_myarea['Contract date'] < date.today().strftime("%Y-%m-%d"))]
fig2 = px.histogram(df_myarea_12m, x="Purchase price", nbins=int(df_myarea['Purchase price'].max()/50000),
    title='Price histogram - last 12 months', width=1000, height=300,
)

fig.show()
fig2.show()


In [None]:
#Price by size and contract date

#Scale property size so the dots don't get too small
median = statistics.median(df_myarea['Area'])
df_myarea['Area - scaled'] = [(x - median) / 15 + median for x in df_myarea['Area']]

fig = px.scatter(
    df_myarea,
    x='Contract date',
    y='Purchase price',    
    size='Area - scaled',
    color='Zoning',
    title='Price and size of property by contract date',
    width=1000,
    height=500,
    labels={'x':'Contract date'},
    hover_name=df_myarea['Property house number'] + ' ' + df_myarea['Property street name'] + ', ' + df_myarea['Property locality'],
    hover_data={
        'Area - scaled':False,
        'Zoning':True,
        'Area':True
    }
)

fig.show()

In [None]:
#Price by contract date

fig = px.scatter(
    df_myarea,
    x='Contract date',
    y='Purchase price',    
    title='Sale price over time',
    trendline='rolling',
    trendline_options=dict(window=45),    
    trendline_color_override="red",
    width=1000,
    height=500,
    labels={'x':'Contract date'},
    hover_name=df_myarea['Property house number'] + ' ' + df_myarea['Property street name'] + ', ' + df_myarea['Property locality'],
    hover_data={
        'Area - scaled':False,
        'Zoning':True,
        'Area':True
    }
)

fig.show()

In [None]:
#Median price by contract date

df_myarea_agg=df_myarea[['Contract date','Purchase price']]
df_myarea_agg=df_myarea_agg.groupby(['Contract date']).median()

fig = px.scatter(
    df_myarea_agg,
    x=df_myarea_agg.index.values,
    y='Purchase price',    
    title='Daily median price',
    width=1000,
    height=500,
    labels={'x':'Contract date'},
)

fig.show()

In [None]:
#Monthly median price

df_myarea_aggM = df_myarea[['Contract date', 'Purchase price']]

df_myarea_aggM = df_myarea_aggM.groupby([pd.Grouper(key='Contract date', freq='M')]).agg('median')
df_myarea_aggM.reindex()

fig = px.scatter(
    df_myarea_aggM,
    x=df_myarea_aggM.index.values,
    y='Purchase price',    
    title='Monthly median price',
    width=1000,
    height=500,
    labels={'x':'Contract date'},
)

fig.show()

In [None]:
#Monthly median price

df_myarea_aggM = df_myarea[['Contract date', 'Purchase price']]
df_myarea_aggM = df_myarea_aggM.groupby([pd.Grouper(key='Contract date', freq='M')]).agg('median')

df_myarea_aggM['Rolling 6-month average'] = df_myarea_aggM.rolling(6).mean()

px.line(
    df_myarea_aggM,
    title='Monthly median purchase price'
)