In [None]:
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
#import pandas_profiling
#from pandas_profiling import ProfileReport
import plotly.express as px
import statistics

from datetime import datetime

In [None]:
#Read the data into a data frame
df = pd.read_csv("extract-3-very-clean.csv")

In [None]:
#See how many records are included
df.size

In [None]:
df['Contract date']= pd.to_datetime(df['Contract date'])
df['Settlement date']= pd.to_datetime(df['Settlement date'])

In [None]:
#Check types are okay
df.dtypes

In [None]:
# Create a list of property locations, postcodes and minimum areas you want to search for (could obvs filter by whatever, but this is my search area)

property_locations = ['Faulconbridge']
#property_locations = ['Lawson','Hazelbrook','Woodford','Linden','Faulconbridge','Springwood','Valley Heights','Warrimoo', 'Blaxland', 'Glenbrook']
#property_locations = ['Linden','Faulconbridge','Springwood']

exclude_zoning = ['IN1', 'IN2', 'I']
postcode_min = 2750
postcode_max = 2800
area_min = 1000

df_myarea = df
df_myarea = df_myarea[ df_myarea['Property locality'].isin(property_locations) ]
df_myarea = df_myarea[ df_myarea['Area'] > area_min ]
df_myarea = df_myarea[ (df_myarea['Property post code'] >= postcode_min) & (df_myarea['Property post code'] <= postcode_max) ]
df_myarea = df_myarea[ ~df_myarea['Zoning'].isin(exclude_zoning) ]
#df_myarea = df_myarea[ df_myarea['Primary purpose'] == 'Vacant land' ]

print(str(len(df_myarea.index)) + ' records kept')

In [None]:
#Show zoning types in the dataset
df_myarea['Zoning'].unique()

In [None]:
#Fix NaNs
df_myarea['Zoning'].fillna(value='None', inplace=True)

In [None]:
#Remove purchase price outliers
before=len(df_myarea.index)
df_myarea = df_myarea[(np.abs(stats.zscore(df_myarea['Purchase price'])) < 3)]
after=len(df_myarea.index)
print(str(before-after) + ' records removed')

In [None]:
#Price histogram in ~$50K bins

fig = px.histogram(df_myarea, x="Purchase price", nbins=int(df_myarea['Purchase price'].max()/50000),
    title='Price histogram', width=1000, height=400,
)
fig.show()


In [None]:
#Price by size and contract date

#Scale property size so the dots don't get too small
median = statistics.median(df_myarea['Area'])
df_myarea['Area - scaled'] = [(x - median) / 15 + median for x in df_myarea['Area']]

fig = px.scatter(
    df_myarea,
    x='Contract date',
    y='Purchase price',    
    size='Area - scaled',
    color='Zoning',
    title='Price and size of property by contract date',
    width=1000,
    height=500,
    labels={'x':'Contract date'},
    hover_name=df_myarea['Property house number'] + ' ' + df_myarea['Property street name'] + ', ' + df_myarea['Property locality'],
    hover_data={
        'Area - scaled':False,
        'Zoning':True,
        'Area':True
    }
)

fig.show()

In [None]:
#Price by contract date

fig = px.scatter(
    df_myarea,
    x='Contract date',
    y='Purchase price',    
    title='Price and size of property by contract date',
    trendline='lowess',
    width=1000,
    height=500,
    labels={'x':'Contract date'},
    hover_name=df_myarea['Property house number'] + ' ' + df_myarea['Property street name'] + ', ' + df_myarea['Property locality'],
    hover_data={
        'Area - scaled':False,
        'Zoning':True,
        'Area':True
    }
)

fig.show()

In [None]:
#Median price by contract date

df_myarea_agg=df_myarea[['Contract date','Purchase price']]
df_myarea_agg=df_myarea_agg.groupby(['Contract date']).median()

fig = px.scatter(
    df_myarea_agg,
    x=df_myarea_agg.index.values,
    y='Purchase price',    
    title='Median price of properties per day',
    width=1000,
    height=500,
    trendline='lowess',
    labels={'x':'Contract date'},
)

fig.show()