In [None]:
import numpy as np
import pandas as pd
#%matplotlib inline
from scipy import stats
import matplotlib.pyplot as plt
import pandas_profiling
from pandas_profiling import ProfileReport

df = pd.read_csv("extract-3-very-clean.csv", index_col=['Contract date'])

In [None]:
#See how many records are included
df.size

In [None]:
#Check types are okay
df.dtypes

In [None]:
# Create a list of property locations, postcodes and minimum areas you want to search for (could obvs filter by whatever, but this is my search area)

#property_locations = ['Lawson','Hazelbrook','Woodford','Linden','Faulconbridge','Springwood','Valley Heights','Warrimoo', 'Blaxland', 'Glenbrook']

property_locations = ['Linden','Faulconbridge','Springwood']
postcode_min = 2750
postcode_max = 2800
area_min = 300

#df_myarea = df[(df['Property locality'].isin(property_locations)) & (df['Area'] > area_min) & (df['Property post code'] >= postcode_min) & (df['Property post code'] <= postcode_max) & (df['Primary purpose'] == 'Vacant land')]
df_myarea = df[(df['Property locality'].isin(property_locations)) & (df['Area'] > area_min) & (df['Property post code'] >= postcode_min) & (df['Property post code'] <= postcode_max)]

print(str(len(df_myarea.index)) + ' records kept')

In [None]:
#Create data profile report
profile = ProfileReport(df_myarea, title="Pandas Profiling Report")
profile

In [None]:
#Remove purchase price outliers
before=len(df_myarea.index)
df_myarea = df_myarea[(np.abs(stats.zscore(df_myarea['Purchase price'])) < 3)]
after=len(df_myarea.index)
print(str(before-after) + ' records removed')

In [None]:
#Price histogram in ~$50K bins
df_myarea.hist(column='Purchase price', bins=int(df_myarea['Purchase price'].max()/50000))

In [None]:
#Show zoning types in the dataset
df_myarea['Zoning'].unique()

In [None]:
#Price by size

import plotly.express as px
import statistics

median = statistics.median(df_myarea['Area'])
df_myarea['Area - scaled'] = [(x - median) / 10 + median for x in df_myarea['Area']]

df_myarea['Zoning']=df_myarea['Zoning'].fillna('None')
fig = px.scatter(
    df_myarea,
    x=df_myarea.index.values,
    y='Purchase price',    
    size='Area - scaled',
    color='Zoning',
    title='Price and size of property by purchase date',
    width=1000,
    height=500,
    labels = {'x':'Contract date'},
    hover_name=df_myarea['Property house number'] + ' ' + df_myarea['Property street name'] + ', ' + df_myarea['Property locality'],
    hover_data={
        'Area - scaled':False,
        'Zoning':True,
        'Area':True
        #'Contract date':False
    }
    #hover_data=[
    #    'Purchase price',
    #    'Zoning',
    #    df_myarea.index.values
    #]
)
fig.show()