In [None]:
import numpy as np
import pandas as pd
#%matplotlib inline
from scipy import stats
import matplotlib.pyplot as plt
import pandas_profiling
from pandas_profiling import ProfileReport

df = pd.read_csv("extract-3-very-clean.csv", index_col=['Contract date'])

In [None]:
#See how many records are included
df.size

In [None]:
df.index

In [None]:
#Check types are okay
df.dtypes

In [None]:
# Create a list of property locations, postcodes and minimum areas you want to search for (could obvs filter by whatever, but this is my search area)
property_locations = ['Lawson','Hazelbrook','Woodford','Linden','Faulconbridge','Springwood','Valley Heights','Warrimoo', 'Blaxland', 'Glenbrook']
postcode_min = 2750
postcode_max = 2800
area_min = 1000

df_myarea = df[(df['Property locality'].isin(property_locations)) & (df['Area'] > area_min) & (df['Property post code'] >= postcode_min) & (df['Property post code'] <= postcode_max) & (df['Primary purpose'] == 'Vacant land')]
print(str(len(df_myarea.index)) + ' records kept')

In [None]:
profile = ProfileReport(df_myarea, title="Pandas Profiling Report")
profile

In [None]:
#Remove purchase price outliers
before=len(df_myarea.index)
df_myarea = df_myarea[(np.abs(stats.zscore(df_myarea['Purchase price'])) < 3)]
after=len(df_myarea.index)
print

print(str(before-after) + ' records removed')


In [None]:
#Price histogram in ~$50K bins
df_myarea.hist(column='Purchase price', bins=int(df_myarea['Purchase price'].max()/50000))

In [None]:
#Get unique
df_myarea['Zoning'].unique()

In [None]:
#Price by size

# Create figure and plot space
fig, ax = plt.subplots(figsize=(25, 8))

# Add x-axis and y-axis
ax.scatter(
       x=df_myarea.index.values,
       y=df_myarea['Purchase price'],
       s=df_myarea['Area']/200
       )

# Set title and labels for axes
ax.set(xlabel="Date",
       ylabel="Purchase price ($m)",
       title="Purchase price by date and size of property")

# Rotate tick marks on x-axis
plt.setp(ax.get_xticklabels(), rotation=45)

# Reduce number of labels on x-axis
every_nth = 6
for n, label in enumerate(ax.xaxis.get_ticklabels()):
    if n % every_nth != 0:
        label.set_visible(False)

# Add grid
plt.grid( axis = 'y' )

plt.show()

In [None]:
import plotly.express as px
import statistics

median = statistics.median(df_myarea['Area'])
df_myarea['Area - scaled'] = [(x - median) / 10 + median for x in df_myarea['Area']]

df_myarea['Zoning']=df_myarea['Zoning'].fillna('None')
fig = px.scatter(
    df_myarea,
    x=df_myarea.index.values,
    y='Purchase price',    
    size='Area - scaled',
    color='Zoning',
    title='Price and size of property by purchase date',
    width=1000,
    height=500,
    labels = {'x':'Contract date'},
    hover_name=df_myarea['Property house number'] + ' ' + df_myarea['Property street name'] + ', ' + df_myarea['Property locality'],
    hover_data={
        'Area - scaled':False,
        'Zoning':True,
        'Area':True
        #'Contract date':False
    }
    #hover_data=[
    #    'Purchase price',
    #    'Zoning',
    #    df_myarea.index.values
    #]
)
fig.show()

In [None]:
#Price by zoning

#Create figure and plot space
fig, ax = plt.subplots(figsize=(25, 8))

#Loop groups and create a scatter per group
for name, group in df_myarea.groupby('Zoning'):
    ax.scatter(x="Settlement date", y="Purchase price", data=group, label=name, s=100)

ax.legend(title="Zoning")

#Set title and labels for axes
ax.set(xlabel="Date",
       ylabel="Purchase price ($m)",
       title="Purchase price by zoning")

#Rotate tick marks on x-axis
plt.setp(ax.get_xticklabels(), rotation=45)

plt.show()