In [1]:
# Import dependancies

import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as st

In [2]:
# Set the path for the property sales data and read the CSV into a dataframe
path = "Resources/aus-property-sales-sep2018-april2020.csv"

sales_df = pd.read_csv(path)
sales_df.head()

Unnamed: 0,date_sold,price,suburb,city_name,state,lat,lon,bedrooms,property_type,loc_pid,lga_pid
0,2018-09-18 00:00:00,,Darling Point,Sydney,NSW,-33.869565,151.241317,3,unit,NSW1221,NSW180
1,2018-09-24 00:00:00,,Darling Point,Sydney,NSW,-33.872179,151.239726,3,unit,NSW1221,NSW180
2,2018-09-26 00:00:00,1730000.0,Darling Point,Sydney,NSW,-33.868386,151.237471,2,unit,NSW1221,NSW180
3,2018-09-26 00:00:00,1928000.0,Darling Point,Sydney,NSW,-33.875465,151.23628,3,unit,NSW1221,NSW180
4,2018-10-02 00:00:00,1475000.0,Darling Point,Sydney,NSW,-33.875734,151.233575,3,unit,NSW1221,NSW180


In [3]:
# Filter the dataframe to only contain results in Victoria
victorian_sales_df = sales_df.loc[sales_df["state"] == "VIC", :].reset_index(drop=True)
victorian_sales_df.head()

Unnamed: 0,date_sold,price,suburb,city_name,state,lat,lon,bedrooms,property_type,loc_pid,lga_pid
0,2018-09-04 00:00:00,337500.0,Healesville,Melbourne,VIC,-37.661765,145.510034,1,unit,VIC1150,VIC129
1,2018-09-05 00:00:00,370000.0,Healesville,Melbourne,VIC,-37.648961,145.522798,2,unit,VIC1150,VIC129
2,2018-11-02 00:00:00,320000.0,Healesville,Melbourne,VIC,-37.663509,145.509362,2,unit,VIC1150,VIC129
3,2018-11-02 00:00:00,385000.0,Healesville,Melbourne,VIC,-37.637402,145.49733,2,unit,VIC1150,VIC129
4,2018-11-03 00:00:00,440000.0,Healesville,Melbourne,VIC,-37.662562,145.509064,3,unit,VIC1150,VIC129


In [4]:
victorian_sales_df.count()

date_sold        119180
price             98550
suburb           119180
city_name        119180
state            119180
lat              119163
lon              119163
bedrooms         119180
property_type    119180
loc_pid          119180
lga_pid          119180
dtype: int64

In [5]:
victorian_sales_df.dropna(how = "any", inplace = True)

In [6]:
victorian_sales_df.count()

date_sold        98535
price            98535
suburb           98535
city_name        98535
state            98535
lat              98535
lon              98535
bedrooms         98535
property_type    98535
loc_pid          98535
lga_pid          98535
dtype: int64

In [7]:
# Remove the time component of the date_sold column, leaving only the date of the transaction
victorian_sales_df["date_sold"] = victorian_sales_df["date_sold"].str[0:10]
victorian_sales_df.head()

Unnamed: 0,date_sold,price,suburb,city_name,state,lat,lon,bedrooms,property_type,loc_pid,lga_pid
0,2018-09-04,337500.0,Healesville,Melbourne,VIC,-37.661765,145.510034,1,unit,VIC1150,VIC129
1,2018-09-05,370000.0,Healesville,Melbourne,VIC,-37.648961,145.522798,2,unit,VIC1150,VIC129
2,2018-11-02,320000.0,Healesville,Melbourne,VIC,-37.663509,145.509362,2,unit,VIC1150,VIC129
3,2018-11-02,385000.0,Healesville,Melbourne,VIC,-37.637402,145.49733,2,unit,VIC1150,VIC129
4,2018-11-03,440000.0,Healesville,Melbourne,VIC,-37.662562,145.509064,3,unit,VIC1150,VIC129


In [8]:
# Set the path for a CSV created to match postcodes and suburbs and read into a dataframe
path = "output/postcode_suburb_df.csv"

postcode_df = pd.read_csv(path)
postcode_df.rename(columns = {"Suburb": "suburb"}, inplace=True)
postcode_df.head()

Unnamed: 0,Postcode,suburb
0,3971,Alberton
1,3277,Allansford
2,3467,Avoca
3,3664,Avenel
4,3113,Warrandyte


In [9]:
# Merge the postcode into the main dataframe
clean_vic_sales_df = pd.merge(victorian_sales_df, postcode_df, how = "left", on = "suburb")
clean_vic_sales_df.head()

Unnamed: 0,date_sold,price,suburb,city_name,state,lat,lon,bedrooms,property_type,loc_pid,lga_pid,Postcode
0,2018-09-04,337500.0,Healesville,Melbourne,VIC,-37.661765,145.510034,1,unit,VIC1150,VIC129,3777.0
1,2018-09-05,370000.0,Healesville,Melbourne,VIC,-37.648961,145.522798,2,unit,VIC1150,VIC129,3777.0
2,2018-11-02,320000.0,Healesville,Melbourne,VIC,-37.663509,145.509362,2,unit,VIC1150,VIC129,3777.0
3,2018-11-02,385000.0,Healesville,Melbourne,VIC,-37.637402,145.49733,2,unit,VIC1150,VIC129,3777.0
4,2018-11-03,440000.0,Healesville,Melbourne,VIC,-37.662562,145.509064,3,unit,VIC1150,VIC129,3777.0
