In [1]:
import sklearn as sk
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [2]:
df = pd.read_csv("Sales_Data.csv")
df.head()

Unnamed: 0,Disclaimer,Building Name,Street Display,Alternate Street Display,Other,Unit,Number,Street Name,Locality,Postcode,...,Valuation Date,Valuation Amount,LGA,Dealing Number,Government Number,Parent Government Number,PDS ID,Sale ID,Load Date,Property ID
0,,,7 KYBEAN ST,,,,7.0,KYBEAN ST,RIVERHILLS,4074.0,...,01/10/2018,"$305,000",BCC-SHERWOOD,720033091.0,QLD9098611,,335052.0,70937192.0,18/05/2020,
1,,,2/15 BOURRELET ST,,,,15.0,BOURRELET ST,TARRAGINDI,4121.0,...,01/10/2018,"$720,000",BCC-STEPHENS,719983327.0,QLD9065046,,88986.0,70008397.0,03/04/2020,
2,,,479 SIMPSONS RD,,,,479.0,SIMPSONS RD,BARDON,4065.0,...,01/10/2018,"$910,000",BCC-ENOGGERA,720003394.0,QLD1253761,,296544.0,70171661.0,04/05/2020,
3,,,2 JENOLAN LA,,,,2.0,JENOLAN LA,FITZGIBBON,4018.0,...,01/10/2018,"$111,000",BCC-KEDRON,720068666.0,QLD41193919,,22462240.0,71047905.0,03/06/2020,
4,,,5 BENAROON ST,,,,5.0,BENAROON ST,BRACKEN RIDGE,4017.0,...,01/10/2018,"$290,000",BCC-KEDRON,720005923.0,QLD1176447,,180794.0,70170915.0,04/05/2020,


In [3]:
#Check the data available
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4194 entries, 0 to 4193
Data columns (total 61 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Disclaimer                1 non-null      object 
 1   Building Name             484 non-null    object 
 2   Street Display            4191 non-null   object 
 3   Alternate Street Display  36 non-null     object 
 4   Other                     0 non-null      float64
 5   Unit                      1121 non-null   float64
 6   Number                    4191 non-null   float64
 7   Street Name               4191 non-null   object 
 8   Locality                  4191 non-null   object 
 9   Postcode                  4191 non-null   float64
 10  Alt. Street               36 non-null     object 
 11  Alt. Locality             30 non-null     object 
 12  Legal Description         4191 non-null   object 
 13  Volume/Folio              0 non-null      float64
 14  Vendor N

In [4]:
df_subset = df[['Sale Price', 'Locality', 'Days To Sell', 'Area', 'Bedrooms', 'Bathrooms', 'Car Parks', 'Property Type', 'Valuation Amount']]
df_subset = df_subset.dropna() #Let's get rid of rows with empty cells, because they will trip up a lot of the analyses
df_subset

Unnamed: 0,Sale Price,Locality,Days To Sell,Area,Bedrooms,Bathrooms,Car Parks,Property Type,Valuation Amount
0,"$435,000",RIVERHILLS,-58,615,3.0,1.0,1.0,House,"$305,000"
1,"$610,000",TARRAGINDI,25,615,3.0,2.0,2.0,House,"$720,000"
2,"$1,240,000",BARDON,25,1897,4.0,2.0,1.0,House,"$910,000"
3,"$352,000",FITZGIBBON,11,88,2.0,2.0,1.0,House,"$111,000"
4,"$390,000",BRACKEN RIDGE,23,579,3.0,1.0,1.0,House,"$290,000"
...,...,...,...,...,...,...,...,...,...
4183,"$440,000",UPPER MOUNT GRAVATT,97,97,2.0,2.0,1.0,Unit,$0
4185,"$360,000",NEWMARKET,62,134,2.0,2.0,1.0,Unit,$0
4186,"$385,000",CHERMSIDE,43,116,2.0,2.0,1.0,Unit,$0
4187,"$520,000",COORPAROO,82,117,3.0,2.0,1.0,Unit,$0


In [5]:
df['Property Type'].value_counts() #Check to see cleanliness of field before dummy var.

House          2124
Unit           1782
Vacant Land     237
Industrial       23
Commercial       17
Other             8
Name: Property Type, dtype: int64

In [6]:
#Creating dummy variables of qualitative variables
dummies1 = pd.get_dummies(df['Property Type'])
dummies1.head()

Unnamed: 0,Commercial,House,Industrial,Other,Unit,Vacant Land
0,0,1,0,0,0,0
1,0,1,0,0,0,0
2,0,1,0,0,0,0
3,0,1,0,0,0,0
4,0,1,0,0,0,0


In [7]:
dummies2 = pd.get_dummies(df['Locality'])
dummies2.head()

Unnamed: 0,ACACIA RIDGE,ALBION,ALDERLEY,ALGESTER,ANNERLEY,ANSTEAD,ARCHERFIELD,ASCOT,ASHGROVE,ASPLEY,...,WILSTON,WINDSOR,WISHART,WOOLLOONGABBA,WOOLOOWIN,WYNNUM,WYNNUM WEST,YEERONGPILLY,YERONGA,ZILLMERE
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
#Adding dummies to the df by concatenating variables to dataframe
df_dummy = pd.concat([df_subset, dummies1, dummies2], axis=1)
df_dummy.head()

Unnamed: 0,Sale Price,Locality,Days To Sell,Area,Bedrooms,Bathrooms,Car Parks,Property Type,Valuation Amount,Commercial,...,WILSTON,WINDSOR,WISHART,WOOLLOONGABBA,WOOLOOWIN,WYNNUM,WYNNUM WEST,YEERONGPILLY,YERONGA,ZILLMERE
0,"$435,000",RIVERHILLS,-58,615,3.0,1.0,1.0,House,"$305,000",0,...,0,0,0,0,0,0,0,0,0,0
1,"$610,000",TARRAGINDI,25,615,3.0,2.0,2.0,House,"$720,000",0,...,0,0,0,0,0,0,0,0,0,0
2,"$1,240,000",BARDON,25,1897,4.0,2.0,1.0,House,"$910,000",0,...,0,0,0,0,0,0,0,0,0,0
3,"$352,000",FITZGIBBON,11,88,2.0,2.0,1.0,House,"$111,000",0,...,0,0,0,0,0,0,0,0,0,0
4,"$390,000",BRACKEN RIDGE,23,579,3.0,1.0,1.0,House,"$290,000",0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
X = df_dummy.loc[:,'Area':'ZILLMERE'] 
y = df_dummy['Sale Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [11]:
rf = RandomForestRegressor(random_state=1, n_estimators=5)
rf = rf.fit(X_train, y_train)

ValueError: could not convert string to float: '1,005'