Cleaning Data for Analysis 

In [18]:
#import statements
import numpy as np
import pandas as pd
from matplotlib.pyplot import subplots
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor


Because not all new construction homes were sold, and among those that were sold, not all reported the sale price. Those cases simply list sale price (SLPR) as 0. Because for our analysis these examples are not of interest to us they will be removed. Additionally there are two variables that indicate the sale price of the home: SLPR and FSLPR. FSLPR (Final Sale Price) is only filled in if the SLPR was changed before the final closing date. Because we would like the most accurate information FSLPR and SLPR will be combined into FSLPR filling in the missing information of FSLPR with SLPR.

In [3]:
## Reading and cleaning data
raw_data = pd.read_csv('soc19.csv')

## Remove rows where sale price was not reported or house was not sold 
raw_data = raw_data[(raw_data['SLPR'] != 0) | (raw_data['FSLPR'] != 0)]

## If FSLPR is zero, replace this value with SLPR
raw_data.FSLPR[raw_data.FSLPR == 0] = raw_data.SLPR
 
## clean data export
cleaned_data = raw_data 
cleaned_data.to_csv('cleaned_data.csv', index=False)

#create train and test set
x = cleaned_data.loc[:, cleaned_data.columns != 'FSLPR']
y = cleaned_data.loc[:, cleaned_data.columns == 'FSLPR']
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size = 0.2, random_state = 1)

Model Implementation

In [4]:
#train random forest model
rForest = RandomForestRegressor()
rForest.fit(xTrain, yTrain.values.ravel())

RandomForestRegressor()

In [5]:
#evaluate model in another cell so don't need to refit model every time
yhat = rForest.predict(xTest)
score = rForest.score(xTest, yhat)
print('Coefficient of determination on test set:', score) 

Coefficient of determination on test set: 1.0


In [6]:
#train decision tree
tree = DecisionTreeRegressor()
tree.fit(xTrain, yTrain)

DecisionTreeRegressor()

In [7]:
#assess tree
yhat = tree.predict(xTest)
score = tree.score(xTest, yTest)
print('coefficient of determination on test set:', np.round(score,2))

coefficient of determination on test set: 0.98


Gradient Boosting Model

In [21]:
#train gradiant boosting model
GBR = GradientBoostingRegressor()
GBR.fit(xTrain, yTrain.values.ravel())


GradientBoostingRegressor()

In [22]:
#assess gradient boosting model
yhat = GBR.predict(xTest)
score = GBR.score(xTest, yTest)
print('coefficient of determination on test set:', np.round(score,2))

coefficient of determination on test set: 0.99
