In [2]:
import pandas as pd

iowa_file_path = 'data/train.csv'
home_data = pd.read_csv(iowa_file_path)
home_data.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

We choose SalePrice as the target variable, as we want to predict that.

In [8]:
y = home_data.SalePrice
#create X
features = ['LotArea', 'YearBuilt', 'YearRemodAdd', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd', 'KitchenAbvGr', 'GarageCars', 'GarageArea', 'YrSold']
X = home_data[features]
X.head()

Unnamed: 0,LotArea,YearBuilt,YearRemodAdd,1stFlrSF,2ndFlrSF,FullBath,BedroomAbvGr,TotRmsAbvGrd,KitchenAbvGr,GarageCars,GarageArea,YrSold
0,8450,2003,2003,856,854,2,3,8,1,2,548,2008
1,9600,1976,1976,1262,0,2,3,6,1,2,460,2007
2,11250,2001,2002,920,866,2,3,6,1,2,608,2008
3,9550,1915,1970,961,756,1,3,7,1,3,642,2006
4,14260,2000,2000,1145,1053,2,4,9,1,3,836,2008


In [27]:
from sklearn.model_selection import train_test_split, train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error

#Split into validation and training data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

#Specify model (decision tree)
iowa_model = DecisionTreeRegressor(random_state=1)
iowa_model.fit(train_X, train_y) #fit model

val_pred = iowa_model.predict(val_X) #validation predictions
val_mae = mean_absolute_error(val_pred, val_y) 
print("Validation MAE: {}".format(val_mae))

Validation MAE: 30521.213698630138


In [28]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    pred_val = model.predict(val_X)
    mae = mean_absolute_error(pred_val, val_y)
    return(mae)

In [29]:
candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]

#A loop to determine the best number for max leaf nodes
scores = {leaf_size: get_mae(leaf_size, train_X, val_X, train_y, val_y) for leaf_size in candidate_max_leaf_nodes}
best_tree_size = min(scores, key=scores.get)
print(best_tree_size)

100


In [30]:
#Final decision tree model with no underfitting or overfitting after validation
final_decision_tree_model = DecisionTreeRegressor(max_leaf_nodes=best_tree_size, random_state=0)
final_decision_tree_model.fit(X, y)

In [41]:
preds_val = final_decision_tree_model.predict(X)
final_decision_mae = mean_absolute_error(preds_val, y)

print("Final Decision Tree MAE: {}".format(final_decision_mae))

#comparison of first few values predicted
df = pd.DataFrame((preds_val[i] for i in range(5)), columns=['SalePrice'])
print(df)
print()
print("Actual price:")
y.head()

Final Decision Tree MAE: 16682.93630822503
       SalePrice
0  214147.629032
1  165310.606061
2  214147.629032
3  130115.384615
4  301457.888889

Actual price:


0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64

This was basic Decision Tree based on some selected features. No we try out Random Forest Regressor.

In [32]:
from sklearn.ensemble import RandomForestRegressor

#Defining and fitting the model
rf_model = RandomForestRegressor(random_state=0)
#we have already split the data
rf_model.fit(train_X, train_y)
rf_val_pred = rf_model.predict(val_X)
rf_val_mae = mean_absolute_error(val_y, rf_val_pred)

print("Validation MAE for Random Forest: {}".format(rf_val_mae))

Validation MAE for Random Forest: 21720.66873972603


In [33]:
#fitting the whole data
rf_model_on_full_data = RandomForestRegressor(random_state=0)
rf_model_on_full_data.fit(X, y)

In [40]:
final_pred_val = rf_model_on_full_data.predict(X)
final_mae = mean_absolute_error(final_pred_val, y)

print("Final MAE of random forest: {}".format(final_mae))

#comparison of first few values predicted
df1 = pd.DataFrame((final_pred_val[i] for i in range(5)), columns=['SalePrice'])
print(df1)
print()
print("Actual data:")
y.head()

Final MAE of random forest: 8103.828971211133
   SalePrice
0  210355.00
1  173475.50
2  221016.28
3  136007.00
4  266055.07

Actual data:


0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64