In [33]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import os
import xgboost as xgb
import pickle
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import mean_squared_error
%matplotlib inline

In [18]:
datarawdir = '../data/raw'
dataprocesseddir = '../data/processed'
train = pd.read_csv(os.path.join(dataprocesseddir, 'train_categorical.csv'))
test = pd.read_csv(os.path.join(dataprocesseddir, 'test_categorical.csv'))

In [19]:
train.head()

Unnamed: 0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,SalePrice
0,4.110874,9.04204,2.079442,1.791759,7.6029,7.6029,5.283204,6.561031,0.0,5.01728,...,0,0,1,0,0,0,0,1,0,12.247699
1,3.044522,9.169623,1.94591,2.197225,7.589336,7.589336,0.0,6.886532,0.0,5.652489,...,0,0,1,0,0,0,0,1,0,12.109016
2,4.110874,9.328212,2.079442,1.791759,7.601902,7.602401,5.09375,6.188264,0.0,6.075346,...,0,0,1,0,0,0,0,1,0,12.317171
3,4.26268,9.164401,2.079442,1.791759,7.557995,7.586296,0.0,5.379897,0.0,6.293419,...,0,0,1,1,0,0,0,0,0,11.849405
4,4.110874,9.565284,2.197225,1.791759,7.601402,7.601402,5.860786,6.486161,0.0,6.196444,...,0,0,1,0,0,0,0,1,0,12.42922


In [20]:
test.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1461,20,80.0,11622,5,6,1961,1961,0.0,468.0,...,0,0,0,1,0,0,0,0,1,0
1,1462,20,81.0,14267,6,6,1958,1958,108.0,923.0,...,0,0,0,1,0,0,0,0,1,0
2,1463,60,74.0,13830,5,5,1997,1998,0.0,791.0,...,0,0,0,1,0,0,0,0,1,0
3,1464,60,78.0,9978,6,6,1998,1998,20.0,602.0,...,0,0,0,1,0,0,0,0,1,0
4,1465,120,43.0,5005,8,5,1992,1992,0.0,263.0,...,0,0,0,1,0,0,0,0,1,0


In [None]:
# Combine train and test data to avoid col exists only in 1 dataframe when dummy-ing
all_data = pd

In [23]:
X_train = train.drop(['SalePrice'], axis=1)
y_train = train['SalePrice']

In [38]:
gbr = GradientBoostingRegressor(random_state=0)
param_grid = {
    'n_estimators': [500],
    'max_features': [10, 15],
    'max_depth': [6, 8, 10],
    'learning_rate': [0.05, 0.1, 0.15], 
    'subsample': [0.8]
}
model = GridSearchCV(estimator=gbr, param_grid=param_grid, n_jobs=1, cv=10)
model.fit(X_train, y_train)
print('Gradient boosted tree regression...')
print('Best params:')
print(model.best_params_)
print('Best CV score:')
print(-model.best_score_)

Gradient boosted tree regression...
Best params:
{'learning_rate': 0.05, 'max_depth': 6, 'max_features': 15, 'n_estimators': 500, 'subsample': 0.8}
Best CV score:
-0.9077899405171053


In [39]:
# save model
file = open('../data/processed/rgb_trained_model', 'wb')
s = pickle.dump(model, file)
file.close()

In [40]:
# load model
file = open('../data/processed/rgb_trained_model','rb')
model = pickle.load(file)
file.close()

In [46]:
# fill null in test df
test = test.fillna(test.mean())

In [74]:
# Compare test and train
#both_col = set(test.columns).intersection(set(X_train.columns))
diff_col = X_train.columns.difference(test.columns)
diff_col

Index(['Condition2_RRAe', 'Condition2_RRAn', 'Condition2_RRNn',
       'Electrical_Mix', 'Exterior1st_ImStucc', 'Exterior1st_Stone',
       'Exterior2nd_Other', 'Heating_Floor', 'Heating_OthW',
       'HouseStyle_2.5Fin', 'RoofMatl_ClyTile', 'RoofMatl_Membran',
       'RoofMatl_Metal', 'RoofMatl_Roll', 'Utilities_NoSeWa'],
      dtype='object')

In [77]:
train = train.drop(diff_col, axis=1)

In [78]:
X_train = train.drop(['SalePrice'], axis=1)
y_train = train['SalePrice']

In [80]:
gbr = GradientBoostingRegressor(random_state=0)
param_grid = {
    'n_estimators': [500],
    'max_features': [10, 15],
    'max_depth': [6, 8, 10],
    'learning_rate': [0.05, 0.1, 0.15], 
    'subsample': [0.8]
}
model = GridSearchCV(estimator=gbr, param_grid=param_grid, n_jobs=1, cv=10)
model.fit(X_train, y_train)
print('Gradient boosted tree regression...')
print('Best params:')
print(model.best_params_)
print('Best CV score:')
print(-model.best_score_)

Gradient boosted tree regression...
Best params:
{'learning_rate': 0.05, 'max_depth': 6, 'max_features': 15, 'n_estimators': 500, 'subsample': 0.8}
Best CV score:
-0.9073088718496795


In [81]:
# save model
file = open('../data/processed/rgb_trained_model', 'wb')
s = pickle.dump(model, file)
file.close()

In [84]:
test.columns.difference(X_train.columns)

Index(['Alley_Grvl', 'Alley_Pave', 'BsmtCond_Fa', 'BsmtCond_Gd', 'BsmtCond_Po',
       'BsmtCond_TA', 'BsmtExposure_Av', 'BsmtExposure_Gd', 'BsmtExposure_Mn',
       'BsmtExposure_No', 'BsmtFinType1_ALQ', 'BsmtFinType1_BLQ',
       'BsmtFinType1_GLQ', 'BsmtFinType1_LwQ', 'BsmtFinType1_Rec',
       'BsmtFinType1_Unf', 'BsmtFinType2_ALQ', 'BsmtFinType2_BLQ',
       'BsmtFinType2_GLQ', 'BsmtFinType2_LwQ', 'BsmtFinType2_Rec',
       'BsmtFinType2_Unf', 'BsmtQual_Ex', 'BsmtQual_Fa', 'BsmtQual_Gd',
       'BsmtQual_TA', 'Fence_GdPrv', 'Fence_GdWo', 'Fence_MnPrv', 'Fence_MnWw',
       'FireplaceQu_Ex', 'FireplaceQu_Fa', 'FireplaceQu_Gd', 'FireplaceQu_Po',
       'FireplaceQu_TA', 'GarageCond_Ex', 'GarageCond_Fa', 'GarageCond_Gd',
       'GarageCond_Po', 'GarageCond_TA', 'GarageFinish_Fin',
       'GarageFinish_RFn', 'GarageFinish_Unf', 'GarageQual_Fa',
       'GarageQual_Gd', 'GarageQual_Po', 'GarageQual_TA', 'GarageType_2Types',
       'GarageType_Attchd', 'GarageType_Basment', 'GarageType_B