## House Price Prediction ML modeling notebook

### Importing XGBoost and other important libraries

In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV

In [2]:
train_df = pd.read_csv("train_wide.csv")
train_df.head()

Unnamed: 0,LotFrontage,LotArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,65.0,8450,2003,2003,196.0,706.0,0.0,150.0,856.0,856,...,0,0,0,1,0,0,0,0,1,0
1,80.0,9600,1976,1976,0.0,978.0,0.0,284.0,1262.0,1262,...,0,0,0,1,0,0,0,0,1,0
2,68.0,11250,2001,2002,162.0,486.0,0.0,434.0,920.0,920,...,0,0,0,1,0,0,0,0,1,0
3,60.0,9550,1915,1970,0.0,216.0,0.0,540.0,756.0,961,...,0,0,0,1,1,0,0,0,0,0
4,84.0,14260,2000,2000,350.0,655.0,0.0,490.0,1145.0,1145,...,0,0,0,1,0,0,0,0,1,0


In [3]:
test_df = pd.read_csv("test_wide.csv")
test_df.head()

Unnamed: 0,LotFrontage,LotArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,80.0,11622,1961,1961,0.0,468.0,144.0,270.0,882.0,896,...,0,0,0,1,0,0,0,0,1,0
1,81.0,14267,1958,1958,108.0,923.0,0.0,406.0,1329.0,1329,...,0,0,0,1,0,0,0,0,1,0
2,74.0,13830,1997,1998,0.0,791.0,0.0,137.0,928.0,928,...,0,0,0,1,0,0,0,0,1,0
3,78.0,9978,1998,1998,20.0,602.0,0.0,324.0,926.0,926,...,0,0,0,1,0,0,0,0,1,0
4,43.0,5005,1992,1992,0.0,263.0,0.0,1017.0,1280.0,1280,...,0,0,0,1,0,0,0,0,1,0


#### Dropping unwanted columns from the training set and extracting the SalePrice

In [4]:
y = train_df['SalePrice']
X = train_df.drop(['SalePrice'], axis=1, inplace=False)
X.head()

Unnamed: 0,LotFrontage,LotArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,65.0,8450,2003,2003,196.0,706.0,0.0,150.0,856.0,856,...,0,0,0,1,0,0,0,0,1,0
1,80.0,9600,1976,1976,0.0,978.0,0.0,284.0,1262.0,1262,...,0,0,0,1,0,0,0,0,1,0
2,68.0,11250,2001,2002,162.0,486.0,0.0,434.0,920.0,920,...,0,0,0,1,0,0,0,0,1,0
3,60.0,9550,1915,1970,0.0,216.0,0.0,540.0,756.0,961,...,0,0,0,1,1,0,0,0,0,0
4,84.0,14260,2000,2000,350.0,655.0,0.0,490.0,1145.0,1145,...,0,0,0,1,0,0,0,0,1,0


#### Hyperparameter tuning using RandomSearchCV

In [None]:
#n_estimators = [100, 500, 900, 1100, 1500]
learning_rate = [0.05, 0.10, 0.15, 0.20, 0.25, 0.30]
max_depth = [3, 4, 5, 6, 8, 10, 12, 15]
min_child_weight = [1,3,5,7]
gamma = [0.0, 0.1, 0.2, 0.3, 0.4]
colsample_bytree = [0.3, 0.4, 0.5, 0.7]
boost = ['gbtree', 'gblinear']


#'n_estimators': n_estimators,
random_grid = {
    'learning_rate': learning_rate,
    'max_depth': max_depth,
    'min_child_weight': min_child_weight,
    'gamma': gamma,
    'colsample_bytree': colsample_bytree,
    'boost': boost
}
xgbr = xgb.XGBRegressor()
xgbr_random = RandomizedSearchCV(estimator = xgbr, param_distributions = random_grid, n_iter = 50, cv = 4, verbose=2, random_state=42, n_jobs = -1)
xgbr_random.fit(X,y)

Fitting 4 folds for each of 50 candidates, totalling 200 fits
[CV] min_child_weight=5, max_depth=4, learning_rate=0.3, gamma=0.2, colsample_bytree=0.7, boost=gblinear 


[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.


Parameters: { boost } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [5]:
learning_rate = [0.20]
max_depth = [12]
min_child_weight = [5]
gamma = [0.0]
colsample_bytree = [0.4]
boost = ['gbtree']

random_grid = {
    'learning_rate': learning_rate,
    'max_depth': max_depth,
    'min_child_weight': min_child_weight,
    'gamma': gamma,
    'colsample_bytree': colsample_bytree,
    'boost': boost
}
xgbr = xgb.XGBRegressor()
xgbr_random = RandomizedSearchCV(estimator = xgbr, param_distributions = random_grid, n_iter = 50, cv = 4, verbose=2, random_state=42, n_jobs = -1)
xgbr_random.fit(X,y)

[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 4 folds for each of 1 candidates, totalling 4 fits
[CV] min_child_weight=5, max_depth=12, learning_rate=0.2, gamma=0.0, colsample_bytree=0.4, boost=gbtree 
Parameters: { boost } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[CV]  min_child_weight=5, max_depth=12, learning_rate=0.2, gamma=0.0, colsample_bytree=0.4, boost=gbtree, total=13.9min
[CV] min_child_weight=5, max_depth=12, learning_rate=0.2, gamma=0.0, colsample_bytree=0.4, boost=gbtree 


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed: 13.9min remaining:    0.0s


Parameters: { boost } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[CV]  min_child_weight=5, max_depth=12, learning_rate=0.2, gamma=0.0, colsample_bytree=0.4, boost=gbtree, total=14.3min
[CV] min_child_weight=5, max_depth=12, learning_rate=0.2, gamma=0.0, colsample_bytree=0.4, boost=gbtree 
Parameters: { boost } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[CV]  min_child_weight=5, max_depth=12, learning_rate=0.2, gamma=0.0, colsample_bytree=0.4, boost=gbtree, total=14.4min
[CV] min_child_weight=5, max_depth=12, learning_rate=0.2, gamma=0.0, colsample_bytree=0.4, bo

[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed: 55.1min finished


Parameters: { boost } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




RandomizedSearchCV(cv=4, error_score='raise-deprecating',
          estimator=XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
       colsample_bynode=None, colsample_bytree=None, gamma=None,
       gpu_id=None, importance_type='gain', interaction_constraints=None,
       learning_rate=None, max_delta_step=None, max_depth=None,
       min_child_we..._pos_weight=None, subsample=None,
       tree_method=None, validate_parameters=None, verbosity=None),
          fit_params=None, iid='warn', n_iter=50, n_jobs=-1,
          param_distributions={'learning_rate': [0.2], 'max_depth': [12], 'min_child_weight': [5], 'gamma': [0.0], 'colsample_bytree': [0.4], 'boost': ['gbtree']},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

#### After doing the hyper-parameter tuning for the model here are the best parameters for the model.

In [6]:
xgbr_random.best_params_
## Output is this after random search
#{'n_estimators': 400,
# 'min_samples_split': 2,
# 'min_samples_leaf': 1,
# 'max_features': 'sqrt',
# 'max_depth': None,
# 'bootstrap': False}

{'min_child_weight': 5,
 'max_depth': 12,
 'learning_rate': 0.2,
 'gamma': 0.0,
 'colsample_bytree': 0.4,
 'boost': 'gbtree'}

In [7]:
X_test = test_df
X_test.head()

Unnamed: 0,LotFrontage,LotArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,80.0,11622,1961,1961,0.0,468.0,144.0,270.0,882.0,896,...,0,0,0,1,0,0,0,0,1,0
1,81.0,14267,1958,1958,108.0,923.0,0.0,406.0,1329.0,1329,...,0,0,0,1,0,0,0,0,1,0
2,74.0,13830,1997,1998,0.0,791.0,0.0,137.0,928.0,928,...,0,0,0,1,0,0,0,0,1,0
3,78.0,9978,1998,1998,20.0,602.0,0.0,324.0,926.0,926,...,0,0,0,1,0,0,0,0,1,0
4,43.0,5005,1992,1992,0.0,263.0,0.0,1017.0,1280.0,1280,...,0,0,0,1,0,0,0,0,1,0


In [9]:
Y_pred = xgbr_random.predict(X_test)
SalePrice = pd.DataFrame(Y_pred, columns=['SalePrice'])
SalePrice.head()

Unnamed: 0,SalePrice
0,119852.445312
1,168187.90625
2,183710.171875
3,179401.734375
4,179628.921875


In [10]:
xgbr_random.score(X,y)

0.9996690925142189

In [None]:
SalePrice.to_csv("Price_prediction.csv")