# Housing Price: Advanced Models

<a id="content1"></a>
## 1 ) Importing the Modules and Loading the Dataset

In [None]:
# Ignore  the warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

# data visualisation and manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
from matplotlib.legend_handler import HandlerBase
import seaborn as sns
import missingno as msno
import statsmodels.api as sm
#configure
# sets matplotlib to inline and displays graphs below the corressponding cell.
%matplotlib inline  
style.use('fivethirtyeight')
sns.set(style='whitegrid',color_codes=True)

#import the necessary modelling algos.

#classifiaction.
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC,SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPRegressor

#regression
from sklearn.linear_model import LinearRegression,Ridge,Lasso,RidgeCV,ElasticNet
from sklearn.ensemble import RandomForestRegressor,BaggingRegressor,GradientBoostingRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

#model selection
from sklearn.model_selection import train_test_split,cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

#evaluation metrics
from sklearn.metrics import mean_squared_log_error,mean_squared_error, r2_score,mean_absolute_error # for regression
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score  # for classification

from scipy import stats
from scipy.stats import norm, skew   # specifically for staistics

import xgboost as xgb
from xgboost import plot_importance

In [None]:
train=pd.read_csv('../data/train.csv')
test=pd.read_csv('../data/test.csv')

<a id="content2"></a>
## 2 ) Data Preparation

In [None]:
df=train.copy()
df.head(10)

In [None]:
df.shape

In [None]:
df.drop(['Id'],axis=1,inplace=True)
test.drop(['Id'],axis=1,inplace=True)

#### MERGING THE TRAIN & TEST SETS

In [None]:
all_data=pd.concat([train,test])

In [None]:
print(all_data.shape)
all_data = all_data.reset_index(drop=True)

In [None]:
print(all_data.loc[1461:,'SalePrice'])  # note that it is Nan for the values in test set as expected. so we drop it here for now.
all_data.drop(['SalePrice'],axis=1,inplace=True)


Most of them are in 'average', 'above avergae' or 'good' categories.

<a id="content3"></a>
## 3 ) Missing Values Treatment

In [None]:
#delete some features with very high number of missing values.  
all_data.drop(['PoolQC','Alley','Fence','Id','MiscFeature'],axis=1,inplace=True)


In [None]:
test.drop(['PoolQC','Alley','Fence','MiscFeature'],axis=1,inplace=True)
df.drop(['PoolQC','Alley','Fence','MiscFeature'],axis=1,inplace=True)

In [None]:
# FireplaceQu
# it is useful but many of the values nearly half are missing makes no sense to fill half of the values. so deleting this
all_data.drop(['FireplaceQu'],axis=1,inplace=True)
test.drop(['FireplaceQu'],axis=1,inplace=True)
df.drop(['FireplaceQu'],axis=1,inplace=True)


#### Above analysis shows that there is some relation of lot with teh sale price both by scatter plot and also by the corelation value. therefore instead of deleting I will fill the values with the mean.

In [None]:
all_data['LotFrontage'].fillna(np.mean(all_data['LotFrontage']),inplace=True)
all_data['LotFrontage'].isna().sum()

In [None]:
#Garage  related features.
# these features eg like garage qual,cond,finish,type seems to be important and relevant for buying car. 
# hence I will not drop these features insted i will fill them with the 'none' for categorical and 0 for numeric as nan here implies that there is no garage.

all_data['GarageYrBlt'].fillna(0,inplace=True)
print(all_data['GarageYrBlt'].isnull().sum())

all_data['GarageArea'].fillna(0,inplace=True)
print(all_data['GarageArea'].isnull().sum())

all_data['GarageCars'].fillna(0,inplace=True)
print(all_data['GarageCars'].isnull().sum())

all_data['GarageQual'].fillna('None',inplace=True)   # creating a separate category 'none' which means no garage.
print(all_data['GarageQual'].isnull().sum())

all_data['GarageFinish'].fillna('None',inplace=True)   # creating a separate category 'none' which means no garage.
print(all_data['GarageFinish'].isnull().sum())

all_data['GarageCond'].fillna('None',inplace=True)   # creating a separate category 'none' which means no garage.
print(all_data['GarageCond'].isnull().sum())

all_data['GarageType'].fillna('None',inplace=True)   # creating a separate category 'none' which means no garage.
print(all_data['GarageType'].isnull().sum())

In [None]:
# basement related features.
#missing values are likely zero for having no basement
for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
    all_data[col].fillna(0,inplace=True)
    
# for categorical features we will create a separate class 'none' as before.
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    all_data[col].fillna('None',inplace=True)
    
print(all_data['TotalBsmtSF'].isnull().sum())

In [None]:
# MasVnrArea 0 and MasVnrType 'None'.
all_data['MasVnrArea'].fillna(0,inplace=True)
print(all_data['MasVnrArea'].isnull().sum())

all_data['MasVnrType'].fillna('None',inplace=True)
print(all_data['MasVnrType'].isnull().sum())

In [None]:
#MSZoning.
# Here nan does not mean no so I will with the most common one ie the mode.
all_data['MSZoning'].fillna(all_data['MSZoning'].mode()[0],inplace=True)
print(all_data['MSZoning'].isnull().sum())

In [None]:
#functional
# fill with mode
all_data['Functional'].fillna(all_data['Functional'].mode()[0],inplace=True)
print(all_data['Functional'].isnull().sum())

In [None]:
# other rem columns rae all cat like kitchen qual etc.. and so filled with mode.
for col in ['SaleType','KitchenQual','Exterior2nd','Exterior1st','Electrical']:
  all_data[col].fillna(all_data[col].mode()[0],inplace=True)
  print(all_data[col].isnull().sum())

#### Lastly checking if any null value still remains.

In [None]:
nan_all_data = (all_data.isnull().sum())
nan_all_data= nan_all_data.drop(nan_all_data[nan_all_data== 0].index).sort_values(ascending=False)
nan_all_data
miss_df = pd.DataFrame({'Missing Ratio' :nan_all_data})
miss_df

#### Finally no null value remain now;)

In [None]:
all_data.drop(['Utilities'],axis=1,inplace=True)
all_data.drop(['3SsnPorch'],axis=1,inplace=True)
all_data.drop(['BedroomAbvGr'],axis=1,inplace=True)
all_data.drop(['BldgType'],axis=1,inplace=True)
all_data.drop(['BsmtCond'],axis=1,inplace=True)
all_data.drop(['BsmtExposure'],axis=1,inplace=True)
all_data.drop(['BsmtFinSF2'],axis=1,inplace=True)
all_data.drop(['BsmtFinType1'],axis=1,inplace=True)
all_data.drop(['BsmtFinType2'],axis=1,inplace=True)
all_data.drop(['BsmtHalfBath'],axis=1,inplace=True)
all_data.drop(['Condition1'],axis=1,inplace=True)
all_data.drop(['Condition2'],axis=1,inplace=True)
all_data.drop(['Electrical'],axis=1,inplace=True)
all_data.drop(['EnclosedPorch'],axis=1,inplace=True)
all_data.drop(['ExterCond'],axis=1,inplace=True)
all_data.drop(['Exterior1st'],axis=1,inplace=True)
all_data.drop(['Exterior2nd'],axis=1,inplace=True)
all_data.drop(['ExterQual'],axis=1,inplace=True)
all_data.drop(['Foundation'],axis=1,inplace=True)
all_data.drop(['FullBath'],axis=1,inplace=True)
all_data.drop(['Functional'],axis=1,inplace=True)
all_data.drop(['GarageCond'],axis=1,inplace=True)
all_data.drop(['GarageFinish'],axis=1,inplace=True)
all_data.drop(['GarageQual'],axis=1,inplace=True)
all_data.drop(['GarageType'],axis=1,inplace=True)
all_data.drop(['HalfBath'],axis=1,inplace=True)
all_data.drop(['Heating'],axis=1,inplace=True)
all_data.drop(['HeatingQC'],axis=1,inplace=True)
all_data.drop(['HouseStyle'],axis=1,inplace=True)
all_data.drop(['KitchenQual'],axis=1,inplace=True)
all_data.drop(['LandContour'],axis=1,inplace=True)
all_data.drop(['LandSlope'],axis=1,inplace=True)
all_data.drop(['LotConfig'],axis=1,inplace=True)
all_data.drop(['LotShape'],axis=1,inplace=True)
all_data.drop(['LowQualFinSF'],axis=1,inplace=True)
all_data.drop(['MasVnrArea'],axis=1,inplace=True)
all_data.drop(['MasVnrType'],axis=1,inplace=True)
all_data.drop(['MiscVal'],axis=1,inplace=True)
all_data.drop(['MSSubClass'],axis=1,inplace=True)
all_data.drop(['PavedDrive'],axis=1,inplace=True)
all_data.drop(['PoolArea'],axis=1,inplace=True)
all_data.drop(['RoofMatl'],axis=1,inplace=True)
all_data.drop(['RoofStyle'],axis=1,inplace=True)
all_data.drop(['SaleType'],axis=1,inplace=True)
all_data.drop(['Street'],axis=1,inplace=True)
all_data.drop(['TotRmsAbvGrd'],axis=1,inplace=True)
all_data.drop(['YrSold'],axis=1,inplace=True)

<a id="content5"></a>
## 5 ) Prepare the Data

In [None]:
for col in all_data.columns:
    if(all_data[col].dtype == 'object'):
        le=LabelEncoder()
        all_data[col]=le.fit_transform(all_data[col])

## 6 ) Regression Models

In [None]:
train=all_data.loc[:(df.shape)[0]+2,:]
test=all_data.loc[(df.shape)[0]+2:,:]

In [None]:
train['SalePrice']=df['SalePrice']
train['SalePrice'].fillna(np.mean(train['SalePrice']),inplace=True)
train.shape
print(train['SalePrice'].isnull().sum())

In [None]:
print(train.shape)
print(test.shape)

In [None]:
resultR2=[]
resultRMSE=[]
resultModel=[]

#### LINEAR REGRESSION

In [None]:
for i in range(1,51):
    x_train,x_test,y_train,y_test=train_test_split(train.drop(['SalePrice'],axis=1),train['SalePrice'],test_size=0.20,random_state=i)
    reg_lin=LinearRegression()
    reg_lin.fit(x_train,y_train)
    pred=reg_lin.predict(x_test)
    resultR2.append(r2_score(y_test,pred))
    resultRMSE.append(np.sqrt(mean_squared_error(y_test,pred)))
    resultModel.append("RegLin")
print("R2 Medio: ", np.mean(resultR2))
print("RMSE Medio: ", np.mean(resultRMSE))

#### LASSO (and tuning with GridSearchCV)

In [None]:
for i in range(1,51):
    x_train,x_test,y_train,y_test=train_test_split(train.drop(['SalePrice'],axis=1),train['SalePrice'],test_size=0.20,random_state=i)
    params_dict={'alpha':[0.05,0.1,0.5,1], 'random_state':[7]}
    reg_lasso_CV=GridSearchCV(estimator=Lasso(),param_grid=params_dict,scoring='neg_mean_squared_error',cv=10)
    reg_lasso_CV.fit(x_train,y_train)
    pred=reg_lasso_CV.predict(x_test)
    resultR2.append(r2_score(y_test,pred))
    resultRMSE.append(np.sqrt(mean_squared_error(y_test,pred)))
    resultModel.append("LASSO")
print("R2 Medio: ", np.mean(resultR2[50:99]))
print("RMSE Medio: ", np.mean(resultRMSE[50:99]))

In [None]:
reg_lasso_CV.best_estimator_

#### RIDGE (and tuning with GridSearchCV)

In [None]:
for i in range(1,51):
    x_train,x_test,y_train,y_test=train_test_split(train.drop(['SalePrice'],axis=1),train['SalePrice'],test_size=0.20,random_state=i)
    params_dict={'alpha':[0.05,0.1,0.5,1], 'random_state':[7]}
    reg_ridge_CV=GridSearchCV(estimator=Ridge(),param_grid=params_dict,scoring='neg_mean_squared_error',cv=10)
    reg_ridge_CV.fit(x_train,y_train)
    pred=reg_ridge_CV.predict(x_test)
    resultR2.append(r2_score(y_test,pred))
    resultRMSE.append(np.sqrt(mean_squared_error(y_test,pred)))
    resultModel.append("RIDGE")
print("R2 Medio: ", np.mean(resultR2[100:149]))
print("RMSE Medio: ", np.mean(resultRMSE[100:149]))

In [None]:
reg_ridge_CV.best_estimator_

#### GRADIENT BOOSTING

In [None]:
for i in range(1,51):
    x_train,x_test,y_train,y_test=train_test_split(train.drop(['SalePrice'],axis=1),train['SalePrice'],test_size=0.20,random_state=i)
    reg_gb=GradientBoostingRegressor(n_estimators=1000,learning_rate=0.05,max_depth=3,min_samples_split=10,max_features='sqrt',subsample=0.75,random_state=7,loss='huber')
    reg_gb.fit(x_train,y_train)
    pred=reg_gb.predict(x_test)
    resultR2.append(r2_score(y_test,pred))
    resultRMSE.append(np.sqrt(mean_squared_error(y_test,pred)))
    resultModel.append("GradBoost")
print("R2 Medio: ", np.mean(resultR2[150:199]))
print("RMSE Medio: ", np.mean(resultRMSE[150:199]))

#### XGBoost

In [None]:
for i in range(1,51):
    x_train,x_test,y_train,y_test=train_test_split(train.drop(['SalePrice'],axis=1),train['SalePrice'],test_size=0.20,random_state=i)
    model_xgb = xgb.XGBRegressor(colsample_bytree=0.5,
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=2, n_estimators=1000,
                             subsample=0.6,
                             random_state =7, nthread = -1)
    model_xgb.fit(x_train,y_train)
    pred=model_xgb.predict(x_test)
    resultR2.append(r2_score(y_test,pred))
    resultRMSE.append(np.sqrt(mean_squared_error(y_test,pred)))
    resultModel.append("XGBoost")
print("R2 Medio: ", np.mean(resultR2[200:249]))
print("RMSE Medio: ", np.mean(resultRMSE[200:249]))

### Neural Net (Multi Layer Perceptron for Regression)

In [None]:
for i in range(1,51):
    x_train,x_test,y_train,y_test=train_test_split(train.drop(['SalePrice'],axis=1),train['SalePrice'],test_size=0.20,random_state=i)
    model_mlp = MLPRegressor(solver='lbfgs', hidden_layer_sizes=40, max_iter=150, learning_rate_init=0.001, random_state =7)
    model_mlp.fit(x_train,y_train)
    pred=model_mlp.predict(x_test)
    resultR2.append(r2_score(y_test,pred))
    resultRMSE.append(np.sqrt(mean_squared_error(y_test,pred)))
    resultModel.append("MLP")
print("R2 Medio: ", np.mean(resultR2[250:299]))
print("RMSE Medio: ", np.mean(resultRMSE[250:299]))

#### Exporting stats

In [None]:
test_id=[]
for i in range (1,301):
    test_id.append(i)
d={'Id':test_id,'Model':resultModel,'R2':resultR2,'RMSE':resultRMSE}
ans_df=pd.DataFrame(d)
ans_df.to_csv('../docs/Models.csv',index=False)

#### Comparing Models by R2

In [None]:
sns.factorplot(data=ans_df,x='Model',y='R2',kind='box',size=5,aspect=1.5)

#### Comparing Models by RMSE

In [None]:
sns.factorplot(data=ans_df,x='Model',y='RMSE',kind='box',size=5,aspect=1.5)

## 7 ) Best Models

#### Best Model in the original Notebook

In [None]:
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, 
                             random_state =7)
model_xgb.fit(x_train,y_train)
pred=model_xgb.predict(x_test)
print("RSME: ",np.sqrt(mean_squared_error(y_test,pred)))
print("R2: ",r2_score(y_test,pred))

#### Changing some parameters

In [None]:
modelxgb=xgb.XGBRegressor(n_jobs=-1,learning_rate=.05,max_depth=3,colsample_bytree=1,verbosity=0,
                      subsample=1,n_estimators=1000,random_state=7)
modelxgb.fit(x_train,y_train)
predxgb=modelxgb.predict(x_test)
print("RSME: ",np.sqrt(mean_squared_error(y_test,predxgb)))
print("R2: ",r2_score(y_test,predxgb))

#### Tuning parameters with GridSearchCV 

In [None]:
parameters = {'nthread':[-1], #when use hyperthread, xgboost may become slower
              'learning_rate': [0.05,0.03], #so called `eta` value
              'max_depth': [2,3],
              'min_child_weight': [2],
              'subsample': [0.9,1],
              'colsample_bytree': [0.9,1],
              'n_estimators': [500,1000], #number of trees, change it to 1000 for better results
              'missing':[-999],
              'seed': [1234]}

In [None]:
clf = GridSearchCV(model_xgb, parameters, n_jobs=5, 
                   cv=5, 
                   scoring='neg_mean_squared_error',
                   verbose=2, refit=True)
clf.fit(x_train,y_train)
predxgb=clf.predict(x_test)
print("RSME: ",np.sqrt(mean_squared_error(y_test,predxgb)))
print("R2: ",r2_score(y_test,predxgb))

In [None]:
clf.best_estimator_

#### Best XGBoost:

In [None]:
modelxgb=xgb.XGBRegressor(n_jobs=-1,learning_rate=.05,max_depth=3,colsample_bytree=1,verbosity=0,
                      subsample=0.9,n_estimators=500,random_state=7)
modelxgb.fit(x_train,y_train)
predxgb=modelxgb.predict(x_test)
print("RSME: ",np.sqrt(mean_squared_error(y_test,predxgb)))
print("R2: ",r2_score(y_test,predxgb))

Copyright 2022. Elaborado por Luis Cajachahua bajo licencia MIT