# House Prices: Advanced Regression Techniques

In [548]:
import pandas as pd
import numpy as nop
import tensorflow as tf

In [617]:
train = pd.read_csv('./data/train.csv')

In [618]:
test = pd.read_csv('./data/test.csv')

In [619]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [552]:
train.shape

(1460, 81)

In [553]:
train.dtypes

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
SalePrice          int64
Length: 81, dtype: object

In [554]:
# Attributes which are mostly Null
mostly_nan = {x:y for x,y in dict(train.isnull().sum()).items() if y > 700}
mostly_nan

{'Alley': 1369, 'PoolQC': 1453, 'Fence': 1179, 'MiscFeature': 1406}

In [555]:
train = train.drop(mostly_nan.keys(), axis = 1)

In [556]:
have_nan = {x:y for x,y in dict(train.isnull().sum()).items() if y > 0}
have_nan

{'LotFrontage': 259,
 'MasVnrType': 8,
 'MasVnrArea': 8,
 'BsmtQual': 37,
 'BsmtCond': 37,
 'BsmtExposure': 38,
 'BsmtFinType1': 37,
 'BsmtFinType2': 38,
 'Electrical': 1,
 'FireplaceQu': 690,
 'GarageType': 81,
 'GarageYrBlt': 81,
 'GarageFinish': 81,
 'GarageQual': 81,
 'GarageCond': 81}

In [557]:
train['Electrical'].value_counts().index[0]

'SBrkr'

In [558]:
train[train['Electrical'].isnull()]

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
1379,1380,80,RL,73.0,9735,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,5,2008,WD,Normal,167500


In [559]:
# Replace Nan with mode
for col,_ in have_nan.items():
    if train.dtypes[col] == 'object':
        mode = train[col].value_counts().index[0]
        train[col] = train[col].replace(np.nan, mode)
    else:
        median = train[col].median()
        train[col] = train[col].replace(np.nan, median)

In [560]:
{x:y for x,y in dict(train.isnull().sum()).items() if y > 0}

{}

In [561]:
categoricals = list(train.select_dtypes(include=['object']).columns)

In [562]:
from sklearn.preprocessing import LabelBinarizer
encoders = {}
to_drop = []
for col in categoricals:
    print(f'{col} = {train[col].unique()}')
    lb = LabelBinarizer()
    encoded = lb.fit_transform(train[col])
#     print(y.shape)
    if encoded.shape[1] > 1:
        ne = pd.DataFrame(encoded, columns=[f'{col}_{clas}' for clas in lb.classes_])
        to_drop.append(col)
    else:
        ne = pd.DataFrame(encoded, columns=[col])
    train = train.drop([col], axis = 1)
    train = pd.concat([train, ne], axis=1)
    encoders[col] = lb

MSZoning = ['RL' 'RM' 'C (all)' 'FV' 'RH']
Street = ['Pave' 'Grvl']
LotShape = ['Reg' 'IR1' 'IR2' 'IR3']
LandContour = ['Lvl' 'Bnk' 'Low' 'HLS']
Utilities = ['AllPub' 'NoSeWa']
LotConfig = ['Inside' 'FR2' 'Corner' 'CulDSac' 'FR3']
LandSlope = ['Gtl' 'Mod' 'Sev']
Neighborhood = ['CollgCr' 'Veenker' 'Crawfor' 'NoRidge' 'Mitchel' 'Somerst' 'NWAmes'
 'OldTown' 'BrkSide' 'Sawyer' 'NridgHt' 'NAmes' 'SawyerW' 'IDOTRR'
 'MeadowV' 'Edwards' 'Timber' 'Gilbert' 'StoneBr' 'ClearCr' 'NPkVill'
 'Blmngtn' 'BrDale' 'SWISU' 'Blueste']
Condition1 = ['Norm' 'Feedr' 'PosN' 'Artery' 'RRAe' 'RRNn' 'RRAn' 'PosA' 'RRNe']
Condition2 = ['Norm' 'Artery' 'RRNn' 'Feedr' 'PosN' 'PosA' 'RRAn' 'RRAe']
BldgType = ['1Fam' '2fmCon' 'Duplex' 'TwnhsE' 'Twnhs']
HouseStyle = ['2Story' '1Story' '1.5Fin' '1.5Unf' 'SFoyer' 'SLvl' '2.5Unf' '2.5Fin']
RoofStyle = ['Gable' 'Hip' 'Gambrel' 'Mansard' 'Flat' 'Shed']
RoofMatl = ['CompShg' 'WdShngl' 'Metal' 'WdShake' 'Membran' 'Tar&Grv' 'Roll'
 'ClyTile']
Exterior1st = ['VinylSd' 'Meta

In [563]:
train.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,0,0,1,0,0,0,0,1,0
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,0,0,0,1,0,0,0,0,1,0
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,0,0,1,0,0,0,0,1,0
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,0,0,1,1,0,0,0,0,0
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,0,0,0,1,0,0,0,0,1,0


In [564]:
train.dtypes.value_counts()

int32      236
int64       35
float64      3
dtype: int64

In [565]:
y = train['SalePrice']
X = train
X = train.drop(['SalePrice', 'Id'], axis = 1)

In [566]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [567]:
from sklearn.linear_model import LinearRegression

In [568]:
lr_model = LinearRegression()

In [569]:
lr_model.fit(X_train, y_train)

LinearRegression()

In [570]:
pred = lr_model.predict(X_test)

In [571]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [572]:
mean_squared_error(y_test, pred)

2.3903793472431483e+18

In [573]:
mean_absolute_error(y_test, pred)

184092045.33438632

In [574]:
test = test.drop(mostly_nan, axis = 1)

In [575]:
test_have_nan = {x:y for x,y in dict(test.isnull().sum()).items() if y > 0}
test_have_nan

{'MSZoning': 4,
 'LotFrontage': 227,
 'Utilities': 2,
 'Exterior1st': 1,
 'Exterior2nd': 1,
 'MasVnrType': 16,
 'MasVnrArea': 15,
 'BsmtQual': 44,
 'BsmtCond': 45,
 'BsmtExposure': 44,
 'BsmtFinType1': 42,
 'BsmtFinSF1': 1,
 'BsmtFinType2': 42,
 'BsmtFinSF2': 1,
 'BsmtUnfSF': 1,
 'TotalBsmtSF': 1,
 'BsmtFullBath': 2,
 'BsmtHalfBath': 2,
 'KitchenQual': 1,
 'Functional': 2,
 'FireplaceQu': 730,
 'GarageType': 76,
 'GarageYrBlt': 78,
 'GarageFinish': 78,
 'GarageCars': 1,
 'GarageArea': 1,
 'GarageQual': 78,
 'GarageCond': 78,
 'SaleType': 1}

In [576]:
# Replace Nan with mode
for col,_ in test_have_nan.items():
    mode = test[col].value_counts().index[0]
    test[col] = test[col].replace(np.nan, mode)

In [577]:
for col in categoricals:
#     print(f'{col} = {test[col].unique()}')

    encoded = encoders[col].transform(test[col])
#     print(y.shape)
    if encoded.shape[1] > 1:
        ne = pd.DataFrame(encoded, columns=[f'{col}_{clas}' for clas in encoders[col].classes_])
        to_drop.append(col)
    else:
        ne = pd.DataFrame(encoded, columns=[col])
    test = test.drop([col], axis = 1)
    test = pd.concat([test, ne], axis=1)

In [578]:
test.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1461,20,80.0,11622,5,6,1961,1961,0.0,468.0,...,0,0,0,1,0,0,0,0,1,0
1,1462,20,81.0,14267,6,6,1958,1958,108.0,923.0,...,0,0,0,1,0,0,0,0,1,0
2,1463,60,74.0,13830,5,5,1997,1998,0.0,791.0,...,0,0,0,1,0,0,0,0,1,0
3,1464,60,78.0,9978,6,6,1998,1998,20.0,602.0,...,0,0,0,1,0,0,0,0,1,0
4,1465,120,43.0,5005,8,5,1992,1992,0.0,263.0,...,0,0,0,1,0,0,0,0,1,0


In [579]:
Id_col = test['Id']
X_test = test.drop(['Id'], axis = 1)

In [580]:
my_pred = lr_model.predict(X_test)

In [581]:
my_pred

array([109392.66174316, 152254.56469727, 186848.2947998 , ...,
       180758.60296631, 105995.24700928, 224207.48864746])

In [582]:
pred_series = pd.Series(my_pred, name='SalePrice')

In [583]:
output = pd.concat([Id_col, pred_series], axis=1)

In [584]:
output.head()

Unnamed: 0,Id,SalePrice
0,1461,109392.661743
1,1462,152254.564697
2,1463,186848.2948
3,1464,192210.252869
4,1465,205223.612793


In [592]:
output.to_csv('./submissions/lr_kaggle_submssion.csv', index = False)

In [593]:
# Simple Linear Regression got score 0.85796 and 4989 position

In [594]:
from sklearn.ensemble import RandomForestRegressor

In [595]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [596]:
rf_model = RandomForestRegressor()

In [597]:
rf_model.fit(X, y)

RandomForestRegressor()

In [598]:
Id_col = test['Id']
X_kaggle_test = test.drop(['Id'], axis = 1)

In [599]:
my_pred = rf_model.predict(X_kaggle_test)
pred_series = pd.Series(my_pred, name='SalePrice')
output = pd.concat([Id_col, pred_series], axis=1)
output.to_csv('rf_kaggle_submssion.csv', index = False)

In [600]:
# Random Forest Regressor got score 0.14668 and 2965 position

In [601]:
from sklearn.ensemble import GradientBoostingRegressor

In [602]:
gb_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,max_depth=1, random_state=0, loss='ls')

In [603]:
gb_model.fit(X, y)

GradientBoostingRegressor(max_depth=1, random_state=0)

In [604]:
my_pred = gb_model.predict(X_kaggle_test)
pred_series = pd.Series(my_pred, name='SalePrice')
output = pd.concat([Id_col, pred_series], axis=1)
output.to_csv('./submissions/gb_kaggle_submssion.csv', index = False)

In [605]:
 # GradientBoosting Regressor got score 0.17547

In [606]:
from sklearn.neural_network import MLPRegressor

In [607]:
mlp_model = MLPRegressor(random_state=1, max_iter=5000)

In [608]:
mlp_model.fit(X, y)

MLPRegressor(max_iter=5000, random_state=1)

In [609]:
my_pred = mlp_model.predict(X_kaggle_test)
pred_series = pd.Series(my_pred, name='SalePrice')
output = pd.concat([Id_col, pred_series], axis=1)
output.to_csv('./submissions/mlp_kaggle_submssion.csv', index = False)

In [610]:
#!pip3 install xgboost

In [611]:
import xgboost as xgb

In [612]:
xgb_model = xgb.XGBRegressor(objective="reg:squarederror", random_state=11196)

In [613]:
xgb_model.fit(X, y)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1,
             random_state=11196, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             subsample=1, tree_method='exact', validate_parameters=1,
             verbosity=None)

In [615]:
my_pred = xgb_model.predict(X_kaggle_test)
pred_series = pd.Series(my_pred, name='SalePrice')
output = pd.concat([Id_col, pred_series], axis=1)
output.to_csv('./submissions/xgb_kaggle_submssion.csv', index = False)

In [399]:
# XGB Regressor got score 0.14274 and 2645 position