# Housing Modeling

### Data Prep & Plain Vanilla

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import boxcox
from sklearn.preprocessing import StandardScaler
import pingouin as pg
import pickle as pk
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore") 
pd.set_option('display.max_columns', None)

In [2]:
train1 = pd.read_pickle("train.pkl")
test1 = pd.read_pickle("test.pkl")

##### Reading in all the variables in that I will use in my linear regression.

In [3]:
# Prepping train data frame

# All my numeric columns were transformed and because of the way I wrote my function they all end in what
# their transformation was. Using this I can combine the transformed variables with the categorical to set up 
# perfectly for modeling
trans_columns = train1.filter(regex='(_log$|_cbrt$|_sqrt$|_square$|_reciprocal$|_lognat$)').columns.tolist()

# Specifying the categorical columns
cat_columns = [
    'OverallQual', 'All_House_Baths', 'Neighborhood', 
    'GarageQual', 'Exterior1st', 'MSSubClass', 
    'LotShape', 'Foundation', 'FireplaceQu', 
    'KitchenQual', 'ExterQual', 'Fence', 
    'TotRmsAbvGrd', 'GarageFinish', 'GarageCars', 
    'Has_a_2nd_Flr', 'House_Age_bin', 'BsmtExposure',  
    'HeatingQC', 'RoofStyle', 'CentralAir', 
    'Electrical', 'MSZoning', 'LandContour',
    'LotConfig', 'Condition1', 'BldgType', 
    'HouseStyle', 'Exterior2nd', 'MasVnrType', 
    'ExterCond', 'BsmtQual', 'BsmtFinType1', 
    'BsmtFinType2', 'GarageType',
    'PavedDrive', 'SaleType', 'SaleCondition', 
    'Garage_Age_bin'
]

# Combining standard and categorical columns
combined_columns = cat_columns + trans_columns

# Creating a new DataFrame with the selected columns
train = train1[combined_columns].copy()

# Displaying the first few rows of the new DataFrame
train.head()

Unnamed: 0,OverallQual,All_House_Baths,Neighborhood,GarageQual,Exterior1st,MSSubClass,LotShape,Foundation,FireplaceQu,KitchenQual,ExterQual,Fence,TotRmsAbvGrd,GarageFinish,GarageCars,Has_a_2nd_Flr,House_Age_bin,BsmtExposure,HeatingQC,RoofStyle,CentralAir,Electrical,MSZoning,LandContour,LotConfig,Condition1,BldgType,HouseStyle,Exterior2nd,MasVnrType,ExterCond,BsmtQual,BsmtFinType1,BsmtFinType2,GarageType,PavedDrive,SaleType,SaleCondition,Garage_Age_bin,SalePrice_log,GarageArea_square,TotalSqFt_sqrt,Prop_of_SF_on_1st_reciprocal,Last_Remod_cbrt,MasVnrArea_reciprocal,TotalPorchSF_sqrt,LotArea_cbrt,LotFrontage_square,LowQualFinSF_reciprocal,BsmtFinSF1_sqrt,BsmtFinSF2_reciprocal,BsmtUnfSF_sqrt
0,7,3.5,CollgCr,TA,VinylSd,60,Reg,PConc,NoFirePlace,Gd,Gd,No Fence,8,RFn,2,2-Story,"(0, 15]",No,Ex,Gable,Y,SBrkr,RL,Lvl,Inside,Norm,1Fam,2Story,VinylSd,BrkFace,TA,Gd,GLQ,Unf,Attchd,Y,WD,Normal,"(0, 25]",5.319106,300314.9601,50.655799,2.910419,1.913842,0.005102,7.81089,20.368189,4226.3001,100.0,26.570849,100.0,12.247857
1,6,2.5,Veenker,TA,MetalSd,20,Reg,CBlock,TA,TA,TA,No Fence,6,RFn,2,1_Story,"(30, 60]",Gd,Ex,Gable,Y,SBrkr,RL,Lvl,LotConfigOther,Feedr,1Fam,1Story,MetalSd,NoMasVnr,TA,Gd,ALQ,Unf,Attchd,Y,WD,Normal,"(25, 50]",5.258877,211609.2001,50.239526,1.960784,3.239929,100.0,17.262966,21.253179,6401.6001,100.0,31.273151,100.0,16.852596
2,7,3.5,CollgCr,TA,VinylSd,60,IR1,PConc,TA,Gd,Gd,No Fence,6,RFn,2,2-Story,"(0, 15]",Mn,Ex,Gable,Y,SBrkr,RL,Lvl,Inside,Norm,1Fam,2Story,VinylSd,BrkFace,TA,Gd,GLQ,Unf,Attchd,Y,WD,Normal,"(0, 25]",5.349278,369676.1601,52.019323,2.857264,2.000833,0.006172,6.481512,22.40703,4625.3601,100.0,22.045634,100.0,20.832907
3,7,2.0,Crawfor,TA,Wd Sdng,70,IR1,BrkTil,Gd,Gd,TA,No Fence,7,Unf,3,2-Story,"(60, 140]",No,Gd,Gable,Y,SBrkr,RL,Lvl,Corner,Norm,1Fam,2Story,Exterior2ndOthers,NoMasVnr,TA,TA,ALQ,Unf,Detchd,Y,WD,Abnorml,"(0, 25]",5.146128,412176.8401,49.729368,2.508801,3.420237,100.0,17.521701,21.216217,3601.2001,100.0,14.697279,100.0,23.238115
4,8,3.5,NoRidge,TA,VinylSd,60,IR1,PConc,TA,Gd,Gd,No Fence,9,RFn,3,2-Story,"(0, 15]",Av,Ex,Gable,Y,SBrkr,RL,Lvl,LotConfigOther,Norm,1Fam,2Story,VinylSd,BrkFace,TA,Gd,GLQ,Unf,Attchd,Y,WD,Normal,"(0, 25]",5.39794,698912.7201,57.818769,2.836825,2.155153,0.002857,16.613549,24.249713,7057.6801,100.0,25.593163,100.0,22.136169


In [4]:
# Prepping test data frame
trans_columns = test1.filter(regex='(_log$|_cbrt$|_sqrt$|_square$|_reciprocal$|_lognat$)').columns.tolist()

# Specifying the categorical columns
cat_columns = [
    'OverallQual', 'All_House_Baths', 'Neighborhood', 
    'GarageQual', 'Exterior1st', 'MSSubClass', 
    'LotShape', 'Foundation', 'FireplaceQu', 
    'KitchenQual', 'ExterQual', 'Fence', 
    'TotRmsAbvGrd', 'GarageFinish', 'GarageCars', 
    'Has_a_2nd_Flr', 'House_Age_bin', 'BsmtExposure',  
    'HeatingQC', 'RoofStyle', 'CentralAir', 
    'Electrical', 'MSZoning', 'LandContour',
    'LotConfig', 'Condition1', 'BldgType', 
    'HouseStyle', 'Exterior2nd', 'MasVnrType', 
    'ExterCond', 'BsmtQual', 'BsmtFinType1', 
    'BsmtFinType2', 'GarageType',
    'PavedDrive', 'SaleType', 'SaleCondition', 
    'Garage_Age_bin'
]

# Combining standard and categorical columns
combined_columns = cat_columns + trans_columns

# Creating a new DataFrame with the selected columns
test = test1[combined_columns].copy()
test['SalePrice_log'] = 0

# Displaying the first few rows of the new DataFrame
test.head()

Unnamed: 0,OverallQual,All_House_Baths,Neighborhood,GarageQual,Exterior1st,MSSubClass,LotShape,Foundation,FireplaceQu,KitchenQual,ExterQual,Fence,TotRmsAbvGrd,GarageFinish,GarageCars,Has_a_2nd_Flr,House_Age_bin,BsmtExposure,HeatingQC,RoofStyle,CentralAir,Electrical,MSZoning,LandContour,LotConfig,Condition1,BldgType,HouseStyle,Exterior2nd,MasVnrType,ExterCond,BsmtQual,BsmtFinType1,BsmtFinType2,GarageType,PavedDrive,SaleType,SaleCondition,Garage_Age_bin,GarageArea_square,TotalSqFt_sqrt,Prop_of_SF_on_1st_reciprocal,Last_Remod_cbrt,MasVnrArea_reciprocal,TotalPorchSF_sqrt,LotArea_cbrt,LotFrontage_square,LowQualFinSF_reciprocal,BsmtFinSF1_sqrt,BsmtFinSF2_reciprocal,BsmtUnfSF_sqrt,SalePrice_log
0,5,1.0,NAmes,TA,VinylSd,20,Reg,CBlock,NoFirePlace,TA,TA,MnPrv,5,Unf,1.0,1_Story,"(30, 60]",No,TA,Gable,Y,SBrkr,RM,Lvl,Inside,Feedr,1Fam,1Story,VinylSd,NoMasVnr,TA,TA,Rec,LwQ,Attchd,Y,WD,Normal,"(25, 50]",532914.6001,42.166456,1.945764,3.659555,100.0,16.124826,22.651332,6401.6001,100.0,21.633539,0.006944,16.431981,0
1,6,1.5,NAmes,TA,Wd Sdng,20,IR1,CBlock,NoFirePlace,Gd,TA,No Fence,6,Unf,1.0,1_Story,"(30, 60]",No,TA,Hip,Y,SBrkr,RL,Lvl,Corner,Norm,1Fam,1Story,Wd Sdng,BrkFace,TA,TA,ALQ,Unf,Attchd,Y,WD,Normal,"(50, 75]",97350.2401,51.555892,1.960784,3.73275,0.009258,20.712557,24.253681,6562.6201,100.0,30.38108,100.0,20.14969,0
2,5,2.5,Gilbert,TA,VinylSd,60,IR1,PConc,TA,TA,TA,MnPrv,6,Fin,2.0,2-Story,"(0, 15]",No,Gd,Gable,Y,SBrkr,RL,Lvl,Inside,Norm,1Fam,2Story,VinylSd,NoMasVnr,TA,Gd,GLQ,Unf,Attchd,Y,WD,Normal,"(0, 25]",232333.6401,50.566886,2.681502,2.290064,100.0,15.684706,24.003478,5477.4801,100.0,28.1249,100.0,11.705127,0
3,6,2.5,Gilbert,TA,VinylSd,60,IR1,PConc,Gd,Gd,TA,No Fence,7,Fin,2.0,2-Story,"(0, 15]",No,Ex,Gable,Y,SBrkr,RL,Lvl,Inside,Norm,1Fam,2Story,VinylSd,BrkFace,TA,TA,GLQ,Unf,Attchd,Y,WD,Normal,"(0, 25]",220909.4001,50.299205,2.659519,2.290064,0.049975,19.9,21.528543,6085.5601,100.0,24.535892,100.0,18.000278,0
4,8,2.0,StoneBr,TA,HdBoard,120,IR1,PConc,NoFirePlace,Gd,Gd,No Fence,5,RFn,2.0,1_Story,"(15, 30]",No,Ex,Gable,Y,SBrkr,RL,LandConOther,Inside,Norm,TwnhsE,1Story,HdBoard,NoMasVnr,TA,Gd,ALQ,Unf,Attchd,Y,WD,Normal,"(0, 25]",256046.1201,50.596541,1.960784,2.621227,100.0,15.033629,17.105469,1849.8601,100.0,16.217583,100.0,43.198222,0


In [5]:
pd.set_option('display.max_columns', None)
test.head()
test.shape

(1459, 52)

In [6]:
train.shape

(1460, 52)

In [7]:
# I was having trouble with my train and test having different numbers of predictors because of the way 
# they were encoded - train and test had some different levels in their categories - and thus I used this 
# to figure out which ones to fix in my cleaning notebooks. This was one of the bigger challenges I faced
# for the project

missing_in_train = set(test.columns) - set(train.columns)
missing_in_test = set(train.columns) - set(test.columns)

print("Missing in train:", missing_in_train)
print("Missing in test:", missing_in_test)

Missing in train: set()
Missing in test: set()


In [8]:
train['GarageCars'] = train['GarageCars'].astype(float)

##### One hot encoding all the categorical columns together for train..

In [9]:
train = pd.get_dummies(train, columns=cat_columns, dtype = int, drop_first = True)
train.head()

Unnamed: 0,SalePrice_log,GarageArea_square,TotalSqFt_sqrt,Prop_of_SF_on_1st_reciprocal,Last_Remod_cbrt,MasVnrArea_reciprocal,TotalPorchSF_sqrt,LotArea_cbrt,LotFrontage_square,LowQualFinSF_reciprocal,BsmtFinSF1_sqrt,BsmtFinSF2_reciprocal,BsmtUnfSF_sqrt,OverallQual_4,OverallQual_5,OverallQual_6,OverallQual_7,OverallQual_8,OverallQual_9,OverallQual_10,All_House_Baths_1.5,All_House_Baths_2.0,All_House_Baths_2.5,All_House_Baths_3.0,All_House_Baths_3.5,All_House_Baths_4 or more,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,GarageQual_Gd,GarageQual_NoGarage,GarageQual_TA,Exterior1st_BrkFace,Exterior1st_CemntBd,Exterior1st_HdBoard,Exterior1st_MetalSd,Exterior1st_Plywood,Exterior1st_Stucco,Exterior1st_VinylSd,Exterior1st_Wd Sdng,Exterior1st_WdShing,MSSubClass_30,MSSubClass_45,MSSubClass_50,MSSubClass_60,MSSubClass_70,MSSubClass_75,MSSubClass_80,MSSubClass_85,MSSubClass_90,MSSubClass_120,MSSubClass_160,MSSubClass_180,MSSubClass_190,MSSubClass_MSSubClass_Others,LotShape_IR2,LotShape_IR3,LotShape_Reg,Foundation_CBlock,Foundation_PConc,Foundation_Slab,FireplaceQu_Fa,FireplaceQu_Gd,FireplaceQu_NoFirePlace,FireplaceQu_Po,FireplaceQu_TA,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA,ExterQual_Gd,ExterQual_TA,Fence_GdWo,Fence_MnPrv,Fence_MnWw,Fence_No Fence,TotRmsAbvGrd_4,TotRmsAbvGrd_5,TotRmsAbvGrd_6,TotRmsAbvGrd_7,TotRmsAbvGrd_8,TotRmsAbvGrd_9,TotRmsAbvGrd_10,TotRmsAbvGrd_11,GarageFinish_No_Garage,GarageFinish_RFn,GarageFinish_Unf,GarageCars_1.0,GarageCars_2.0,GarageCars_3.0,Has_a_2nd_Flr_2-Story,"House_Age_bin_(15, 30]","House_Age_bin_(30, 60]","House_Age_bin_(60, 140]",BsmtExposure_Gd,BsmtExposure_Mn,BsmtExposure_No,BsmtExposure_No_Basement,HeatingQC_Fa,HeatingQC_Gd,HeatingQC_TA,RoofStyle_Hip,CentralAir_Y,Electrical_FuseF,Electrical_SBrkr,MSZoning_RL,MSZoning_RM,LandContour_Lvl,LotConfig_CulDSac,LotConfig_Inside,LotConfig_LotConfigOther,Condition1_Feedr,Condition1_Norm,BldgType_BldgTypeOther,BldgType_TwnhsE,HouseStyle_1Story,HouseStyle_2Story,HouseStyle_StyleOthers,Exterior2nd_HdBoard,Exterior2nd_MetalSd,Exterior2nd_Plywood,Exterior2nd_VinylSd,Exterior2nd_Wd Sdng,MasVnrType_BrkFace,MasVnrType_NoMasVnr,MasVnrType_Stone,ExterCond_Gd,ExterCond_TA,BsmtQual_Gd,BsmtQual_NoGarage,BsmtQual_TA,BsmtFinType1_BLQ,BsmtFinType1_GLQ,BsmtFinType1_LwQ,BsmtFinType1_NoBasement,BsmtFinType1_Rec,BsmtFinType1_Unf,BsmtFinType2_NoBasement,BsmtFinType2_Rec,BsmtFinType2_Unf,GarageType_BuiltIn,GarageType_Detchd,GarageType_NoGarage,PavedDrive_Y,SaleType_SaleTypeOthers,SaleType_WD,SaleCondition_Normal,SaleCondition_Partial,"Garage_Age_bin_(25, 50]","Garage_Age_bin_(50, 75]","Garage_Age_bin_(75, 115]"
0,5.319106,300314.9601,50.655799,2.910419,1.913842,0.005102,7.81089,20.368189,4226.3001,100.0,26.570849,100.0,12.247857,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,1,1,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,0,0
1,5.258877,211609.2001,50.239526,1.960784,3.239929,100.0,17.262966,21.253179,6401.6001,100.0,31.273151,100.0,16.852596,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,1,0,1,0,0,1,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,1,0,0
2,5.349278,369676.1601,52.019323,2.857264,2.000833,0.006172,6.481512,22.40703,4625.3601,100.0,22.045634,100.0,20.832907,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,1,1,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,0,0
3,5.146128,412176.8401,49.729368,2.508801,3.420237,100.0,17.521701,21.216217,3601.2001,100.0,14.697279,100.0,23.238115,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,1,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,0,0,0,0
4,5.39794,698912.7201,57.818769,2.836825,2.155153,0.002857,16.613549,24.249713,7057.6801,100.0,25.593163,100.0,22.136169,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,0,0


##### One hot encoding all the categorical columns together for test..

In [10]:
test = pd.get_dummies(test, columns=cat_columns, dtype = int, drop_first = True)
test.head()

Unnamed: 0,GarageArea_square,TotalSqFt_sqrt,Prop_of_SF_on_1st_reciprocal,Last_Remod_cbrt,MasVnrArea_reciprocal,TotalPorchSF_sqrt,LotArea_cbrt,LotFrontage_square,LowQualFinSF_reciprocal,BsmtFinSF1_sqrt,BsmtFinSF2_reciprocal,BsmtUnfSF_sqrt,SalePrice_log,OverallQual_4,OverallQual_5,OverallQual_6,OverallQual_7,OverallQual_8,OverallQual_9,OverallQual_10,All_House_Baths_1.5,All_House_Baths_2.0,All_House_Baths_2.5,All_House_Baths_3.0,All_House_Baths_3.5,All_House_Baths_4 or more,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,GarageQual_Gd,GarageQual_NoGarage,GarageQual_TA,Exterior1st_BrkFace,Exterior1st_CemntBd,Exterior1st_HdBoard,Exterior1st_MetalSd,Exterior1st_Plywood,Exterior1st_Stucco,Exterior1st_VinylSd,Exterior1st_Wd Sdng,Exterior1st_WdShing,MSSubClass_30,MSSubClass_45,MSSubClass_50,MSSubClass_60,MSSubClass_70,MSSubClass_75,MSSubClass_80,MSSubClass_85,MSSubClass_90,MSSubClass_120,MSSubClass_160,MSSubClass_180,MSSubClass_190,MSSubClass_MSSubClass_Others,LotShape_IR2,LotShape_IR3,LotShape_Reg,Foundation_CBlock,Foundation_PConc,Foundation_Slab,FireplaceQu_Fa,FireplaceQu_Gd,FireplaceQu_NoFirePlace,FireplaceQu_Po,FireplaceQu_TA,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA,ExterQual_Gd,ExterQual_TA,Fence_GdWo,Fence_MnPrv,Fence_MnWw,Fence_No Fence,TotRmsAbvGrd_4,TotRmsAbvGrd_5,TotRmsAbvGrd_6,TotRmsAbvGrd_7,TotRmsAbvGrd_8,TotRmsAbvGrd_9,TotRmsAbvGrd_10,TotRmsAbvGrd_11,GarageFinish_No_Garage,GarageFinish_RFn,GarageFinish_Unf,GarageCars_1.0,GarageCars_2.0,GarageCars_3.0,Has_a_2nd_Flr_2-Story,"House_Age_bin_(15, 30]","House_Age_bin_(30, 60]","House_Age_bin_(60, 140]",BsmtExposure_Gd,BsmtExposure_Mn,BsmtExposure_No,BsmtExposure_No_Basement,HeatingQC_Fa,HeatingQC_Gd,HeatingQC_TA,RoofStyle_Hip,CentralAir_Y,Electrical_FuseF,Electrical_SBrkr,MSZoning_RL,MSZoning_RM,LandContour_Lvl,LotConfig_CulDSac,LotConfig_Inside,LotConfig_LotConfigOther,Condition1_Feedr,Condition1_Norm,BldgType_BldgTypeOther,BldgType_TwnhsE,HouseStyle_1Story,HouseStyle_2Story,HouseStyle_StyleOthers,Exterior2nd_HdBoard,Exterior2nd_MetalSd,Exterior2nd_Plywood,Exterior2nd_VinylSd,Exterior2nd_Wd Sdng,MasVnrType_BrkFace,MasVnrType_NoMasVnr,MasVnrType_Stone,ExterCond_Gd,ExterCond_TA,BsmtQual_Gd,BsmtQual_NoGarage,BsmtQual_TA,BsmtFinType1_BLQ,BsmtFinType1_GLQ,BsmtFinType1_LwQ,BsmtFinType1_NoBasement,BsmtFinType1_Rec,BsmtFinType1_Unf,BsmtFinType2_NoBasement,BsmtFinType2_Rec,BsmtFinType2_Unf,GarageType_BuiltIn,GarageType_Detchd,GarageType_NoGarage,PavedDrive_Y,SaleType_SaleTypeOthers,SaleType_WD,SaleCondition_Normal,SaleCondition_Partial,"Garage_Age_bin_(25, 50]","Garage_Age_bin_(50, 75]","Garage_Age_bin_(75, 115]"
0,532914.6001,42.166456,1.945764,3.659555,100.0,16.124826,22.651332,6401.6001,100.0,21.633539,0.006944,16.431981,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,1,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0,1,0,1,1,0,1,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,1,0,1,0,0
1,97350.2401,51.555892,1.960784,3.73275,0.009258,20.712557,24.253681,6562.6201,100.0,30.38108,100.0,20.14969,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,1,0,0,0,1,1,1,0,1,1,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1,0
2,232333.6401,50.566886,2.681502,2.290064,100.0,15.684706,24.003478,5477.4801,100.0,28.1249,100.0,11.705127,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,1,1,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,0,0
3,220909.4001,50.299205,2.659519,2.290064,0.049975,19.9,21.528543,6085.5601,100.0,24.535892,100.0,18.000278,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,1,1,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,0,0
4,256046.1201,50.596541,1.960784,2.621227,100.0,15.033629,17.105469,1849.8601,100.0,16.217583,100.0,43.198222,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,1,0,0,0,1,0,0,1,0,1,1,0,0,1,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,0,0


In [11]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Columns: 169 entries, SalePrice_log to Garage_Age_bin_(75, 115]
dtypes: float64(13), int64(156)
memory usage: 1.9 MB


In [12]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Columns: 169 entries, GarageArea_square to Garage_Age_bin_(75, 115]
dtypes: float64(12), int64(157)
memory usage: 1.9 MB


In [13]:
# Ensuring there are no null value issues

null_values1 = test[test.isnull().any(axis=1)]
null_values2 = train[train.isnull().any(axis=1)]

# Print the rows with null values
null_values1

Unnamed: 0,GarageArea_square,TotalSqFt_sqrt,Prop_of_SF_on_1st_reciprocal,Last_Remod_cbrt,MasVnrArea_reciprocal,TotalPorchSF_sqrt,LotArea_cbrt,LotFrontage_square,LowQualFinSF_reciprocal,BsmtFinSF1_sqrt,BsmtFinSF2_reciprocal,BsmtUnfSF_sqrt,SalePrice_log,OverallQual_4,OverallQual_5,OverallQual_6,OverallQual_7,OverallQual_8,OverallQual_9,OverallQual_10,All_House_Baths_1.5,All_House_Baths_2.0,All_House_Baths_2.5,All_House_Baths_3.0,All_House_Baths_3.5,All_House_Baths_4 or more,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,GarageQual_Gd,GarageQual_NoGarage,GarageQual_TA,Exterior1st_BrkFace,Exterior1st_CemntBd,Exterior1st_HdBoard,Exterior1st_MetalSd,Exterior1st_Plywood,Exterior1st_Stucco,Exterior1st_VinylSd,Exterior1st_Wd Sdng,Exterior1st_WdShing,MSSubClass_30,MSSubClass_45,MSSubClass_50,MSSubClass_60,MSSubClass_70,MSSubClass_75,MSSubClass_80,MSSubClass_85,MSSubClass_90,MSSubClass_120,MSSubClass_160,MSSubClass_180,MSSubClass_190,MSSubClass_MSSubClass_Others,LotShape_IR2,LotShape_IR3,LotShape_Reg,Foundation_CBlock,Foundation_PConc,Foundation_Slab,FireplaceQu_Fa,FireplaceQu_Gd,FireplaceQu_NoFirePlace,FireplaceQu_Po,FireplaceQu_TA,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA,ExterQual_Gd,ExterQual_TA,Fence_GdWo,Fence_MnPrv,Fence_MnWw,Fence_No Fence,TotRmsAbvGrd_4,TotRmsAbvGrd_5,TotRmsAbvGrd_6,TotRmsAbvGrd_7,TotRmsAbvGrd_8,TotRmsAbvGrd_9,TotRmsAbvGrd_10,TotRmsAbvGrd_11,GarageFinish_No_Garage,GarageFinish_RFn,GarageFinish_Unf,GarageCars_1.0,GarageCars_2.0,GarageCars_3.0,Has_a_2nd_Flr_2-Story,"House_Age_bin_(15, 30]","House_Age_bin_(30, 60]","House_Age_bin_(60, 140]",BsmtExposure_Gd,BsmtExposure_Mn,BsmtExposure_No,BsmtExposure_No_Basement,HeatingQC_Fa,HeatingQC_Gd,HeatingQC_TA,RoofStyle_Hip,CentralAir_Y,Electrical_FuseF,Electrical_SBrkr,MSZoning_RL,MSZoning_RM,LandContour_Lvl,LotConfig_CulDSac,LotConfig_Inside,LotConfig_LotConfigOther,Condition1_Feedr,Condition1_Norm,BldgType_BldgTypeOther,BldgType_TwnhsE,HouseStyle_1Story,HouseStyle_2Story,HouseStyle_StyleOthers,Exterior2nd_HdBoard,Exterior2nd_MetalSd,Exterior2nd_Plywood,Exterior2nd_VinylSd,Exterior2nd_Wd Sdng,MasVnrType_BrkFace,MasVnrType_NoMasVnr,MasVnrType_Stone,ExterCond_Gd,ExterCond_TA,BsmtQual_Gd,BsmtQual_NoGarage,BsmtQual_TA,BsmtFinType1_BLQ,BsmtFinType1_GLQ,BsmtFinType1_LwQ,BsmtFinType1_NoBasement,BsmtFinType1_Rec,BsmtFinType1_Unf,BsmtFinType2_NoBasement,BsmtFinType2_Rec,BsmtFinType2_Unf,GarageType_BuiltIn,GarageType_Detchd,GarageType_NoGarage,PavedDrive_Y,SaleType_SaleTypeOthers,SaleType_WD,SaleCondition_Normal,SaleCondition_Partial,"Garage_Age_bin_(25, 50]","Garage_Age_bin_(50, 75]","Garage_Age_bin_(75, 115]"
660,78405.6001,29.933426,0.990099,3.915085,100.0,0.1,18.110442,9802.9801,100.0,,,,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,1,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,1,1,0,0,0,0,1,0
1116,,51.836377,2.773334,2.224654,100.0,19.647137,20.846968,2501.0001,100.0,23.409613,100.0,17.635476,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,1,0,1,1,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,0,0,0,0


In [14]:
null_values2

Unnamed: 0,SalePrice_log,GarageArea_square,TotalSqFt_sqrt,Prop_of_SF_on_1st_reciprocal,Last_Remod_cbrt,MasVnrArea_reciprocal,TotalPorchSF_sqrt,LotArea_cbrt,LotFrontage_square,LowQualFinSF_reciprocal,BsmtFinSF1_sqrt,BsmtFinSF2_reciprocal,BsmtUnfSF_sqrt,OverallQual_4,OverallQual_5,OverallQual_6,OverallQual_7,OverallQual_8,OverallQual_9,OverallQual_10,All_House_Baths_1.5,All_House_Baths_2.0,All_House_Baths_2.5,All_House_Baths_3.0,All_House_Baths_3.5,All_House_Baths_4 or more,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,GarageQual_Gd,GarageQual_NoGarage,GarageQual_TA,Exterior1st_BrkFace,Exterior1st_CemntBd,Exterior1st_HdBoard,Exterior1st_MetalSd,Exterior1st_Plywood,Exterior1st_Stucco,Exterior1st_VinylSd,Exterior1st_Wd Sdng,Exterior1st_WdShing,MSSubClass_30,MSSubClass_45,MSSubClass_50,MSSubClass_60,MSSubClass_70,MSSubClass_75,MSSubClass_80,MSSubClass_85,MSSubClass_90,MSSubClass_120,MSSubClass_160,MSSubClass_180,MSSubClass_190,MSSubClass_MSSubClass_Others,LotShape_IR2,LotShape_IR3,LotShape_Reg,Foundation_CBlock,Foundation_PConc,Foundation_Slab,FireplaceQu_Fa,FireplaceQu_Gd,FireplaceQu_NoFirePlace,FireplaceQu_Po,FireplaceQu_TA,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA,ExterQual_Gd,ExterQual_TA,Fence_GdWo,Fence_MnPrv,Fence_MnWw,Fence_No Fence,TotRmsAbvGrd_4,TotRmsAbvGrd_5,TotRmsAbvGrd_6,TotRmsAbvGrd_7,TotRmsAbvGrd_8,TotRmsAbvGrd_9,TotRmsAbvGrd_10,TotRmsAbvGrd_11,GarageFinish_No_Garage,GarageFinish_RFn,GarageFinish_Unf,GarageCars_1.0,GarageCars_2.0,GarageCars_3.0,Has_a_2nd_Flr_2-Story,"House_Age_bin_(15, 30]","House_Age_bin_(30, 60]","House_Age_bin_(60, 140]",BsmtExposure_Gd,BsmtExposure_Mn,BsmtExposure_No,BsmtExposure_No_Basement,HeatingQC_Fa,HeatingQC_Gd,HeatingQC_TA,RoofStyle_Hip,CentralAir_Y,Electrical_FuseF,Electrical_SBrkr,MSZoning_RL,MSZoning_RM,LandContour_Lvl,LotConfig_CulDSac,LotConfig_Inside,LotConfig_LotConfigOther,Condition1_Feedr,Condition1_Norm,BldgType_BldgTypeOther,BldgType_TwnhsE,HouseStyle_1Story,HouseStyle_2Story,HouseStyle_StyleOthers,Exterior2nd_HdBoard,Exterior2nd_MetalSd,Exterior2nd_Plywood,Exterior2nd_VinylSd,Exterior2nd_Wd Sdng,MasVnrType_BrkFace,MasVnrType_NoMasVnr,MasVnrType_Stone,ExterCond_Gd,ExterCond_TA,BsmtQual_Gd,BsmtQual_NoGarage,BsmtQual_TA,BsmtFinType1_BLQ,BsmtFinType1_GLQ,BsmtFinType1_LwQ,BsmtFinType1_NoBasement,BsmtFinType1_Rec,BsmtFinType1_Unf,BsmtFinType2_NoBasement,BsmtFinType2_Rec,BsmtFinType2_Unf,GarageType_BuiltIn,GarageType_Detchd,GarageType_NoGarage,PavedDrive_Y,SaleType_SaleTypeOthers,SaleType_WD,SaleCondition_Normal,SaleCondition_Partial,"Garage_Age_bin_(25, 50]","Garage_Age_bin_(50, 75]","Garage_Age_bin_(75, 115]"


In [15]:
# Addressing the two rows in test that had values missing.

test['GarageArea_square'].fillna(test['GarageArea_square'].mean(), inplace=True)
test['BsmtFinSF1_sqrt'].fillna(test['BsmtFinSF1_sqrt'].mean(), inplace=True)
test['BsmtFinSF2_reciprocal'].fillna(test['BsmtFinSF2_reciprocal'].mean(), inplace=True)
test['BsmtUnfSF_sqrt'].fillna(test['BsmtUnfSF_sqrt'].mean(), inplace=True)

## Plain Vanilla Model

In [None]:
# Reset the indices of train and test data
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

# Running plain vanilla 
x_train = sm.add_constant(train.drop(columns='SalePrice_log'))
y_train = train['SalePrice_log']
model_plain_vanilla = sm.OLS(y_train, x_train).fit()
print(model_plain_vanilla.summary())

In [None]:
test.head()

In [None]:
PlainVanillaSubmission = pd.DataFrame({})
PlainVanillaSubmission['ID'] = range(1461, 1461 + len(x_test))
predictions = model_plain_vanilla.predict(x_test)
PlainVanillaSubmission['SalePrice'] = 10 ** predictions
PlainVanillaSubmission.to_csv('PlainVanillaSubmission.csv', index = False)
PlainVanillaSubmission
# Kaggle Score: 3.50524

In [None]:
with open('train2.pkl', 'wb') as f:
    pk.dump(train, f)

In [None]:
with open('test2.pkl', 'wb') as f:
    pk.dump(test, f)

# Assumptions Work

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import boxcox
from sklearn.preprocessing import StandardScaler
import pingouin as pg
import pickle as pk
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
pd.set_option('display.max_columns', None)

In [None]:
train = pd.read_pickle("train2.pkl")
test = pd.read_pickle("test2.pkl")

In [None]:
pd.set_option('display.max_rows',None)

In [None]:
# Reseting the indices of train and test data
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

# "plain vanilla" model
x_train = sm.add_constant(train.drop(columns='SalePrice_log'))
y_train = train['SalePrice_log']
model_plain_vanilla = sm.OLS(y_train, x_train).fit()
print(model_plain_vanilla.summary())

In [None]:
model_assumptions = pg.linear_regression(y = y_train, X= x_train)

##### Now going to check all of the assumptions

1.  There exists a linear relationship between the Outcome (DV) and the Predictor (IV) variable(s).
2.  The error terms are normally distributed with a mean of 0. 
3.  The variance of the error terms is not related predicted outcomes (homoskedasticity).  
4.  There is no multicollinearity between predictor variables.  

In [None]:
model_assumptions.residuals_            

In [None]:
data = {}
df = pd.DataFrame(data)

In [None]:
df["errors"] = model_assumptions.residuals_

In [None]:
df["Fitted_Saleprice_std"] = y_train - df.errors

In [None]:
sns.kdeplot(x = "errors", data = df)    ### the errors are mostly normally distributed 
                                          ### with a mean of 0

The errors appear to be normally distruted with mean of zero.  

In [None]:
sns.scatterplot(x = "Fitted_Saleprice_std", y = "errors",  data = df)

The error terms do not seem to be related to the size of the fitted dependent values so the model does not exhibit characteristics of heterskedasticity.  Next, let's assess the variance inflation factors to ensure that the predictors do not exhibit high multicollinearity.  

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame()
vif['VIF'] = [variance_inflation_factor(x_train.values, i) for i in range(x_train.shape[1])]
vif['variable'] = x_train.columns
vif[:]

In [None]:
# List of columns to drop
columns_to_drop1 = ['MasVnrArea_reciprocal', 'GarageQual_NoGarage',
                   'BsmtExposure_No_Basement','BsmtQual_NoGarage','BsmtFinType1_NoBasement',
                  'BsmtFinType2_NoBasement','GarageType_NoGarage','BsmtFinSF2_reciprocal']

# Drop the specified columns
train.drop(columns=columns_to_drop1, axis=1, inplace=True)
test.drop(columns=columns_to_drop1, axis=1, inplace=True)

In [None]:
# Define the prefixes for the columns you want to drop
prefixes = ['Exterior2nd','GarageFinish','FireplaceQu','ExterQual','GarageCars','BldgType','HouseStyle',
           'SaleType','MasVnrType','Has_a_2nd_Flr_']

# Create a list of columns to drop based on the prefixes
columns_to_drop2 = [col for col in train.columns if any(col.startswith(prefix) for prefix in prefixes)]

# Drop the specified columns
train.drop(columns=columns_to_drop2, axis=1, inplace=True)
test.drop(columns=columns_to_drop2, axis=1, inplace=True)

# These were to address my issues with multicolinearity. My process involved removing one variable at a time,
# re-running the code to get the updated VIF scores, and repeat this process until I have gotten acceptable
# VIF scores. Similarly, above I was doing the same with columns I identified with issues. Like BsmtFinSF2 
# was mostly made up of 0 values, so their reciprocal numbers just would not make sense. I also ensured I dropped
# variables that were redundent, like BsmtExposure_No_Basement and BsmtQual_NoGarage to address
# these multicolinearity problems.

I have fixed all issues with VIF up in the script. This included house age by binning it, removing all the variables that consisted of no garage, as well as total square footage and proportion of squarefootage on the first floor.

In [None]:
# Reset the indices of train and test data
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

# After adressing issues of multicolinearity and passing all assumptions I will see how the model performs
x_train = sm.add_constant(train.drop(columns='SalePrice_log'))
y_train = train['SalePrice_log']
x_test = sm.add_constant(test.drop(columns='SalePrice_log')) 
y_test = test['SalePrice_log']
model_plain_vanilla = sm.OLS(y_train, x_train).fit()
print(model_plain_vanilla.summary())

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame()
vif['VIF'] = [variance_inflation_factor(x_train.values, i) for i in range(x_train.shape[1])]
vif['variable'] = x_train.columns
vif[:]

In [None]:
with open('trainPOST.pkl', 'wb') as f:
    pk.dump(train, f)

In [None]:
with open('testPOST.pkl', 'wb') as f:
    pk.dump(test, f)

# Feature Selection

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import boxcox
from sklearn.preprocessing import StandardScaler
import pingouin as pg
import pickle as pk
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore") 
pd.set_option('display.max_columns', None)

In [None]:
train = pd.read_pickle("train3.pkl")
test = pd.read_pickle("test3.pkl")

In [None]:
pd.set_option('display.max_rows',None)

In [None]:
x_train = sm.add_constant(train.drop(columns='SalePrice_log'))
y_train = train['SalePrice_log']
x_test = sm.add_constant(test.drop(columns='SalePrice_log')) 
y_test = test['SalePrice_log']

In [None]:
x_train.shape[1]-1

### Forward

In [None]:
# Forward Selection by adjr2
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
# 1) Generate the best models of different sizes
lr = LinearRegression()
model_forward = sfs(lr,k_features = (1,102), forward=True, scoring='r2',cv= None)
model_forward = model_forward.fit(train.drop(columns = 'SalePrice_log'), y_train)
model_forward.subsets_

# 2) Writing a loop to create a dictionary that stores selected features of the best model of each size.
forward_predictor_dict = {}
for num_predictors in range(1,102):
    specific_predictors = list(model_forward.subsets_[num_predictors]['feature_names'])
    key = f'{num_predictors}_predictor'
    forward_predictor_dict[key] = specific_predictors

print(forward_predictor_dict)

# 3) Exact the keys
forward_predictor_dict_keys = list(forward_predictor_dict.keys())

In [None]:
# 4) Write a loop where we run OLS against selected features of each of the 102 best models and compute ajusted R-square.
forward_adjr2 = []
for i in forward_predictor_dict_keys:
    results_forward = sm.OLS(y_train, sm.add_constant(train[forward_predictor_dict[i]])).fit()
    forward_adjr2.append(results_forward.rsquared_adj)

# 5) Picking the model with the HIGHEST adjusted R-square
print(forward_adjr2)
forward_index_of_highest_adjr2 = forward_adjr2.index(max(forward_adjr2))
print(forward_predictor_dict_keys[forward_index_of_highest_adjr2])

# 6) Use the best 72-predictor model
Best_72_predictor_forward_adjr2 = forward_predictor_dict['72_predictor']
x_train_forward_adjr2 = sm.add_constant(train[Best_72_predictor_forward_adjr2])
x_test_forward_adjr2 = sm.add_constant(test[Best_72_predictor_forward_adjr2])
model_forward_adjr2 = sm.OLS(y_train, x_train_forward_adjr2).fit()

In [None]:
forwardsSubmission = pd.DataFrame({})
forwardsSubmission['ID'] = range(1461, 1461 + len(x_test))
predictions = model_forward_adjr2.predict(x_test_forward_adjr2)
forwardsSubmission['SalePrice'] = 10 ** predictions
forwardsSubmission.to_csv('forwardsSubmission.csv', index = False)
forwardsSubmission
# Kaggle Score: 0.14507

### Backward

In [None]:
# Backward Selection by adjr2
from mlxtend.feature_selection import SequentialFeatureSelector as sfs

# 1) Generate the best models of different sizes
lr = LinearRegression()
model_backward = sfs(lr,k_features = (1,102), forward=False, scoring='r2',cv= None)
model_backward = model_backward.fit(train.drop(columns = 'SalePrice_log'), y_train)
model_backward.subsets_

# 2) Write a loop to create a dictionary that stores selected features of the best model of each size.
backward_predictor_dict = {}
for num_predictors in range(1,102):
    specific_predictors = list(model_backward.subsets_[num_predictors]['feature_names'])
    key = f'{num_predictors}_predictor'
    backward_predictor_dict[key] = specific_predictors

print(backward_predictor_dict)

# 3) Exact the keys
backward_predictor_dict_keys = list(backward_predictor_dict.keys())

In [None]:
# 4) Write a loop where we run OLS against selected features of each of the 102 best models and compute ajusted R-square.
backward_adjr2 = []
for i in backward_predictor_dict_keys:
    results_backward = sm.OLS(y_train, sm.add_constant(train[backward_predictor_dict[i]])).fit()
    backward_adjr2.append(results_backward.rsquared_adj)

# 5) Pick the model with the HIGHEST adjusted R-square
print(backward_adjr2)
backward_index_of_highest_adjr2 = backward_adjr2.index(max(backward_adjr2))
print(backward_predictor_dict_keys[backward_index_of_highest_adjr2])

# 6) Use the best 72-predictor model
Best_72_predictor_backward_adjr2 = backward_predictor_dict['72_predictor']
x_train_backward_adjr2 = sm.add_constant(train[Best_72_predictor_backward_adjr2])
x_test_backward_adjr2 = sm.add_constant(test[Best_72_predictor_backward_adjr2])
model_backward_adjr2 = sm.OLS(y_train, x_train_backward_adjr2).fit()

In [None]:
BackwardsSubmission = pd.DataFrame({})
BackwardsSubmission['ID'] = range(1461, 1461 + len(x_test))
predictions2 = model_backward_adjr2.predict(x_test_backward_adjr2)
BackwardsSubmission['SalePrice'] = 10 ** predictions2
BackwardsSubmission.to_csv('BackwardsSubmission.csv', index = False)
BackwardsSubmission
# Kaggle Score: 0.14505

# Lasso Model 

In [8]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import boxcox
from sklearn.preprocessing import StandardScaler
import pingouin as pg
import pickle as pk
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore") 
pd.set_option('display.max_columns', None)

In [9]:
# train = pd.read_pickle("train3.pkl")
# test = pd.read_pickle("test3.pkl")
train = pd.read_pickle("trainPOST.pkl")
test = pd.read_pickle("testPOST.pkl")
test['SalePrice_log'] = 0

In [10]:
x_train = sm.add_constant(train.drop(columns='SalePrice_log'))
y_train = train['SalePrice_log']
x_test = sm.add_constant(test.drop(columns='SalePrice_log')) 
y_test = test['SalePrice_log']
x_train = x_train.drop(columns = 'const') 
x_test = x_test.drop(columns = 'const')

In [11]:
# Lasso Regression
#Importing the Lasso algorithm, Grid Search, Cross-Validation and KFold algorithms
from sklearn.linear_model import Lasso
from regressors import stats  
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import KFold

# Determining the best tuning parameter alpha through k-fold cross-validation on the training data.
# 1) Defining the Lasso estimator
model_Lasso = Lasso() 

# 2) Defining cross-validation data splitting strategy
kf_Lasso = KFold(n_splits = 10, shuffle = True, random_state = 42) 

# 3) Defining the search range for alpha
# # This is 100 alphas spaced evenly on a logarithmic scale between  10**(-8) and 10**(8).
grid_Lasso = {'alpha' : np.logspace(start = -8, stop = 8, num = 100, base = 10)} 

# 4) Excecuting cross-validation for each alpha value in the grid
cv_Lasso = GridSearchCV(estimator = model_Lasso,
                        param_grid = grid_Lasso, 
                        scoring = 'neg_mean_squared_error',
                        cv = kf_Lasso) 
cv_Lasso.fit(x_train, y_train)

# 5) Outputing the best alpha and the corresponding score
print(cv_Lasso.best_params_, cv_Lasso.best_score_) 

# 6) Best alpha = 3.5938136638046256e-05
model_Lasso_best = Lasso(alpha = 3.5938136638046256e-05).fit(x_train, y_train)

# 7) Getting the coefficients of the predictors and the intercept
print(model_Lasso_best.coef_)
print(model_Lasso_best.intercept_)

{'alpha': 3.5938136638046256e-05} -0.0029752439946564126
[ 4.03115499e-08  8.48475862e-03  1.47434064e-02 -1.68734914e-02
  1.12072815e-03  3.77357807e-03 -2.27925453e-07  2.34516306e-04
  4.66369234e-05 -7.45359962e-04  4.15873527e-02  6.37031196e-02
  8.38024042e-02  1.06935767e-01  1.29792423e-01  1.59697285e-01
  1.24936870e-01  1.62223141e-02  6.17976340e-03  2.02033216e-02
  1.66230783e-02  3.24770884e-02  2.42960747e-02 -3.26414251e-02
  1.08205232e-02  5.88826470e-03 -8.86819476e-03  5.63480821e-02
 -3.53740126e-02  4.58593514e-03 -5.35146082e-02 -8.03787765e-02
 -2.54429685e-02 -1.34315184e-02 -0.00000000e+00  3.64182368e-02
  3.81183244e-02 -3.05437889e-02 -1.05443359e-04 -1.70322011e-02
 -2.80641776e-03  5.24991498e-02  2.21812939e-02  1.69518933e-04
  2.38274853e-02  6.78052953e-02  2.70922683e-02  3.85408081e-02
  5.17280565e-03 -2.39933807e-03  8.41693011e-03  2.47157494e-03
  1.72087586e-02  3.44062288e-04 -6.42445226e-03  6.40004853e-03
 -2.56430631e-02  0.00000000e+00 

In [12]:
# 6.2.5 Outputting the corresponding feature names of coefficients and the p-values
stats.summary(model_Lasso_best, x_train, y_train, xlabels = x_train.columns)

Residuals:
    Min      1Q  Median     3Q    Max
-0.1983 -0.0275  -0.002 0.0211 0.3152


Coefficients:
                              Estimate  Std. Error   t value   p value
_intercept                    4.442855    0.040205  110.5042  0.000000
GarageArea_square             0.000000    0.000000   10.6080  0.000000
TotalSqFt_sqrt                0.008485    0.000218   38.9653  0.000000
Prop_of_SF_on_1st_reciprocal  0.014743    0.004697    3.1391  0.001729
Last_Remod_cbrt              -0.016873    0.002109   -7.9996  0.000000
TotalPorchSF_sqrt             0.001121    0.000204    5.5021  0.000000
LotArea_cbrt                  0.003774    0.000461    8.1787  0.000000
LotFrontage_square           -0.000000    0.000000   -0.6922  0.488945
LowQualFinSF_reciprocal       0.000235    0.000050    4.7319  0.000002
BsmtFinSF1_sqrt               0.000047    0.000137    0.3414  0.732837
BsmtUnfSF_sqrt               -0.000745    0.000106   -7.0470  0.000000
OverallQual_4                 0.041587    0.0

In [13]:
LassoSubmission = pd.DataFrame({})
LassoSubmission['ID'] = range(1461, 1461 + len(x_test))
predictions = model_Lasso_best.predict(x_test)
LassoSubmission['SalePrice'] = 10 ** predictions
LassoSubmission.to_csv('LassoSubmission.csv', index = False)
LassoSubmission
# Kaggle Score: 0.13093

Unnamed: 0,ID,SalePrice
0,1461,117904.766414
1,1462,170771.491370
2,1463,186425.600623
3,1464,193550.420671
4,1465,188245.669701
...,...,...
1454,2915,78320.504150
1455,2916,80709.177189
1456,2917,159752.622224
1457,2918,121086.021879


# Ridge Model 

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import boxcox
from sklearn.preprocessing import StandardScaler
import pingouin as pg
import pickle as pk
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore") 
pd.set_option('display.max_columns', None)

In [None]:
train = pd.read_pickle("trainPOST.pkl")
test = pd.read_pickle("testPOST.pkl")

In [None]:
x_train = sm.add_constant(train.drop(columns='SalePrice_log'))
y_train = train['SalePrice_log']
x_test = sm.add_constant(test.drop(columns='SalePrice_log')) 
y_test = test['SalePrice_log']

In [None]:
# Ridge Regression
from sklearn.linear_model import Ridge # Import Ridge algorithm

In [None]:
# 1) Determining the optimal tuning parameter alpha through k-fold cross-validation on training data.

# Import the algorithm that can perform grid search and cross-validation simultaneously.
from sklearn.model_selection import GridSearchCV 
# Import KFold algorithm to sepcify how data is split for cross-validation.
from sklearn.model_selection import KFold

# drop constant here OLS and feature selection require constant but none of the others

# Performing 10-fold cross-validation for 100 alpha values between log10(-8) and log10(8)
# 2) Define the estimator
model_ridge = Ridge() 

# 3) Defining cross-validation data splitting strategy
kf_ridge = KFold(n_splits = 10, shuffle = True, random_state = 42) 

# 4) Specify the range of alpha: 100 alphas spaced evenly on a logarithmic scale between  10**(-8) and 10**(8).
grid_ridge = {'alpha' : np.logspace(start = -8, stop = 8, num = 100, base = 10)} 

# 5) Excecuting cross-validation for each alpha value in the grid
cv_ridge = GridSearchCV(estimator = model_ridge,
                        param_grid = grid_ridge, 
                        scoring = 'neg_mean_squared_error',
                        cv = kf_ridge) 
cv_ridge.fit(x_train, y_train)

# 6) Outputing the best alpha and the corresponding cross-validation score
print(cv_ridge.best_params_, cv_ridge.best_score_) 

# 7) Running ridge regression on the training data using the best alpha of ~0.83.
model_ridge_best = Ridge(alpha = 0.8302175681319752).fit(x_train, y_train)

# 8) Outputting the coefficients of the predictors and the intercept
print(model_ridge_best.coef_)
print(model_ridge_best.intercept_)

In [None]:
RidgeSubmission = pd.DataFrame({})
RidgeSubmission['ID'] = range(1461, 1461 + len(x_test))
predictions = model_ridge_best.predict(x_test)
RidgeSubmission['SalePrice'] = 10 ** predictions
RidgeSubmission.to_csv('RidgeSubmission.csv', index = False)
RidgeSubmission
# Kaggle Score: 0.13142

# Regression Tree - Bagging, Random Forest, and Boosting

In [None]:
# Data Prep & Plain Vanillaimport pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import boxcox
from sklearn.preprocessing import StandardScaler
import pingouin as pg
import pickle as pk
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore") 
pd.set_option('display.max_columns', None)
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
train = pd.read_pickle("trainPOST.pkl")
test = pd.read_pickle("testPOST.pkl")

In [None]:
x_train = sm.add_constant(train.drop(columns='SalePrice_log'))
y_train = train['SalePrice_log']
x_test = sm.add_constant(test.drop(columns='SalePrice_log')) 
y_test = test['SalePrice_log']

In [None]:
# 1) Defining the RandomForestRegressor
Tree_Bagging = RandomForestRegressor(n_estimators = 500, 
                                     criterion = 'squared_error',
                                     max_features = x_train.shape[1], 
                                     bootstrap = True,
                                     max_samples = x_train.shape[0],
                                     random_state = 42)

In [None]:
# 2) Fitting a bagged regression tree
Tree_Bagging.fit(x_train, y_train)

# Output feature importance
RSS_reduction = Tree_Bagging.feature_importances_

# Create a feature importance data frame
feature_importance_dict = {'feature_name': x_train.columns, 'RSS_reduction': RSS_reduction}
feature_importance_df = pd.DataFrame(feature_importance_dict).sort_values(by = 'RSS_reduction', ascending=False)
print(feature_importance_df)
# The top five most important features are: TotalSqFt_sqrt, Last_Remod_cbrt,GarageArea_square,
# KitchenQual_TA,BsmtQual_TA   

In [None]:
TreeBaggingSubmission = pd.DataFrame({})
TreeBaggingSubmission['ID'] = range(1461, 1461 + len(x_test))
predictions1 = Tree_Bagging.predict(x_test)
TreeBaggingSubmission['SalePrice'] = 10 ** predictions1
TreeBaggingSubmission.to_csv('TreeBaggingSubmission.csv', index = False)
TreeBaggingSubmission
# Kaggle score: 0.15438

## Random Forest

In [None]:
# Random Forest

# 1) Splitting training data into sub-training and sub-test sets using a 75-25 split.
sub_train, sub_test = train_test_split(train, train_size = 0.75, test_size= 0.25, random_state= 123)

x_sub_train = sub_train.drop(columns='SalePrice_log')
y_sub_train = sub_train['SalePrice_log']

x_sub_test = sub_test.drop(columns='SalePrice_log')
y_sub_test = sub_test['SalePrice_log']

# 2) Write a loop to find the opimal max_features value
max_features = []
subtest_mse_rf = []
for i in range(1, 12):
    Tree_RF = RandomForestRegressor(n_estimators = 500, 
                                    criterion = 'squared_error',
                                    max_features = i, 
                                    bootstrap = True,
                                    max_samples = x_sub_train.shape[0],
                                    random_state = 42)
    Tree_RF.fit(x_sub_train, y_sub_train)
    max_features.append(i)
    subtest_mse_rf.append(mean_squared_error(y_sub_test, Tree_RF.predict(x_sub_test)))

# 3) Create a data frame to store max_features and subtest_mse_rf and sort the data frame by subtest_mse_rf.
df = pd.DataFrame({'max_features': max_features, 'subtest_mse_rf': subtest_mse_rf}).sort_values(by = 'subtest_mse_rf')
print(df)

# The optimal max_features is 11. 

In [None]:
# 4) Use the optimal max_features to grow a random forest
Tree_RF_Best = RandomForestRegressor(n_estimators = 500, 
                           criterion = 'squared_error',
                           max_features = 11, 
                           bootstrap = True,
                           max_samples = x_train.shape[0],
                           random_state = 42)
Tree_RF_Best.fit(x_train, y_train)

In [None]:
RFSubmission = pd.DataFrame({})
RFSubmission['ID'] = range(1461, 1461 + len(x_test))
predictions2 = Tree_RF_Best.predict(x_test)
RFSubmission['SalePrice'] = 10 ** predictions2
RFSubmission.to_csv('RFSubmission.csv', index = False)
RFSubmission
# Kaggle score: 0.15852

## Boosting

In [None]:
# Boosting

from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import KFold 

# 1) Define cross-validation data splitting strategy
kf_boosting = KFold(n_splits = 2, shuffle = True, random_state = 42)

# 2) Define a grid of possible values for each parameter
param_grid_boosting = {
    'n_estimators': list(range(1000, 5001, 100)),
    'max_leaf_nodes': [2, 3, 4, 5, 6, 7]
}

# 3)
Tree_Boosting = GradientBoostingRegressor(learning_rate = 0.01,
                                          criterion = 'squared_error',
                                          max_features = x_train.shape[1],
                                          random_state = 42)

# 4) Excecuting cross-validation for all combinations to find the set of parameters
cv_tree_boosting = GridSearchCV(estimator = Tree_Boosting,
                              param_grid = param_grid_boosting, 
                              scoring = 'neg_mean_squared_error',
                              cv = kf_boosting, 
                              n_jobs = -1)
cv_tree_boosting.fit(x_train, y_train)

In [None]:
print("Best Parameters: ", cv_tree_boosting.best_params_)
print("Best Score: ", cv_tree_boosting.best_score_)

In [None]:
# 5) Redefining GradientBoostingRegressor using the best combination of parameters
Tree_Boosting_best = GradientBoostingRegressor(learning_rate = 0.01,
                                              n_estimators = 5000,
                                              criterion = 'squared_error',
                                              max_leaf_nodes = 3,
                                              max_features = 12,
                                              random_state = 42)

In [None]:
# 6) Fit a boosted regression tree

Tree_Boosting_best.fit(x_train, y_train)

# Output feature importance
RSS_reduction = Tree_Boosting_best.feature_importances_

# Create a feature importance data frame
feature_importance_dict = {'feature_name': x_train.columns, 'RSS_reduction': RSS_reduction}
feature_importance_df = pd.DataFrame(feature_importance_dict).sort_values(by = 'RSS_reduction', ascending=False)
print(feature_importance_df)
# The top five most important features are: TotalSqFt_sqrt, GarageArea_square, Last_Remod_cbrt     
# KitchenQual_TA, and Foundation_PConc

In [None]:
BoostingRFSubmission = pd.DataFrame({})
BoostingRFSubmission['ID'] = range(1461, 1461 + len(x_test))
predictions3 = Tree_Boosting_best.predict(x_test)
BoostingRFSubmission['SalePrice'] = 10 ** predictions3
BoostingRFSubmission.to_csv('BoostingRFSubmission.csv', index = False)
BoostingRFSubmission
# Kaggle score: 0.14087