### Summary

In [1]:
import pandas as pd
import numpy as np
from scipy import stats

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

pd.set_option('display.max_columns', 500)

In [2]:
data_path = "../data"
file = "train"

df = pd.read_csv("{}/{}_wrangled.csv".format(data_path, file))

y = df["SalePrice_log"]

df.drop(labels=["Id", "SalePrice", "SalePrice_log"], axis=1, inplace=True)
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,4.189655,9.04204,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,5.283204,Gd,TA,PConc,Gd,TA,No,GLQ,6.561031,Unf,0.0,5.01728,6.753438,GasA,Ex,Y,SBrkr,6.753438,6.751101,0.0,7.444833,1,0.0,2,1,3,0.693147,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0.0,4.127134,0.0,0.0,0.0,0.0,,,,0.0,2,2008,WD,Normal
1,20,RL,4.394449,9.169623,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,6.886532,Unf,0.0,5.652489,7.141245,GasA,Ex,Y,SBrkr,7.141245,0.0,0.0,7.141245,0,0.693147,2,0,3,0.693147,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,5.700444,0.0,0.0,0.0,0.0,0.0,,,,0.0,5,2007,WD,Normal
2,60,RL,4.234107,9.328212,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,5.09375,Gd,TA,PConc,Gd,TA,Mn,GLQ,6.188264,Unf,0.0,6.075346,6.82546,GasA,Ex,Y,SBrkr,6.82546,6.765039,0.0,7.488294,1,0.0,2,1,3,0.693147,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0.0,3.7612,0.0,0.0,0.0,0.0,,,,0.0,9,2008,WD,Normal
3,70,RL,4.110874,9.164401,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,5.379897,Unf,0.0,6.293419,6.629363,GasA,Gd,Y,SBrkr,6.869014,6.629363,0.0,7.448916,1,0.0,1,0,3,0.693147,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0.0,3.583519,5.609472,0.0,0.0,0.0,,,,0.0,2,2006,WD,Abnorml
4,60,RL,4.442651,9.565284,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,5.860786,Gd,TA,PConc,Gd,TA,Av,GLQ,6.486161,Unf,0.0,6.196444,7.044033,GasA,Ex,Y,SBrkr,7.044033,6.960348,0.0,7.695758,1,0.0,2,1,4,0.693147,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,5.26269,4.442651,0.0,0.0,0.0,0.0,,,,0.0,12,2008,WD,Normal


In [3]:
df.loc[:, ['MSSubClass', 'OverallQual', 'OverallCond', 
               'MoSold', 'YrSold']] = df.loc[:,[
    'MSSubClass', 'OverallQual', 
    'OverallCond', 'MoSold', 'YrSold']].astype('object')

In [4]:
df.dtypes.value_counts()

object     48
float64    21
int64      10
dtype: int64

In [5]:
df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']

In [6]:
df[["TotalSF", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF"]].head()

Unnamed: 0,TotalSF,TotalBsmtSF,1stFlrSF,2ndFlrSF
0,20.257977,6.753438,6.753438,6.751101
1,14.28249,7.141245,7.141245,0.0
2,20.415959,6.82546,6.82546,6.765039
3,20.127741,6.629363,6.869014,6.629363
4,21.048414,7.044033,7.044033,6.960348


In [7]:
df.dtypes.value_counts()

object     48
float64    22
int64      10
dtype: int64

In [8]:
continuous_df = df.select_dtypes(include = ['int64', 'float64'])
continuous_df.head()

Unnamed: 0,LotFrontage,LotArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,TotalSF
0,4.189655,9.04204,2003,2003,5.283204,6.561031,0.0,5.01728,6.753438,6.753438,6.751101,0.0,7.444833,1,0.0,2,1,3,0.693147,8,0,2003.0,2,548,0.0,4.127134,0.0,0.0,0.0,0.0,0.0,20.257977
1,4.394449,9.169623,1976,1976,0.0,6.886532,0.0,5.652489,7.141245,7.141245,0.0,0.0,7.141245,0,0.693147,2,0,3,0.693147,6,1,1976.0,2,460,5.700444,0.0,0.0,0.0,0.0,0.0,0.0,14.28249
2,4.234107,9.328212,2001,2002,5.09375,6.188264,0.0,6.075346,6.82546,6.82546,6.765039,0.0,7.488294,1,0.0,2,1,3,0.693147,6,1,2001.0,2,608,0.0,3.7612,0.0,0.0,0.0,0.0,0.0,20.415959
3,4.110874,9.164401,1915,1970,0.0,5.379897,0.0,6.293419,6.629363,6.869014,6.629363,0.0,7.448916,1,0.0,1,0,3,0.693147,7,1,1998.0,3,642,0.0,3.583519,5.609472,0.0,0.0,0.0,0.0,20.127741
4,4.442651,9.565284,2000,2000,5.860786,6.486161,0.0,6.196444,7.044033,7.044033,6.960348,0.0,7.695758,1,0.0,2,1,4,0.693147,9,1,2000.0,3,836,5.26269,4.442651,0.0,0.0,0.0,0.0,0.0,21.048414


__Robust Scaling__

You can check out the different types of scalers visually [here](https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html#sphx-glr-auto-examples-preprocessing-plot-all-scaling-py)

In [9]:
# Checking what RobustScaler does

# robust_scaler = RobustScaler()
# robust_scaler.fit(continuous_df)
# pd.DataFrame(robust_scaler.transform(continuous_df), 
#              columns=continuous_df.columns)

df_cont_rob_scl = pd.DataFrame(RobustScaler().fit_transform(continuous_df), 
                                columns=continuous_df.columns, 
                               index=continuous_df.index
                              )
df_cont_rob_scl.head()

Unnamed: 0,LotFrontage,LotArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,TotalSF
0,-0.320894,-0.26766,0.652174,0.243243,1.03441,0.09271,0.0,-0.898157,-0.300181,-0.524119,1.024186,0.0,0.342891,1.0,0.0,0.0,1.0,0.0,0.0,1.0,-1.0,0.589744,0.0,0.281573,0.0,0.205247,0.0,0.0,0.0,0.0,0.0,0.894401
1,0.434379,0.029682,0.065217,-0.486486,0.0,0.142255,0.0,-0.403505,0.492878,0.327547,0.0,0.0,-0.327743,0.0,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,-0.102564,0.0,-0.082816,1.11122,-0.769489,0.0,0.0,0.0,0.0,0.0,-0.089952
2,-0.156958,0.399288,0.608696,0.216216,0.997317,0.035971,0.0,-0.074216,-0.152897,-0.36595,1.026301,0.0,0.438896,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.538462,0.0,0.530021,0.0,0.118822,0.0,0.0,0.0,0.0,0.0,0.920426
3,-0.611435,0.017513,-1.26087,-0.648649,0.0,-0.087072,0.0,0.095602,-0.553912,-0.2703,1.005718,0.0,0.35191,1.0,0.0,-1.0,0.0,0.0,0.0,0.5,0.0,0.461538,1.0,0.670807,0.0,0.076857,5.609472,0.0,0.0,0.0,0.0,0.872947
4,0.612146,0.951802,0.586957,0.162162,1.147496,0.081314,0.0,0.020085,0.294081,0.114059,1.05593,0.0,0.89719,1.0,0.0,0.0,1.0,1.0,0.0,1.5,0.0,0.512821,1.0,1.47412,1.025886,0.279765,0.0,0.0,0.0,0.0,0.0,1.024611


In [10]:
df[continuous_df.columns] = df_cont_rob_scl
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,TotalSF
0,60,RL,-0.320894,-0.26766,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,0.652174,0.243243,Gable,CompShg,VinylSd,VinylSd,BrkFace,1.03441,Gd,TA,PConc,Gd,TA,No,GLQ,0.09271,Unf,0.0,-0.898157,-0.300181,GasA,Ex,Y,SBrkr,-0.524119,1.024186,0.0,0.342891,1.0,0.0,0.0,1.0,0.0,0.0,Gd,1.0,Typ,-1.0,,Attchd,0.589744,RFn,0.0,0.281573,TA,TA,Y,0.0,0.205247,0.0,0.0,0.0,0.0,,,,0.0,2,2008,WD,Normal,0.894401
1,20,RL,0.434379,0.029682,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,0.065217,-0.486486,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,0.142255,Unf,0.0,-0.403505,0.492878,GasA,Ex,Y,SBrkr,0.327547,0.0,0.0,-0.327743,0.0,0.693147,0.0,0.0,0.0,0.0,TA,0.0,Typ,0.0,TA,Attchd,-0.102564,RFn,0.0,-0.082816,TA,TA,Y,1.11122,-0.769489,0.0,0.0,0.0,0.0,,,,0.0,5,2007,WD,Normal,-0.089952
2,60,RL,-0.156958,0.399288,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,0.608696,0.216216,Gable,CompShg,VinylSd,VinylSd,BrkFace,0.997317,Gd,TA,PConc,Gd,TA,Mn,GLQ,0.035971,Unf,0.0,-0.074216,-0.152897,GasA,Ex,Y,SBrkr,-0.36595,1.026301,0.0,0.438896,1.0,0.0,0.0,1.0,0.0,0.0,Gd,0.0,Typ,0.0,TA,Attchd,0.538462,RFn,0.0,0.530021,TA,TA,Y,0.0,0.118822,0.0,0.0,0.0,0.0,,,,0.0,9,2008,WD,Normal,0.920426
3,70,RL,-0.611435,0.017513,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,-1.26087,-0.648649,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,-0.087072,Unf,0.0,0.095602,-0.553912,GasA,Gd,Y,SBrkr,-0.2703,1.005718,0.0,0.35191,1.0,0.0,-1.0,0.0,0.0,0.0,Gd,0.5,Typ,0.0,Gd,Detchd,0.461538,Unf,1.0,0.670807,TA,TA,Y,0.0,0.076857,5.609472,0.0,0.0,0.0,,,,0.0,2,2006,WD,Abnorml,0.872947
4,60,RL,0.612146,0.951802,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,0.586957,0.162162,Gable,CompShg,VinylSd,VinylSd,BrkFace,1.147496,Gd,TA,PConc,Gd,TA,Av,GLQ,0.081314,Unf,0.0,0.020085,0.294081,GasA,Ex,Y,SBrkr,0.114059,1.05593,0.0,0.89719,1.0,0.0,0.0,1.0,1.0,0.0,Gd,1.5,Typ,0.0,TA,Attchd,0.512821,RFn,1.0,1.47412,TA,TA,Y,1.025886,0.279765,0.0,0.0,0.0,0.0,,,,0.0,12,2008,WD,Normal,1.024611


__Nominal and ordinal encoding of variables__

We handle ordinal variables manually, and subset the remaining nominal variables to be one-hot encoded

In [11]:
df.dtypes.value_counts()

object     48
float64    32
dtype: int64

In [12]:
df.LotShape.replace(to_replace = ['IR3', 'IR2', 'IR1', 'Reg'], value = [0, 1, 2, 3], inplace = True)
df.LandContour.replace(to_replace = ['Low', 'Bnk', 'HLS', 'Lvl'], value = [0, 1, 2, 3], inplace = True)
df.Utilities.replace(to_replace = ['ELO', 'NoSeWa', 'NoSewr', 'AllPub'], value = [0, 1, 2, 3], inplace = True)
df.LandSlope.replace(to_replace = ['Sev', 'Mod', 'Gtl'], value = [0, 1, 2], inplace = True)

df.ExterQual.replace(to_replace = ['Po', 'Fa', 'TA', 'Gd', 'Ex'], value = [0, 1, 2, 3, 4], inplace = True)
df.ExterCond.replace(to_replace = ['Po', 'Fa', 'TA', 'Gd', 'Ex'], value = [0, 1, 2, 3, 4], inplace = True)

df.BsmtQual.replace(to_replace = ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], value = [0, 1, 2, 3, 4, 5], inplace = True)
df.BsmtExposure.replace(to_replace = ['None', 'No', 'Mn', 'Av', 'Gd'], value = [0, 1, 2, 3, 4], inplace = True)
df.BsmtFinType1.replace(to_replace = ['None', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'], value = [0, 1, 2, 3, 4, 5, 6], inplace = True)
df.BsmtFinType2.replace(to_replace = ['None', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'], value = [0, 1, 2, 3, 4, 5, 6], inplace = True)
df.BsmtCond.replace(to_replace = ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], value = [0, 1, 2, 3, 4, 5], inplace = True)

df.HeatingQC.replace(to_replace = ['Po', 'Fa', 'TA', 'Gd', 'Ex'], value = [0, 1, 2, 3, 4], inplace = True)
df.Electrical.replace(to_replace = ['Mix', 'FuseP', 'FuseF', 'FuseA', 'SBrkr'], value = [0, 1, 2, 3, 4], inplace = True)
df.KitchenQual.replace(to_replace = ['Po', 'Fa', 'TA', 'Gd', 'Ex'], value = [0, 1, 2, 3, 4], inplace = True)
df.Functional.replace(to_replace = ['Sal', 'Sev', 'Maj2', 'Maj1', 'Mod', 'Min2', 'Min1', 'Typ'], value = [0, 1, 2, 3, 4, 5, 6, 7], inplace = True)
df.FireplaceQu.replace(to_replace =  ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], value = [0, 1, 2, 3, 4, 5], inplace = True)

df.GarageFinish.replace(to_replace =  ['None', 'Unf', 'RFn', 'Fin'], value = [0, 1, 2, 3], inplace = True)
df.GarageQual.replace(to_replace =  ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], value = [0, 1, 2, 3, 4, 5], inplace = True)
df.GarageCond.replace(to_replace =  ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], value = [0, 1, 2, 3, 4, 5], inplace = True)
df.PavedDrive.replace(to_replace =  ['N', 'P', 'Y'], value = [0, 1, 2], inplace = True)

df.PoolQC.replace(to_replace =  ['None', 'Fa', 'TA', 'Gd', 'Ex'], value = [0, 1, 2, 3, 4], inplace = True)
df.Fence.replace(to_replace =  ['None', 'MnWw', 'GdWo', 'MnPrv', 'GdPrv'], value = [0, 1, 2, 3, 4], inplace = True)

In [13]:
df.BsmtCond.unique()

array([3, 4, 0, 2, 1], dtype=object)

This is a weird bug... for some reason BsmtCond is the only one left behind as an object after the encoding.

In [14]:
df["BsmtCond"] = df["BsmtCond"].astype('int64')

In [15]:
df.dtypes.value_counts()

float64    32
object     26
int64      22
dtype: int64

We also make sure that any nominal variables are converted back to integers before we one-hot encode.

In [16]:
df["OverallQual"] = df["OverallQual"].astype('int64')
df["OverallCond"] = df["OverallCond"].astype('int64')

In [17]:
one_hot_var = df.select_dtypes(include="object").columns
print(len(one_hot_var))
one_hot_var

24


Index(['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotConfig',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'Foundation', 'Heating', 'CentralAir', 'GarageType', 'MiscFeature',
       'MoSold', 'YrSold', 'SaleType', 'SaleCondition'],
      dtype='object')

In [18]:
one_hot_df = pd.get_dummies(df.select_dtypes(include="object"))
print(one_hot_df.shape)
one_hot_df.head()

(1460, 192)


Unnamed: 0,MSSubClass_20,MSSubClass_30,MSSubClass_40,MSSubClass_45,MSSubClass_50,MSSubClass_60,MSSubClass_70,MSSubClass_75,MSSubClass_80,MSSubClass_85,MSSubClass_90,MSSubClass_120,MSSubClass_160,MSSubClass_180,MSSubClass_190,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Grvl,Street_Pave,Alley_Grvl,Alley_None,Alley_Pave,LotConfig_Corner,LotConfig_CulDSac,LotConfig_FR2,LotConfig_FR3,LotConfig_Inside,Neighborhood_Blmngtn,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,Condition1_Artery,Condition1_Feedr,Condition1_Norm,Condition1_PosA,Condition1_PosN,Condition1_RRAe,Condition1_RRAn,Condition1_RRNe,Condition1_RRNn,Condition2_Artery,Condition2_Feedr,Condition2_Norm,Condition2_PosA,Condition2_PosN,Condition2_RRAe,Condition2_RRAn,Condition2_RRNn,BldgType_1Fam,BldgType_2fmCon,BldgType_Duplex,BldgType_Twnhs,BldgType_TwnhsE,HouseStyle_1.5Fin,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl,RoofStyle_Flat,RoofStyle_Gable,RoofStyle_Gambrel,RoofStyle_Hip,RoofStyle_Mansard,RoofStyle_Shed,RoofMatl_ClyTile,RoofMatl_CompShg,RoofMatl_Membran,RoofMatl_Metal,RoofMatl_Roll,RoofMatl_Tar&Grv,RoofMatl_WdShake,RoofMatl_WdShngl,Exterior1st_AsbShng,Exterior1st_AsphShn,Exterior1st_BrkComm,Exterior1st_BrkFace,Exterior1st_CBlock,Exterior1st_CemntBd,Exterior1st_HdBoard,Exterior1st_ImStucc,Exterior1st_MetalSd,Exterior1st_Plywood,Exterior1st_Stone,Exterior1st_Stucco,Exterior1st_VinylSd,Exterior1st_Wd Sdng,Exterior1st_WdShing,Exterior2nd_AsbShng,Exterior2nd_AsphShn,Exterior2nd_Brk Cmn,Exterior2nd_BrkFace,Exterior2nd_CBlock,Exterior2nd_CmentBd,Exterior2nd_HdBoard,Exterior2nd_ImStucc,Exterior2nd_MetalSd,Exterior2nd_Other,Exterior2nd_Plywood,Exterior2nd_Stone,Exterior2nd_Stucco,Exterior2nd_VinylSd,Exterior2nd_Wd Sdng,Exterior2nd_Wd Shng,MasVnrType_BrkCmn,MasVnrType_BrkFace,MasVnrType_None,MasVnrType_Stone,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,Heating_Floor,Heating_GasA,Heating_GasW,Heating_Grav,Heating_OthW,Heating_Wall,CentralAir_N,CentralAir_Y,GarageType_2Types,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageType_None,MiscFeature_Gar2,MiscFeature_None,MiscFeature_Othr,MiscFeature_Shed,MiscFeature_TenC,MoSold_1,MoSold_2,MoSold_3,MoSold_4,MoSold_5,MoSold_6,MoSold_7,MoSold_8,MoSold_9,MoSold_10,MoSold_11,MoSold_12,YrSold_2006,YrSold_2007,YrSold_2008,YrSold_2009,YrSold_2010,SaleType_COD,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
3,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0


In [19]:
df_processed = pd.concat([df, one_hot_df], axis=1)
print(df_processed.shape)
df_processed.drop(labels=one_hot_var, axis=1, inplace=True)
print(df_processed.shape) # Should be dropping 25 columns
df_processed.head()

(1460, 272)
(1460, 248)


Unnamed: 0,LotFrontage,LotArea,LotShape,LandContour,Utilities,LandSlope,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,ExterCond,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,HeatingQC,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscVal,TotalSF,MSSubClass_20,MSSubClass_30,MSSubClass_40,MSSubClass_45,MSSubClass_50,MSSubClass_60,MSSubClass_70,MSSubClass_75,MSSubClass_80,MSSubClass_85,MSSubClass_90,MSSubClass_120,MSSubClass_160,MSSubClass_180,MSSubClass_190,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Grvl,Street_Pave,Alley_Grvl,Alley_None,Alley_Pave,LotConfig_Corner,LotConfig_CulDSac,LotConfig_FR2,LotConfig_FR3,LotConfig_Inside,Neighborhood_Blmngtn,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,Condition1_Artery,Condition1_Feedr,Condition1_Norm,Condition1_PosA,Condition1_PosN,Condition1_RRAe,Condition1_RRAn,Condition1_RRNe,Condition1_RRNn,Condition2_Artery,Condition2_Feedr,Condition2_Norm,Condition2_PosA,Condition2_PosN,Condition2_RRAe,Condition2_RRAn,Condition2_RRNn,BldgType_1Fam,BldgType_2fmCon,BldgType_Duplex,BldgType_Twnhs,BldgType_TwnhsE,HouseStyle_1.5Fin,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl,RoofStyle_Flat,RoofStyle_Gable,RoofStyle_Gambrel,RoofStyle_Hip,RoofStyle_Mansard,RoofStyle_Shed,RoofMatl_ClyTile,RoofMatl_CompShg,RoofMatl_Membran,RoofMatl_Metal,RoofMatl_Roll,RoofMatl_Tar&Grv,RoofMatl_WdShake,RoofMatl_WdShngl,Exterior1st_AsbShng,Exterior1st_AsphShn,Exterior1st_BrkComm,Exterior1st_BrkFace,Exterior1st_CBlock,Exterior1st_CemntBd,Exterior1st_HdBoard,Exterior1st_ImStucc,Exterior1st_MetalSd,Exterior1st_Plywood,Exterior1st_Stone,Exterior1st_Stucco,Exterior1st_VinylSd,Exterior1st_Wd Sdng,Exterior1st_WdShing,Exterior2nd_AsbShng,Exterior2nd_AsphShn,Exterior2nd_Brk Cmn,Exterior2nd_BrkFace,Exterior2nd_CBlock,Exterior2nd_CmentBd,Exterior2nd_HdBoard,Exterior2nd_ImStucc,Exterior2nd_MetalSd,Exterior2nd_Other,Exterior2nd_Plywood,Exterior2nd_Stone,Exterior2nd_Stucco,Exterior2nd_VinylSd,Exterior2nd_Wd Sdng,Exterior2nd_Wd Shng,MasVnrType_BrkCmn,MasVnrType_BrkFace,MasVnrType_None,MasVnrType_Stone,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,Heating_Floor,Heating_GasA,Heating_GasW,Heating_Grav,Heating_OthW,Heating_Wall,CentralAir_N,CentralAir_Y,GarageType_2Types,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageType_None,MiscFeature_Gar2,MiscFeature_None,MiscFeature_Othr,MiscFeature_Shed,MiscFeature_TenC,MoSold_1,MoSold_2,MoSold_3,MoSold_4,MoSold_5,MoSold_6,MoSold_7,MoSold_8,MoSold_9,MoSold_10,MoSold_11,MoSold_12,YrSold_2006,YrSold_2007,YrSold_2008,YrSold_2009,YrSold_2010,SaleType_COD,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,-0.320894,-0.26766,3,3,3,2,7,5,0.652174,0.243243,1.03441,3,2,4,3,1,6,0.09271,1,0.0,-0.898157,-0.300181,4,4,-0.524119,1.024186,0.0,0.342891,1.0,0.0,0.0,1.0,0.0,0.0,3,1.0,7,-1.0,0,0.589744,2,0.0,0.281573,3,3,2,0.0,0.205247,0.0,0.0,0.0,0.0,0,0,0.0,0.894401,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
1,0.434379,0.029682,3,3,3,2,6,8,0.065217,-0.486486,0.0,2,2,4,3,4,5,0.142255,1,0.0,-0.403505,0.492878,4,4,0.327547,0.0,0.0,-0.327743,0.0,0.693147,0.0,0.0,0.0,0.0,2,0.0,7,0.0,3,-0.102564,2,0.0,-0.082816,3,3,2,1.11122,-0.769489,0.0,0.0,0.0,0.0,0,0,0.0,-0.089952,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
2,-0.156958,0.399288,2,3,3,2,7,5,0.608696,0.216216,0.997317,3,2,4,3,2,6,0.035971,1,0.0,-0.074216,-0.152897,4,4,-0.36595,1.026301,0.0,0.438896,1.0,0.0,0.0,1.0,0.0,0.0,3,0.0,7,0.0,3,0.538462,2,0.0,0.530021,3,3,2,0.0,0.118822,0.0,0.0,0.0,0.0,0,0,0.0,0.920426,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
3,-0.611435,0.017513,2,3,3,2,7,5,-1.26087,-0.648649,0.0,2,2,3,4,1,5,-0.087072,1,0.0,0.095602,-0.553912,3,4,-0.2703,1.005718,0.0,0.35191,1.0,0.0,-1.0,0.0,0.0,0.0,3,0.5,7,0.0,4,0.461538,1,1.0,0.670807,3,3,2,0.0,0.076857,5.609472,0.0,0.0,0.0,0,0,0.0,0.872947,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0
4,0.612146,0.951802,2,3,3,2,8,5,0.586957,0.162162,1.147496,3,2,4,3,3,6,0.081314,1,0.0,0.020085,0.294081,4,4,0.114059,1.05593,0.0,0.89719,1.0,0.0,0.0,1.0,1.0,0.0,3,1.5,7,0.0,3,0.512821,2,1.0,1.47412,3,3,2,1.025886,0.279765,0.0,0.0,0.0,0.0,0,0,0.0,1.024611,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0


In [20]:
X = df_processed
print(X.shape)

(1460, 248)


In [24]:
df_processed.dtypes.value_counts()

uint8      192
float64     32
int64       24
dtype: int64

### Save the Dataset 

In [28]:
df_processed.to_csv("{}/{}_final.csv".format(data_path, file), index=False)