In order to prepare the data for a machine learning model, all of the null values need to be dealt with, categorical values need to be encoded, and the skew of the target variable has to be addressed. 

In [1]:
# import packages to be used for data wrangling

import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler

In [2]:
# read in data file

df = pd.read_csv('data/train.csv')

In [3]:
# examine whiich columns have null values and the percentage of nulls

df_nulls = (df.isnull().sum() / len(df)) * 100
df_nulls = df_nulls.drop(df_nulls[df_nulls == 0].index).sort_values(ascending=False)
pd.DataFrame(df_nulls)

Unnamed: 0,0
PoolQC,99.520548
MiscFeature,96.30137
Alley,93.767123
Fence,80.753425
FireplaceQu,47.260274
LotFrontage,17.739726
GarageYrBlt,5.547945
GarageType,5.547945
GarageFinish,5.547945
GarageQual,5.547945


The feature for PoolQC has the highest number of nulls. There aren't too many null values in this dataset, so we should be able to impute most of them.

In [4]:
df_imputed = df.copy()

In [5]:
# according to the data description, null values mean that these features are not present. We will fill them with "None"

df_imputed[['PoolQC','MiscFeature','Alley','Fence','FireplaceQu']] = df_imputed[['PoolQC','MiscFeature','Alley','Fence','FireplaceQu']].fillna("None")

In [6]:
# for the lot frontage, the mean value is inserted into the null entries

df_imputed['LotFrontage'] = df_imputed['LotFrontage'].fillna(value=df_imputed.LotFrontage.mean())

In [7]:
# The categorical features for garages were left blank if the house did not have a garage, so they will be filled with "None"

df_imputed[['GarageType','GarageFinish','GarageQual','GarageCond']] = df_imputed[['GarageType','GarageFinish','GarageQual','GarageCond']].fillna("None")

In [8]:
# The numerical features for garages were left blank if the house did not have a garage, so they will be filled with zeroes

df_imputed['GarageYrBlt'] = df_imputed['GarageYrBlt'].fillna(0)

In [9]:
df_imputed[['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']] = df_imputed[['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']].fillna('None')

In [10]:
df_imputed["MasVnrType"] = df_imputed["MasVnrType"].fillna("None")
df_imputed["MasVnrArea"] = df_imputed["MasVnrArea"].fillna(0)

In [11]:
df_imputed['Electrical'] = df_imputed['Electrical'].fillna(df_imputed['Electrical'].mode()[0])

In [12]:
df_imputed.isnull().sum().sum()

0

Now that there are no null values remaining in the DataFrame, the categorical variables need to be encoded. As some of the categorical variables have ordinality (such as ratings of Good, Fair and Poor), they will be mapped with numerical values that are in order. 

In [13]:
# create dictionary of values for ordinal categorical variables

cond_nums = {'LotShape': {'IR3':0, 'IR2':1, 'IR1':2, 'Reg':3},
              'LandSlope': {'Gtl':0, 'Mod':1, 'Sev':2},
              'ExterQual': {'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}, 
              'ExterCond': {'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}, 
              'BsmtQual': {'None':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}, 
              'BsmtCond': {'None':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}, 
              'BsmtExposure': {'None':0, 'No':1, 'Mn':2, 'Av':3, 'Gd':4}, 
              'BsmtFinType1': {'None':0, 'Unf':1, 'LwQ':2, 'Rec':3, 'BLQ':4, 'ALQ':5, 'GLQ':6},
              'BsmtFinType2': {'None':0, 'Unf':1, 'LwQ':2, 'Rec':3, 'BLQ':4, 'ALQ':5, 'GLQ':6},
              'HeatingQC': {'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}, 
              'CentralAir': {'N':0, 'Y':1},
              'KitchenQual': {'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
              'Functional': {'Sal':0, 'Sev':1, 'Maj2':2, 'Maj1':3, 'Mod':4, 'Min2':5, 'Min1':6, 'Typ':7}, 
              'FireplaceQu': {'None':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}, 
              'GarageFinish': {'None':0, 'Unf':1, 'RFn':2, 'Fin':3},
              'GarageQual': {'None':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}, 
              'GarageCond': {'None':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
              'PavedDrive': {'N':0, 'P':1, 'Y':2}, 
              'PoolQC': {'None':0, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}}
                  

In [14]:
# replace ordinal categorical variables using above dictionary

df_imputed.replace(cond_nums, inplace=True)

In [15]:
# check that ordinal values have been converted to integers.

df_imputed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1460 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            1460 non-null object
LotShape         1460 non-null int64
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null int64
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-n

In [16]:
# use pd.get_dummies to encode the remaining categorical variables.

df_imputed_cat = pd.get_dummies(df_imputed.select_dtypes(include=[object]))

In [17]:
# examine encoded categorical variables.

pd.options.display.max_columns = None
df_imputed_cat.head()

Unnamed: 0,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Grvl,Street_Pave,Alley_Grvl,Alley_None,Alley_Pave,LandContour_Bnk,LandContour_HLS,LandContour_Low,LandContour_Lvl,Utilities_AllPub,Utilities_NoSeWa,LotConfig_Corner,LotConfig_CulDSac,LotConfig_FR2,LotConfig_FR3,LotConfig_Inside,Neighborhood_Blmngtn,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,Condition1_Artery,Condition1_Feedr,Condition1_Norm,Condition1_PosA,Condition1_PosN,Condition1_RRAe,Condition1_RRAn,Condition1_RRNe,Condition1_RRNn,Condition2_Artery,Condition2_Feedr,Condition2_Norm,Condition2_PosA,Condition2_PosN,Condition2_RRAe,Condition2_RRAn,Condition2_RRNn,BldgType_1Fam,BldgType_2fmCon,BldgType_Duplex,BldgType_Twnhs,BldgType_TwnhsE,HouseStyle_1.5Fin,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl,RoofStyle_Flat,RoofStyle_Gable,RoofStyle_Gambrel,RoofStyle_Hip,RoofStyle_Mansard,RoofStyle_Shed,RoofMatl_ClyTile,RoofMatl_CompShg,RoofMatl_Membran,RoofMatl_Metal,RoofMatl_Roll,RoofMatl_Tar&Grv,RoofMatl_WdShake,RoofMatl_WdShngl,Exterior1st_AsbShng,Exterior1st_AsphShn,Exterior1st_BrkComm,Exterior1st_BrkFace,Exterior1st_CBlock,Exterior1st_CemntBd,Exterior1st_HdBoard,Exterior1st_ImStucc,Exterior1st_MetalSd,Exterior1st_Plywood,Exterior1st_Stone,Exterior1st_Stucco,Exterior1st_VinylSd,Exterior1st_Wd Sdng,Exterior1st_WdShing,Exterior2nd_AsbShng,Exterior2nd_AsphShn,Exterior2nd_Brk Cmn,Exterior2nd_BrkFace,Exterior2nd_CBlock,Exterior2nd_CmentBd,Exterior2nd_HdBoard,Exterior2nd_ImStucc,Exterior2nd_MetalSd,Exterior2nd_Other,Exterior2nd_Plywood,Exterior2nd_Stone,Exterior2nd_Stucco,Exterior2nd_VinylSd,Exterior2nd_Wd Sdng,Exterior2nd_Wd Shng,MasVnrType_BrkCmn,MasVnrType_BrkFace,MasVnrType_None,MasVnrType_Stone,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,Heating_Floor,Heating_GasA,Heating_GasW,Heating_Grav,Heating_OthW,Heating_Wall,Electrical_FuseA,Electrical_FuseF,Electrical_FuseP,Electrical_Mix,Electrical_SBrkr,GarageType_2Types,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageType_None,Fence_GdPrv,Fence_GdWo,Fence_MnPrv,Fence_MnWw,Fence_None,MiscFeature_Gar2,MiscFeature_None,MiscFeature_Othr,MiscFeature_Shed,MiscFeature_TenC,SaleType_COD,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,0,0,0,1,0,0,1,0,1,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
1,0,0,0,1,0,0,1,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
2,0,0,0,1,0,0,1,0,1,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
3,0,0,0,1,0,0,1,0,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0
4,0,0,0,1,0,0,1,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0


In [18]:
# scale the numerical variables using RobustScaler

df_num = df_imputed.select_dtypes(include=[np.number])

scaler = RobustScaler()

for column in df_num:
    if column != 'SalePrice':
        df_num[column] = scaler.fit_transform(df_num[column].values.reshape(-1, 1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [19]:
df_num.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,LotShape,LandSlope,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,ExterCond,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,HeatingQC,CentralAir,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,MiscVal,MoSold,YrSold,SalePrice
0,-1.0,0.2,-0.265787,-0.254076,0.0,0.0,0.5,0.0,0.652174,0.243243,1.193303,1.0,0.0,0.0,0.0,0.0,0.4,0.45279,0.0,0.0,-0.559829,-0.269652,0.0,0.0,-0.453608,1.173077,0.0,0.38007,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,-1.0,-0.5,0.604651,0.0,0.0,0.281573,0.0,0.0,0.0,0.0,0.529412,0.0,0.0,0.0,0.0,0.0,0.0,-1.333333,0.0,208500
1,-0.998629,-0.6,0.523686,0.030015,0.0,0.0,0.0,3.0,0.065217,-0.486486,0.0,0.0,0.0,0.0,0.0,3.0,0.2,0.834679,0.0,0.0,-0.330769,0.538308,0.0,0.0,0.343643,0.0,0.0,-0.31209,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,-0.023256,0.0,0.0,-0.082816,0.0,0.0,0.0,1.77381,-0.367647,0.0,0.0,0.0,0.0,0.0,0.0,-0.333333,-0.5,181500
2,-0.997258,0.2,-0.107893,0.437624,-1.0,0.0,0.5,0.0,0.608696,0.216216,0.986301,1.0,0.0,0.0,0.0,1.0,0.4,0.14391,0.0,0.0,-0.074359,-0.142289,0.0,0.0,-0.327933,1.18956,0.0,0.497489,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.25,0.55814,0.0,0.0,0.530021,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,223500
3,-0.995888,0.4,-0.528945,0.017663,-1.0,0.0,0.5,0.0,-1.26087,-0.648649,0.0,0.0,0.0,-1.0,1.0,0.0,0.2,-0.23517,0.0,0.0,0.106838,-0.468657,-0.5,0.0,-0.247423,1.038462,0.0,0.390885,1.0,0.0,-1.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.5,0.488372,-1.0,1.0,0.670807,0.0,0.0,0.0,0.0,0.147059,272.0,0.0,0.0,0.0,0.0,0.0,-1.333333,-1.0,140000
4,-0.994517,0.2,0.734213,1.181201,-1.0,0.0,1.0,0.0,0.586957,0.162162,2.130898,1.0,0.0,0.0,0.0,2.0,0.4,0.381186,0.0,0.0,0.021368,0.305473,0.0,0.0,0.113893,1.446429,0.0,1.134029,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.5,0.0,0.0,0.25,0.534884,0.0,1.0,1.47412,0.0,0.0,0.0,1.142857,0.867647,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,250000


In [20]:
# join the numerical values and encoded categorical values back together into the same DataFrame

df_imputed = df_num.join(df_imputed_cat, how='left')

In [21]:
df_imputed.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,LotShape,LandSlope,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,ExterCond,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,HeatingQC,CentralAir,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,MiscVal,MoSold,YrSold,SalePrice,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Grvl,Street_Pave,Alley_Grvl,Alley_None,Alley_Pave,LandContour_Bnk,LandContour_HLS,LandContour_Low,LandContour_Lvl,Utilities_AllPub,Utilities_NoSeWa,LotConfig_Corner,LotConfig_CulDSac,LotConfig_FR2,LotConfig_FR3,LotConfig_Inside,Neighborhood_Blmngtn,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,Condition1_Artery,Condition1_Feedr,Condition1_Norm,Condition1_PosA,Condition1_PosN,Condition1_RRAe,Condition1_RRAn,Condition1_RRNe,Condition1_RRNn,Condition2_Artery,Condition2_Feedr,Condition2_Norm,Condition2_PosA,Condition2_PosN,Condition2_RRAe,Condition2_RRAn,Condition2_RRNn,BldgType_1Fam,BldgType_2fmCon,BldgType_Duplex,BldgType_Twnhs,BldgType_TwnhsE,HouseStyle_1.5Fin,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl,RoofStyle_Flat,RoofStyle_Gable,RoofStyle_Gambrel,RoofStyle_Hip,RoofStyle_Mansard,RoofStyle_Shed,RoofMatl_ClyTile,RoofMatl_CompShg,RoofMatl_Membran,RoofMatl_Metal,RoofMatl_Roll,RoofMatl_Tar&Grv,RoofMatl_WdShake,RoofMatl_WdShngl,Exterior1st_AsbShng,Exterior1st_AsphShn,Exterior1st_BrkComm,Exterior1st_BrkFace,Exterior1st_CBlock,Exterior1st_CemntBd,Exterior1st_HdBoard,Exterior1st_ImStucc,Exterior1st_MetalSd,Exterior1st_Plywood,Exterior1st_Stone,Exterior1st_Stucco,Exterior1st_VinylSd,Exterior1st_Wd Sdng,Exterior1st_WdShing,Exterior2nd_AsbShng,Exterior2nd_AsphShn,Exterior2nd_Brk Cmn,Exterior2nd_BrkFace,Exterior2nd_CBlock,Exterior2nd_CmentBd,Exterior2nd_HdBoard,Exterior2nd_ImStucc,Exterior2nd_MetalSd,Exterior2nd_Other,Exterior2nd_Plywood,Exterior2nd_Stone,Exterior2nd_Stucco,Exterior2nd_VinylSd,Exterior2nd_Wd Sdng,Exterior2nd_Wd Shng,MasVnrType_BrkCmn,MasVnrType_BrkFace,MasVnrType_None,MasVnrType_Stone,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,Heating_Floor,Heating_GasA,Heating_GasW,Heating_Grav,Heating_OthW,Heating_Wall,Electrical_FuseA,Electrical_FuseF,Electrical_FuseP,Electrical_Mix,Electrical_SBrkr,GarageType_2Types,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageType_None,Fence_GdPrv,Fence_GdWo,Fence_MnPrv,Fence_MnWw,Fence_None,MiscFeature_Gar2,MiscFeature_None,MiscFeature_Othr,MiscFeature_Shed,MiscFeature_TenC,SaleType_COD,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,-1.0,0.2,-0.265787,-0.254076,0.0,0.0,0.5,0.0,0.652174,0.243243,1.193303,1.0,0.0,0.0,0.0,0.0,0.4,0.45279,0.0,0.0,-0.559829,-0.269652,0.0,0.0,-0.453608,1.173077,0.0,0.38007,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,-1.0,-0.5,0.604651,0.0,0.0,0.281573,0.0,0.0,0.0,0.0,0.529412,0.0,0.0,0.0,0.0,0.0,0.0,-1.333333,0.0,208500,0,0,0,1,0,0,1,0,1,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
1,-0.998629,-0.6,0.523686,0.030015,0.0,0.0,0.0,3.0,0.065217,-0.486486,0.0,0.0,0.0,0.0,0.0,3.0,0.2,0.834679,0.0,0.0,-0.330769,0.538308,0.0,0.0,0.343643,0.0,0.0,-0.31209,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,-0.023256,0.0,0.0,-0.082816,0.0,0.0,0.0,1.77381,-0.367647,0.0,0.0,0.0,0.0,0.0,0.0,-0.333333,-0.5,181500,0,0,0,1,0,0,1,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
2,-0.997258,0.2,-0.107893,0.437624,-1.0,0.0,0.5,0.0,0.608696,0.216216,0.986301,1.0,0.0,0.0,0.0,1.0,0.4,0.14391,0.0,0.0,-0.074359,-0.142289,0.0,0.0,-0.327933,1.18956,0.0,0.497489,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.25,0.55814,0.0,0.0,0.530021,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,223500,0,0,0,1,0,0,1,0,1,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
3,-0.995888,0.4,-0.528945,0.017663,-1.0,0.0,0.5,0.0,-1.26087,-0.648649,0.0,0.0,0.0,-1.0,1.0,0.0,0.2,-0.23517,0.0,0.0,0.106838,-0.468657,-0.5,0.0,-0.247423,1.038462,0.0,0.390885,1.0,0.0,-1.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.5,0.488372,-1.0,1.0,0.670807,0.0,0.0,0.0,0.0,0.147059,272.0,0.0,0.0,0.0,0.0,0.0,-1.333333,-1.0,140000,0,0,0,1,0,0,1,0,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0
4,-0.994517,0.2,0.734213,1.181201,-1.0,0.0,1.0,0.0,0.586957,0.162162,2.130898,1.0,0.0,0.0,0.0,2.0,0.4,0.381186,0.0,0.0,0.021368,0.305473,0.0,0.0,0.113893,1.446429,0.0,1.134029,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.5,0.0,0.0,0.25,0.534884,0.0,1.0,1.47412,0.0,0.0,0.0,1.142857,0.867647,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,250000,0,0,0,1,0,0,1,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0


In [22]:
# finally, because we noticed the right-skew of the SalePrice column, we will log the values in that column

df_imputed['SalePrice'] = np.log(df_imputed['SalePrice'])

In [23]:
# outliers are dropped from the DataFrame

df_imputed = df_imputed.drop([523, 1298])
df_imputed = df_imputed.drop('Id', axis=1)

In [24]:
# examine the final DataFrame

df_imputed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1458 entries, 0 to 1459
Columns: 230 entries, MSSubClass to SaleCondition_Partial
dtypes: float64(56), uint8(174)
memory usage: 897.0 KB


In [None]:
# write the final DataFrame to a csv to be used in the machine learning model

df_imputed.to_csv('Data/train_imputed.csv', index=False)