In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

## Dummifying Columns

In [136]:
# reading in train dataset
train = pd.read_csv('train_set.csv')

In [137]:
# dummifying the BldgType column
train['BldgType'].unique()

array(['1Fam', 'Duplex', 'Twnhs', 'TwnhsE', '2fmCon'], dtype=object)

In [138]:
# checking overall counts of each building type
train['BldgType'].value_counts()

1Fam      1657
TwnhsE     148
Twnhs       70
Duplex      68
2fmCon      41
Name: BldgType, dtype: int64

In [170]:
# separate dummy df
dum_bldgtype = pd.get_dummies(train.BldgType, prefix='BldgType')
dum_bldgtype

Unnamed: 0,BldgType_1Fam,BldgType_2fmCon,BldgType_Duplex,BldgType_Twnhs,BldgType_TwnhsE
0,1,0,0,0,0
1,0,0,1,0,0
2,1,0,0,0,0
3,1,0,0,0,0
4,1,0,0,0,0
...,...,...,...,...,...
1979,1,0,0,0,0
1980,0,0,0,0,1
1981,1,0,0,0,0
1982,1,0,0,0,0


In [171]:
# checking sum of each dummy column in dummy df
dum_bldgtype.sum()

BldgType_1Fam      1657
BldgType_2fmCon      41
BldgType_Duplex      68
BldgType_Twnhs       70
BldgType_TwnhsE     148
dtype: int64

In [141]:
# dropping the most common building type column - BldgType_1Fam
dum_bldgtype.drop('BldgType_'+str(train['BldgType'].mode()[0]), axis=1, inplace=True)

In [142]:
dum_bldgtype

Unnamed: 0,BldgType_2fmCon,BldgType_Duplex,BldgType_Twnhs,BldgType_TwnhsE
0,0,0,0,0
1,0,1,0,0
2,0,0,0,0
3,0,0,0,0
4,0,0,0,0
...,...,...,...,...
1979,0,0,0,0
1980,0,0,0,1
1981,0,0,0,0
1982,0,0,0,0


In [143]:
# concatenated train w/ dum_bldgtype
train = pd.concat([train, dum_bldgtype], axis=1)

In [145]:
# dummifying HouseStyle column
# checking value counts of different house style
train['HouseStyle'].value_counts()

1Story    1007
2Story     587
1.5Fin     215
SLvl        88
SFoyer      53
1.5Unf      17
2.5Unf      14
2.5Fin       3
Name: HouseStyle, dtype: int64

In [147]:
# merging 2.5Fin to 2Story
train['HouseStyle'].replace('2.5Fin', '2Story', inplace=True)

In [151]:
# merging 2.5Unf values into 1Story
train['HouseStyle'].replace('2.5Unf', '1Story', inplace=True)

In [154]:
# merging 1.5Unf values into 1.5Fin
train['HouseStyle'].replace('1.5Unf', '1.5Fin', inplace=True)

In [173]:
# checking value counts of binned housestyle
train['HouseStyle'].value_counts()

1Story    1021
2Story     590
1.5Fin     232
SLvl        88
SFoyer      53
Name: HouseStyle, dtype: int64

In [159]:
# dummifying the housestyle variable
dum_housestyle = pd.get_dummies(train.HouseStyle, prefix='HouseStyle')
dum_housestyle

Unnamed: 0,HouseStyle_1.5Fin,HouseStyle_1Story,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl
0,0,1,0,0,0
1,1,0,0,0,0
2,0,1,0,0,0
3,1,0,0,0,0
4,0,1,0,0,0
...,...,...,...,...,...
1979,1,0,0,0,0
1980,0,0,1,0,0
1981,0,0,1,0,0
1982,0,0,1,0,0


In [160]:
# verifying dummy df count per variable
dum_housestyle.sum()

HouseStyle_1.5Fin     232
HouseStyle_1Story    1021
HouseStyle_2Story     590
HouseStyle_SFoyer      53
HouseStyle_SLvl        88
dtype: int64

In [164]:
# dropping the most frequent variable - HouseStyle_1Story
dum_housestyle.drop('HouseStyle_'+str(train['HouseStyle'].mode()[0]), axis=1, inplace=True)

In [165]:
dum_housestyle

Unnamed: 0,HouseStyle_1.5Fin,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl
0,0,0,0,0
1,1,0,0,0
2,0,0,0,0
3,1,0,0,0
4,0,0,0,0
...,...,...,...,...
1979,1,0,0,0
1980,0,1,0,0
1981,0,1,0,0
1982,0,1,0,0


In [168]:
# concatenating dum_housestyle with train
train = pd.concat([train, dum_housestyle], axis=1)

In [174]:
# train dataset including dummy columns for BldgType & HouseStyle
train

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,PID,GrLivArea,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,...,tract,income,BldgType_2fmCon,BldgType_Duplex,BldgType_Twnhs,BldgType_TwnhsE,HouseStyle_1.5Fin,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl
0,1971,1971,535180070,1236,134450,20,RL,72.0,10011,Pave,...,1.916900e+10,52375.0,0,0,0,0,0,0,0,0
1,1357,1357,908102040,2200,145900,90,RL,67.0,8777,Pave,...,1.916900e+10,33311.0,0,1,0,0,1,0,0,0
2,357,357,906380170,1220,194000,20,RL,64.0,7406,Pave,...,1.916900e+10,60833.0,0,0,0,0,0,0,0,0
3,65,65,909250060,2447,169000,50,RL,50.0,9638,Pave,...,1.916900e+10,20878.0,0,0,0,0,1,0,0,0
4,1094,1094,534276180,1074,124000,20,RL,74.0,7450,Pave,...,1.916900e+10,61991.0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1979,1288,1288,903204030,1435,128250,50,RM,55.0,8674,Pave,...,1.916900e+10,37028.0,0,0,0,0,1,0,0,0
1980,2380,2380,527455080,1456,138000,160,RL,24.0,2364,Pave,...,1.916900e+10,60500.0,0,0,0,1,0,1,0,0
1981,2315,2315,907130110,1499,187000,60,RL,65.0,12438,Pave,...,1.916900e+10,33311.0,0,0,0,0,0,1,0,0
1982,425,425,528480060,2078,339750,60,FV,85.0,10800,Pave,...,1.916900e+10,82985.0,0,0,0,0,0,1,0,0
