In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import PolynomialFeatures, StandardScaler

import seaborn as sns

In [2]:
train = pd.read_csv('../../data/cleaned/ames_clean.csv', index_col='id')
test = pd.read_csv('../../data/cleaned/ames_clean_test.csv', index_col='id')

In [3]:
test.columns

Index(['ms_subclass', 'ms_zoning', 'lot_frontage', 'lot_area', 'street',
       'alley', 'lot_shape', 'land_contour', 'utilities', 'lot_config',
       'land_slope', 'neighborhood', 'condition_1', 'condition_2', 'bldg_type',
       'house_style', 'overall_qual', 'overall_cond', 'year_built',
       'year_remod/add', 'roof_style', 'roof_matl', 'exterior_1st',
       'exterior_2nd', 'mas_vnr_type', 'mas_vnr_area', 'exter_qual',
       'exter_cond', 'foundation', 'bsmt_qual', 'bsmt_cond', 'bsmt_exposure',
       'bsmtfin_type_1', 'bsmtfin_sf_1', 'bsmtfin_type_2', 'bsmtfin_sf_2',
       'bsmt_unf_sf', 'total_bsmt_sf', 'heating', 'heating_qc', 'central_air',
       'electrical', '1st_flr_sf', '2nd_flr_sf', 'low_qual_fin_sf',
       'gr_liv_area', 'bsmt_full_bath', 'bsmt_half_bath', 'full_bath',
       'half_bath', 'bedroom_abvgr', 'kitchen_abvgr', 'kitchen_qual',
       'totrms_abvgrd', 'functional', 'fireplaces', 'fireplace_qu',
       'garage_type', 'garage_finish', 'garage_cars', 'garage

In [4]:
train['ms_subclass'] = train['ms_subclass'].astype('object')
test['ms_subclass'] = test['ms_subclass'].astype('object')

In [5]:
sort_index_abs = train.corr().abs().saleprice.sort_values(ascending=False).index

In [6]:
train.corr().saleprice[sort_index_abs]

saleprice          1.000000
overall_qual       0.800429
gr_liv_area        0.699867
garage_area        0.648242
garage_cars        0.646390
total_bsmt_sf      0.633092
1st_flr_sf         0.623714
year_built         0.571449
year_remod/add     0.548528
full_bath          0.538474
mas_vnr_area       0.512396
totrms_abvgrd      0.506497
fireplaces         0.470803
bsmtfin_sf_1       0.424383
wood_deck_sf       0.329595
open_porch_sf      0.326517
lot_frontage       0.323365
lot_area           0.295914
bsmt_full_bath     0.284993
half_bath          0.280894
2nd_flr_sf         0.251503
bsmt_unf_sf        0.189162
screen_porch       0.139539
enclosed_porch    -0.138406
bedroom_abvgr      0.137738
kitchen_abvgr     -0.127242
overall_cond      -0.093804
3ssn_porch         0.049951
bsmt_half_bath    -0.043833
low_qual_fin_sf   -0.040681
mo_sold            0.025572
pool_area          0.023794
bsmtfin_sf_2       0.017599
yr_sold           -0.013066
misc_val          -0.008001
Name: saleprice, dty

In [7]:
train.corr().saleprice[sort_index_abs][1:21]

overall_qual      0.800429
gr_liv_area       0.699867
garage_area       0.648242
garage_cars       0.646390
total_bsmt_sf     0.633092
1st_flr_sf        0.623714
year_built        0.571449
year_remod/add    0.548528
full_bath         0.538474
mas_vnr_area      0.512396
totrms_abvgrd     0.506497
fireplaces        0.470803
bsmtfin_sf_1      0.424383
wood_deck_sf      0.329595
open_porch_sf     0.326517
lot_frontage      0.323365
lot_area          0.295914
bsmt_full_bath    0.284993
half_bath         0.280894
2nd_flr_sf        0.251503
Name: saleprice, dtype: float64

In [8]:
train_saleprice = train.saleprice
train = train.drop(train.corr().saleprice[sort_index_abs][1:21].index, axis=1).drop('saleprice', 1)
test = test[[col for col in train.columns if col != 'saleprice']]

In [9]:
train_cat = train.select_dtypes(include=['object'])
train_num = train.select_dtypes(exclude=['object'])


test_cat = test.select_dtypes(include=['object'])
test_num = test.select_dtypes(exclude=['object'])
train_cat.shape, test_cat.shape, train_num.shape, test_num.shape

((2016, 43), (879, 43), (2016, 14), (879, 14))

In [10]:
pf = PolynomialFeatures()


In [11]:
train_num = pd.DataFrame(pf.fit_transform(train_num.values), index=train_num.index)
test_num = pd.DataFrame(pf.fit_transform(test_num.values), index=test_num.index)
train_num.shape, test_num.shape

((2016, 120), (879, 120))

In [12]:
train_cat = pd.get_dummies(train_cat)
test_cat = pd.get_dummies(test_cat)
train_cat.shape, test_cat.shape

((2016, 281), (879, 264))

In [13]:
for col in test_cat.columns:
    if col not in train_cat.columns:
        test_cat.drop(col, 1, inplace=True)
train_cat.shape, test_cat.shape

((2016, 281), (879, 255))

In [14]:
for col in train_cat.columns:
    if col not in test_cat.columns:
        test_cat[col] = pd.Series([0 for _ in range(test_cat.shape[0])], index=test_cat.index, name=col)
train_cat.shape, test_cat.shape

((2016, 281), (879, 281))

In [15]:
train = train_num.join(train_cat)
test = test_num.join(test_cat)

In [16]:
ss = StandardScaler()
train = pd.DataFrame(ss.fit_transform(train), index=train.index)
test = pd.DataFrame(ss.fit_transform(test), index=test.index)

In [17]:
train['SalePrice'] = train_saleprice

In [18]:
train.shape, test.shape

((2016, 402), (879, 401))

In [19]:
train.to_csv('../../data/engineered/training.csv')
test.to_csv('../../data/engineered/test.csv')