# W207-Spring 2017: Final Project :: Feature Eng. (Revised)
### Danish Iqbal, KC Tobin, Jason Vantomme

___________

As noted in the previous notebook, we ended up realizing that what we thought was useful feature engineering work was actually making performance worse, we hit a reset button on feature engineering.  In fact, this process was, in effect, "comment out everything we've done and add back piece-by-piece, question our assumptions, adjust where needed, then measure against our models."  

Through this rebuilding process, we not only dramatically improved model performance, but we also decreased the size and complexity of the feature engineering code by at least 50%.

In [284]:
# ENVIRONMENT SETUP

# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

import datetime as dt
import pandas as pd
import numpy as np
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from sklearn import metrics
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.precision',6)
pd.set_option('display.max_columns',None) 
pd.set_option('display.max_rows',None) 
pd.set_option('display.max_colwidth',1000)

In [285]:
train = pd.read_csv('data/train.csv',delimiter=',',header=0)
test = pd.read_csv('data/test.csv',delimiter=',',header=0)

train = train.fillna(train.median())
train = pd.get_dummies(train)
train = np.log1p(train)

test = test.fillna(test.mean())
test = pd.get_dummies(test)
test = np.log1p(test)

print "TRAIN: ", train.shape
print "TEST: ", test.shape

TRAIN:  (1460, 290)
TEST:  (1459, 271)


In [286]:
print train.describe(include='all')

                Id   MSSubClass  LotFrontage      LotArea  OverallQual  \
count  1460.000000  1460.000000  1460.000000  1460.000000  1460.000000   
mean      6.294307     3.818794     4.214451     9.110966     1.940309   
std       0.975566     0.688354     0.314395     0.517369     0.201983   
min       0.693147     3.044522     3.091042     7.170888     0.693147   
25%       5.904680     3.044522     4.110874     8.929898     1.791759   
50%       6.595097     3.931826     4.248495     9.156887     1.945910   
75%       6.999650     4.262680     4.382027     9.358976     2.079442   
max       7.286876     5.252273     5.749393    12.279537     2.397895   

       OverallCond    YearBuilt  YearRemodAdd   MasVnrArea   BsmtFinSF1  \
count  1460.000000  1460.000000   1460.000000  1460.000000  1460.000000   
mean      1.869266     7.586821      7.593756     2.120264     4.229731   
std       0.168400     0.015389      0.010424     2.628759     2.992052   
min       0.693147     7.535297  

In [287]:
##
## FEATURE CREATION
##

#
#create aggregate columns
train["OtherRmsAbvGr"] = np.log1p(train["TotRmsAbvGrd"] - (train["BedroomAbvGr"]+train["KitchenAbvGr"]))
test["OtherRmsAbvGr"] = np.log1p(test["TotRmsAbvGrd"] - (test["BedroomAbvGr"]+test["KitchenAbvGr"]))
#
train['BsmtFinSF'] = np.log1p(train['BsmtFinSF1'] + train['BsmtFinSF2'])
test['BsmtFinSF'] = np.log1p(test['BsmtFinSF1'] + test['BsmtFinSF2'])
#
porches = train.loc[:,["WoodDeckSF","OpenPorchSF","EnclosedPorch","3SsnPorch","ScreenPorch"]]
reduced_porches = porches.sum(axis=1)
train["TotalPorchSqFt"] = np.log1p(reduced_porches)
porches = test.loc[:,["WoodDeckSF","OpenPorchSF","EnclosedPorch","3SsnPorch","ScreenPorch"]]
reduced_porches = porches.sum(axis=1)
test["TotalPorchSqFt"] = np.log1p(reduced_porches)

#create binary columns
train["HasBsmt"] = train["TotalBsmtSF"] > 0
train["Has2ndFlr"] = train["2ndFlrSF"] > 0
train["HasPool"] = train["PoolArea"] > 0
train["HasMiscVal"] = train["MiscVal"] > 0
train["HasLowQualFinSF"] = train["LowQualFinSF"] > 0
#
test["HasBsmt"] = test["TotalBsmtSF"] > 0
test["Has2ndFlr"] = test["2ndFlrSF"] > 0
test["HasPool"] = test["PoolArea"] > 0
test["HasMiscVal"] = test["MiscVal"] > 0
test["HasLowQualFinSF"] = test["LowQualFinSF"] > 0

manyFireplaces = 2
train["HasManyFireplaces"] = train["Fireplaces"] > manyFireplaces
test["HasManyFireplaces"] = test["Fireplaces"] > manyFireplaces

largeGarageSize = 800
train["HasLargeGarage"] = train["GarageArea"] > largeGarageSize
test["HasLargeGarage"] = test["GarageArea"] > largeGarageSize

#--
print "TRAIN: ", train.shape
print "TEST: ", test.shape

TRAIN:  (1460, 300)
TEST:  (1459, 281)


In [289]:
##
## CLEAN UP CATEGORICALS
##

print ">> Columns existing in train and not test"
missingCols = (train.columns).difference(test.columns)
missingCols = missingCols.drop("SalePrice")
print missingCols
print

print ">> Creating missing columns in test; initializing with zeros"
for col in missingCols:
    #test[col]=0
    train = train.drop(col,axis=1)
print

print ">> Columns now existing in train and not test "
print (train.columns).difference(test.columns)
print

print ">> Complete"
print "TRAIN: ", train.shape
print "TEST: ", test.shape
print


>> Columns existing in train and not test
Index([u'Condition2_RRAe', u'Condition2_RRAn', u'Condition2_RRNn',
       u'Electrical_Mix', u'Exterior1st_ImStucc', u'Exterior1st_Stone',
       u'Exterior2nd_Other', u'GarageQual_Ex', u'Heating_Floor',
       u'Heating_OthW', u'HouseStyle_2.5Fin', u'MiscFeature_TenC',
       u'PoolQC_Fa', u'RoofMatl_ClyTile', u'RoofMatl_Membran',
       u'RoofMatl_Metal', u'RoofMatl_Roll', u'Utilities_NoSeWa'],
      dtype='object')

>> Creating missing columns in test; initializing with zeros

>> Columns now existing in train and not test 
Index([u'SalePrice'], dtype='object')

>> Complete
TRAIN:  (1460, 282)
TEST:  (1459, 281)



In [290]:
train.to_csv('data/train_fe.csv')
test.to_csv('data/test_fe.csv')
with open('data/features_fe.txt', "w") as feature_file:
    feature_file.write(np.array2string(train.columns, separator=","))

In [291]:
##
## Quick QA test to ensure values are generally correct enough to run a simple model.
## If no errors, then great.
##
mask = np.random.rand(len(train)) < 0.5
after_train = train.copy()
after_dev = after_train[~mask]
after_train = after_train[mask]

train_labels = after_train["SalePrice"].copy()
after_train = after_train.drop( ["SalePrice"], axis=1 )

dev_labels = after_dev["SalePrice"].copy()
after_dev = after_dev.drop( ["SalePrice"], axis=1 ) 

reg = LinearRegression()
reg.fit(after_train, train_labels)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)