W207: Final Project
===========

Our team (Danish Iqbal, KC Tobin, Jason Vantomme) is working through the Kaggle "House Prices: Advanced Regression Techniques" competition @ https://www.kaggle.com/c/house-prices-advanced-regression-techniques.

Our first step is to scrub the data and generate effective features from what is provided.  We have divided this process into three parts, one for each team member.

In [1]:
# ENVIRONMENT SETUP

# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

import datetime as dt
import pandas as pd
import numpy as np
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from sklearn import metrics

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.precision',6)
pd.set_option('display.max_columns',None) 
pd.set_option('display.max_rows',None) 
pd.set_option('display.max_colwidth',1000)



In [2]:
# make sure you have the latest version of pandas (>=0.19.0) otherwise the 'category' type will throw an error

colTypesDict = {"MSSubClass":'category',"MSZoning":'category',"Street":'category',"Alley":'category',
                "LotShape":'category',"LandContour":'category',"Utilities":'category',
                "LotConfig":'category',"LandSlope":'category',"Neighborhood":'category',"Condition1":'category',
                "Condition2":'category',"BldgType":'category',"HouseStyle":'category',"RoofStyle":'category',
                'OverallQual':'category', 'OverallCond':'category',
                "RoofMatl":'category',"Exterior1st":'category',"Exterior2nd":'category',"MasVnrType":'category',
                "ExterQual":'category',"ExterCond":'category',"Foundation":'category',
                "BsmtQual":'category',"BsmtCond":'category',"BsmtExposure":'category',"BsmtFinType1":'category',
                "BsmtFinType2":'category',"Heating":'category',"HeatingQC":'category',"CentralAir":'category',
                "Electrical":'category',"KitchenQual":'category',"Functional":'category',"FireplaceQu":'category',
                "GarageType":'category',"GarageFinish":'category',
                "GarageQual":'category',"GarageCond":'category',"PavedDrive":'category',
                "PoolQC":'category',"Fence":'category',"MiscFeature":'category',
                "SaleType":'category',"SaleCondition":'category',"YrSold":'category'}

# LOAD THE PROVIDED TRAINING DATA
train = pd.read_csv('data/train.csv',delimiter=',',header=0,dtype=colTypesDict,na_values=["NA"])
test = pd.read_csv('data/test.csv',delimiter=',',header=0)

print "TRAIN: ", train.shape
print "TEST: ", test.shape

TRAIN:  (1460, 81)
TEST:  (1459, 80)


In [3]:
print train.describe(include='all')

                 Id MSSubClass MSZoning  LotFrontage        LotArea Street  \
count   1460.000000       1460     1460  1201.000000    1460.000000   1460   
unique          NaN         15        5          NaN            NaN      2   
top             NaN         20       RL          NaN            NaN   Pave   
freq            NaN        536     1151          NaN            NaN   1454   
mean     730.500000        NaN      NaN    70.049958   10516.828082    NaN   
std      421.610009        NaN      NaN    24.284752    9981.264932    NaN   
min        1.000000        NaN      NaN    21.000000    1300.000000    NaN   
25%      365.750000        NaN      NaN    59.000000    7553.500000    NaN   
50%      730.500000        NaN      NaN    69.000000    9478.500000    NaN   
75%     1095.250000        NaN      NaN    80.000000   11601.500000    NaN   
max     1460.000000        NaN      NaN   313.000000  215245.000000    NaN   

       Alley LotShape LandContour Utilities LotConfig LandSlope

In [4]:
###
### ADDRESS NA's & ZERO'S
###

# TRAIN handle NAs
# >numbers
train['MasVnrArea'].fillna(0, inplace=True)
train["BsmtFinSF1"].fillna(0, inplace=True)
train["BsmtFinSF2"].fillna(0, inplace=True)
train["TotalBsmtSF"].fillna(0, inplace=True)
train["BsmtUnfSF"].fillna(0, inplace=True)
train["BsmtHalfBath"].fillna(0, inplace=True)
train["BsmtFullBath"].fillna(0, inplace=True)
train['GarageYrBlt'].fillna(0, inplace=True)
train['GarageCars'].fillna(0, inplace=True)
train['MiscVal'].fillna(0, inplace=True)
# >categoricals
train["KitchenQual"].fillna(method="ffill",inplace=True)
train["BsmtCond"].fillna(method="ffill",inplace=True)
train["BsmtQual"].fillna(method="ffill",inplace=True)
train["BsmtExposure"].fillna(method="ffill",inplace=True)
train["BsmtFinType2"].fillna(method="ffill",inplace=True)

# TEST handle NAs
# >numbers
test['MasVnrArea'].fillna(0, inplace=True)
test["BsmtFinSF1"].fillna(0, inplace=True)
test["BsmtFinSF2"].fillna(0, inplace=True)
test["TotalBsmtSF"].fillna(0, inplace=True)
test["BsmtUnfSF"].fillna(0, inplace=True)
test["BsmtHalfBath"].fillna(0, inplace=True)
test["BsmtFullBath"].fillna(0, inplace=True)
test['GarageYrBlt'].fillna(0, inplace=True)
test['GarageCars'].fillna(0, inplace=True)
test['MiscVal'].fillna(0, inplace=True)
# >categoricals
test["KitchenQual"].fillna(method="ffill",inplace=True)
test["BsmtCond"].fillna(method="ffill",inplace=True)
test["BsmtQual"].fillna(method="ffill",inplace=True)
test["BsmtFinType2"].fillna(method="ffill",inplace=True)
test["BsmtExposure"].fillna(method="ffill",inplace=True)

#--
print "TRAIN: ", train.shape
print "TEST: ", test.shape

TRAIN:  (1460, 81)
TEST:  (1459, 80)


In [5]:
###
### FEATURE CREATION
###

##
##
train['MasVnrArea_0'] = train['MasVnrArea']==0
train['MasVnrArea_50'] = (train['MasVnrArea']>0) & (train['MasVnrArea']<50)
train['MasVnrArea_100'] = (train['MasVnrArea']>50) & (train['MasVnrArea']<100)
train['MasVnrArea_150'] = (train['MasVnrArea']>100) & (train['MasVnrArea']<150)
train['MasVnrArea_200'] = (train['MasVnrArea']>150) & (train['MasVnrArea']<200)
train['MasVnrArea_250'] = (train['MasVnrArea']>200) & (train['MasVnrArea']<250)
train['MasVnrArea_300'] = (train['MasVnrArea']>250) & (train['MasVnrArea']<300)
train['MasVnrArea_350'] = (train['MasVnrArea']>300) & (train['MasVnrArea']<350)
train['MasVnrArea_400'] = (train['MasVnrArea']>350) & (train['MasVnrArea']<400)
train['MasVnrArea_other'] = (train['MasVnrArea']>400)

test['MasVnrArea_0'] = test['MasVnrArea']==0
test['MasVnrArea_50'] = (test['MasVnrArea']>0) & (test['MasVnrArea']<50)
test['MasVnrArea_100'] = (test['MasVnrArea']>50) & (test['MasVnrArea']<100)
test['MasVnrArea_150'] = (test['MasVnrArea']>100) & (test['MasVnrArea']<150)
test['MasVnrArea_200'] = (test['MasVnrArea']>150) & (test['MasVnrArea']<200)
test['MasVnrArea_250'] = (test['MasVnrArea']>200) & (test['MasVnrArea']<250)
test['MasVnrArea_300'] = (test['MasVnrArea']>250) & (test['MasVnrArea']<300)
test['MasVnrArea_350'] = (test['MasVnrArea']>300) & (test['MasVnrArea']<350)
test['MasVnrArea_400'] = (test['MasVnrArea']>350) & (test['MasVnrArea']<400)
test['MasVnrArea_other'] = (test['MasVnrArea']>400)

##
##
train['MSSubClass_60'] = train['MSSubClass']==60
train['MSSubClass_50'] = train['MSSubClass']==50
train['MSSubClass_120'] = train['MSSubClass']==120
train['MSSubClass_80'] = train['MSSubClass']==80
train['MSSubClass_160'] = train['MSSubClass']==160
train['MSSubClass_90'] = train['MSSubClass']==90
train['MSSubClass_30'] = train['MSSubClass']==30
train['MSSubClass_70'] = train['MSSubClass']==70
train['MSSubClass_other'] = (train['MSSubClass']==85) | (train['MSSubClass']==190) | (train['MSSubClass']==75)  | (train['MSSubClass']==180) | (train['MSSubClass']==45) | (train['MSSubClass']==40)

test['MSSubClass_60'] = test['MSSubClass']==60
test['MSSubClass_50'] = test['MSSubClass']==50
test['MSSubClass_120'] = test['MSSubClass']==120
test['MSSubClass_80'] = test['MSSubClass']==80
test['MSSubClass_160'] = test['MSSubClass']==160
test['MSSubClass_90'] = test['MSSubClass']==90
test['MSSubClass_30'] = test['MSSubClass']==30
test['MSSubClass_70'] = test['MSSubClass']==70
test['MSSubClass_other'] = (test['MSSubClass']==85) | (test['MSSubClass']==190) | (test['MSSubClass']==75)  | (test['MSSubClass']==180) | (test['MSSubClass']==45) | (test['MSSubClass']==40)

##
##
train['LotFrontage_0'] = train['LotFrontage']==0
train['LotFrontage_50'] = (train['LotFrontage']>0) & (train['LotFrontage']<50)
train['LotFrontage_60'] = (train['LotFrontage']>50) & (train['LotFrontage']<60)
train['LotFrontage_70'] = (train['LotFrontage']>60) & (train['LotFrontage']<70)
train['LotFrontage_80'] = (train['LotFrontage']>70) & (train['LotFrontage']<80)
train['LotFrontage_90'] = (train['LotFrontage']>80) & (train['LotFrontage']<90)
train['LotFrontage_other'] = train['LotFrontage']>90

test['LotFrontage_0'] = test['LotFrontage']==0
test['LotFrontage_50'] = (test['LotFrontage']>0) & (test['LotFrontage']<50)
test['LotFrontage_60'] = (test['LotFrontage']>50) & (test['LotFrontage']<60)
test['LotFrontage_70'] = (test['LotFrontage']>60) & (test['LotFrontage']<70)
test['LotFrontage_80'] = (test['LotFrontage']>70) & (test['LotFrontage']<80)
test['LotFrontage_90'] = (test['LotFrontage']>80) & (test['LotFrontage']<90)
test['LotFrontage_other'] = test['LotFrontage']>90

##
##
train['LotArea_5'] = train['LotArea']<5000
train['LotArea_6'] = (train['LotArea']>5000) & (train['LotArea']<6000)
train['LotArea_7'] = (train['LotArea']>6000) & (train['LotArea']<7000)
train['LotArea_8'] = (train['LotArea']>7000) & (train['LotArea']<8000)
train['LotArea_9'] = (train['LotArea']>8000) & (train['LotArea']<9000)
train['LotArea_10'] = (train['LotArea']>9000) & (train['LotArea']<10000)
train['LotArea_11'] = (train['LotArea']>10000) & (train['LotArea']<11000)
train['LotArea_12'] = (train['LotArea']>11000) & (train['LotArea']<12000)
train['LotArea_13'] = (train['LotArea']>12000) & (train['LotArea']<13000)
train['LotArea_14'] = (train['LotArea']>13000) & (train['LotArea']<14000)
train['LotArea_other'] = (train['LotArea']>14000) 

test['LotArea_5'] = test['LotArea']<5000
test['LotArea_6'] = (test['LotArea']>5000) & (test['LotArea']<6000)
test['LotArea_7'] = (test['LotArea']>6000) & (test['LotArea']<7000)
test['LotArea_8'] = (test['LotArea']>7000) & (test['LotArea']<8000)
test['LotArea_9'] = (test['LotArea']>8000) & (test['LotArea']<9000)
test['LotArea_10'] = (test['LotArea']>9000) & (test['LotArea']<10000)
test['LotArea_11'] = (test['LotArea']>10000) & (test['LotArea']<11000)
test['LotArea_12'] = (test['LotArea']>11000) & (test['LotArea']<12000)
test['LotArea_13'] = (test['LotArea']>12000) & (test['LotArea']<13000)
test['LotArea_14'] = (test['LotArea']>13000) & (test['LotArea']<14000)
test['LotArea_other'] = (test['LotArea']>14000)

##
##
train['YearBuilt_30'] = train['YearBuilt']<1930
train['YearBuilt_40'] = (train['YearBuilt']>1930) & (train['YearBuilt']<1940)
train['YearBuilt_50'] = (train['YearBuilt']>1940) & (train['YearBuilt']<1950)
train['YearBuilt_60'] = (train['YearBuilt']>1950) & (train['YearBuilt']<1960)
train['YearBuilt_70'] = (train['YearBuilt']>1960) & (train['YearBuilt']<1970)
train['YearBuilt_80'] = (train['YearBuilt']>1970) & (train['YearBuilt']<1980)
train['YearBuilt_90'] = (train['YearBuilt']>1980) & (train['YearBuilt']<1990)
train['YearBuilt_00'] = (train['YearBuilt']>1990) & (train['YearBuilt']<2005)

test['YearBuilt_30'] = test['YearBuilt']<1930
test['YearBuilt_40'] = (test['YearBuilt']>1930) & (test['YearBuilt']<1940)
test['YearBuilt_50'] = (test['YearBuilt']>1940) & (test['YearBuilt']<1950)
test['YearBuilt_60'] = (test['YearBuilt']>1950) & (test['YearBuilt']<1960)
test['YearBuilt_70'] = (test['YearBuilt']>1960) & (test['YearBuilt']<1970)
test['YearBuilt_80'] = (test['YearBuilt']>1970) & (test['YearBuilt']<1980)
test['YearBuilt_90'] = (test['YearBuilt']>1980) & (test['YearBuilt']<1990)
test['YearBuilt_00'] = (test['YearBuilt']>1990) & (test['YearBuilt']<2005)

##
##
train['YearRemodAdd_50'] = (train['YearRemodAdd']>1950) & (train['YearRemodAdd']<1960)
train['YearRemodAdd_60'] = (train['YearRemodAdd']>1960) & (train['YearRemodAdd']<1970)
train['YearRemodAdd_70'] = (train['YearRemodAdd']>1970) & (train['YearRemodAdd']<1980)
train['YearRemodAdd_80'] = (train['YearRemodAdd']>1980) & (train['YearRemodAdd']<1990)
train['YearRemodAdd_90'] = (train['YearRemodAdd']>1990) & (train['YearRemodAdd']<2000)
train['YearRemodAdd_00'] = (train['YearRemodAdd']>2000)

test['YearRemodAdd_50'] = (test['YearRemodAdd']>1950) & (test['YearRemodAdd']<1960)
test['YearRemodAdd_60'] = (test['YearRemodAdd']>1960) & (test['YearRemodAdd']<1970)
test['YearRemodAdd_70'] = (test['YearRemodAdd']>1970) & (test['YearRemodAdd']<1980)
test['YearRemodAdd_80'] = (test['YearRemodAdd']>1980) & (test['YearRemodAdd']<1990)
test['YearRemodAdd_90'] = (test['YearRemodAdd']>1990) & (test['YearRemodAdd']<2000)
test['YearRemodAdd_00'] = (test['YearRemodAdd']>2000)

##
##create other rooms columns
train["OtherRmsAbvGr"] = train["TotRmsAbvGrd"] - (train["BedroomAbvGr"]+train["KitchenAbvGr"])
test["OtherRmsAbvGr"] = test["TotRmsAbvGrd"] - (test["BedroomAbvGr"]+test["KitchenAbvGr"])

#create binary columns
train["HasBsmt"] = train["TotalBsmtSF"] > 0
train["Has2ndFlr"] = train["2ndFlrSF"] > 0
train["HasPool"] = train["PoolArea"] > 0
train["HasMiscVal"] = train["MiscVal"] > 0
train["HasLowQualFinSF"] = train["LowQualFinSF"] > 0

#
test["HasBsmt"] = test["TotalBsmtSF"] > 0
test["Has2ndFlr"] = test["2ndFlrSF"] > 0
test["HasPool"] = test["PoolArea"] > 0
test["HasMiscVal"] = test["MiscVal"] > 0
test["HasLowQualFinSF"] = test["LowQualFinSF"] > 0

##
train['BsmtFinSF'] = train['BsmtFinSF1'] + train['BsmtFinSF2']
test['BsmtFinSF'] = test['BsmtFinSF1'] + test['BsmtFinSF2']

##
##  PORCH/DECK COLLAPSE/REMOVALS INTO ONE SQ FT MEASUREMENT (SUM OF ALL)
##
porches = train.loc[:,["WoodDeckSF","OpenPorchSF","EnclosedPorch","3SsnPorch","ScreenPorch"]]
reduced_porches = porches.sum(axis=1)
train["TotalPorchSqFt"] = reduced_porches
#
porches = test.loc[:,["WoodDeckSF","OpenPorchSF","EnclosedPorch","3SsnPorch","ScreenPorch"]]
reduced_porches = porches.sum(axis=1)
test["TotalPorchSqFt"] = reduced_porches

## ----------------------------
##  FIREPLACE COLLAPSE/REMOVALS
## ----------------------------
manyFireplaces = 2
train["HasManyFireplaces"] = train["Fireplaces"] > manyFireplaces
test["HasManyFireplaces"] = test["Fireplaces"] > manyFireplaces

## ----------------------------
##  GARAGE COLLAPSE/REMOVALS
## ----------------------------
largeGarageSize = 800
train["HasLargeGarage"] = train["GarageArea"] > largeGarageSize
test["HasLargeGarage"] = test["GarageArea"] > largeGarageSize

##
##
train["GarageYrBlt_0"] = train['GarageYrBlt']==0
train["GarageYrBlt_50"] = (train['GarageYrBlt']>0) & (train['GarageYrBlt']<1950)
train["GarageYrBlt_60"] = (train['GarageYrBlt']>1950) & (train['GarageYrBlt']<1960)
train["GarageYrBlt_70"] = (train['GarageYrBlt']>1960) & (train['GarageYrBlt']<1970)
train["GarageYrBlt_80"] = (train['GarageYrBlt']>1970) & (train['GarageYrBlt']<1980)
train["GarageYrBlt_90"] = (train['GarageYrBlt']>1980) & (train['GarageYrBlt']<1990)
train["GarageYrBlt_00"] = (train['GarageYrBlt']>1990) & (train['GarageYrBlt']<2000)
train["GarageYrBlt_10"] = train['GarageYrBlt']>2000

test["GarageYrBlt_0"] = test['GarageYrBlt']==0
test["GarageYrBlt_50"] = (test['GarageYrBlt']>0) & (test['GarageYrBlt']<1950)
test["GarageYrBlt_60"] = (test['GarageYrBlt']>1950) & (test['GarageYrBlt']<1960)
test["GarageYrBlt_70"] = (test['GarageYrBlt']>1960) & (test['GarageYrBlt']<1970)
test["GarageYrBlt_80"] = (test['GarageYrBlt']>1970) & (test['GarageYrBlt']<1980)
test["GarageYrBlt_90"] = (test['GarageYrBlt']>1980) & (test['GarageYrBlt']<1990)
test["GarageYrBlt_00"] = (test['GarageYrBlt']>1990) & (test['GarageYrBlt']<2000)
test["GarageYrBlt_10"] = test['GarageYrBlt']>2000


#--
print "TRAIN: ", train.shape
print "TEST: ", test.shape

TRAIN:  (1460, 150)
TEST:  (1459, 149)


In [6]:
###
### FEATURE DROPS
###

# TRAIN DROPS
train = train.drop(['Id'], axis = 1)
train = train.drop(['MasVnrArea','MSSubClass','LotFrontage','LotArea','YearBuilt',
                    'YearRemodAdd','TotalBsmtSF','TotRmsAbvGrd',
                    'WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch',
                    'GarageArea','PoolArea','GarageYrBlt','2ndFlrSF','BsmtFinSF1',
                    'MiscVal','BsmtFinSF2','LowQualFinSF'], axis = 1)

# TEST DROPS
test = test.drop(['Id'], axis = 1)
test = test.drop(['MasVnrArea','MSSubClass','LotFrontage','LotArea','YearBuilt',
                  'YearRemodAdd','TotalBsmtSF','TotRmsAbvGrd',
                  'WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch',
                  'GarageArea','PoolArea','GarageYrBlt','2ndFlrSF','BsmtFinSF1',
                  'MiscVal','BsmtFinSF2','LowQualFinSF'], axis = 1)

print "TRAIN: ", train.shape
print "TEST: ", test.shape

TRAIN:  (1460, 128)
TEST:  (1459, 127)


In [7]:
print train.select_dtypes(include=[np.number]).columns

Index([u'BsmtUnfSF', u'1stFlrSF', u'GrLivArea', u'BsmtFullBath',
       u'BsmtHalfBath', u'FullBath', u'HalfBath', u'BedroomAbvGr',
       u'KitchenAbvGr', u'Fireplaces', u'GarageCars', u'MoSold', u'SalePrice',
       u'OtherRmsAbvGr', u'BsmtFinSF', u'TotalPorchSqFt'],
      dtype='object')


In [8]:
###
### FEATURE NORMALIZATION & TRANSFORMS
###
# from sklearn.preprocessing import RobustScaler

# # scale all non-categorical, numerical values that have large ranges
# #  : this ignores scaling small range values such as room counts
# scaleCols = train.ix[:, ['GrLivArea','BsmtFinSF', 'BsmtUnfSF', '1stFlrSF', 'TotalPorchSqFt']]

# scaler = RobustScaler()
# for col in scaleCols:
#     train[col] = scaler.fit_transform(train[col])

# for col in scaleCols:
#     test[col] = scaler.fit_transform(test[col])

# # log transform SalePrice
# #train["SalePrice"] = train["SalePrice"].apply(np.log)

# print "TRAIN: ", train.shape
# print "TEST: ", test.shape

In [9]:
print train.describe(include='all')

       MSZoning Street Alley LotShape LandContour Utilities LotConfig  \
count      1460   1460    91     1460        1460      1460      1460   
unique        5      2     2        4           4         2         5   
top          RL   Pave  Grvl      Reg         Lvl    AllPub    Inside   
freq       1151   1454    50      925        1311      1459      1052   
mean        NaN    NaN   NaN      NaN         NaN       NaN       NaN   
std         NaN    NaN   NaN      NaN         NaN       NaN       NaN   
min         NaN    NaN   NaN      NaN         NaN       NaN       NaN   
25%         NaN    NaN   NaN      NaN         NaN       NaN       NaN   
50%         NaN    NaN   NaN      NaN         NaN       NaN       NaN   
75%         NaN    NaN   NaN      NaN         NaN       NaN       NaN   
max         NaN    NaN   NaN      NaN         NaN       NaN       NaN   

       LandSlope Neighborhood Condition1 Condition2 BldgType HouseStyle  \
count       1460         1460       1460       1

In [10]:
###
### DUMMIFY CATEGORICALS
###

print ">> Before dummy creation"
print "TRAIN: ", train.shape
print "TEST: ", test.shape
print

## flatten categoricals and drop original features
encodeCols = train.select_dtypes(include=['category'])
print ">> Columns to dummify (" + str(len(encodeCols.columns)) + ")"
print encodeCols.columns
print

train = pd.concat([train,pd.get_dummies(encodeCols,prefix=encodeCols.columns)],axis=1)
test = pd.concat([test,pd.get_dummies(encodeCols,prefix=encodeCols.columns)],axis=1)
print ">> After dummy creation"
print "TRAIN: ", train.shape
print "TEST: ", test.shape
print

train = train.drop(encodeCols.columns.tolist(),axis=1)
test = test.drop(encodeCols.columns.tolist(),axis=1)
print ">> After original categorical removals"
print "TRAIN: ", train.shape
print "TEST: ", test.shape


>> Before dummy creation
TRAIN:  (1460, 128)
TEST:  (1459, 127)

>> Columns to dummify (46)
Index([u'MSZoning', u'Street', u'Alley', u'LotShape', u'LandContour',
       u'Utilities', u'LotConfig', u'LandSlope', u'Neighborhood',
       u'Condition1', u'Condition2', u'BldgType', u'HouseStyle',
       u'OverallQual', u'OverallCond', u'RoofStyle', u'RoofMatl',
       u'Exterior1st', u'Exterior2nd', u'MasVnrType', u'ExterQual',
       u'ExterCond', u'Foundation', u'BsmtQual', u'BsmtCond', u'BsmtExposure',
       u'BsmtFinType1', u'BsmtFinType2', u'Heating', u'HeatingQC',
       u'CentralAir', u'Electrical', u'KitchenQual', u'Functional',
       u'FireplaceQu', u'GarageType', u'GarageFinish', u'GarageQual',
       u'GarageCond', u'PavedDrive', u'PoolQC', u'Fence', u'MiscFeature',
       u'YrSold', u'SaleType', u'SaleCondition'],
      dtype='object')

>> After dummy creation
TRAIN:  (1460, 404)
TEST:  (1460, 403)

>> After original categorical removals
TRAIN:  (1460, 358)
TEST:  (1460, 357)


In [11]:
train.to_csv('data/train_fe.csv',index=False)
test.to_csv('data/test_fe.csv',index=False)
with open('data/features_fe.txt', "w") as feature_file:
    feature_file.write(np.array2string(train.columns, separator=","))

In [89]:
##
## Simple test to ensure values are generally
## correct enough to run a simple model.
##
mask = np.random.rand(len(train)) < 0.5
after_train = train.copy()
after_dev = after_train[~mask]
after_train = after_train[mask]

train_labels = after_train["SalePrice"].copy()
after_train = after_train.drop( ["SalePrice"], axis=1 )

dev_labels = after_dev["SalePrice"].copy()
after_dev = after_dev.drop( ["SalePrice"], axis=1 ) 

reg = LinearRegression()
reg.fit(after_train, train_labels)
print("R^2 value: " + str(reg.score(after_dev, dev_labels)))

R^2 value: 0.797023629943


In [90]:
print after_dev.describe()

         BsmtUnfSF     1stFlrSF    GrLivArea  BsmtFullBath  BsmtHalfBath  \
count   752.000000   752.000000   752.000000    752.000000    752.000000   
mean    569.156915  1165.910904  1508.412234      0.432181      0.049202   
std     452.657352   379.690103   514.369069      0.521880      0.216434   
min       0.000000   334.000000   334.000000      0.000000      0.000000   
25%     203.750000   894.000000  1127.500000      0.000000      0.000000   
50%     482.500000  1095.000000  1472.000000      0.000000      0.000000   
75%     814.500000  1392.000000  1776.000000      1.000000      0.000000   
max    2336.000000  3228.000000  4476.000000      2.000000      1.000000   

         FullBath    HalfBath  BedroomAbvGr  KitchenAbvGr  Fireplaces  \
count  752.000000  752.000000    752.000000    752.000000  752.000000   
mean     1.566489    0.368351      2.875000      1.050532    0.627660   
std      0.546963    0.498956      0.840273      0.219186    0.638057   
min      0.000000    0.