# Project: House Prices - Advanced Regression Techniques

## Feature Engineering

In [36]:
# Import libraries necessary for this project
import numpy as np
import pandas as pd
from IPython.display import display # Allows the use of display() for DataFrames

# Pretty display for notebooks
%matplotlib inline

In [37]:
# Load the Ames housing dataset
train = pd.read_csv('data/house_prices/train.csv')
test = pd.read_csv('data/house_prices/test.csv')

In [22]:
# Size of the dataset
print "Ames housing train dataset has {} data points with {} variables each.".format(*train.shape)
print "Ames housing test dataset has {} data points with {} variables each.".format(*test.shape)

Ames housing train dataset has 1460 data points with 81 variables each.
Ames housing test dataset has 1459 data points with 80 variables each.


### First approach: without feature engineering

In [94]:
# Removal of Null values
train_fillna = train.fillna(value = 0.0)
test_fillna = test.fillna(value = 0.0)

#### 1) Top 2 most corralated features

In [95]:
# splitting the train set
features = train_fillna[['OverallQual', 'GrLivArea']]
prices = train_fillna['SalePrice']
log_prices = np.log1p(prices)

# splitting the test set
public_features = test_fillna[['OverallQual', 'GrLivArea']]

In [96]:
print features.shape
print public_features.shape

(1456, 2)
(1459, 2)


In [23]:
# save the dataset with selected features
# the easiest way is to pickle it using to_pickle:

features.to_pickle('features_top2.pkl')  # where to save it, usually as a .pkl
log_prices.to_pickle('log_prices_top2.pkl')

public_features.to_pickle('public_features_top2.pkl')

#### 2) Top ten most corralated features

In [6]:
# splitting the train set
# selected features or all continuous features
features = train_fillna[['OverallQual','GrLivArea','GarageCars','GarageArea','TotalBsmtSF','1stFlrSF','FullBath','TotRmsAbvGrd','YearBuilt','YearRemodAdd']]
prices = train_fillna['SalePrice']
log_prices = np.log1p(prices)

# splitting the test set
public_features = test_fillna[['OverallQual','GrLivArea','GarageCars','GarageArea','TotalBsmtSF','1stFlrSF','FullBath','TotRmsAbvGrd','YearBuilt','YearRemodAdd']]

In [7]:
# save the dataset with selected features
# the easiest way is to pickle it using to_pickle:

features.to_pickle('features_top10.pkl')  # where to save it, usually as a .pkl
log_prices.to_pickle('log_prices_top10.pkl')

public_features.to_pickle('public_features_top10.pkl')

### 3) Working with all the features

* All missing values will be replaced either by o in the case of the numeric features or by the mean in the case of the categorical features

* LabelEncoder will be applied to the Ordinal features.

* One-Hot-Encoding will be applied to the Nominal features.

In [38]:
# There are 4 houses with more than 4000 sq ft living area that are
# outliers, so we drop them from the training data.
train.drop(train[train['GrLivArea'] > 4000].index, inplace=True)
print 'Ames housing train dataset has {} data points with {} variables each.'.format(*train.shape)

Ames housing train dataset has 1456 data points with 81 variables each.


In [39]:
import pandas as pd
import numpy as np

from sklearn.base import TransformerMixin

class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.

        Columns of dtype object are imputed with the most frequent value 
        in column.

        Columns of other types are imputed with mean of column.

        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

In [40]:
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict

def label_encoding(X):
    ''' Preprocesses the dataset data and converts ordinal variables into labels with value between 0 and n_classes-1. '''
    
    # retain all columns LabelEncoder as dictionary
    d = defaultdict(LabelEncoder)

    # Encoding the variable
    fit = X.apply(lambda x: d[x.name].fit_transform(x))

    # Inverse the encoded
    fit.apply(lambda x: d[x.name].inverse_transform(x))

    # Using the dictionary to label future data
    # Example: 'BsmtCond' => 'NA' = - 'Po' = 3 'Fa' = 1 'TA' = 4 'Gd' = 2 and 'Ex' = -
    output = X.apply(lambda x: d[x.name].transform(x))
    
    return output

In [51]:
# debugging
x = train[['BsmtCond','BsmtExposure']]
print x[x['BsmtCond'] == 'TA'].head()

  BsmtCond BsmtExposure
0       TA           No
1       TA           Gd
2       TA           Mn
4       TA           Av
5       TA           No


In [76]:
# debugging
a = train[['BsmtCond','BsmtExposure']]
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
d = defaultdict(LabelEncoder)

# Encoding the variable
fit = a.apply(lambda x: d[x.name].fit_transform(x))

# Inverse the encoded
fit.apply(lambda x: d[x.name].inverse_transform(x))

# Using the dictionary to label future data
at = a.apply(lambda x: d[x.name].transform(x))
print at.loc[375]

BsmtCond        3
BsmtExposure    2
Name: 375, dtype: int64


In [58]:
# debugging
a = train[['BsmtExposure','Alley']]
at = label_encoding(a)
print "Processed feature columns ({} total features):\n{}".format(len(at.columns), list(at.columns))

# Show the feature information by printing the first five rows
print at.head()

Processed feature columns (2 total features):
['BsmtExposure', 'Alley']
   BsmtExposure  Alley
0             4      0
1             2      0
2             3      0
3             4      0
4             1      0


In [41]:
def one_hot(X):
    ''' Preprocesses the dataset data and converts nominal variables into dummy variables. '''
    
    # Initialize new output DataFrame
    output = pd.DataFrame(index = X.index)

    # Investigate each feature column for the data
    for col, col_data in X.iteritems():

        # Categorical data to dummy variables  
        # Example: 'Alley' => 'Alley_Grvl' and 'Alley_Pave'
        col_data = pd.get_dummies(col_data, prefix = col)  
        
        # Collect the revised columns
        output = output.join(col_data)
    
    return output

In [59]:
# debugging
b = train[['BsmtExposure','Alley']]
print b.head()

  BsmtExposure Alley
0           No   NaN
1           Gd   NaN
2           Mn   NaN
3           No   NaN
4           Av   NaN


In [62]:
# debugging
bt = one_hot(b)
print bt.head()

   BsmtExposure_Av  BsmtExposure_Gd  BsmtExposure_Mn  BsmtExposure_No  \
0              0.0              0.0              0.0              1.0   
1              0.0              1.0              0.0              0.0   
2              0.0              0.0              1.0              0.0   
3              0.0              0.0              0.0              1.0   
4              1.0              0.0              0.0              0.0   

   Alley_Grvl  Alley_Pave  
0         0.0         0.0  
1         0.0         0.0  
2         0.0         0.0  
3         0.0         0.0  
4         0.0         0.0  


In [62]:
def transform_skewed(X):
    ''' Transform the skewed numeric features. '''
    # Initialize new output DataFrame
    output = pd.DataFrame(index = X.index)

    # Transform the skewed numeric features by taking log1p.
    from scipy.stats import skew

    skewed = X.apply(lambda x: skew(x.dropna().astype(float)))
    skewed = skewed[skewed > 0.75]
    skewed = skewed.index
    output = np.log1p(X[skewed])
    return output

In [73]:
from sklearn.preprocessing import StandardScaler

def standard_escaling(X):
    ''' Preprocesses the dataset data and converts ordinal variables into labels with value between 0 and n_classes-1. '''
    
    # retain all columns LabelEncoder as dictionary
    d = defaultdict(StandardScaler)

    # Encoding the variable
    fit = X.apply(lambda x: d[x.name].fit_transform(x))

    # Inverse the encoded
    fit.apply(lambda x: d[x.name].inverse_transform(x))

    # Using the dictionary to label future data
    # Example: 'BsmtCond' => 'NA' = - 'Po' = 3 'Fa' = 1 'TA' = 4 'Gd' = 2 and 'Ex' = -
    output = X.apply(lambda x: d[x.name].transform(x))
    
    return output

In [42]:
def feature_engineering(dataset):
    ''' Transforms all the features and output a new data frame of engineered features. '''
    
    output = pd.DataFrame(index = dataset.index)
    
    # Removal of null values
    dataset_filled = DataFrameImputer().fit_transform(dataset)
    
    # Numeric features
    numeric_features = dataset_filled[['1stFlrSF', '2ndFlrSF', '3SsnPorch', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'EnclosedPorch', 'GarageArea', 'GrLivArea', 'LotArea', 'LotFrontage', 'LowQualFinSF', 'MasVnrArea', 'MiscVal', 'OpenPorchSF', 'PoolArea', 'ScreenPorch', 'TotalBsmtSF', 'WoodDeckSF', 'BedroomAbvGr', 'BsmtFullBath', 'BsmtHalfBath', 'Fireplaces', 'FullBath', 'GarageCars', 'GarageYrBlt', 'HalfBath', 'KitchenAbvGr', 'MoSold', 'TotRmsAbvGrd', 'YearBuilt', 'YearRemodAdd', 'YrSold']]
    output = output.join(numeric_features)
    
    ## Transform Nominal features
    # All nominal features
    nominal_features = dataset_filled[['Alley', 'BldgType', 'CentralAir', 'Condition1', 'Condition2', 'Exterior1st', 'Exterior2nd', 'Foundation', 'GarageType', 'Heating', 'HouseStyle', 'LandContour', 'LotConfig', 'MasVnrType', 'MiscFeature', 'MSSubClass', 'MSZoning', 'Neighborhood', 'RoofMatl', 'RoofStyle', 'SaleCondition', 'SaleType', 'Street']]
    # Collect the revised columns
    engineered_nominal_features = label_encoding(nominal_features)
    output = output.join(engineered_nominal_features)
    
    ## Transform Ordinal features
    # All ordinal features
    ordinal_features = dataset_filled[['BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'BsmtQual', 'Electrical', 'ExterCond', 'ExterQual', 'Fence', 'FireplaceQu', 'Functional', 'GarageCond', 'GarageFinish', 'GarageQual', 'HeatingQC', 'KitchenQual', 'LandSlope', 'LotShape', 'OverallCond', 'OverallQual', 'PavedDrive', 'PoolQC', 'Utilities']]
    
    # Collect the revised columns
    engineered_ordinal_features = one_hot(ordinal_features)
    output = output.join(engineered_ordinal_features)
    
    ## Skewed (this actually lower the score!)
    # skewed_features = transform_skewed(numeric_features)
    # output = output.join(skewed_features)
    
    ## Scale the data (this also lower the score!)
    # scaled_features = standard_escaling(numeric_features)
    # output = output.join(scaled_features)
    
    return output

In [84]:
# feature engineering to datasets
train_engineered = feature_engineering(train)
test_engineered = feature_engineering(test)

#### Drop unique columns if the train and test set have different shape

In [86]:
# check if the train and test set have the same shape
if train_engineered.shape[1] > test_engineered.shape[1]:
    features_to_drop = []
    for feature in train_engineered.columns:
        if feature not in test_engineered.columns:
            features_to_drop.append(feature)
    train_engineered.drop(features_to_drop, axis=1, inplace=True)
    print 'Done'

Done


In [62]:
if train_engineered.shape[1] < test_engineered.shape[1]:
    features_to_drop = []
    for feature in test_engineered.columns:
        if feature not in train_engineered:
            features_to_drop.append(feature)
    test_engineered.drop(features_to_drop, axis=1, inplace=True)
    print 'Done'

In [87]:
# retrieve price information
log_prices = pd.DataFrame(index = train_engineered.index, columns=["SalePrice"])
log_prices["SalePrice"] = np.log(train["SalePrice"])

#### Save the datasets using pickle

In [89]:
# save the dataset with selected features
# the easiest way is to pickle it using to_pickle:
train_engineered.to_pickle('features_all.pkl')  # where to save it, usually as a .pkl
log_prices.to_pickle('log_prices_all.pkl')

test_engineered.to_pickle('public_features_all.pkl')

### Exporting correlation of engineered features

In [47]:
# Import libraries necessary for this project
import numpy as np
import pandas as pd
train_engineered['SalePrice'] = log_prices
corr = train_engineered.corr()['SalePrice'].sort_values(ascending = False)

corr.to_csv('correlation.csv', header=True, index_label='Id') # add 1 to csv name every time