# Feature Engineering

Let's try to manufacture some new features and see how they perform.

# Imports

In [1]:
import numpy as np
import os
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

In [2]:
# Import data
filepath = 'datasets/train_clean.csv'
df = pd.read_csv(filepath)

In [3]:
pd.options.display.max_rows = 999

# Polynomial features

In [4]:
# Grab and sort the ten variables most highly correlated with SalePrice
corrs = df.corr()[['SalePrice']].drop('SalePrice')
corrs = corrs.sort_values(by='SalePrice', ascending=False)[:10]

In [5]:
# Turn those ten into a list of features
features = [ix for ix in corrs.index]
X = df[features]

In [6]:
# Create poly instance
poly = PolynomialFeatures(include_bias=False)

# Create polynomial features
X_poly = poly.fit_transform(X)

# Create new dataframe with SalePrice and our polynomial features
corrs_df = pd.DataFrame(X_poly, columns=poly.get_feature_names(features))
corrs_df['SalePrice'] = df['SalePrice']

In [7]:
# Now see which of our polynomial features correlate highly with SalePrice
corrs_df.corr()[['SalePrice']].drop('SalePrice').sort_values(by='SalePrice', ascending=False)[:30]

Unnamed: 0,SalePrice
Overall Qual Gr Liv Area,0.873705
Overall Qual 1st Flr SF,0.842434
Overall Qual Total Bsmt SF,0.832307
Overall Qual^2,0.829961
Overall Qual Garage Area,0.826098
Gr Liv Area Total Bsmt SF,0.823888
Overall Qual Garage Cars,0.823318
Gr Liv Area Garage Area,0.809826
Overall Qual Year Built,0.809632
Gr Liv Area Garage Cars,0.808582


# Building a model

We can see that the feature most highly correlated with our target is the interaction term `Overall Qual Gr Liv Area` (followed by `Overall Qual^2`). So let's build a model using these interaction terms!

In [8]:
features = [#'Garage Cars Total Bsmt SF',
            'Garage Cars',
            'Total Bsmt SF',
            'Overall Qual Gr Liv Area',
            'Overall Qual',
            'Gr Liv Area',
            'Overall Qual^2']

X = corrs_df[features]
y = corrs_df['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [9]:
# Instantiate and fit model.
lr = LinearRegression()
lr.fit(X_train, y_train);

# Train score
print('train:', lr.score(X_train, y_train))

# Test score
print('test:', lr.score(X_test, y_test))

# Cross val score
print('cross val score:', cross_val_score(lr, X_train, y_train, cv=5).mean())

train: 0.8583114269993146
test: 0.8589191189525888
cross val score: 0.8549616952711011


This is easily the best model we've tried so far!

# Log transforming our target

This MLR with interaction terms looks to be pretty solid, but can we make it even better by combining it with a log transformation of `SalePrice`?

In [10]:
y_log = np.log(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_log, random_state=42)

In [11]:
# Instantiate and fit model
log_lr = LinearRegression()
log_lr.fit(X_train, y_train);

# Train score
print('train:', log_lr.score(X_train, y_train))

# Test score
print('test:', log_lr.score(X_test, y_test))

# Cross val score
print('cross val score:', cross_val_score(log_lr, X_train, y_train, cv=5).mean())

train: 0.8319281453489725
test: 0.8145141504093693
cross val score: 0.8285093648283937


So the model is not quite as good with a log transformation of the target.

# Bring in `test_clean.csv`

In [12]:
test = pd.read_csv('datasets/test_clean.csv')

We need to create our interactions terms in the test dataset.

In [13]:
features = [ix for ix in corrs.index]

X = test[features]
X_poly = poly.fit_transform(X)

test = pd.DataFrame(X_poly, columns=poly.get_feature_names(features))

# Need to add back in Id column from original test data
test2 = pd.read_csv('datasets/test_clean.csv')
test['Id'] = test2['Id']

# Making predictions

In [14]:
# Make submissions directory if it doesn't already exist
try:
    os.mkdir('submissions')
except FileExistsError:
    pass

In [15]:
features = [#'Garage Cars Total Bsmt SF',
            'Garage Cars',
            'Total Bsmt SF',
            'Overall Qual Gr Liv Area',
            'Overall Qual',
            'Gr Liv Area',
            'Overall Qual^2']

In [16]:
test_preds = np.exp(log_lr.predict(test[features]))

test_preds_df = pd.DataFrame({
    'Id': test['Id'],
    'SalePrice': test_preds
})

test_preds_df.to_csv('submissions/interaction_and_log_transform.csv', index=False)

# Exporting datasets with polynomial features

In [17]:
corrs_df.to_csv('datasets/train_with_polynomials.csv', index=False)

In [18]:
test.to_csv('datasets/test_with_polynomials.csv', index=False)