## Feature Engineering and Model Improvement

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV

In [2]:
def rmse(model, X, y):
    model_preds = model.predict(X)
    mse = mean_squared_error(y, model_preds)
    return mse ** 0.5

In [7]:
df = pd.read_csv('./datasets/cleaned_housing_data.csv', keep_default_na=False, na_values=[''])
test = pd.read_csv('./datasets/test.csv', keep_default_na=False, na_values=[''])
pd.options.display.max_columns = 99
df = df.drop(columns=['Unnamed: 0'])

In [133]:
features_A=[
    'Overall Qual',
    'Overall Cond',
    'Year Built',
    'Year Remod/Add',
    'Total Bsmt SF',
    '1st Flr SF',
    'Gr Liv Area',
    'TotRms AbvGrd',
    'Fireplaces',
    'Garage Yr Blt',
    'Garage Area'
]

In [134]:
num_features = features_A + ['Mas Vnr Area', 'Full Bath', 'BsmtFin SF 1', 'Garage Cars']

### Best Working Model

In [20]:
#Define predictor and target variable(s):
X=df[num_features]
y=df['SalePrice']

In [21]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [22]:
#Fit and Score Pipeline
pipe = Pipeline([
    ('ss', StandardScaler()),
    ('pf', PolynomialFeatures(interaction_only=False, include_bias=False)),
    ('lr', LassoCV(verbose=True))
])

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.5s finished
  model = cd_fast.enet_coordinate_descent(


(0.9313352958324199, 0.9188377542426759)

In [23]:
#Calculate rmse for train test splits
rmse(pipe, X_train, y_train), rmse(pipe, X_test, y_test)

(21610.809185260536, 23154.910813046903)

-----

In [30]:
ohe_features = [
    'MS Zoning', 
    'Land Contour', 
    'Neighborhood', 
    'Condition 1', 
    'House Style', 
    'Roof Style', 
    'Exterior 1st', 
    'Mas Vnr Type', 
    'Exter Qual', 
    'Foundation', 
    'Bsmt Qual', 
    'Kitchen Qual', 
    'Functional', 
    'Garage Finish', 
    'Garage Qual', 
    'Pool QC'
]

In [31]:
features = num_features + ohe_features

In [32]:
#Define predictor and target variable(s):
X=df[features]
y=df['SalePrice']

In [33]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [36]:
ohe = OneHotEncoder(sparse=False,handle_unknown='ignore')
ohe.fit(X_train[ohe_features])

#convert training categorical columns
X_train_ohe = pd.DataFrame(
    ohe.transform(X_train[ohe_features]),
    columns=ohe.get_feature_names(ohe_features)
)

#convert testing categorical columns
X_test_ohe = pd.DataFrame(
    ohe.transform(X_test[ohe_features]),
    columns=ohe.get_feature_names(ohe_features)
)

In [37]:
#Adding one hot encoded columns to train test split X's
X_train_full = pd.concat([X_train.reset_index(drop=True), X_train_ohe], axis=1).drop(columns=ohe_features)
X_test_full = pd.concat([X_test.reset_index(drop=True), X_test_ohe], axis=1).drop(columns=ohe_features)

In [38]:
#Fit and Score Pipeline
pipe1=Pipeline([
    ('ss', StandardScaler()),
    ('pf', PolynomialFeatures(interaction_only=True, include_bias=False)),
    ('lr', LassoCV(verbose=True))
])

pipe1.fit(X_train_full, y_train)
pipe1.score(X_train_full, y_train), pipe1.score(X_test_full, y_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
......[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  6.8min finished


(0.8658333818751494, 0.8508943192834514)

In [39]:
rmse(pipe1, X_train_full, y_train), rmse(pipe1, X_test_full, y_test)

(30208.31410982516, 31384.35298311812)

------

##### Now I will iterate on ohe features to include in the pipe:

In [41]:
pipe1_coefs = pipe1.named_steps['lr'].coef_

In [43]:
pipe1_coefs = pd.DataFrame(pipe1.named_steps['lr'].coef_, index = pipe1.named_steps['pf'].get_feature_names(X_train_full.columns), columns=['coef val'])

In [91]:
pipe1_coefs.sort_values('coef val', ascending=False).head(25)

Unnamed: 0,coef val
Overall Qual,23066.443275
Gr Liv Area,19469.54037
Total Bsmt SF,6169.331571
Garage Area,3988.745027
Neighborhood_NridgHt Bsmt Qual_Ex,2775.821637
Neighborhood_StoneBr Bsmt Qual_Ex,2408.069258
Garage Cars,2359.575505
Year Built,2178.831555
BsmtFin SF 1,1970.077746
Year Remod/Add,1896.752426


In [92]:
pipe1_coefs.sort_values('coef val', ascending=False).tail(25)

Unnamed: 0,coef val
Pool QC_NA Pool QC_TA,-0.0
Land Contour_HLS Roof Style_Gambrel,-0.0
Land Contour_HLS House Style_SLvl,-0.0
Land Contour_HLS House Style_1.5Unf,-0.0
Land Contour_HLS House Style_1Story,0.0
Land Contour_HLS House Style_2.5Fin,-0.0
Land Contour_HLS House Style_2.5Unf,-0.0
Land Contour_HLS House Style_SFoyer,-0.0
Land Contour_HLS House Style_2Story,-0.0
Land Contour_HLS Roof Style_Flat,-0.0


For having 8778 columns, Lasso definitely is a powerful tool for minimizing overfitting. That being said I can probably shave down the number of ohe features.

In [58]:
ohe_features2 = [
    'MS Zoning', 
    'Neighborhood', 
    'Condition 1',
    'Condition 2'
]

In [59]:
features = num_features + ohe_features2

In [60]:
#Define predictor and target variable(s):
X=df[features]
y=df['SalePrice']

In [61]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [62]:
ohe2 = OneHotEncoder(sparse=False,handle_unknown='ignore')
ohe2.fit(X_train[ohe_features2])

#convert training categorical columns
X_train_ohe = pd.DataFrame(
    ohe2.transform(X_train[ohe_features2]),
    columns=ohe2.get_feature_names(ohe_features2)
)

#convert testing categorical columns
X_test_ohe = pd.DataFrame(
    ohe2.transform(X_test[ohe_features2]),
    columns=ohe2.get_feature_names(ohe_features2)
)

In [63]:
#Adding one hot encoded columns to train test split X's
X_train_full = pd.concat([X_train.reset_index(drop=True), X_train_ohe], axis=1).drop(columns=ohe_features2)
X_test_full = pd.concat([X_test.reset_index(drop=True), X_test_ohe], axis=1).drop(columns=ohe_features2)

In [66]:
#Fit and Score Pipeline
pipe2=Pipeline([
    ('ss', StandardScaler()),
    ('pf', PolynomialFeatures(interaction_only=False, include_bias=False)),
    ('lr', LassoCV(verbose=True))
])

pipe2.fit(X_train_full, y_train)
pipe2.score(X_train_full, y_train), pipe2.score(X_test_full, y_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   46.6s finished


(0.9606334062024935, 0.9228026490607858)

In [67]:
rmse(pipe2, X_train_full, y_train), rmse(pipe2, X_test_full, y_test)

(16363.197473958438, 22582.25377488116)

-----

The model got worse though not by much. I still want to narrow down ohe features included to find the best ones. Instead of populating notebooks full of ohe1,2,3,4,5,n and pipe1,2,3,4,5,n I am going to edit the ohe features2 list and rerun the same model until I arrrive at a desired model to display.

After a few iterations I arrrived at the model displayed above. It is my best model performance yet but shows signs of overfitting. I'm going to use the model to predict test set sales prices for kaggle submission before iterating further.

-----

In [70]:
test = pd.read_csv('./datasets/test.csv', keep_default_na=False, na_values=[''])
test=test.fillna(0)

In [72]:
test_ohe = pd.DataFrame(
    ohe2.transform(test[ohe_features2]),
    columns=ohe2.get_feature_names(ohe_features2)
)

In [73]:
test_subset = pd.concat([test[features].drop(columns=ohe_features2), test_ohe], axis=1).fillna(0)

In [75]:
test['SalePrice'] = pipe2.predict(test_subset)

In [76]:
submission2 = pd.DataFrame({
    'Id': test['Id'],
    'SalePrice': test['SalePrice']
})

In [79]:
submission2 = submission2.set_index('Id')

In [80]:
submission2.to_csv('GatesPrediction2.csv')

-----

In [177]:
ohe_features = [
    'MS Zoning', 
    'Neighborhood', 
    'Condition 1',
    'Condition 2',
    #adding the following:
    'Land Contour',
    'Kitchen Qual'
]

features = num_features + ohe_features

In [178]:
#Define predictor and target variable(s):
X=df[features]
y=df['SalePrice']

In [179]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [180]:
ohe = OneHotEncoder(sparse=False,handle_unknown='ignore')
ohe.fit(X_train[ohe_features])

#convert training categorical columns
X_train_ohe = pd.DataFrame(
    ohe.transform(X_train[ohe_features]),
    columns=ohe.get_feature_names(ohe_features)
)

#convert testing categorical columns
X_test_ohe = pd.DataFrame(
    ohe.transform(X_test[ohe_features]),
    columns=ohe.get_feature_names(ohe_features)
)

In [181]:
#Adding one hot encoded columns to train test split X's
X_train_full = pd.concat([X_train.reset_index(drop=True), X_train_ohe], axis=1).drop(columns=ohe_features)
X_test_full = pd.concat([X_test.reset_index(drop=True), X_test_ohe], axis=1).drop(columns=ohe_features)

In [155]:
#Fit and Score Pipeline
pipe3=Pipeline([
    ('ss', StandardScaler()),
    ('pf', PolynomialFeatures(interaction_only=False, include_bias=False)),
    ('lr', LassoCV(verbose=True))
])

pipe3.fit(X_train_full, y_train)
pipe3.score(X_train_full, y_train), pipe3.score(X_test_full, y_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.2min finished
  model = cd_fast.enet_coordinate_descent(


(0.9706598363598034, 0.9256822201252949)

In [156]:
rmse(pipe3, X_train_full, y_train), rmse(pipe3, X_test_full, y_test)

(14126.530709645056, 22157.07605736612)

Exporting model preds to kaggle:

In [161]:
test = pd.read_csv('./datasets/test.csv', keep_default_na=False, na_values=[''])
test=test.fillna(0)

In [162]:
test_ohe = pd.DataFrame(
    ohe.transform(test[ohe_features]),
    columns=ohe.get_feature_names(ohe_features)
)

In [163]:
test_subset = pd.concat([test[features].drop(columns=ohe_features), test_ohe], axis=1).fillna(0)

In [164]:
test['SalePrice'] = pipe3.predict(test_subset)

In [165]:
submission3 = pd.DataFrame({
    'Id': test['Id'],
    'SalePrice': test['SalePrice']
})

In [166]:
submission3 = submission3.set_index('Id')

In [167]:
submission3.to_csv('./kaggle_subs/gates_pred3.csv')

-----

In [182]:
#Fit and Score Pipeline (Same as above except interaction_only=True)
pipe4=Pipeline([
    ('ss', StandardScaler()),
    ('pf', PolynomialFeatures(interaction_only=True, include_bias=False)),
    ('lr', LassoCV(verbose=True))
])

pipe4.fit(X_train_full, y_train)
pipe4.score(X_train_full, y_train), pipe4.score(X_test_full, y_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  model = cd_fast.enet_coordinate_descent(
...........................................................................................................................................................................................................[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.6min finished
  model = cd_fast.enet_coordinate_descent(


(0.9767290842570977, 0.9094053482731476)

In [183]:
rmse(pipe4, X_train_full, y_train), rmse(pipe4, X_test_full, y_test)

(12580.879101181486, 24463.430879587053)

Exporting model predictions to kaggle:

In [184]:
test = pd.read_csv('./datasets/test.csv', keep_default_na=False, na_values=[''])
test=test.fillna(0)

In [185]:
test_ohe = pd.DataFrame(
    ohe.transform(test[ohe_features]),
    columns=ohe.get_feature_names(ohe_features)
)

In [186]:
test_subset = pd.concat([test[features].drop(columns=ohe_features), test_ohe], axis=1).fillna(0)

In [187]:
preds4 = pipe4.predict(test_subset)

In [188]:
submission4 = pd.DataFrame({
    'Id': test['Id'],
    'SalePrice': preds4
})

In [189]:
submission4 = submission4.set_index('Id')

In [190]:
submission4.to_csv('./kaggle_subs/gates_pred4.csv')

-----

I want to try adding just a couple more categorical values.

In [140]:
ohe_features = [
    'MS Zoning', 
    'Neighborhood', 
    'Condition 1',
    'Condition 2',
    'Land Contour',
    'Kitchen Qual',
    #adding the following:
    'Bsmt Qual',
    'Functional',
    'Garage Qual'
]

features = num_features + ohe_features

In [141]:
#Define predictor and target variable(s):
X=df[features]
y=df['SalePrice']

In [142]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [143]:
ohe = OneHotEncoder(sparse=False,handle_unknown='ignore')
ohe.fit(X_train[ohe_features])

#convert training categorical columns
X_train_ohe = pd.DataFrame(
    ohe.transform(X_train[ohe_features]),
    columns=ohe.get_feature_names(ohe_features)
)

#convert testing categorical columns
X_test_ohe = pd.DataFrame(
    ohe.transform(X_test[ohe_features]),
    columns=ohe.get_feature_names(ohe_features)
)

In [144]:
#Adding one hot encoded columns to train test split X's
X_train_full = pd.concat([X_train.reset_index(drop=True), X_train_ohe], axis=1).drop(columns=ohe_features)
X_test_full = pd.concat([X_test.reset_index(drop=True), X_test_ohe], axis=1).drop(columns=ohe_features)

In [148]:
#Fit and Score Pipeline
pipe5=Pipeline([
    ('ss', StandardScaler()),
    ('pf', PolynomialFeatures(interaction_only=True, include_bias=False)),
    ('lr', LassoCV(verbose=True))
])

pipe5.fit(X_train_full, y_train)
pipe5.score(X_train_full, y_train), pipe5.score(X_test_full, y_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.3min finished


(0.8577380327714458, 0.8395253977695564)

In [149]:
rmse(pipe5, X_train_full, y_train), rmse(pipe5, X_test_full, y_test)

(31106.32154848639, 32558.863631670796)

------

I will make an attempt to use RidgeCV instead of Lasso CV on my best fitting model to show the difference in methodology. 

------

In [191]:
ohe_features = [
    'MS Zoning', 
    'Neighborhood', 
    'Condition 1',
    'Condition 2',
    'Land Contour',
    'Kitchen Qual'
]

features = num_features + ohe_features

In [192]:
#Define predictor and target variable(s):
X=df[features]
y=df['SalePrice']

In [193]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [194]:
ohe = OneHotEncoder(sparse=False,handle_unknown='ignore')
ohe.fit(X_train[ohe_features])

#convert training categorical columns
X_train_ohe = pd.DataFrame(
    ohe.transform(X_train[ohe_features]),
    columns=ohe.get_feature_names(ohe_features)
)

#convert testing categorical columns
X_test_ohe = pd.DataFrame(
    ohe.transform(X_test[ohe_features]),
    columns=ohe.get_feature_names(ohe_features)
)

In [195]:
#Adding one hot encoded columns to train test split X's
X_train_full = pd.concat([X_train.reset_index(drop=True), X_train_ohe], axis=1).drop(columns=ohe_features)
X_test_full = pd.concat([X_test.reset_index(drop=True), X_test_ohe], axis=1).drop(columns=ohe_features)

In [196]:
#Fit and Score Pipeline
pipe_r=Pipeline([
    ('ss', StandardScaler()),
    ('pf', PolynomialFeatures(interaction_only=False, include_bias=False)),
    ('rg', RidgeCV())
])

pipe_r.fit(X_train_full, y_train)
pipe_r.score(X_train_full, y_train), pipe_r.score(X_test_full, y_test)

(0.9826281708886859, 0.8288420810437107)

In [197]:
rmse(pipe_r, X_train_full, y_train), rmse(pipe_r, X_test_full, y_test)

(10869.936982638008, 33625.177342471194)

-----

Running RidgeCV on the best model inputs for LassoCV produces the highest R2 train score I've seen yet but is quite overfit as a model. Lasso penalizes more strongly certain coefficients taking them all the way to 0 which can explain why the lasso model is less susceptible to overfitting.

--------