In [1]:
# Import useful libraries
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt

from collections import defaultdict

import matplotlib as mpl
import seaborn as sns
sns.set(style ='white',font_scale=1.25)
%matplotlib inline

from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.metrics import r2_score, mean_squared_error

# Set waring to 'ignore' to prevent them from prining on screen
import warnings
warnings.filterwarnings('ignore')

In [2]:
with open('data/processed_data.pkl','rb') as file:
    train,y,test,FEATURES = pickle.load(file)

for feat in ['ExterQual','KitchenQual']:
    train[feat] = train[feat].astype(np.float)
    test[feat] = test[feat].astype(np.float)
#test.loc[test['KitchenQual']=='Missing','KitchenQual'] = np.median(train["KitchenQual"])

In [3]:
feat_to_norm = FEATURES['num'] + FEATURES['aug_num'] + FEATURES['eng_num'] + FEATURES['ord_num'] + FEATURES['interactions_num']
print('Total %i features to normalize' %(len(feat_to_norm)))

Total 246 features to normalize


In [4]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
ss.fit(train[feat_to_norm])

X_num_train_norm = pd.DataFrame(ss.transform(train[feat_to_norm]),columns=feat_to_norm,index=train.index)
X_cat_train = train[[col for col in train.columns if col not in feat_to_norm]]
X_train_preproc = pd.concat([X_num_train_norm,X_cat_train],axis=1)

X_num_test_norm = pd.DataFrame(ss.transform(test[feat_to_norm]),columns=feat_to_norm,index=test.index)
X_cat_test = test[[col for col in test.columns if col not in feat_to_norm]]
X_test_preproc = pd.concat([X_num_test_norm,X_cat_test],axis=1)

print("X_train_preproc : " + str(X_train_preproc.shape))
print("X_test_preproc : " + str(X_test_preproc.shape))

X_train_preproc : (1454, 466)
X_test_preproc : (1459, 466)


## Ridge

In [5]:
ridge = Ridge(alpha=30)
ridge.fit(X_train_preproc, y)
y_pred = ridge.predict(X_train_preproc)

print('Train accuracies:')
print('-'*50)
print('R-sq:', r2_score(y, y_pred))
print('MSE:', mean_squared_error(y, y_pred))
print('RMSE:', np.sqrt(mean_squared_error(y, y_pred)))

y_exp = np.expm1(y)
y_pred_exp = np.expm1(y_pred)
print('-'*50)
print('R-sq orig price:', r2_score(y_exp, y_pred_exp))
print('MSE orig price:', mean_squared_error(y_exp, y_pred_exp))
print('RMSE orig price:', np.sqrt(mean_squared_error(y_exp, y_pred_exp)))

Train accuracies:
--------------------------------------------------
R-sq: 0.9374962646711976
MSE: 0.009936699767063854
RMSE: 0.09968299637884012
--------------------------------------------------
R-sq orig price: 0.9388818146952416
MSE orig price: 383926189.8165709
RMSE orig price: 19594.03454668208


In [6]:
y_test_pred = ridge.predict(X_test_preproc)
y_test_pred_exp = np.expm1(y_test_pred)

test_predictions = pd.Series(y_test_pred_exp,index=X_test_preproc.index,name='SalePrice').to_frame()
test_predictions.to_csv('data/Limbachia_ridge_predictions.csv')

## Lasso

In [7]:
lasso = Lasso(alpha=0.0003)
lasso.fit(X_train_preproc, y)
y_pred = lasso.predict(X_train_preproc)

print('Train accuracies:')
print('-'*50)
print('R-sq:', r2_score(y, y_pred))
print('MSE:', mean_squared_error(y, y_pred))
print('RMSE:', np.sqrt(mean_squared_error(y, y_pred)))

y_exp = np.expm1(y)
y_pred_exp = np.expm1(y_pred)
print('-'*50)
print('R-sq orig price:', r2_score(y_exp, y_pred_exp))
print('MSE orig price:', mean_squared_error(y_exp, y_pred_exp))
print('RMSE orig price:', np.sqrt(mean_squared_error(y_exp, y_pred_exp)))

Train accuracies:
--------------------------------------------------
R-sq: 0.9410137045535535
MSE: 0.009377505282513417
RMSE: 0.09683752001426625
--------------------------------------------------
R-sq orig price: 0.9446761515810551
MSE orig price: 347527895.72470134
RMSE orig price: 18642.100088903648


In [8]:
y_test_pred = lasso.predict(X_test_preproc)
y_test_pred_exp = np.expm1(y_test_pred)

test_predictions = pd.Series(y_test_pred_exp,index=X_test_preproc.index,name='SalePrice').to_frame()
test_predictions.to_csv('data/Limbachia_lasso_predictions.csv')

## ElasticNet(l1_ratio=0.9,alpha=0.0006)

In [9]:
enet = ElasticNet(l1_ratio=0.9,alpha=0.0006)
enet.fit(X_train_preproc, y)
y_pred = enet.predict(X_train_preproc)

print('Train accuracies:')
print('-'*50)
print('R-sq:', r2_score(y, y_pred))
print('MSE:', mean_squared_error(y, y_pred))
print('RMSE:', np.sqrt(mean_squared_error(y, y_pred)))

y_exp = np.expm1(y)
y_pred_exp = np.expm1(y_pred)
print('-'*50)
print('R-sq orig price:', r2_score(y_exp, y_pred_exp))
print('MSE orig price:', mean_squared_error(y_exp, y_pred_exp))
print('RMSE orig price:', np.sqrt(mean_squared_error(y_exp, y_pred_exp)))

Train accuracies:
--------------------------------------------------
R-sq: 0.93423068574252
MSE: 0.010455853977755752
RMSE: 0.10225387023362857
--------------------------------------------------
R-sq orig price: 0.936444670871035
MSE orig price: 399235599.57402647
RMSE orig price: 19980.880850804013


In [10]:
y_test_pred = enet.predict(X_test_preproc)
y_test_pred_exp = np.expm1(y_test_pred)

test_predictions = pd.Series(y_test_pred_exp,index=X_test_preproc.index,name='SalePrice').to_frame()
test_predictions.to_csv('data/Limbachia_enet_predictions.csv')