In [1]:
# Import useful libraries
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt

from collections import defaultdict

import matplotlib as mpl
import seaborn as sns

from sklearn.linear_model import Ridge, ElasticNet
from sklearn.metrics import r2_score, mean_squared_error

sns.set(style ='white',font_scale=1.25)

%matplotlib inline

In [2]:
with open('data/processed_data.pkl','rb') as file:
    train,y,test,FEATURES = pickle.load(file)

for feat in ['ExterQual','KitchenQual']:
    train[feat] = train[feat].astype(np.float)
    test[feat] = test[feat].astype(np.float)
#test.loc[test['KitchenQual']=='Missing','KitchenQual'] = np.median(train["KitchenQual"])

In [3]:
feat_to_norm = FEATURES['num'] + FEATURES['aug_num'] + FEATURES['eng_num'] + FEATURES['ord_num'] + FEATURES['interactions_num']
print('Total %i features to normalize' %(len(feat_to_norm)))

Total 217 features to normalize


In [4]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
ss.fit(train[feat_to_norm])

X_num_train_norm = pd.DataFrame(ss.transform(train[feat_to_norm]),columns=feat_to_norm,index=train.index)
X_cat_train = train[[col for col in train.columns if col not in feat_to_norm]]
X_train_preproc = pd.concat([X_num_train_norm,X_cat_train],axis=1)

X_num_test_norm = pd.DataFrame(ss.transform(test[feat_to_norm]),columns=feat_to_norm,index=test.index)
X_cat_test = test[[col for col in test.columns if col not in feat_to_norm]]
X_test_preproc = pd.concat([X_num_test_norm,X_cat_test],axis=1)

print("X_train_preproc : " + str(X_train_preproc.shape))
print("X_test_preproc : " + str(X_test_preproc.shape))

X_train_preproc : (1454, 437)
X_test_preproc : (1459, 437)


In [5]:
ridge = Ridge(alpha=30)
ridge.fit(X_train_preproc, y)
y_pred = ridge.predict(X_train_preproc)

print('Train accuracies:')
print('-'*50)
print('R-sq:', r2_score(y, y_pred))
print('MSE:', mean_squared_error(y, y_pred))
print('RMSE:', np.sqrt(mean_squared_error(y, y_pred)))

y_exp = np.expm1(y)
y_pred_exp = np.expm1(y_pred)
print('-'*50)
print('R-sq orig price:', r2_score(y_exp, y_pred_exp))
print('MSE orig price:', mean_squared_error(y_exp, y_pred_exp))
print('RMSE orig price:', np.sqrt(mean_squared_error(y_exp, y_pred_exp)))

Train accuracies:
--------------------------------------------------
R-sq: 0.9370209293268082
MSE: 0.010012267484433445
RMSE: 0.10006131862230001
--------------------------------------------------
R-sq orig price: 0.9380860271165936
MSE orig price: 388925089.7585476
RMSE orig price: 19721.183781876472


In [9]:
y_test_pred = ridge.predict(X_test_preproc)
y_test_pred_exp = np.expm1(y_test_pred)

test_predictions = pd.Series(y_test_pred_exp,index=X_test_preproc.index,name='SalePrice').to_frame()
test_predictions.to_csv('data/Limbachia_ridge_predictions.csv')

In [6]:
enet = ElasticNet(l1_ratio=1,alpha=0.0003)
enet.fit(X_train_preproc, y)
y_pred = enet.predict(X_train_preproc)

print('Train accuracies:')
print('-'*50)
print('R-sq:', r2_score(y, y_pred))
print('MSE:', mean_squared_error(y, y_pred))
print('RMSE:', np.sqrt(mean_squared_error(y, y_pred)))

y_exp = np.expm1(y)
y_pred_exp = np.expm1(y_pred)
print('-'*50)
print('R-sq orig price:', r2_score(y_exp, y_pred_exp))
print('MSE orig price:', mean_squared_error(y_exp, y_pred_exp))
print('RMSE orig price:', np.sqrt(mean_squared_error(y_exp, y_pred_exp)))

Train accuracies:
--------------------------------------------------
R-sq: 0.9406961689034393
MSE: 0.009427986368226544
RMSE: 0.09709781855544719
--------------------------------------------------
R-sq orig price: 0.9439719127827979
MSE orig price: 351951713.5652946
RMSE orig price: 18760.376157350755


  positive)


In [11]:
y_test_pred = enet.predict(X_test_preproc)
y_test_pred_exp = np.expm1(y_test_pred)

test_predictions = pd.Series(y_test_pred_exp,index=X_test_preproc.index,name='SalePrice').to_frame()
test_predictions.to_csv('data/Limbachia_enet_predictions.csv')