In [58]:
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy  as np
import pandas as pd
from scipy import stats
from statsmodels.graphics.gofplots import qqplot
from sklearn import linear_model
plt.style.use('ggplot')
%matplotlib inline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold


In [59]:
def plot_cv_testscores_r(model):
    scores = model.cv_results_['mean_test_score']
    scores_std = model.cv_results_['std_test_score']
    plt.figure().set_size_inches(8, 6)
    plt.semilogx(alphas_ridge, scores)
    
    std_error = scores_std / np.sqrt(n_folds)
    
    plt.semilogx(alphas_ridge, scores + std_error, 'b--')
    plt.semilogx(alphas_ridge, scores - std_error, 'b--')
    
    # alpha=0.2 controls the translucency of the fill color
    plt.fill_between(alphas_ridge, scores + std_error, scores - std_error, alpha=0.2)
    
    plt.ylabel('CV score +/- std error')
    plt.xlabel('alpha')
    plt.axhline(np.max(scores), linestyle='--', color='.5')
    plt.xlim([alphas_ridge[0], alphas_ridge[-1]])
    plt.ylim(0.8,1)
    


In [60]:
def plot_cv_traintestscores_r(model):
    testscores = model.cv_results_['mean_test_score']
    trainscores = model.cv_results_['mean_train_score']
    plt.figure().set_size_inches(8, 6)
    plt.semilogx(alphas_ridge, testscores)
    plt.semilogx(alphas_ridge, trainscores, 'b--')
    plt.ylim(0.8,1)
    


In [61]:
raw = pd.read_csv("./s2_clean_dummified.csv")

In [62]:
raw.shape

(1458, 145)

In [63]:
sale_price = raw['LogSalePrice']
raw = raw.drop(['LogSalePrice','Id'],axis=1)

In [64]:
X = raw.copy()
Y = sale_price.copy()


In [65]:
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
ridge = Ridge()
lasso = Lasso()
net   = ElasticNet()

In [66]:
#running ridge with alpha 0 (MLR)
ridge.set_params(alpha = 0, normalize = True)
ridge.fit(X, Y)
ridge.score(X, Y)

0.9302064346986071

In [67]:
alphas_ridge = np.linspace(0,10,50)
tuned_parameters_r = [{'alpha': alphas_ridge}]
n_folds = 5
cv = KFold(n_splits=n_folds, shuffle=True)


In [68]:
tune_ridge = GridSearchCV(ridge, tuned_parameters_r, 
                          cv=cv, refit=True, return_train_score = True, 
                          scoring = 'neg_mean_squared_error')


In [69]:
tune_ridge.fit(X,Y)



GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
       error_score='raise-deprecating',
       estimator=Ridge(alpha=0, copy_X=True, fit_intercept=True, max_iter=None, normalize=True,
   random_state=None, solver='auto', tol=0.001),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'alpha': array([ 0.     ,  0.20408,  0.40816,  0.61224,  0.81633,  1.02041,
        1.22449,  1.42857,  1.63265,  1.83673,  2.04082,  2.2449 ,
        2.44898,  2.65306,  2.85714,  3.06122,  3.26531,  3.46939,
        3.67347,  3.87755,  4.08163,  4.28571,  4.4898 ,  4.69388,
        4....35,
        8.57143,  8.77551,  8.97959,  9.18367,  9.38776,  9.59184,
        9.79592, 10.     ])}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=0)

In [70]:
print(tune_ridge.best_params_)
print(np.max(tune_ridge.cv_results_['mean_test_score']))
print(np.min(tune_ridge.cv_results_['mean_test_score']))

{'alpha': 0.20408163265306123}
-0.015791971153047462
-1.4278823684828534e+25


In [71]:
#running ridge with CV (0.204) alpha 
ridge.set_params(alpha = 0.204081632, normalize = True)
ridge.fit(X, Y)
ridge.score(X, Y)

0.919612006241331

In [72]:
#fitting lasso with alpha 0 (MLR)
lasso.set_params(alpha = 0, normalize = True, max_iter = 10000)
lasso.fit(X, Y)
lasso.score(X, Y)

  This is separate from the ipykernel package so we can avoid doing imports until
  positive)


0.9302137096222629

In [73]:
alphas_lasso = np.logspace(-7, -2, 50)
tuned_parameters_l = [{'alpha': alphas_lasso}]
n_folds = 5
cv = KFold(n_splits=n_folds, shuffle=True)


In [74]:
tune_lasso = GridSearchCV(lasso, tuned_parameters_l, 
                          cv=cv, refit=True, return_train_score = True, 
                          scoring = 'neg_mean_squared_error')

In [75]:
tune_lasso.fit(X,Y)

GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
       error_score='raise-deprecating',
       estimator=Lasso(alpha=0, copy_X=True, fit_intercept=True, max_iter=10000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'alpha': array([1.00000e-07, 1.26486e-07, 1.59986e-07, 2.02359e-07, 2.55955e-07,
       3.23746e-07, 4.09492e-07, 5.17947e-07, 6.55129e-07, 8.28643e-07,
       1.04811e-06, 1.32571e-06, 1.67683e-06, 2.12095e-06, 2.68270e-06,
       3.39322e-06, 4.29193e-06, 5.42868e-06, 6.86649e-06, 8.6...2.44205e-03, 3.08884e-03,
       3.90694e-03, 4.94171e-03, 6.25055e-03, 7.90604e-03, 1.00000e-02])}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=0)

In [78]:
print(tune_lasso.best_params_)
#print(np.max(-tune_lasso.cv_results_['mean_test_score']))
print(np.min(-tune_lasso.cv_results_['mean_test_score']))


{'alpha': 2.811768697974231e-05}
0.01486016259485676


In [77]:
#fitting lasso with CV alpha (2.811e-05)
lasso.set_params(alpha = 2.811e-05, normalize = True, max_iter = 10000)
lasso.fit(X, Y)
lasso.score(X, Y)


0.9269161023662832

In [82]:
raw_test = pd.read_csv("./s2_clean_dummified_test.csv")

In [83]:
raw_test['Exterior1st_ImStucc'] = 0
raw_test['Exterior1st_Stone'] = 0
raw_test['HouseStyle_2.5Fin'] = 0
test_IDs = raw_test['Id']
raw_test.drop(['Id', 'Exterior1st_Other'], axis = 1, inplace = True)

In [85]:
## running lasso to predict housing values 

predict_lasso = lasso.predict(raw_test)

In [86]:
predict_lasso = np.exp(predict_lasso)

In [87]:
predict_lasso

array([ 88409.22941702, 118177.24698481, 117842.98600651, ...,
       113217.31366552,  89259.07480763, 168872.67869602])

In [93]:
predict_lasso1 = pd.DataFrame(predict_lasso)

In [98]:
test_IDs = pd.DataFrame(test_IDs)

In [100]:
full_pred = pd.concat([predict_lasso1, test_IDs], axis = 0)

In [101]:
full_pred

Unnamed: 0,0,Id
0,88409.229417,
1,118177.246985,
2,117842.986007,
3,146310.486046,
4,153547.935402,
5,119132.960112,
6,129897.500190,
7,115080.426904,
8,120698.099494,
9,87402.990947,
