In [18]:
import pandas as pd
import numpy as np

import datetime

import pickle

from itertools import chain

from sklearn.linear_model import ElasticNet, Lasso, Ridge, LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectFromModel, SelectKBest
from sklearn.neighbors import KNeighborsRegressor

import pickle

In [3]:
train = pd.read_csv('../../data/engineered/training.csv', index_col='id')
submission_input = pd.read_csv('../../data/engineered/test.csv', index_col='id')

target = 'SalePrice'
y = train[target]
X = train[[col for col in train.columns if col != target]]
y_log = np.log1p(y)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=.3, random_state=13)

In [5]:
ss = StandardScaler()
pf = PolynomialFeatures()
pca = PCA(n_components=20)
sp = SelectFromModel(Lasso())
lr = LinearRegression()


pipe_lin_reg = Pipeline([
    ('ss', ss),
#     ('pf', pf),
    ('pca', pca),
    ('sp', sp),
    ('lin_reg', lr)
])

params = {
}
gs_lin_rg = GridSearchCV(pipe_lin_reg, param_grid=params)

In [160]:
gs_lin_rg.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('ss', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=20, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('sp', SelectFromModel(estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
 ...d=None)), ('lin_reg', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False))]),
       fit_params={}, iid=True, n_jobs=1, param_grid={},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [161]:
gs_lin_rg.score(X_train, y_train)

0.7156157539917796

In [162]:
gs_lin_rg.score(X_test, y_test)

0.6979061016979401

In [163]:
mean_squared_error(y_test, gs_lin_rg.predict(X_test))**.5

0.22382723778730435

In [164]:
mean_squared_error(np.exp(y_test) - 1, np.exp(gs_lin_rg.predict(X_test)) - 1)**.5

47633.706374421185

In [165]:
mean_squared_error(np.exp(y_train) - 1, np.exp(gs_lin_rg.predict(X_train)) - 1)**.5

45638.18703963579

In [167]:
ss = StandardScaler()
pca = PCA(n_components=40)
sp = SelectFromModel(Lasso())
lass = Lasso()

pipe_las_reg = Pipeline([
    ('ss', ss),
    ('pca', pca),
    ('sp', sp),
    ('lasso_reg', lass)
])

params = {
    'lasso_reg__alpha':np.random.exponential(1,100),
    'lasso_reg__max_iter':[10000]
}
gs_las_rg = GridSearchCV(pipe_las_reg, param_grid=params)

In [168]:
gs_las_rg.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('ss', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=40, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('sp', SelectFromModel(estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
 ...e=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'lasso_reg__alpha': array([0.4143 , 2.23966, ..., 3.47567, 0.0748 ]), 'lasso_reg__max_iter': [10000]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [169]:
gs_las_rg.best_params_

{'lasso_reg__alpha': 0.012952667379493148, 'lasso_reg__max_iter': 10000}

In [170]:
gs_las_rg.score(X_train, y_train)

0.7155530176873929

In [171]:
gs_las_rg.score(X_test, y_test)

0.6977681583781946

In [172]:
mean_squared_error(y_test, gs_las_rg.predict(X_test))**.5

0.22387833439764226

In [173]:
mean_squared_error(np.exp(y_test) - 1, np.exp(gs_las_rg.predict(X_test)) - 1)**.5

47712.448231513

In [174]:
mean_squared_error(np.exp(y_train) - 1, np.exp(gs_las_rg.predict(X_train)) - 1)**.5

45712.15081715503

In [19]:
# with open('lasso.pickle', "wb") as pickle_model:
#     pickle.dump(gs_las_rg, pickle_model)


# # with open('lasso.pickle', "rb") as pickle_model:
# #     load_model = pickle.load(pickle_model)

In [20]:
# lasso_pred = gs_las_rg.predict(submission_input)
# lasso_pred = np.exp(lasso_pred) - 1
# submission = pd.DataFrame(data=
#                          {
#                              'SalePrice':lasso_pred
#                          },
#                          index=submission_input.index)
# submission.sort_index().to_csv('lasso_submission.csv')


In [21]:
ss = StandardScaler()
ridge = Ridge()

pipe_rid_rg = Pipeline([
    ('ss', ss),
    ('ridge_reg',ridge)
])

params = {
    'ridge_reg__alpha':np.linspace(1, 10, 100),
    'ridge_reg__max_iter':[10000]
}
gs_rid_rg = GridSearchCV(pipe_rid_rg, param_grid=params)

In [22]:
gs_rid_rg.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('ss', StandardScaler(copy=True, with_mean=True, with_std=True)), ('ridge_reg', Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'ridge_reg__alpha': array([  1.     ,   1.09091, ...,   9.90909,  10.     ]), 'ridge_reg__max_iter': [10000]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [23]:
gs_rid_rg.score(X_train, y_train)

0.91094353590147281

In [24]:
gs_rid_rg.score(X_test, y_test)

0.87409149022896093

In [25]:
mean_squared_error(y_test, gs_rid_rg.predict(X_test))**.5

0.14450055006903961

In [26]:
mean_squared_error(np.exp(y_test) - 1, np.exp(gs_rid_rg.predict(X_test)) - 1)**.5

39225.996832202865

In [27]:
mean_squared_error(np.exp(y_train) - 1, np.exp(gs_rid_rg.predict(X_train)) - 1)**.5

23751.549415132809

In [28]:
with open('ridge.pickle', "wb") as pickle_model:
    pickle.dump(gs_rid_rg, pickle_model)


# with open('ridge.pickle', "rb") as pickle_model:
#     load_model = pickle.load(pickle_model)

In [29]:
ridge_pred = gs_rid_rg.predict(submission_input)
ridge_pred = np.exp(ridge_pred) - 1
ridge_pred = pd.DataFrame(data=
                         {
                             'SalePrice':ridge_pred
                         },
                         index=submission_input.index)
submission.sort_index().to_csv('ridge_submission.csv')

In [30]:
ss = StandardScaler()
elast = ElasticNet()

pipe_ela_rg = Pipeline([
    ('ss', ss),
    ('ela_reg',elast)
])

params = {
    'ela_reg__alpha':np.random.exponential(1, 1000),
    'ela_reg__max_iter':[10000]
}
gs_ela_rg = GridSearchCV(pipe_ela_rg, param_grid=params)



In [31]:
gs_ela_rg.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('ss', StandardScaler(copy=True, with_mean=True, with_std=True)), ('ela_reg', ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'ela_reg__alpha': array([ 3.31211,  0.02122, ...,  2.18987,  2.72493]), 'ela_reg__max_iter': [10000]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [32]:
gs_ela_rg.score(X_train, y_train)

0.90107750796049135

In [33]:
gs_ela_rg.score(X_test, y_test)

0.88202142263467043

In [34]:
mean_squared_error(y_test, gs_ela_rg.predict(X_test))**.5

0.13987610677898943

In [35]:
mean_squared_error(np.exp(y_test) - 1, np.exp(gs_ela_rg.predict(X_test)) - 1)**.5

35883.592959387097

In [36]:
mean_squared_error(np.exp(y_train) - 1, np.exp(gs_ela_rg.predict(X_train)) - 1)**.5

26436.170031200807

In [37]:
with open('elastic_net.pickle', "wb") as pickle_model:
    pickle.dump(gs_ela_rg, pickle_model)


# with open('elastic_net.pickle', "rb") as pickle_model:
#     load_model = pickle.load(pickle_model)

In [38]:
elast_pred = gs_ela_rg.predict(submission_input)
elast_pred = np.exp(elast_pred) - 1
elast_pred = pd.DataFrame(data=
                         {
                             'SalePrice':elast_pred
                         },
                         index=submission_input.index)
submission.sort_index().to_csv('elast_submission.csv')

In [196]:
ss = StandardScaler()

pca = PCA(n_components=50)
sp = SelectFromModel(Lasso())

elast = ElasticNet()

pipe_ela_rg2 = Pipeline([
    ('ss', ss),
    ('pca', pca),
    ('sp', sp),
    ('ela_reg',elast)
])

params = {
    'ela_reg__alpha':np.random.exponential(1, 100),
    'ela_reg__max_iter':[10000]
}
gs_ela_rg2 = GridSearchCV(pipe_ela_rg2, param_grid=params)



In [197]:
gs_ela_rg2.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('ss', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('sp', SelectFromModel(estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
 ...alse, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'ela_reg__alpha': array([2.47136, 0.19964, ..., 0.05093, 0.25578]), 'ela_reg__max_iter': [10000]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [198]:
gs_ela_rg2.score(X_train, y_train)

0.7156014970799178

In [199]:
gs_ela_rg2.score(X_test, y_test)

0.6978554157805125

In [200]:
mean_squared_error(y_test, gs_ela_rg2.predict(X_test))**.5

0.22384601409018434

In [201]:
mean_squared_error(np.exp(y_test) - 1, np.exp(gs_ela_rg2.predict(X_test)) - 1)**.5

47670.26199861496

In [202]:
mean_squared_error(np.exp(y_train) - 1, np.exp(gs_ela_rg2.predict(X_train)) - 1)**.5

45672.28036147853

In [34]:
# with open('elastic_net_2.pickle', "wb") as pickle_model:
#     pickle.dump(gs_ela_rg, pickle_model)


# # with open('elastic_net.pickle', "rb") as pickle_model:
# #     load_model = pickle.load(pickle_model)

In [38]:
# elast_pred2 = gs_ela_rg.predict(submission_input)
# elast_pred2 = np.exp(elast_pred2) - 1
# elast_pred2 = pd.DataFrame(data=
#                          {
#                              'SalePrice':elast_pred2
#                          },
#                          index=submission_input.index)
# elast_pred2.sort_index().to_csv('elast_submission_2.csv')