In [1]:
import pandas as pd
import numpy as np

import pickle

from sklearn.linear_model import ElasticNet, Lasso, Ridge, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures




In [2]:
train = pd.read_csv('../data/engineered/train.csv', index_col='id')
submission_input = pd.read_csv('../data/engineered/test.csv', index_col='id')

target = 'saleprice'
y = train[target]
X = train[[col for col in train.columns if col != target]]
y_log = np.log1p(y)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=.3, random_state=13)

In [4]:
ss = StandardScaler()
lr = LinearRegression()

pipe_lin_reg = Pipeline([
    ('ss', ss),
    ('lin_reg', lr)
])

params = {
}
gs_lin_rg = GridSearchCV(pipe_lin_reg, param_grid=params)

In [5]:
gs_lin_rg.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('ss', StandardScaler(copy=True, with_mean=True, with_std=True)), ('lin_reg', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False))]),
       fit_params={}, iid=True, n_jobs=1, param_grid={},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [6]:
gs_lin_rg.score(X_train, y_train)

0.88834527445338651

In [7]:
gs_lin_rg.score(X_test, y_test)

0.86081944142064148

In [8]:
mean_squared_error(y_test, gs_lin_rg.predict(X_test))**.5

0.15192569999243774

In [9]:
mean_squared_error(np.exp(y_test) - 1, np.exp(gs_lin_rg.predict(X_test)) - 1)**.5

41766.580016300912

In [10]:
mean_squared_error(np.exp(y_train) - 1, np.exp(gs_lin_rg.predict(X_train)) - 1)**.5

29608.928762916341

In [11]:
ss = StandardScaler()
lass = Lasso(random_state=41)

pipe_las_reg = Pipeline([
    ('ss', ss),
    ('lasso_reg',lass)
])

params = {
    'lasso_reg__alpha':np.linspace(.001,.005, 100),
    'lasso_reg__max_iter':[10000]
}
gs_las_rg = GridSearchCV(pipe_las_reg, param_grid=params, cv=5)

In [12]:
gs_las_rg.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('ss', StandardScaler(copy=True, with_mean=True, with_std=True)), ('lasso_reg', Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=41,
   selection='cyclic', tol=0.0001, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'lasso_reg__alpha': array([ 0.001  ,  0.00104, ...,  0.00496,  0.005  ]), 'lasso_reg__max_iter': [10000]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [13]:
gs_las_rg.best_params_

{'lasso_reg__alpha': 0.0015656565656565658, 'lasso_reg__max_iter': 10000}

In [14]:
gs_las_rg.score(X_train, y_train)

0.88527542783057989

In [15]:
gs_las_rg.score(X_test, y_test)

0.86342682721580266

In [16]:
mean_squared_error(y_test, gs_las_rg.predict(X_test))**.5

0.1504958963119728

In [17]:
mean_squared_error(np.exp(y_test) - 1, np.exp(gs_las_rg.predict(X_test)) - 1)**.5

42080.992344546481

In [18]:
mean_squared_error(np.exp(y_train) - 1, np.exp(gs_las_rg.predict(X_train)) - 1)**.5

30325.628353951841

In [19]:
with open('./models/lasso.pickle', "wb") as pickle_model:
    pickle.dump(gs_las_rg, pickle_model)


# with open('lasso.pickle', "rb") as pickle_model:
#     load_model = pickle.load(pickle_model)

In [20]:
lasso_pred = gs_las_rg.predict(submission_input)
lasso_pred = np.exp(lasso_pred) - 1
submission = pd.DataFrame(data=
                         {
                             'SalePrice':lasso_pred
                         },
                         index=submission_input.index)
submission.sort_index().to_csv('./submissions/lasso_submission.csv')


In [21]:
ss = StandardScaler()
ridge = Ridge(random_state=41)

pipe_rid_rg = Pipeline([
    ('ss', ss),
    ('ridge_reg',ridge)
])

params = {
    'ridge_reg__alpha':np.linspace(1, 10, 100),
    'ridge_reg__max_iter':[10000]
}
gs_rid_rg = GridSearchCV(pipe_rid_rg, param_grid=params)

In [22]:
gs_rid_rg.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('ss', StandardScaler(copy=True, with_mean=True, with_std=True)), ('ridge_reg', Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=41, solver='auto', tol=0.001))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'ridge_reg__alpha': array([  1.     ,   1.09091, ...,   9.90909,  10.     ]), 'ridge_reg__max_iter': [10000]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [23]:
gs_rid_rg.score(X_train, y_train)

0.88816761062708127

In [24]:
gs_rid_rg.score(X_test, y_test)

0.86112156678270402

In [25]:
mean_squared_error(y_test, gs_rid_rg.predict(X_test))**.5

0.15176071450784359

In [26]:
mean_squared_error(np.exp(y_test) - 1, np.exp(gs_rid_rg.predict(X_test)) - 1)**.5

42411.818726188285

In [27]:
mean_squared_error(np.exp(y_train) - 1, np.exp(gs_rid_rg.predict(X_train)) - 1)**.5

29727.718881048928

In [28]:
with open('./models/ridge.pickle', "wb") as pickle_model:
    pickle.dump(gs_rid_rg, pickle_model)


# with open('ridge.pickle', "rb") as pickle_model:
#     load_model = pickle.load(pickle_model)

In [29]:
ridge_pred = gs_rid_rg.predict(submission_input)
ridge_pred = np.exp(ridge_pred) - 1
ridge_pred = pd.DataFrame(data=
                         {
                             'SalePrice':ridge_pred
                         },
                         index=submission_input.index)
submission.sort_index().to_csv('./submissions/ridge_submission.csv')

In [30]:
ss = StandardScaler()
elast = ElasticNet(random_state=41)
pf = PolynomialFeatures(include_bias=False)

pipe_ela_rg = Pipeline([
    ('pf', pf),
    ('ss', ss),
    ('ela_reg',elast)
])

params = {
    'ela_reg__alpha':np.random.exponential(1, 500),
    'ela_reg__max_iter':[10000]
}
gs_ela_rg = GridSearchCV(pipe_ela_rg, param_grid=params)



In [31]:
gs_ela_rg.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('pf', PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)), ('ss', StandardScaler(copy=True, with_mean=True, with_std=True)), ('ela_reg', ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=41, selection='cyclic', tol=0.0001, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'ela_reg__alpha': array([ 1.78318,  3.37548, ...,  0.02842,  2.9765 ]), 'ela_reg__max_iter': [10000]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [32]:
gs_ela_rg.score(X_train, y_train)

0.89855767260989805

In [33]:
gs_ela_rg.score(X_test, y_test)

0.881776359748219

In [34]:
mean_squared_error(y_test, gs_ela_rg.predict(X_test))**.5

0.14002130542889368

In [35]:
mean_squared_error(np.exp(y_test) - 1, np.exp(gs_ela_rg.predict(X_test)) - 1)**.5

29025.763911250549

In [36]:
mean_squared_error(np.exp(y_train) - 1, np.exp(gs_ela_rg.predict(X_train)) - 1)**.5

24976.062589207744

In [37]:
with open('./models/elastic_net_2.pickle', "wb") as pickle_model:
    pickle.dump(gs_ela_rg, pickle_model)


# with open('elastic_net.pickle', "rb") as pickle_model:
#     load_model = pickle.load(pickle_model)

In [38]:
elast_pred2 = gs_ela_rg.predict(submission_input)
elast_pred2 = np.exp(elast_pred2) - 1
elast_pred2 = pd.DataFrame(data=
                         {
                             'SalePrice':elast_pred2
                         },
                         index=submission_input.index)
elast_pred2.sort_index().to_csv('./submissions/elast_submission.csv')