# Ames Gridsearch
***


- Dummy varaibles are picked when corr > .50
- Polynomial Features were picked when corr > 0.50
- `train_features_for_gridsearch` is already scaled 

**Total of 253 features (250 numeric and 3 dummy columns)**

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.linear_model import ElasticNet, Lasso
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
train = pd.read_csv('./datasets/train_features_for_gridsearch')
train.head()

Unnamed: 0.1,Unnamed: 0,Overall Qual,Year Built,Year Remod/Add,Mas Vnr Area,Total Bsmt SF,1st Flr SF,Gr Liv Area,Full Bath,TotRms AbvGrd,...,Garage Area^2,Garage Area Yr Sold,Garage Area baths,Yr Sold homeage,Yr Sold baths,Yr Sold garageage,baths^2,Foundation_PConc,Bsmt Qual_Ex,Kitchen Qual_Ex
0,0,-0.078644,0.142227,0.989479,1.092329,-0.741232,-1.108838,-0.040634,0.769779,-0.279441,...,-0.199323,0.008581,0.496281,-0.06721,1.160521,0.203566,1.138397,0,0,0
1,1,0.622656,0.805126,0.60909,0.191491,-0.322705,-0.63451,1.244529,0.769779,1.002738,...,0.182637,0.396922,0.848706,-0.763445,1.158568,-0.68613,1.138397,1,0,0
2,2,-0.779944,-0.620106,1.084576,-0.565901,-0.00213,-0.271195,-0.884084,-1.051232,-0.920531,...,-0.925409,-1.053629,-1.083758,0.6951,-1.194594,1.133449,-1.046041,0,0,0
3,3,-0.779944,1.136575,1.084576,-0.565901,-1.50037,-1.0609,-0.110588,0.769779,0.361648,...,-0.487916,-0.339305,0.181616,-1.061526,1.160521,-1.049756,1.138397,1,0,0
4,4,-0.078644,-2.376787,0.418896,-0.565901,-0.850317,-0.841397,-0.108589,0.769779,-0.279441,...,-0.161367,0.050327,0.127913,2.451726,0.375483,0.971731,0.202209,1,0,0


In [3]:
price = pd.read_csv('./datasets/clean_train.csv')

In [4]:
price['SalePrice'].shape

(2051,)

In [5]:
train.shape

(2051, 168)

In [6]:
#lets drop Unnamed: 0 to make train have only the 23 features we want 
train.drop(labels = 'Unnamed: 0', axis = 1, inplace = True)

In [7]:
train.head()

Unnamed: 0,Overall Qual,Year Built,Year Remod/Add,Mas Vnr Area,Total Bsmt SF,1st Flr SF,Gr Liv Area,Full Bath,TotRms AbvGrd,Garage Yr Blt,...,Garage Area^2,Garage Area Yr Sold,Garage Area baths,Yr Sold homeage,Yr Sold baths,Yr Sold garageage,baths^2,Foundation_PConc,Bsmt Qual_Ex,Kitchen Qual_Ex
0,-0.078644,0.142227,0.989479,1.092329,-0.741232,-1.108838,-0.040634,0.769779,-0.279441,-0.112447,...,-0.199323,0.008581,0.496281,-0.06721,1.160521,0.203566,1.138397,0,0,0
1,0.622656,0.805126,0.60909,0.191491,-0.322705,-0.63451,1.244529,0.769779,1.002738,0.73709,...,0.182637,0.396922,0.848706,-0.763445,1.158568,-0.68613,1.138397,1,0,0
2,-0.779944,-0.620106,1.084576,-0.565901,-0.00213,-0.271195,-0.884084,-1.051232,-0.920531,-1.042893,...,-0.925409,-1.053629,-1.083758,0.6951,-1.194594,1.133449,-1.046041,0,0,0
3,-0.779944,1.136575,1.084576,-0.565901,-1.50037,-1.0609,-0.110588,0.769779,0.361648,1.141632,...,-0.487916,-0.339305,0.181616,-1.061526,1.160521,-1.049756,1.138397,1,0,0
4,-0.078644,-2.376787,0.418896,-0.565901,-0.850317,-0.841397,-0.108589,0.769779,-0.279441,-0.881076,...,-0.161367,0.050327,0.127913,2.451726,0.375483,0.971731,0.202209,1,0,0


In [8]:
train.shape

(2051, 167)

In [9]:
#create X and y
X= train
y = price['SalePrice']

# Train Test Split 

In [10]:
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [11]:
lasso = Lasso()
lasso.fit(X_train, y_train)



Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

# Calculate R-Squared and RMSE Scores

In [12]:
lasso.score(X_train, y_train)

0.9280369942602362

In [13]:
lasso.score(X_holdout, y_holdout)

0.8982563315885601

# Creating Predictions

In [14]:
y_train_pred = lasso.predict(X_train)
y_holdout_pred = lasso.predict(X_holdout)

In [15]:
def rmse(y, yhat):
    mse = mean_squared_error(y, yhat)
    return np.sqrt(mse)

In [16]:
#calc rmse
rmse(y_train, y_train_pred)

21471.45065277749

In [17]:
rmse(y_holdout, y_holdout_pred)

24661.008680354156

# Grid Search Time

check sklearn parameters guide to alter whatever you want on gridsearch

In [18]:
#params
lasso_params = {
    'alpha': np.linspace(0, 1, 20) , #start at .2 end at 1 and give me 8 steps between that 
    
}

In [19]:
#grid search
lasso_gridsearch = GridSearchCV(
    Lasso(),
    lasso_params,
    cv = 8,
    verbose = 1,
    n_jobs =2,
    return_train_score = False
)

In [20]:
#fit to train
lasso_gridsearch.fit(X_train, y_train)

Fitting 8 folds for each of 20 candidates, totalling 160 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    4.4s
[Parallel(n_jobs=2)]: Done 160 out of 160 | elapsed:   11.0s finished


GridSearchCV(cv=8, error_score='raise-deprecating',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params=None, iid='warn', n_jobs=2,
       param_grid={'alpha': array([0.     , 0.05263, 0.10526, 0.15789, 0.21053, 0.26316, 0.31579,
       0.36842, 0.42105, 0.47368, 0.52632, 0.57895, 0.63158, 0.68421,
       0.73684, 0.78947, 0.84211, 0.89474, 0.94737, 1.     ])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring=None, verbose=1)

In [21]:
#best params
lasso_gridsearch.best_params_

{'alpha': 0.05263157894736842}

In [22]:
#best score
lasso_gridsearch.best_score_

0.8700855199389407

In [23]:
#save the best model it came up with 
best_lasso = lasso_gridsearch.best_estimator_

In [24]:
best_lasso.coef_

array([ 55466.78798494,  17078.61368862,   4774.03832283,  26492.5774325 ,
         2259.19024164,   7956.43747977,  21294.38381491, -10646.89872196,
        17150.09739484, -16536.19417464, -16481.14653922,   7448.26039813,
         5085.5413657 ,  -5419.90074485,  -3995.11587392,  -7934.65609929,
        -1822.36335573,   3258.25078938, -19136.94321892,  31717.34526027,
       -24754.13684015,  19100.82079442,  19994.73207497, -11277.8224447 ,
       -19427.68979385,   7538.71911108,  -3507.83502343,  -5780.76355547,
         1621.48109206,  15673.83703106,  38695.44330773,  22072.96587038,
        35468.96853722, -28900.8299033 , -13243.88789844,  -6250.90697642,
        -6259.98484635,  12344.92572197, -41182.29120521,  -5460.09291429,
         4415.00352934, -13213.26043038,  50508.56852881,  -5654.37071211,
        -1924.92664402,  18205.00470132,  28732.20541425, -24484.92809727,
       -11067.89540716,  -2528.98514847,   -214.92329371, -30766.26130783,
       -29581.3485315 ,  

In [25]:
#best estimator score 
best_lasso.score(X_train, y_train)

0.9280642982005263

In [26]:
#best estimator score on holdut 
best_lasso.score(X_holdout, y_holdout)

0.8978091279521075

# RMSE

In [27]:
#predict 
y_train_lasso = best_lasso.predict(X_train)
y_holdout_lasso = best_lasso.predict(X_holdout)

#calc rmse
print(rmse(y_train, y_train_lasso))
print(rmse(y_holdout, y_holdout_lasso))

21467.376956718046
24715.14669624479


# Conclusion 

By executing the `cross_val_score`, proceeding with the `Ridge` was the best model for us. Here are the following regression metrics for Ridge:

|Metric|Score|
|---|---|
|Train Score|.9283|
|Test Score|.9031|
|RMSE Train|21,427|
|RMSE Test|24,064|

By executing `GridSearchCV`, we used `ElasticNet` as the model since it involves both Lasso and Ridge. GridSearch will tells us the best ratio between Lasso and Ridge. After looking at the recommended parameters, the best parameters told us to have a L1 ratio of 1. We decided to perform a GridSearch for `Lasso`. We computed Lasso regression metrics. Here are the following metrics for Lasso. 

|Metric|Score|
|---|---|
|Train Score|.9280|
|Test Score|.8978|
|RMSE Train|21,467|
|RMSE Test|24,715|

**Final:** For the features we used in our regression model, it is best to proceed with the Ridge model since the overall test score and RMSE score have a better overall fit with the true sale price of homes compared to the Lasso GridSearch model. 