In [1]:
# Import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
import pickle

For the Regularization models, I first loaded the feature selected dataframe and my y target dataframe from my Modelling notebook. I then performed the train/test/split function and fit_transformed the X_train split and transformed the X_validation split.

In [2]:
# Load necessary dataframes

X_train_df = pd.read_csv('../data/X_train_df.csv')
y = pd.read_csv('../data/y.csv')

In [3]:
# Train/test/split, scale, and fit_transform/transform the data

X_train, X_validation, y_train, y_validation = train_test_split(X_train_df, y, random_state=42)
sc = StandardScaler()            

scalar_columns = list(X_train.select_dtypes(exclude='object'))

Z_train = sc.fit_transform(X_train[scalar_columns])
Z_validation = sc.transform(X_validation[scalar_columns]) 

I created my alphas, instantiated and cross-validated the RidgeCV model, and fit my RidgeCV model. I checked the RidgeCV alpha and printed the test scores. I then repeated the process for my LassoCV model and printed the RidgeCV and LassoCV coefficients.

In [4]:
# Create alphas, instantiate and cross-validate, and fit

r_alphas = np.logspace(0, 5, 100)   # Get 100 values from 10^0 to 10^5
ridge_cv = RidgeCV(alphas=r_alphas, scoring='r2', cv=5)   # 5-fold CV
ridge_cv.fit(Z_train, y_train);

In [5]:
# Find the optimal alpha value

ridge_cv.alpha_
# ridge_cv.best_score_ 

1.0

In [6]:
# Print RidgeCV scores

print('Training RidgeCV score: ', ridge_cv.score(Z_train, y_train))
print('Test RidgeCV score: ', ridge_cv.score(Z_validation, y_validation))

Training RidgeCV score:  0.923570616132344
Test RidgeCV score:  0.9236460642441144


In [7]:
# Create alphas, instantiate and cross-validate, and fit

l_alphas = np.logspace(-3, 0, 100)   # 10^-3 = .001 up to 10^0 = 1
lasso_cv = LassoCV(alphas=l_alphas, cv=5, max_iter=50000)
lasso_cv.fit(Z_train, y_train);

  return f(**kwargs)


In [8]:
# Find the optimal alpha value

lasso_cv.alpha_

1.0

In [9]:
# Print LassoCV scores

print('Training LassoCV score: ', lasso_cv.score(Z_train, y_train))
print('Test LassoCV score: ', lasso_cv.score(Z_validation, y_validation))

Training LassoCV score:  0.9238530105960564
Test LassoCV score:  0.9229294247079624


In [10]:
ridge_cv.coef_

array([[ 1.49858386e+03, -6.56884617e+03,  6.94965581e+03,
         9.77808268e+03,  2.59902253e+03,  1.66217192e+04,
         2.63306210e+03,  2.03536579e+04,  3.05404631e+03,
        -2.58070624e+04,  5.25452755e+04, -2.88217664e+04,
         7.93678927e+03,  1.48179009e+03,  1.07448409e+04,
        -3.43860530e+03, -3.85268932e+04,  8.16123908e+03,
         6.11282956e+04,  1.74728925e+03, -9.59141459e+03,
         5.30209019e+02,  2.31509446e+03,  2.00979604e+03,
         1.28214309e+03,  2.62998766e+02,  3.66550958e+03,
         9.95033683e+01,  2.81393373e+02,  2.00781228e+02,
        -2.04991679e+02,  3.77113573e+02, -2.06710340e+02,
         2.25091257e+03,  5.32681992e+03,  5.94204864e+03,
        -1.24495594e+02,  9.69404613e+02,  7.39002925e+02,
         9.43300964e+02,  3.56607250e+03,  5.40009843e+03,
         6.57966774e+02, -3.10750634e+02,  1.04168206e+03,
         1.51089289e+03, -1.01053082e+03, -2.18614621e+02,
        -1.51499135e+03, -4.43254310e+03, -2.27233979e+0

In [11]:
lasso_cv.coef_

array([ 1.49438284e+03, -6.03878889e+03,  6.95632941e+03,  9.87751978e+03,
        2.54137294e+03,  2.54145015e+04,  2.37061065e+03,  2.37779823e+04,
        3.06057301e+03, -2.78662116e+04,  5.60948765e+04, -4.89847155e+04,
        8.10957175e+03,  1.49366081e+03,  1.05876832e+04, -2.91056536e+03,
       -4.74532271e+04,  7.94157427e+03,  8.03061614e+04,  1.58679157e+03,
       -1.21602524e+04,  5.31319335e+02,  2.33169781e+03,  2.00744582e+03,
        1.28325727e+03,  3.33113558e+02,  3.69228400e+03,  6.46053677e+01,
        4.53340959e+02,  1.16910003e+02, -3.11635460e+02,  4.25233658e+02,
       -1.66739278e+02,  2.21088982e+03,  5.17840751e+03,  5.96018333e+03,
       -1.23478821e+02,  9.09097156e+02,  7.37824006e+02,  9.47515901e+02,
        3.52830778e+03,  5.22192246e+03,  7.19653593e+02, -3.71057528e+02,
        1.06682383e+03,  1.53742680e+03, -1.00135398e+03, -2.05325470e+02,
       -1.47645511e+03, -4.45928549e+03, -2.30820017e+03, -2.91818246e+03,
       -2.34393203e+02, -