In [None]:
import pandas as pd
import numpy as np
import util

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [None]:
data = pd.read_csv('data/train.csv', index_col='id')
cat_colz = ['cat'+str(i) for i in range(10)]
cont_colz = ['cont'+str(i) for i in range(14)]

### Generic Pipeline Stuff

In [None]:
seed = 42
metric = 'neg_mean_squared_error'
n_jobs = 5

transformers = [('one_hot', OneHotEncoder(sparse=False), cat_colz),
                ('scaler', StandardScaler(), cont_colz)
               ]

### Lasso

Below I test lasso with `alpha` running between 1e-3 and 1. `alpha = 1e-3` scores the best, but going lower doesn't give significantly better performance, the fits take a long time, and we start to run into convergence issues.

Best mean CV score is MSE = 0.746 (RMSE = 0.864)

In [None]:
%%time
# Takes about 40s

pipeline = Pipeline([
    ('prep', ColumnTransformer(transformers)),
    ('lasso', Lasso())
])

params = {'lasso__alpha': [10**(x/2.) for x in range(-6, 1)]}

lasso_search = GridSearchCV(pipeline, params, scoring=metric, n_jobs=n_jobs)
_ = lasso_search.fit(data[cat_colz+cont_colz], data.target)

In [None]:
display_cols = ['param_lasso__alpha', 'mean_test_score']
pd.DataFrame(lasso_search.cv_results_)[display_cols]

### Lasso - Continuous Only
As a comparison, I fit lasso with only the continuous features. With only 14 features, the model is very robust against overfitting, and the regularization has little effect. Varying alpha over 15 orders of magnitude from 1e-10 to 1e5 only changes the mean score from 0.7794 to 0.7873 (RMSE from 0.8829 to 0.8873). The categorical features reduce the RMSE by ~1.9%.

In [None]:
%%time
# Takes ~20s

pipeline = Pipeline([('lasso', Lasso(normalize=True))])

params = {'lasso__alpha': [10**x for x in range(-10, 5)]}

lasso_cont_search = GridSearchCV(pipeline, params, scoring=metric, n_jobs=n_jobs)
_ = lasso_cont_search.fit(data[cont_colz], data.target)

In [None]:
display_cols = ['param_lasso__alpha', 'mean_test_score', 'rank_test_score']
pd.DataFrame(lasso_cont_search.cv_results_)[display_cols]

### Ridge
The Ridge model does ~4e-4 better than the lasso model, a negligable improvement. Although it's performance is more stable over a wide range of `alpha`.

Best mean CV score is MSE = 0.746 (RMSE = 0.864)

In [None]:
%%time
# Takes 30s

pipeline = Pipeline([
    ('prep', ColumnTransformer(transformers)),
    ('ridge', Ridge())
])

params = {'ridge__alpha': [10**(x/2.) for x in range(-6, 7)]}

ridge_search = GridSearchCV(pipeline, params, scoring=metric, n_jobs=n_jobs)
_ = ridge_search.fit(data[cat_colz+cont_colz], data.target)

In [None]:
display_cols = ['param_ridge__alpha', 'mean_test_score', 'rank_test_score']
pd.DataFrame(ridge_search.cv_results_)[display_cols]

### ElasticNet

ElasticNet performs in between the Ridge and Lasso fits, which is unsurprising. The best fit is for `l1_ratio = 0.01` (almost all Ridge) and `alpha = 0.01`.

Best mean CV score is MSE = 0.746 (RMSE = 0.864)

In [None]:
%%time
# Takes ~8 min

pipeline = Pipeline([
    ('prep', ColumnTransformer(transformers)),
    ('net', ElasticNet())
])

params = {'net__alpha': [10**(x/2.) for x in range(-4, 4)],
         'net__l1_ratio': [.01, .25, .5, 0.75, 0.99]}

elastic_search = GridSearchCV(pipeline, params, scoring=metric, n_jobs=n_jobs)
_ = elastic_search.fit(data[cat_colz+cont_colz], data.target)

In [None]:
display_cols = ['param_net__alpha', 'param_net__l1_ratio', 'mean_test_score', 'rank_test_score']
pd.DataFrame(elastic_search.cv_results_)[display_cols]