In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split

In [2]:
data = pd.read_csv("data/csv/features.csv")
len(data), len(data.columns)

(50, 39)

In [3]:
X, y = data.drop(columns = ['n']), data['n']

In [4]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0, random_state=69)

ValueError: test_size=0 should be either positive and smaller than the number of samples 50 or a float in the (0, 1) range

In [5]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', Lasso())
])

In [6]:
search = GridSearchCV(pipeline,
                      {'model__alpha':np.arange(0.1, 3, 0.1)},
                      cv = 5,
                      scoring = 'neg_mean_squared_error',
                      verbose = 3)

In [7]:
search.fit(X, y)

Fitting 5 folds for each of 29 candidates, totalling 145 fits
[CV 1/5] END .............model__alpha=0.1;, score=-16024.734 total time=   0.0s
[CV 2/5] END ..............model__alpha=0.1;, score=-8290.126 total time=   0.0s
[CV 3/5] END ..............model__alpha=0.1;, score=-6755.527 total time=   0.0s
[CV 4/5] END ..............model__alpha=0.1;, score=-9759.530 total time=   0.0s
[CV 5/5] END .............model__alpha=0.1;, score=-48851.942 total time=   0.0s
[CV 1/5] END .............model__alpha=0.2;, score=-13521.804 total time=   0.0s
[CV 2/5] END ..............model__alpha=0.2;, score=-6185.056 total time=   0.0s
[CV 3/5] END ..............model__alpha=0.2;, score=-5184.841 total time=   0.0s
[CV 4/5] END ..............model__alpha=0.2;, score=-6274.322 total time=   0.0s
[CV 5/5] END .............model__alpha=0.2;, score=-33641.291 total time=   0.0s
[CV 1/5] END model__alpha=0.30000000000000004;, score=-13393.740 total time=   0.0s
[CV 2/5] END model__alpha=0.3000000000000000

  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


[CV 4/5] END model__alpha=0.30000000000000004;, score=-5130.901 total time=   0.0s
[CV 5/5] END model__alpha=0.30000000000000004;, score=-16018.705 total time=   0.0s
[CV 1/5] END .............model__alpha=0.4;, score=-12914.326 total time=   0.0s
[CV 2/5] END ..............model__alpha=0.4;, score=-4657.107 total time=   0.0s
[CV 3/5] END ..............model__alpha=0.4;, score=-4353.306 total time=   0.0s
[CV 4/5] END ..............model__alpha=0.4;, score=-4590.406 total time=   0.0s
[CV 5/5] END ..............model__alpha=0.4;, score=-7797.566 total time=   0.0s
[CV 1/5] END .............model__alpha=0.5;, score=-11382.001 total time=   0.0s
[CV 2/5] END ..............model__alpha=0.5;, score=-4110.762 total time=   0.0s
[CV 3/5] END ..............model__alpha=0.5;, score=-4310.199 total time=   0.0s
[CV 4/5] END ..............model__alpha=0.5;, score=-4220.817 total time=   0.0s
[CV 5/5] END ..............model__alpha=0.5;, score=-6693.752 total time=   0.0s
[CV 1/5] END ..........

  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


[CV 5/5] END ..............model__alpha=0.6;, score=-5856.686 total time=   0.0s
[CV 1/5] END model__alpha=0.7000000000000001;, score=-8428.377 total time=   0.0s
[CV 2/5] END model__alpha=0.7000000000000001;, score=-3000.334 total time=   0.0s
[CV 3/5] END model__alpha=0.7000000000000001;, score=-4024.018 total time=   0.0s
[CV 4/5] END model__alpha=0.7000000000000001;, score=-3687.251 total time=   0.0s
[CV 5/5] END model__alpha=0.7000000000000001;, score=-5072.754 total time=   0.0s
[CV 1/5] END ..............model__alpha=0.8;, score=-7645.732 total time=   0.0s
[CV 2/5] END ..............model__alpha=0.8;, score=-2476.087 total time=   0.0s
[CV 3/5] END ..............model__alpha=0.8;, score=-3912.416 total time=   0.0s
[CV 4/5] END ..............model__alpha=0.8;, score=-3542.100 total time=   0.0s
[CV 5/5] END ..............model__alpha=0.8;, score=-4865.568 total time=   0.0s
[CV 1/5] END ..............model__alpha=0.9;, score=-7021.044 total time=   0.0s
[CV 2/5] END ..........

[CV 1/5] END model__alpha=2.9000000000000004;, score=-3979.163 total time=   0.0s
[CV 2/5] END model__alpha=2.9000000000000004;, score=-1011.019 total time=   0.0s
[CV 3/5] END model__alpha=2.9000000000000004;, score=-2794.097 total time=   0.0s
[CV 4/5] END model__alpha=2.9000000000000004;, score=-3025.427 total time=   0.0s
[CV 5/5] END model__alpha=2.9000000000000004;, score=-3286.095 total time=   0.0s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('model', Lasso())]),
             param_grid={'model__alpha': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. , 1.1, 1.2, 1.3,
       1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2. , 2.1, 2.2, 2.3, 2.4, 2.5, 2.6,
       2.7, 2.8, 2.9])},
             scoring='neg_mean_squared_error', verbose=3)

In [8]:
search.best_params_

{'model__alpha': 2.9000000000000004}

In [9]:
coef = search.best_estimator_[1].coef_

In [11]:
out = X.iloc[:, coef != 0]
out

Unnamed: 0,oil_lag6,oil_lag12,ngf2_lag6,sfrm_lag12,durable_lag12,altsales_lag3
0,50.51,44.83,22,2.86,224061,17.818
1,47.01,56.83,21,2.99,218966,17.863
2,43.58,51.06,62,2.92,227573,17.081
3,43.66,62.21,64,2.85,224339,17.61
4,38.93,60.47,27,2.9,217332,17.64
5,31.27,58.79,103,2.98,232766,16.826
6,29.11,50.51,56,2.95,230189,17.247
7,30.13,47.01,37,2.9,222647,17.288
8,34.33,43.58,57,2.91,218738,17.334
9,42.7,43.66,82,2.89,226032,17.744


In [13]:
out.insert(0, 'n', y, True)

In [14]:
out.to_csv("data/out/lasso.csv")