#### Lab : Regularization and Linear models

In [1]:
import numpy as np
from matplotlib.pyplot import subplots
import pandas as pd

# The modelling libraries
from statsmodels.api import OLS
import sklearn.model_selection as skm
import sklearn.linear_model as skl
from sklearn.preprocessing import StandardScaler

# The ISLP libraries
from ISLP import load_data
from ISLP.models import ModelSpec as MS
from functools import partial

# Some new imports
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
from ISLP.models import (Stepwise, sklearn_selected, sklearn_selection_path)
from l0bnb import fit_path

#### Subset selection methods
- Forward Selection 
- Backward Selection 
- Mixed

We cannot use best subset because that is computationally infeasible for p > 40. 

### Forward Selection

_Dataset Description_

* `AtBat`: Number of times at bat in 1986
* `Hits`: Number of hits in 1986
* `HmRun`: Number of home runs in 1986
* `Runs`: Number of runs in 1986
* `RBI`: Number of runs batted in in 1986
* `Walks`: Number of walks in 1986
* `Years`: Number of years in the major leagues
* `CAtBat`: Number of times at bat during his career
* `CHits`: Number of hits during his career
* `CHmRun`: Number of home runs during his career
* `CRuns`: Number of runs during his career
* `CRBI`: Number of runs batted in during his career
* `CWalks`: Number of walks during his career
* `League`: A factor with levels A and N indicating player's league at the end of 1986
* `Division`: A factor with levels E and W indicating player's division at the end of 1986
* `PutOuts`: Number of put outs in 1986
* `Assists`: Number of assists in 1986
* `Errors`: Number of errors in 1986
* `Salary`: 1987 annual salary on opening day in thousands of dollars
* `NewLeague`: A factor with levels A and N indicating player's league at the beginning of 1987

Here probably we are going to see how the salary is influenced

In [3]:
data = load_data("Hitters")
data.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
0,293,66,1,30,29,14,1,293,66,1,30,29,14,A,E,446,33,20,,A
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,N,W,632,43,10,475.0,N
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,A,W,880,82,14,480.0,A
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,N,E,200,11,3,500.0,N
4,321,87,10,39,42,30,2,396,101,12,48,46,33,N,E,805,40,4,91.5,N


In [5]:
data_neu = data.dropna()
data_neu.shape # 263 rows, 20 columns

(263, 20)

In [7]:
# Using the C_p statistics = MSE + 2 * p * sigma^2

def nCp(sigma2, estimator, X, Y) : 
    "Negative C_p statistics being maximized"
    n, p = X.shape
    y_hat = estimator.predict(X)
    rss = np.sum((y - y_hat)**2)
    return -((rss + 2 * p * sigma2) / n)

# Estimating the residual variance
design = MS(data_neu.columns.drop("Salary")).fit(data_neu)
y = np.array(data_neu.Salary)
x = design.transform(data_neu)
sigma2 = OLS(y, x).fit().scale

# Fixing some parameters
neg_cp = partial(nCp, sigma2)

In [None]:
# Defining the search strategy - need to underst
strategy = Stepwise.first_peak(design, direction = "forward", max_terms=len(design.terms))

data_mse = sklearn_selected(OLS, strategy, scoring = neg_cp)
data_mse.fit(data_neu, y)
data_mse.selected_state_

('Assists',
 'AtBat',
 'CAtBat',
 'CRBI',
 'CRuns',
 'CWalks',
 'Division',
 'Hits',
 'PutOuts',
 'Walks')