# Feature Selection Using Lasso Regularisation

https://towardsdatascience.com/feature-selection-using-regularisation-a3678b71e499


## Import Libraries

In [48]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.feature_selection import SelectFromModel

## Load dataset


Dataset Source: https://www.kaggle.com/competitions/bnp-paribas-cardif-claims-management/data?select=train.csv.zip

In [49]:
df = pd.read_csv('datasets/train.csv')
df.head()

Unnamed: 0,ID,target,v1,v2,v3,v4,v5,v6,v7,v8,...,v122,v123,v124,v125,v126,v127,v128,v129,v130,v131
0,3,1,1.335739,8.727474,C,3.921026,7.915266,2.599278,3.176895,0.012941,...,8.0,1.98978,0.035754,AU,1.804126,3.113719,2.024285,0,0.636365,2.857144
1,4,1,,,C,,9.191265,,,2.30163,...,,,0.598896,AF,,,1.957825,0,,
2,5,1,0.943877,5.310079,C,4.410969,5.326159,3.979592,3.928571,0.019645,...,9.333333,2.477596,0.013452,AE,1.773709,3.922193,1.120468,2,0.883118,1.176472
3,6,1,0.797415,8.304757,C,4.22593,11.627438,2.0977,1.987549,0.171947,...,7.018256,1.812795,0.002267,CJ,1.41523,2.954381,1.990847,1,1.677108,1.034483
4,8,1,,,C,,,,,,...,,,,Z,,,,0,,


## EDA

### Selecting Numerical Columns

In [50]:
numerical_columns = df.select_dtypes(include=['int32', 'int64', 'float64', 'float32'])
df_copy = numerical_columns
df_copy.head()

Unnamed: 0,ID,target,v1,v2,v4,v5,v6,v7,v8,v9,...,v121,v122,v123,v124,v126,v127,v128,v129,v130,v131
0,3,1,1.335739,8.727474,3.921026,7.915266,2.599278,3.176895,0.012941,9.999999,...,0.803572,8.0,1.98978,0.035754,1.804126,3.113719,2.024285,0,0.636365,2.857144
1,4,1,,,,9.191265,,,2.30163,,...,,,,0.598896,,,1.957825,0,,
2,5,1,0.943877,5.310079,4.410969,5.326159,3.979592,3.928571,0.019645,12.666667,...,2.238806,9.333333,2.477596,0.013452,1.773709,3.922193,1.120468,2,0.883118,1.176472
3,6,1,0.797415,8.304757,4.22593,11.627438,2.0977,1.987549,0.171947,8.965516,...,1.956521,7.018256,1.812795,0.002267,1.41523,2.954381,1.990847,1,1.677108,1.034483
4,8,1,,,,,,,,,...,,,,,,,,0,,


In [51]:
numerical_columns.head()

Unnamed: 0,ID,target,v1,v2,v4,v5,v6,v7,v8,v9,...,v121,v122,v123,v124,v126,v127,v128,v129,v130,v131
0,3,1,1.335739,8.727474,3.921026,7.915266,2.599278,3.176895,0.012941,9.999999,...,0.803572,8.0,1.98978,0.035754,1.804126,3.113719,2.024285,0,0.636365,2.857144
1,4,1,,,,9.191265,,,2.30163,,...,,,,0.598896,,,1.957825,0,,
2,5,1,0.943877,5.310079,4.410969,5.326159,3.979592,3.928571,0.019645,12.666667,...,2.238806,9.333333,2.477596,0.013452,1.773709,3.922193,1.120468,2,0.883118,1.176472
3,6,1,0.797415,8.304757,4.22593,11.627438,2.0977,1.987549,0.171947,8.965516,...,1.956521,7.018256,1.812795,0.002267,1.41523,2.954381,1.990847,1,1.677108,1.034483
4,8,1,,,,,,,,,...,,,,,,,,0,,


### Dependent and Independent features

In [52]:
X=df_copy.drop(['target', 'ID'], axis=1) # independent features
X.head()

Unnamed: 0,v1,v2,v4,v5,v6,v7,v8,v9,v10,v11,...,v121,v122,v123,v124,v126,v127,v128,v129,v130,v131
0,1.335739,8.727474,3.921026,7.915266,2.599278,3.176895,0.012941,9.999999,0.503281,16.434108,...,0.803572,8.0,1.98978,0.035754,1.804126,3.113719,2.024285,0,0.636365,2.857144
1,,,,9.191265,,,2.30163,,1.31291,,...,,,,0.598896,,,1.957825,0,,
2,0.943877,5.310079,4.410969,5.326159,3.979592,3.928571,0.019645,12.666667,0.765864,14.756098,...,2.238806,9.333333,2.477596,0.013452,1.773709,3.922193,1.120468,2,0.883118,1.176472
3,0.797415,8.304757,4.22593,11.627438,2.0977,1.987549,0.171947,8.965516,6.542669,16.347483,...,1.956521,7.018256,1.812795,0.002267,1.41523,2.954381,1.990847,1,1.677108,1.034483
4,,,,,,,,,1.050328,,...,,,,,,,,0,,


In [53]:
y=df_copy[['target']] # dependent feature
y.head()

Unnamed: 0,target
0,1
1,1
2,1
3,1
4,1


### Split data into train and test set


In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [55]:
X_train.head()

Unnamed: 0,v1,v2,v4,v5,v6,v7,v8,v9,v10,v11,...,v121,v122,v123,v124,v126,v127,v128,v129,v130,v131
104731,,,,,,,,,1.816192,,...,,,,,,,,0,,
12845,4.736842,9.459733,6.246583,11.143242,3.26221,2.532005,7.219355,6.804733,1.291028,14.299706,...,2.553192,5.621302,2.296689,19.999999,1.627792,4.818635,9.988227,0,1.737828,4.655172
13359,,,,,,,,,1.050327,,...,,,,,,,,0,,
97685,,,,,,,,,1.291029,,...,,,,,,,,0,,
86571,1.996627,3.966235,2.383398,8.950573,3.055649,1.305228,3.803387,7.854077,2.341357,15.922267,...,4.049557,5.772532,8.035969,2.46267,1.447307,0.866358,1.808886,0,5.105942,1.295546


### Fix Null Values

In [56]:
X_train.isnull().sum()

v1      39970
v2      39941
v4      39941
v5      39030
v6      39970
        ...  
v127    39970
v128    39030
v129        0
v130    39978
v131    40021
Length: 112, dtype: int64

In [57]:
X_train = X_train.fillna(0)

In [58]:
X_train.isnull().sum()

v1      0
v2      0
v4      0
v5      0
v6      0
       ..
v127    0
v128    0
v129    0
v130    0
v131    0
Length: 112, dtype: int64

### Scaling the data using Standard Scalar

In [59]:
from sklearn.preprocessing import StandardScaler

In [60]:
scalar = StandardScaler()
scalar

StandardScaler()

In [61]:
scalar.fit(X_train)

StandardScaler()

## Applying Feature Selection using Lasso regularisation

- First Specify the lasso regression model, and select a 
suitable alpha.
- Bigger the alpha, lesser the features will be selected.
- Use *SelectFromModel* object to select the features whose coefficients are none-zero.

In [62]:
from sklearn.feature_selection import SelectFromModel

In [69]:
sel_ = SelectFromModel(Lasso(alpha=0.005, random_state=0))
sel_.fit(X_train, y_train)

SelectFromModel(estimator=Lasso(alpha=0.005, random_state=0))

### Visualising features that were kept by the lasso regularisation

In [70]:
sel_.get_support()

array([False, False, False, False, False, False, False, False,  True,
       False, False, False,  True, False, False, False, False, False,
       False, False,  True, False, False, False, False, False, False,
       False, False, False,  True, False,  True, False,  True, False,
       False, False, False, False, False, False, False,  True, False,
       False, False, False, False, False, False, False, False,  True,
       False, False, False, False, False, False, False,  True, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,  True, False,  True, False, False, False, False,
       False, False, False,  True, False, False, False, False, False,
       False,  True, False, False, False, False, False, False, False,
       False,  True, False, False])

**Inference:**

Here True basically indicates that the feature is important and should be selected. False indicates that the feature is not important and can be skipped.

### Make a list with the selected features.

In [71]:
selected_features = X_train.columns[(sel_.get_support())]
print('total features: {}'.format((X_train.shape[1])))
print('selected features: {}'.format(len(selected_features)))
print('features with coefficients shrank to zero: {}'.format(
      np.sum(sel_.estimator_.coef_ == 0)))

total features: 112
selected features: 14
features with coefficients shrank to zero: 98


###  Identifying the removed features

In [74]:
removed_features = X_train.columns[(sel_.estimator_.coef_==0).ravel().tolist()]
removed_features

Index(['v1', 'v2', 'v4', 'v5', 'v6', 'v7', 'v8', 'v9', 'v11', 'v12', 'v13',
       'v15', 'v16', 'v17', 'v18', 'v19', 'v20', 'v21', 'v25', 'v26', 'v27',
       'v28', 'v29', 'v32', 'v33', 'v34', 'v35', 'v37', 'v39', 'v41', 'v42',
       'v43', 'v44', 'v45', 'v46', 'v48', 'v49', 'v51', 'v53', 'v54', 'v55',
       'v57', 'v58', 'v59', 'v60', 'v61', 'v63', 'v64', 'v65', 'v67', 'v68',
       'v69', 'v70', 'v73', 'v76', 'v77', 'v78', 'v80', 'v81', 'v82', 'v83',
       'v84', 'v85', 'v86', 'v87', 'v88', 'v89', 'v90', 'v92', 'v93', 'v94',
       'v95', 'v96', 'v97', 'v99', 'v101', 'v102', 'v103', 'v104', 'v105',
       'v106', 'v108', 'v111', 'v114', 'v115', 'v116', 'v117', 'v118', 'v120',
       'v121', 'v122', 'v123', 'v124', 'v126', 'v127', 'v128', 'v130', 'v131'],
      dtype='object')

In [75]:
len(removed_features)

98

**Inference:**

- 98 features removed.
- 14 features selected.

### Get records of selected features

In [77]:
X_train[selected_features].head()

Unnamed: 0,v10,v14,v23,v36,v38,v40,v50,v62,v72,v98,v100,v109,v119,v129
104731,1.816192,13.669634,0.0,0.0,0,9.393677,1.385079,1,1,0.0,0.0,0.0,0.0,0
12845,1.291028,11.354501,6.576856e-07,6.032123,0,6.171768,0.955176,1,1,7.03879,1.37787,20.0,1.641135,0
13359,1.050327,12.603405,0.0,0.0,0,8.762029,1.285305,1,1,0.0,0.0,0.0,0.0,0
97685,1.291029,11.636385,0.0,0.0,0,10.79155,2.042403,1,1,0.0,0.0,0.0,0.0,0
86571,2.341357,13.105813,0.003900022,14.996236,0,10.969539,2.174456,1,1,4.348129,0.098478,3.706858,12.905181,0


### Update X_train with selected features

In [78]:
X_train = X_train[selected_features]

In [79]:
X_train.head()

Unnamed: 0,v10,v14,v23,v36,v38,v40,v50,v62,v72,v98,v100,v109,v119,v129
104731,1.816192,13.669634,0.0,0.0,0,9.393677,1.385079,1,1,0.0,0.0,0.0,0.0,0
12845,1.291028,11.354501,6.576856e-07,6.032123,0,6.171768,0.955176,1,1,7.03879,1.37787,20.0,1.641135,0
13359,1.050327,12.603405,0.0,0.0,0,8.762029,1.285305,1,1,0.0,0.0,0.0,0.0,0
97685,1.291029,11.636385,0.0,0.0,0,10.79155,2.042403,1,1,0.0,0.0,0.0,0.0,0
86571,2.341357,13.105813,0.003900022,14.996236,0,10.969539,2.174456,1,1,4.348129,0.098478,3.706858,12.905181,0
