<h1> Feature Selection using Lasso </h1>

https://www.kaggle.com/code/bextuychiev/lasso-regression-with-pipelines-tutorial

<h2> Importing the Necessary Libraries </h2>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, KFold

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer #for missing values


from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

import joblib

In [2]:
index_names = ['unit_number', 'time_cycles']
setting_names = ['setting_1', 'setting_2', 'setting_3']
sensor_names = ['s_{}'.format(i+1) for i in range(0,21)]
col_names = index_names + setting_names + sensor_names

In [46]:
train = pd.read_csv('../backend/CMAPSSData/train_FD001.txt',sep='\s+',header=None,index_col=False,names=col_names)
test = pd.read_csv('../backend/CMAPSSData/test_FD001.txt',sep='\s+',header=None,index_col=False,names=col_names)
rul = pd.read_csv('../backend/CMAPSSDATA/RUL_FD001.txt',sep='\s+',header=None,index_col=False,names=['RUL'])

In [4]:
def add_RUL_column(df):
    
    #get total number of cycles for each unit
    train_grouped_by_unit = df.groupby(by='unit_number') 
    max_time_cycles = train_grouped_by_unit['time_cycles'].max() 
    
    #merge the max cycle back into original frame
    merged = df.merge(max_time_cycles.to_frame(name='max_time_cycle'), left_on='unit_number',right_index=True)
    
    #calculate remaining useful life for each row
    merged["RUL"] = merged["max_time_cycle"] - merged['time_cycles']
    
    #drop max cycle as it's no longer needed
    merged = merged.drop("max_time_cycle", axis=1) 
    return merged
train = add_RUL_column(train)

In [5]:
X = train.drop('RUL', axis = 1)
y = train.RUL

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state = 42)

In [6]:
pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')), 
    ('scale', MinMaxScaler())
])

In [7]:
pipeline.fit_transform(X_train)

array([[0.64646465, 0.27146814, 0.16091954, ..., 0.        , 0.41085271,
        0.67144061],
       [0.50505051, 0.07479224, 0.45977011, ..., 0.        , 0.72868217,
        0.66610503],
       [0.60606061, 0.2299169 , 0.7183908 , ..., 0.        , 0.63565891,
        0.57329402],
       ...,
       [0.26262626, 0.22437673, 0.59770115, ..., 0.        , 0.47286822,
        0.64855378],
       [0.04040404, 0.03601108, 0.5       , ..., 0.        , 0.72093023,
        0.6727043 ],
       [0.78787879, 0.11357341, 0.45402299, ..., 0.        , 0.79069767,
        0.71889919]])

In [8]:
lasso = Lasso(alpha=0.00001)
lasso_pipeline = Pipeline(steps=[
    ('preprocess', pipeline),
    ('model', lasso)
])

In [9]:
_=lasso_pipeline.fit(X_train, y_train)

In [10]:
preds = lasso_pipeline.predict(X_test)
mean_absolute_error(y_test, preds)

30.283605042554235

In [11]:
lasso_pipeline.score(X_test, y_test)

0.6674849131377599

In [12]:
params = {'model__alpha': np.arange(0.00001, 10, 500)}
kf=KFold(n_splits=5,shuffle=True, random_state=42)

lasso_cv=GridSearchCV(lasso_pipeline, param_grid=params, cv=kf)
_=lasso_cv.fit(X_train,y_train)

In [13]:
print('Best score:', abs(lasso_cv.best_score_))

Best score: 0.6692788363419618


In [14]:
print('Best alpha:', lasso_cv.best_params_)

Best alpha: {'model__alpha': 1e-05}


In [15]:
params = {'model__alpha': np.arange(0.00001, 5, 20)}
kf=KFold(n_splits=10,shuffle=True, random_state=42)

lasso_cv=GridSearchCV(lasso_pipeline, param_grid=params, cv=kf)
_=lasso_cv.fit(X_train,y_train)

In [16]:
print('Best score:', abs(lasso_cv.best_score_))

Best score: 0.6693449209686653


In [17]:
print('Best alpha:', lasso_cv.best_params_)

Best alpha: {'model__alpha': 1e-05}


In [18]:
class LassoFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, alpha=0.1):
        self.alpha = alpha
        self.lasso = Lasso(alpha=self.alpha)
        self.selected_features = None
    
    def fit(self, X, y=None):
        self.lasso.fit(X, y)
        self.selected_features = np.where(self.lasso.coef_ != 0)[0]
        return self
    
    def transform(self, X):
        return X[:, self.selected_features]

In [19]:
fs_lasso = Pipeline([
    ('preprocess', pipeline), 
    ('lasso_selector', LassoFeatureSelector(alpha=0.1))
])

In [20]:
fs_lasso.fit(X_train,y_train)

In [21]:
X_selected = fs_lasso.transform(X_train)

In [22]:
print("Selected feature indices:", fs_lasso.named_steps['lasso_selector'].selected_features)

Selected feature indices: [ 0  1  3  6  7  8 10 11 12 13 15 16 19 21 24 25]


In [23]:
print("Shape of X_selected:", X_selected.shape)

Shape of X_selected: (14441, 16)


In [24]:
X_selected

array([[0.64646465, 0.27146814, 0.75      , ..., 0.36363636, 0.41085271,
        0.67144061],
       [0.50505051, 0.07479224, 0.16666667, ..., 0.27272727, 0.72868217,
        0.66610503],
       [0.60606061, 0.2299169 , 0.25      , ..., 0.18181818, 0.63565891,
        0.57329402],
       ...,
       [0.26262626, 0.22437673, 0.5       , ..., 0.27272727, 0.47286822,
        0.64855378],
       [0.04040404, 0.03601108, 0.83333333, ..., 0.36363636, 0.72093023,
        0.6727043 ],
       [0.78787879, 0.11357341, 0.66666667, ..., 0.18181818, 0.79069767,
        0.71889919]])

In [25]:
class Linear_Regression() :
    def __init__( self, lr=0.01, iterations=150 ) :    
        self.lr = lr
        self.iterations = iterations     
    def fit( self, X, Y ) :
        self.l, self.p = X.shape
        # weight initializer
        self.W = np.zeros( self.p )
        self.b = 0
        self.X = X
        self.Y = Y
        # gradientlearning      
        for i in range( self.iterations ) :  
            self.weight_updater()
        return self
         
    def weight_updater( self ) :
        Y_pred = self.predict( self.X )  
        #  gradients  
        dW = - ( 2 * ( self.X.T ).dot( self.Y - Y_pred )  ) / self.l
        db = - 2 * np.sum( self.Y - Y_pred ) / self.l
        # new weights assigned
        self.b = self.b - self.lr * db
        self.W = self.W - self.lr * dW
        return self
          
    def predict( self, X ) :
          # Y_pr=X.W+b
        return X.dot( self.W ) + self.b

In [26]:
X_test_selected = fs_lasso.transform(X_test)
print(X_test_selected.shape) 

(6190, 16)


In [29]:
LR_LASSO = Pipeline([
    ('lasso', fs_lasso),
    ('linear_regression', Linear_Regression()),
])

In [30]:
LR_LASSO.fit(X_selected, y_train)

In [31]:
def evaluate(y_true, y_hat, label='test'):
    mse = mean_squared_error(y_true, y_hat)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_hat)
    variance = r2_score(y_true, y_hat)
    print('{} set RMSE:{}, R2:{}, MAE:{}'.format(label, rmse, variance, mae))

In [32]:
pred_train = LR_LASSO.predict(X_selected)
evaluate(y_train, pred_train, label ='train')

pred_test = LR_LASSO.predict(X_test_selected)
evaluate(y_test, pred_test)

train set RMSE:54.02852368194552, R2:0.3936269568461468, MAE:43.79993586908403
test set RMSE:52.84040037896107, R2:0.39041933202171253, MAE:42.736155764384385


In [35]:
from sklearn.svm import SVR
SVR_LASSO = Pipeline([
    ('lasso', fs_lasso),
    ('SVR', SVR(kernel = 'rbf')),
])


In [36]:
SVR_LASSO.fit(X_train, y_train)

In [38]:
pred_train = SVR_LASSO.predict(X_train)
evaluate(y_train, pred_train, label ='train')

pred_test = SVR_LASSO.predict(X_test)
evaluate(y_test, pred_test)

train set RMSE:37.915833467520244, R2:0.7013689928617147, MAE:26.479725810311884
test set RMSE:36.96674898824343, R2:0.7016531180913639, MAE:26.16304275524355


In [40]:
from sklearn.ensemble import RandomForestRegressor
RF_LASSO = Pipeline([
    ('lasso', fs_lasso),
    ('random_forest', RandomForestRegressor(random_state=0)),
])


In [41]:
RF_LASSO.fit(X_train, y_train)

In [42]:
pred_train = RF_LASSO.predict(X_train)
evaluate(y_train, pred_train, label ='train')

pred_test = RF_LASSO.predict(X_test)
evaluate(y_test, pred_test)

train set RMSE:6.582093620544312, R2:0.9910004292581837, MAE:4.5103496987743235
test set RMSE:17.438801328562114, R2:0.9336054783696113, MAE:11.916547657512115


In [51]:
X_valid = test.groupby('unit_number').last().reset_index()

In [48]:
drop_labels = index_names+setting_names

In [53]:
new_pred_LR = LR_LASSO.predict(X_valid)
evaluate(rul, new_pred_LR, label = 'valid')

valid set RMSE:37.470499003870856, R2:0.18694596534730412, MAE:30.657103697225082


In [55]:
new_pred_SVR = SVR_LASSO.predict(X_valid)
evaluate(rul, new_pred_SVR, label = 'valid')

valid set RMSE:26.03541309896346, R2:0.6074731205913415, MAE:19.835788508675343


In [56]:
new_pred_RF = RF_LASSO.predict(X_valid)
evaluate(rul, new_pred_RF, label = 'valid')

valid set RMSE:34.51441038175214, R2:0.3101711188847146, MAE:23.966400000000004
