# Machine Learning Pipeline - Feature Selection

In [1]:
# to handle datasets
import pandas as pd
import numpy as np

# for plotting
import matplotlib.pyplot as plt

# to build the models
from sklearn.linear_model import Lasso, LinearRegression   ## reducir caracteristicas
from sklearn.feature_selection import SelectFromModel

from lightgbm import LGBMRegressor

# to visualise al the columns in the dataframe
pd.pandas.set_option('display.max_columns', None)

In [2]:
X_train = pd.read_csv('./Output/xtrain_coches.csv')
X_test = pd.read_csv('./Output/xtest_coches.csv')

X_test.head()

Unnamed: 0,model,year,motor_type,running,wheel,color,type,status,motor_volume
0,-0.667066,-0.657434,-2.808668,-0.509656,0.0,0.327037,-3.35376,-0.458448,-0.11775
1,0.069299,0.6584,-2.808668,0.771572,0.0,0.327037,-3.35376,0.961108,-0.11775
2,0.805664,-0.074489,-2.808668,0.161087,0.0,-0.294228,-2.24485,-0.458448,1.78184
3,1.542029,-0.657434,-2.808668,-0.106913,0.0,-0.915493,-3.35376,-0.458448,-0.11775
4,0.805664,-0.074489,-2.808668,0.000308,0.0,-1.070809,-5.571582,-0.458448,1.78184


In [3]:
y_train = pd.read_csv('./Output/ytrain_coches.csv')
y_test = pd.read_csv('./Output/ytest_coches.csv')

y_train.head()

Unnamed: 0,price
0,9.093807
1,9.729134
2,8.455318
3,9.769956
4,9.740969


### Feature Selection

In [4]:
sel_lasso = SelectFromModel(Lasso(alpha=0.001, random_state=0))


sel_lasso.fit(X_train, y_train)

In [5]:
selected_feats_lasso = X_train.columns[(sel_lasso.get_support())]

print('Número de Features en Total: {}'.format((X_train.shape[1])))
print('Número de Features Seleccionados: {}'.format(len(selected_feats_lasso)))
print('Características con coeficientes reducidos a cero: {}'.format(
    np.sum(sel_lasso.estimator_.coef_ == 0)))

selected_feats_lasso

Número de Features en Total: 9
Número de Features Seleccionados: 8
Características con coeficientes reducidos a cero: 1


Index(['model', 'year', 'motor_type', 'running', 'color', 'type', 'status',
       'motor_volume'],
      dtype='object')

In [9]:
pd.Series(selected_feats_lasso).to_csv('./Output/selected_features_coches.csv', index=False)

In [6]:
sel_LGBMRegressor = SelectFromModel(LGBMRegressor())

sel_LGBMRegressor.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000773 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 341
[LightGBM] [Info] Number of data points in the train set: 1477, number of used features: 8
[LightGBM] [Info] Start training from score 9.572343


In [None]:
selected_feats_LGBMRegressor = X_train.columns[(sel_LGBMRegressor.get_support())]

print('Número de Features en Total: {}'.format((X_train.shape[1])))
print('Número de Features Seleccionados: {}'.format(len(selected_feats_LGBMRegressor)))

selected_feats_LGBMRegressor

Número de Features en Total: 9
Número de Features Seleccionados: 2


Index(['year', 'running'], dtype='object')

In [None]:
sel_lregression = SelectFromModel(LinearRegression())

sel_lregression.fit(X_train, y_train)

In [13]:
sel_lasso.get_support().sum()

8

In [14]:
sel_lregression.get_support().sum()

2

In [6]:
sel_LGBMRegressor.get_support().sum()

2

Index(['model', 'year'], dtype='object')