# Feature Selection II - Selecting for Model Accuracy

In [1]:
import pandas as pd
ansur_female = pd.read_csv('datasets/ANSUR_II_FEMALE.csv')
ansur_male = pd.read_csv('datasets/ANSUR_II_MALE.csv')

ansur = pd.concat([ansur_female,ansur_male])
ansur.shape

(6068, 99)

In [2]:
ansur_filtered = ansur[['Gender','chestdepth','handlength',
                        'neckcircumference','shoulderlength',
                       'earlength']]
ansur_filtered.head()

Unnamed: 0,Gender,chestdepth,handlength,neckcircumference,shoulderlength,earlength
0,Female,245,184,335,148,65
1,Female,206,189,302,142,60
2,Female,223,195,325,164,65
3,Female,285,186,357,157,62
4,Female,290,187,340,156,65


In [3]:
# Features
X = ansur_filtered.drop('Gender',axis=1)

# Target
y = ansur_filtered['Gender']

In [4]:
# Pre-processing the data

# Split into train-test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Standardize the data, mean=0,variance=1
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)

In [5]:
# Creating a logistic regression model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Create Logistic regression model and fit using standardized data
lr = LogisticRegression()
lr.fit(X_train_std, y_train)

# Calculate accuarcy of the model 
X_test_std = scaler.transform(X_test) #<-- first standardized

y_pred = lr.predict(X_test_std)
print(accuracy_score(y_test,y_pred))

0.9906644700713894


In [6]:
# Inspecting the feature coefficients
print(lr.coef_)

[[-3.08469117  0.02544785  7.66460597  1.33982551  0.67397921]]


**Note:** These coefficients will be multiplied with the feature values when the model makes a prediction, features with coefficients close to zero will contribute little to the end result.

In [7]:
print(dict(zip(X.columns, abs(lr.coef_[0]))))

{'chestdepth': 3.0846911680204783, 'handlength': 0.02544785454550054, 'neckcircumference': 7.664605966495018, 'shoulderlength': 1.3398255121268206, 'earlength': 0.6739792057328612}


In [8]:
# Dropping feature that contribute little to the model
X.drop('handlength',axis=1, inplace=True)

# Recalculating the accuarcy after dropping a feature
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

lr = LogisticRegression()
lr.fit(scaler.fit_transform(X_train), y_train)

print(accuracy_score(y_test, lr.predict(scaler.transform(X_test))))

0.9879187259747392


Thus, increased accuracy and decreased model complexity. To repeat this step recursively we have RFE (Recursive Feature Elimination).

## Recursive Feature Elimination
- Feature selection algorithm that can be wrapped around any model that produces feature coefficients or feature importance values.
- We can pass it the model we want to use and the number of features we want to select. 
-  While fitting to our data it will repeat a process where it first fits the internal model and then drops the feature with the weakest coefficient.
-  It will keep doing this until the desired number of features is reached.

In [9]:
# Features
X = ansur_filtered.drop('Gender',axis=1)

# Target
y = ansur_filtered['Gender']

In [10]:
# Pre-processing the data

# Split into train-test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Standardize the data, mean=0,variance=1
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)

In [11]:
# RFE
from sklearn.feature_selection import RFE

# Instantiate RFE
rfe = RFE(estimator=LogisticRegression(), n_features_to_select=2, verbose=1)

# Fit the model
rfe.fit(X_train_std, y_train)

Fitting estimator with 5 features.
Fitting estimator with 4 features.
Fitting estimator with 3 features.


In [12]:
# Inspecting the RFE results
X.columns[rfe.support_]

Index(['chestdepth', 'neckcircumference'], dtype='object')

In [13]:
# See in which iteration a feature was dropped
dict(zip(X.columns, rfe.ranking_))

{'chestdepth': 1,
 'handlength': 4,
 'neckcircumference': 1,
 'shoulderlength': 2,
 'earlength': 3}

In [14]:
# Check accuracy from the two remaining feature
X_test_std = scaler.transform(X_test)

accuracy_score(y_test, rfe.predict(X_test_std))

0.9824272377814388

# Tree-based feature selection

![image-3](image-3.png)

Random Forest is one of such models that performs feature selection by design to avoid overfitting. 
- It pass different, random, subsets of features to a number of decision trees.

In [15]:
import pandas as pd
ansur_female = pd.read_csv('datasets/ANSUR_II_FEMALE.csv')
ansur_male = pd.read_csv('datasets/ANSUR_II_MALE.csv')

ansur = pd.concat([ansur_female,ansur_male])
ansur.shape

(6068, 99)

In [16]:
ansur.drop(['Branch','Component','BMI_class','Height_class'],
          axis=1,
          inplace=True)

In [17]:
# Features
X = ansur.drop('Gender',axis=1)

# Target
y = ansur['Gender']

In [18]:
# Split into train-test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [19]:
# Random forest classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier()

rf.fit(X_train, y_train)

print(accuracy_score(y_test, rf.predict(X_test)))

0.9906644700713894


Being able to get such high accuracy means it managed to escape the curse of dimensionality and didn't overfit on the many features in the training set.

In [20]:
# Feature importance values
rf.feature_importances_

array([0.00158209, 0.00078468, 0.00060322, 0.0007228 , 0.00077227,
       0.01361131, 0.01930108, 0.08249253, 0.00316029, 0.0073802 ,
       0.01569823, 0.02589116, 0.00061053, 0.00968738, 0.00149896,
       0.01546283, 0.00245461, 0.00064102, 0.00224862, 0.00174543,
       0.00137302, 0.02060868, 0.0007312 , 0.00114741, 0.00752685,
       0.03379426, 0.0004317 , 0.00570454, 0.00074169, 0.00092596,
       0.00115428, 0.00244007, 0.00043242, 0.00320249, 0.00134764,
       0.00765255, 0.00069702, 0.05595191, 0.00744972, 0.00075592,
       0.00072762, 0.03614581, 0.08194956, 0.00069755, 0.00129565,
       0.00099696, 0.00070611, 0.01995327, 0.00045674, 0.0131407 ,
       0.02666373, 0.00090233, 0.00061568, 0.00307083, 0.00414214,
       0.00059269, 0.00033574, 0.00088505, 0.00549695, 0.00599081,
       0.00124363, 0.10899823, 0.05932448, 0.00517493, 0.00028992,
       0.00551621, 0.00509178, 0.09318428, 0.00797017, 0.00087411,
       0.00565232, 0.02054258, 0.0006657 , 0.00143432, 0.00907

An advantage of these feature importance values over coefficients is that they are comparable between features by default, since they always sum up to one.

In [21]:
# Feature importance as a feature selector
mask = rf.feature_importances_ > 0.1
print(mask)

X_reduced = X.loc[:,mask]
print(X_reduced.columns)

[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False  True False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False]
Index(['neckcircumference'], dtype='object')


In [22]:
# RFE with random forests
from sklearn.feature_selection import RFE

# Model instantiate
rfe = RFE(estimator=RandomForestClassifier(),
         n_features_to_select = 6,
         step = 10, #<-- at each step 10 least imp. features are dropped
         verbose = 1)

# Fit the model
rfe.fit(X_train, y_train)

# Remaining features
print(X.columns[rfe.support_])

Fitting estimator with 94 features.
Fitting estimator with 84 features.
Fitting estimator with 74 features.
Fitting estimator with 64 features.
Fitting estimator with 54 features.
Fitting estimator with 44 features.
Fitting estimator with 34 features.
Fitting estimator with 24 features.
Fitting estimator with 14 features.
Index(['biacromialbreadth', 'handcircumference', 'neckcircumference',
       'neckcircumferencebase', 'shouldercircumference', 'wristcircumference'],
      dtype='object')


In [23]:
# accuracy calculation
accuracy_score(y_test, rfe.predict(X_test))

0.9654036243822076

# Regularized linear regression
![image-4](image-4.png)


In [24]:
ansur_male.drop(['Branch','Component','BMI_class','Height_class','Gender'],
          axis=1,
          inplace=True)

In [25]:
ansur_male.shape

(4082, 94)

In [26]:
X = ansur_male.drop('BMI',axis=1)
y = ansur_male['BMI']

In [27]:
from sklearn.linear_model import Lasso

# Set the test size to 30% to get a 70-30% train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Fit the scaler on the training features and transform these in one go
X_train_std = scaler.fit_transform(X_train)

# Create the Lasso model
la = Lasso()

# Fit it to the standardized training data
la.fit(X_train_std, y_train)

In [28]:
# Transform the test set with the pre-fitted scaler
X_test_std = scaler.transform(X_test)

# Calculate the coefficient of determination (R squared) on X_test_std
r_squared = la.score(X_test_std, y_test)
print(f"The model can predict {r_squared:.1%} of the variance in the test set.")

The model can predict 84.7% of the variance in the test set.


In [29]:
# Feature coeff. (higher the value, higher the importance of the feature)
la.coef_

array([ 0.09198472, -0.        , -0.        ,  0.        , -0.        ,
        0.        , -0.        ,  0.        ,  0.26349677,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.30062148, -0.        , -0.        , -0.        ,
        0.06019966, -0.        ,  0.        ,  0.84028524,  0.        ,
       -0.        , -0.        ,  0.        , -0.        ,  0.        ,
        0.        , -0.        ,  0.        , -0.        ,  0.        ,
       -0.        , -0.        ,  0.        ,  0.        , -0.        ,
       -0.        ,  0.        ,  0.        , -0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        , -0.        ,  0.        ,  0.        ,  0.        ,
       -0.        , -0.        , -0.        , -0.        ,  0.        ,
        0.        ,  0.01276255,  0.        , -0.        , -0.        ,
       -0.        , -0.        ,  0.        , -0.        , -0.  

In [30]:
# Create a list that has True values when coefficients equal 0
zero_coef = la.coef_ == 0 
print(zero_coef)

# Calculate how many features have a zero coefficient
n_ignored = sum(zero_coef)
print(f"The model has ignored {n_ignored} out of {len(la.coef_)} features.")

[False  True  True  True  True  True  True  True False  True  True  True
  True  True  True  True False  True  True  True False  True  True False
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True False  True  True  True  True  True  True  True  True  True  True
  True  True  True  True False  True  True  True  True  True  True  True
  True False False  True  True  True  True  True  True]
The model has ignored 84 out of 93 features.


In [31]:
# Lasso Regression with alpha parameter, which shrinks least important features but doen't peanlize the important feature much
la = Lasso(alpha=0.1, random_state=0)

# Fits the model and calculates performance stats
la.fit(X_train_std, y_train)
r_squared = la.score(X_test_std, y_test)
n_ignored_features = sum(la.coef_ == 0)

# Print peformance stats 
print(f"The model can predict {r_squared:.1%} of the variance in the test set.")
print(f"{n_ignored_features} out of {len(la.coef_)} features were ignored.")

The model can predict 99.2% of the variance in the test set.
75 out of 93 features were ignored.


# Creating a LassoCV regressor
- `LassoCV()` regressor automatically tunes the regularization strength (alpha value) using Cross-Validation.

In [32]:
from sklearn.linear_model import LassoCV

# Create and fit the LassoCV model on the training set
lcv = LassoCV()
lcv.fit(X_train, y_train)
print(f'Optimal alpha = {lcv.alpha_:.3f}')

# Calculate R squared on the test set
r_squared = lcv.score(X_test, y_test)
print(f'The model explains {r_squared:.1%} of the test set variance')

# Create a mask for coefficients not equal to zero
lcv_mask = (lcv.coef_ != 0)
print(f'{sum(lcv_mask)} features out of {len(lcv_mask)} selected')

Optimal alpha = 0.404
The model explains 99.1% of the test set variance
38 features out of 93 selected


# Combining feature selectors

### Using GradientBoostingRegressor

In [33]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import GradientBoostingRegressor

# Select 10 features with RFE on a GradientBoostingRegressor, drop 3 features on each step
rfe_gb = RFE(estimator=GradientBoostingRegressor(), 
             n_features_to_select=10, step=3, verbose=1)
rfe_gb.fit(X_train, y_train)

# Calculate the R squared on the test set
r_squared = rfe_gb.score(X_test,y_test)
print(f'The model can explain {r_squared:.1%} of the variance in the test set')

Fitting estimator with 93 features.
Fitting estimator with 90 features.
Fitting estimator with 87 features.
Fitting estimator with 84 features.
Fitting estimator with 81 features.
Fitting estimator with 78 features.
Fitting estimator with 75 features.
Fitting estimator with 72 features.
Fitting estimator with 69 features.
Fitting estimator with 66 features.
Fitting estimator with 63 features.
Fitting estimator with 60 features.
Fitting estimator with 57 features.
Fitting estimator with 54 features.
Fitting estimator with 51 features.
Fitting estimator with 48 features.
Fitting estimator with 45 features.
Fitting estimator with 42 features.
Fitting estimator with 39 features.
Fitting estimator with 36 features.
Fitting estimator with 33 features.
Fitting estimator with 30 features.
Fitting estimator with 27 features.
Fitting estimator with 24 features.
Fitting estimator with 21 features.
Fitting estimator with 18 features.
Fitting estimator with 15 features.
Fitting estimator with 12 fe

In [37]:
# Mask --> True:The corresponding feature is selected, otherwise not selected
gb_mask = rfe_gb.support_

In [38]:
# Inspecting RFE result in GradientBoostingRegressor 
X.columns[rfe_gb.support_]

Index(['bicepscircumferenceflexed', 'buttockcircumference', 'buttockdepth',
       'calfcircumference', 'chestcircumference', 'neckcircumference',
       'poplitealheight', 'thighcircumference', 'waistcircumference',
       'stature_m'],
      dtype='object')

### Using RandomForestRegressor

In [39]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor

# Select 10 features with RFE on a RandomForestRegressor, drop 3 features on each step
rfe_rf = RFE(estimator=RandomForestRegressor(), 
             n_features_to_select=10, step=3, verbose=1)
rfe_rf.fit(X_train, y_train)

# Calculate the R squared on the test set
r_squared = rfe_rf.score(X_test, y_test)
print(f'The model can explain {r_squared:.1%} of the variance in the test set')

# Assign the support array to rf_mask
rf_mask = rfe_rf.support_

In [40]:
# Inspecting RFE result in RandomForestRegressor
X.columns[rfe_rf.support_]

Index(['axillaheight', 'bicepscircumferenceflexed', 'buttockdepth',
       'chestcircumference', 'poplitealheight', 'thighcircumference',
       'waistcircumference', 'waistheightomphalion', 'weight_kg', 'stature_m'],
      dtype='object')

## Combining 3 feature selectors
- Combining the votes of the 3 models built earlier, to decide which features are important into a meta mask. We'll then use this mask to reduce dimensionality and see how a simple linear regressor performs on the reduced dataset.

In [45]:
import numpy as np

# Sum the votes of the three models
votes = np.sum([lcv_mask, rf_mask, gb_mask],axis=0)
votes

array([1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 2, 3, 1, 0, 0, 2, 1,
       0, 3, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 1, 0, 3,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 3, 1, 1, 1, 0, 0, 1, 0, 0, 3, 1, 0,
       2, 0, 0, 2, 2])

In [46]:
# Create a mask for features selected by all 3 models
meta_mask = votes == 3
meta_mask

array([False, False, False, False, False, False, False, False,  True,
       False, False, False, False, False, False, False,  True, False,
       False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,  True, False, False, False, False, False, False,
       False, False, False, False,  True, False, False, False, False,
       False, False, False, False,  True, False, False, False, False,
       False, False, False])

In [48]:
# Apply the dimensionality reduction on X
X_reduced = X.iloc[:,meta_mask]
print(X_reduced.columns)

Index(['bicepscircumferenceflexed', 'buttockdepth', 'chestcircumference',
       'poplitealheight', 'thighcircumference', 'waistcircumference'],
      dtype='object')


In [50]:
from sklearn.linear_model import LinearRegression

# Instantiate the model
lm = LinearRegression()

# Plug the reduced dataset into a linear regression pipeline
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.3, random_state=0)
lm.fit(scaler.fit_transform(X_train), y_train)
r_squared = lm.score(scaler.transform(X_test), y_test)
print(f'The model can explain {r_squared:.1%} of the variance in the test set using {len(lm.coef_)} features.')

The model can explain 95.7% of the variance in the test set using 6 features.
