# Feature Selection Examples

This notebook provides a few examples of how to do feature selection with sklearn.

In [17]:
import pandas as pd
import matplotlib.pyplot as plt
import pprint
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import RFECV
from sklearn.metrics import make_scorer
from sklearn.metrics import r2_score
from data_util import *

cars = pd.read_csv('./data/mtcars.csv')
data_x = cars[list(cars)[1:]]
data_y = cars[list(cars)[0]]

### 1. Split into Training and Test Sets

In [6]:
x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.2, 
                                                   random_state=4)

### 2. Build Base Model

In [7]:
base_model = linear_model.LinearRegression()
base_model.fit(x_train, y_train)
preds = base_model.predict(x_test)
print_regression_error_report(y_test, preds)

MSE, MAE, R^2, EVS: [24.51493293197121, 4.3870307737591006, 0.634381669142229, 0.7656962491586179]


### 3. Use Percentile-Based Feature Selection

In [12]:
# Get top 25% features by F-score
selector_f = SelectPercentile(f_regression, percentile=25)
selector_f.fit(x_train, y_train)

# Print the f-score and p-value info
for name, score, pv in zip(list(cars), selector_f.scores_, selector_f.pvalues_):
    print('F-score, p-value (' + name + '): ' + str(score) + ', ' + str(pv))
    


F-score, p-value (mpg): 72.60897277172427, 1.4285485646903708e-08
F-score, p-value (cyl): 52.37926936274743, 2.288405556460013e-07
F-score, p-value (disp): 28.728860103230335, 1.918656593805583e-05
F-score, p-value (hp): 14.982418395530182, 0.0007751208908700202
F-score, p-value (drat): 95.9791939506271, 1.1242991837306483e-09
F-score, p-value (wt): 3.0574333594941234, 0.09370523006136659
F-score, p-value (qsec): 12.843088079770348, 0.0015714445870867975
F-score, p-value (vs): 8.266762931256759, 0.008549929449842912
F-score, p-value (am): 6.894741767136821, 0.01511038230276851
F-score, p-value (gear): 5.8645189056937435, 0.023739272527108935


In [15]:
# Get the columns of the best 25% features
xt_train, xt_test = selector_f.transform(x_train), selector_f.transform(x_test)

# Create a model that uses these 25% best features only
model = linear_model.LinearRegression()
model.fit(xt_train, y_train)

# Make predictions and look at results
preds = model.predict(xt_test)
print_regression_error_report(preds, y_test)

MSE, MAE, R^2, EVS: [19.988618207227002, 4.1125419122460265, 0.6673504425122332, 0.7324619718569146]


### 4. Use K-Best Feature Selection

In [16]:
selector_f = SelectKBest(f_regression, k=3) # Select the top 3 features
selector_f.fit(x_train, y_train)

# Get the columns of the best 3 features
xt_train, xt_test = selector_f.transform(x_train), selector_f.transform(x_test)

# Create a model that uses these 3 best features only
model = linear_model.LinearRegression()
model.fit(xt_train, y_train)

# Make predictions and look at results
preds = model.predict(xt_test)
print_regression_error_report(preds, y_test)

MSE, MAE, R^2, EVS: [19.988618207227002, 4.1125419122460265, 0.6673504425122332, 0.7324619718569146]


### 5. Use Recursive Feature Elimination with Cross Validation

In [18]:
# Use RFECV to arrive at approximately the best set of features. RFECV is a greedy method.
selector_f = RFECV(estimator=linear_model.LinearRegression(), 
                   scoring=make_scorer(r2_score),
                   cv=5) 
selector_f.fit(x_train, y_train)

# Get the columns of the best 3 features
xt_train, xt_test = selector_f.transform(x_train), selector_f.transform(x_test)

# Create a model that uses these 3 best features only
model = linear_model.LinearRegression()
model.fit(xt_train, y_train)

# Make predictions and look at results
preds = model.predict(xt_test)
print_regression_error_report(preds, y_test)

MSE, MAE, R^2, EVS: [20.98193480610998, 4.632496505789522, 0.6879483251172415, 0.7495733392019129]
