# Feature Selection

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from error_metrics import print_reg_error_metrics
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import RFECV
from sklearn.metrics import r2_score
from sklearn.metrics import make_scorer

cars = pd.read_csv('./data/cars.csv')
data_x = cars[list(cars)[1:]]
data_y = cars['mpg']
#create training and test splits
x_train, x_test, y_train, y_test = train_test_split(data_x, data_y,test_size = 0.2, random_state = 4)

In [2]:
base_model = linear_model.LinearRegression()
base_model.fit(x_train, y_train)
preds = base_model.predict(x_test)
pd.DataFrame({'Actual':y_test,'Predicted':preds})#bruh
print_reg_error_metrics(y_test, preds)

MSE, MAE, R^2, EVS: [24.51493293197121, 4.3870307737591006, 0.636872671247911, 0.767292589077751]


## Use F-Selection (top 45%)

In [3]:
selector_f = SelectPercentile(f_regression, percentile=45)
selector_f.fit(x_train, y_train)
for name, score, pv in zip(list(cars), selector_f.scores_, selector_f.pvalues_):
    print('F-score, p-value, (' + name + '):' + str(pv))

F-score, p-value, (mpg):1.4285485646903708e-08
F-score, p-value, (cyl):2.288405556460013e-07
F-score, p-value, (disp):1.918656593805583e-05
F-score, p-value, (hp):0.0007751208908700202
F-score, p-value, (drat):1.1242991837306483e-09
F-score, p-value, (wt):0.09370523006136659
F-score, p-value, (qsec):0.0015714445870867975
F-score, p-value, (vs):0.008549929449842912
F-score, p-value, (am):0.01511038230276851
F-score, p-value, (gear):0.023739272527108935


In [4]:
#get the features-selected data
xt_train, xt_test = selector_f.transform(x_train), selector_f.transform(x_test)
model = linear_model.LinearRegression()
model.fit(xt_train, y_train)
preds = model.predict(xt_test)
print_reg_error_metrics(y_test, preds)
pd.DataFrame({'Actual':y_test,'Predicted':preds})#bruh

MSE, MAE, R^2, EVS: [19.846641819159014, 3.923571746993119, 0.6850648293222865, 0.7543657939488126]


Unnamed: 0,Actual,Predicted
20,21.5,25.423572
15,10.4,9.409473
17,32.4,26.361547
2,22.8,25.917334
11,16.4,13.31324
19,33.9,27.901512
16,14.7,9.144796


### F-Selection (K-best, k=3)

In [5]:
#create a feature selecter that uses the top 3 features by the F metric.
selector_f = SelectKBest(f_regression, k=3)
selector_f.fit(x_train, y_train)

SelectKBest(k=3, score_func=<function f_regression at 0x0000020FF029D7B8>)

In [6]:
#get the features-selected data
xt_train, xt_test = selector_f.transform(x_train), selector_f.transform(x_test)
model = linear_model.LinearRegression()
model.fit(xt_train, y_train)
preds = model.predict(xt_test)
print_reg_error_metrics(y_test, preds)
pd.DataFrame({'Actual':y_test,'Predicted':preds})#bruh

MSE, MAE, R^2, EVS: [19.988618207227002, 4.1125419122460265, 0.6673504425122332, 0.7324619718569146]


Unnamed: 0,Actual,Predicted
20,21.5,25.612542
15,10.4,9.764495
17,32.4,26.195267
2,22.8,26.073464
11,16.4,13.10984
19,33.9,27.663263
16,14.7,9.835112


## Recursive Feature Elimination with Cross-Validation
### (remember K-fold)

In [8]:
#Build the RFECV selector with 5-fold CV and the R-squared score as the error metric
selector_f = RFECV(estimator=linear_model.LinearRegression(), cv=5, scoring=make_scorer(r2_score))
selector_f.fit(x_train, y_train)

#get the features-selected data
xt_train, xt_test = selector_f.transform(x_train), selector_f.transform(x_test)
model = linear_model.LinearRegression()
model.fit(xt_train, y_train)
preds = model.predict(xt_test)
print_reg_error_metrics(y_test, preds)
pd.DataFrame({'Actual':y_test,'Predicted':preds})#bruh

MSE, MAE, R^2, EVS: [20.98193480610998, 4.632496505789522, 0.6879483251172415, 0.7495733392019129]


Unnamed: 0,Actual,Predicted
20,21.5,26.132497
15,10.4,8.407512
17,32.4,26.437259
2,22.8,25.950664
11,16.4,14.27784
19,33.9,27.91732
16,14.7,8.727854
