In [18]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import  RFECV
from sklearn.metrics import r2_score
from sklearn.metrics import make_scorer
from error_metrics import print_regression_error_metrics

# Read in and set the data.
cars = pd.read_csv('./data/cars.csv')
data_x = cars[list(cars)[1:]]
data_y = cars[list(cars)[0]]

print(list(data_x))

# Create training and test splits.
x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.2, random_state=4)

['cyl', 'disp', 'hp', 'drat', 'wt', 'qsec', 'vs', 'am', 'gear', 'carb']


In [19]:
base_model = linear_model.LinearRegression()
base_model.fit(x_train, y_train)
preds = base_model.predict(x_test)
print_regression_error_metrics(y_test, preds)
pd.DataFrame({'Actual':y_test, 'Predicted':preds})


MSE, MAE, R^2, EVS: [24.514932931971277, 4.3870307737591077, 0.63687267124791014, 0.76729258907775111]


Unnamed: 0,Actual,Predicted
20,21.5,25.161462
15,10.4,8.551049
17,32.4,25.465516
2,22.8,24.508777
11,16.4,12.012969
19,33.9,27.86578
16,14.7,7.763519


### Use f-selection (top 45%)

In [20]:
selector_f = SelectPercentile(f_regression, percentile=45)
selector_f.fit(x_train, y_train)
for name, score, pv in zip(list(cars), selector_f.scores_, selector_f.pvalues_):
    print('F-score, p-value, (' + name + '):' + str(score) + ", " + str(pv))

F-score, p-value, (mpg):72.6089727717, 1.42854856469e-08
F-score, p-value, (cyl):52.3792693627, 2.28840555646e-07
F-score, p-value, (disp):28.7288601032, 1.91865659381e-05
F-score, p-value, (hp):14.9824183955, 0.00077512089087
F-score, p-value, (drat):95.9791939506, 1.12429918373e-09
F-score, p-value, (wt):3.05743335949, 0.0937052300614
F-score, p-value, (qsec):12.8430880798, 0.00157144458709
F-score, p-value, (vs):8.26676293126, 0.00854992944984
F-score, p-value, (am):6.89474176714, 0.0151103823028
F-score, p-value, (gear):5.86451890569, 0.0237392725271


In [21]:
# Get the feature-selected data
xt_train, xt_test = selector_f.transform(x_train), selector_f.transform(x_test)

# Build and test the new model with the selected features
model = linear_model.LinearRegression()
model.fit(xt_train, y_train)
preds = model.predict(xt_test)
print_regression_error_metrics(y_test, preds)
pd.DataFrame({'Actual':y_test, 'Predicted':preds})

MSE, MAE, R^2, EVS: [19.846641819159014, 3.9235717469931188, 0.68506482932228652, 0.75436579394881265]


Unnamed: 0,Actual,Predicted
20,21.5,25.423572
15,10.4,9.409473
17,32.4,26.361547
2,22.8,25.917334
11,16.4,13.31324
19,33.9,27.901512
16,14.7,9.144796


### Use f-selection (using best 3 features)

In [22]:
# Create a feature selectior to get the top 3 features by the F metric.
selector_f = SelectKBest(f_regression, k=3)


# Get the feature-selected data
xt_train, xt_test = selector_f.fit_transform(x_train, y_train), selector_f.transform(x_test)

# Build and test the new model with the selected features
model = linear_model.LinearRegression()
model.fit(xt_train, y_train)
preds = model.predict(xt_test)
print_regression_error_metrics(y_test, preds)
pd.DataFrame({'Actual':y_test, 'Predicted':preds})

MSE, MAE, R^2, EVS: [19.988618207227002, 4.1125419122460265, 0.66735044251223319, 0.73246197185691464]


Unnamed: 0,Actual,Predicted
20,21.5,25.612542
15,10.4,9.764495
17,32.4,26.195267
2,22.8,26.073464
11,16.4,13.10984
19,33.9,27.663263
16,14.7,9.835112


### Use Recursive Feature Elimination with Cross Validation

In [25]:
# Build the RFECV selector with 5-fold CV and the R-squared score as the score metric
selector_f = RFECV(estimator=linear_model.LinearRegression(), cv=5, scoring=make_scorer(r2_score))
selector_f.fit(x_train, y_train)

# Get the feature-selected data
xt_train, xt_test = selector_f.transform(x_train), selector_f.transform(x_test)

# Build and test the new model with the selected features
model = linear_model.LinearRegression()
model.fit(xt_train, y_train)
preds = model.predict(xt_test)
print_regression_error_metrics(y_test, preds)
pd.DataFrame({'Actual':y_test, 'Predicted':preds})

MSE, MAE, R^2, EVS: [20.98193480610998, 4.632496505789522, 0.68794832511724147, 0.74957333920191294]


Unnamed: 0,Actual,Predicted
20,21.5,26.132497
15,10.4,8.407512
17,32.4,26.437259
2,22.8,25.950664
11,16.4,14.27784
19,33.9,27.91732
16,14.7,8.727854
