# Stepwise Regression
Below taken from: <br>
Machine Learning for Business Analytics<br>
Concepts, Techniques, and Applications in Python<br>
by Galit Shmueli, Peter C. Bruce, Peter Gedeck, Nitin R. Patel

Publisher: Wiley; 2nd edition (2024) <br>
<!-- ISBN-13: 978-3031075650 -->

(c) 2024 Galit Shmueli, Peter C. Bruce, Peter Gedeck, Nitin R. Patel


In [3]:
# install mlba package (Machine Learning for Business Analytics)
! pip install mlba

Collecting mlba
  Downloading mlba-2.0.2-py3-none-any.whl.metadata (1.7 kB)
Downloading mlba-2.0.2-py3-none-any.whl (18.6 MB)
   ---------------------------------------- 0.0/18.6 MB ? eta -:--:--
   ---- ----------------------------------- 2.1/18.6 MB 12.1 MB/s eta 0:00:02
   ------- -------------------------------- 3.7/18.6 MB 8.9 MB/s eta 0:00:02
   --------------- ------------------------ 7.1/18.6 MB 11.1 MB/s eta 0:00:02
   ----------------------- ---------------- 11.0/18.6 MB 13.0 MB/s eta 0:00:01
   ------------------------------- -------- 14.4/18.6 MB 13.7 MB/s eta 0:00:01
   ---------------------------------------  18.4/18.6 MB 14.5 MB/s eta 0:00:01
   ---------------------------------------- 18.6/18.6 MB 14.2 MB/s eta 0:00:00
Installing collected packages: mlba
Successfully installed mlba-2.0.2


In [4]:
import matplotlib.pyplot as plt
import mlba
import numpy as np
import pandas as pd
import statsmodels.formula.api as sm
from mlxtend.feature_selection import ExhaustiveFeatureSelector, SequentialFeatureSelector
from sklearn.linear_model import BayesianRidge, Lasso, LassoCV, LinearRegression, Ridge, RidgeCV
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
%matplotlib inline

In [5]:
# reduce data frame to the top 1000 rows and select columns for regression analysis
car_df = mlba.load_data('ToyotaCorolla.csv')
car_df = car_df.iloc[0:999]

predictors = ['Age_08_04', 'KM', 'Fuel_Type', 'HP', 'Met_Color', 'Automatic', 'CC',
              'Doors', 'Quarterly_Tax', 'Weight']
outcome = 'Price'

# partition data
X = pd.get_dummies(car_df[predictors], drop_first=True)
y = car_df[outcome]
train_X, holdout_X, train_y, holdout_y = train_test_split(X, y, test_size=0.4,
                                                    random_state=314)

# train a linear regression model using scikit-learn
car_lm = LinearRegression()
car_lm.fit(train_X, train_y)

# print coefficients
print(pd.DataFrame({'Predictor': X.columns, 'coefficient': car_lm.coef_}))


           Predictor  coefficient
0          Age_08_04  -131.938191
1                 KM    -0.019219
2                 HP    67.853061
3          Met_Color    97.689520
4          Automatic   372.551930
5                 CC    -4.095614
6              Doors   -32.400709
7      Quarterly_Tax    15.528026
8             Weight    14.251452
9   Fuel_Type_Diesel  4010.094356
10  Fuel_Type_Petrol  2553.361060


In [6]:
# Forward model selection
sfs_forward = SequentialFeatureSelector(LinearRegression(),
            k_features=(1, 11),
            forward=True, floating=False,
            cv=5, scoring='neg_root_mean_squared_error',
            n_jobs=-1)

sfs_forward = sfs_forward.fit(train_X, train_y)

best_subset = sfs_forward.subsets_[1]
for v in sfs_forward.subsets_.values():
    if v['avg_score'] > best_subset['avg_score']:
        best_subset = v

print(f"Best accuracy score: {- best_subset['avg_score']:.2f}")
print(f"Best subset (indices): {best_subset['feature_idx']}")
print(f"Best subset (corresponding names):\n{best_subset['feature_names']}")

Best accuracy score: 1446.29
Best subset (indices): (0, 1, 2, 5, 6, 7, 8, 9, 10)
Best subset (corresponding names):
('Age_08_04', 'KM', 'HP', 'CC', 'Doors', 'Quarterly_Tax', 'Weight', 'Fuel_Type_Diesel', 'Fuel_Type_Petrol')


In [7]:
# Backwards selection
sfs_backward = SequentialFeatureSelector(LinearRegression(),
            k_features=(1, 11),
            forward=False, floating=False,
            cv=5, scoring='neg_root_mean_squared_error',
            n_jobs=-1)

sfs_backward = sfs_backward.fit(train_X, train_y)

best_subset = sfs_backward.subsets_[1]
for v in sfs_backward.subsets_.values():
    if v['avg_score'] > best_subset['avg_score']:
        best_subset = v

print(f"Best accuracy score: {- best_subset['avg_score']:.2f}")
print(f"Best subset (indices): {best_subset['feature_idx']}")
print(f"Best subset (corresponding names):\n{best_subset['feature_names']}")

Best accuracy score: 1442.03
Best subset (indices): (0, 1, 2, 5, 7, 8, 9, 10)
Best subset (corresponding names):
('Age_08_04', 'KM', 'HP', 'CC', 'Quarterly_Tax', 'Weight', 'Fuel_Type_Diesel', 'Fuel_Type_Petrol')


In [None]:
# Stepwise selection
sfs_stepwise = SequentialFeatureSelector(LinearRegression(),
            k_features=(1, 11),
            forward=True, floating=True,
            cv=5, scoring='neg_root_mean_squared_error',
            n_jobs=-1)

sfs_stepwise = sfs_stepwise.fit(train_X, train_y)

best_subset = sfs_stepwise.subsets_[1]
for v in sfs_stepwise.subsets_.values():
    if v['avg_score'] > best_subset['avg_score']:
        best_subset = v

print(f"Best accuracy score: {- best_subset['avg_score']:.2f}")
print(f"Best subset (indices): {best_subset['feature_idx']}")
print(f"Best subset (corresponding names):\n{best_subset['feature_names']}")