# Auto MPG dataset Model Training and Selection

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Column Names
cols = ['MPG','Cylinders','Displacement','Horsepower','Weight',
                'Acceleration', 'Model Year', 'Origin']

num_cols = ['MPG','Displacement','Horsepower','Weight',
                'Acceleration', 'Model Year','Cylinders']

cat_cols = ['Origin']

df = pd.read_csv('dataset/auto-mpg.data', names=cols, na_values = "?",
                comment = '\t',
                sep= " ",
                skipinitialspace=True)

#making a copy of the dataframe
data = df.copy()

## Data Preprocessing

In [3]:
# handling missing values
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")
imputer.fit_transform(data[num_cols])

array([[ 18. , 307. , 130. , ...,  12. ,  70. ,   8. ],
       [ 15. , 350. , 165. , ...,  11.5,  70. ,   8. ],
       [ 18. , 318. , 150. , ...,  11. ,  70. ,   8. ],
       ...,
       [ 32. , 135. ,  84. , ...,  11.6,  82. ,   4. ],
       [ 28. , 120. ,  79. , ...,  18.6,  82. ,   4. ],
       [ 31. , 119. ,  82. , ...,  19.4,  82. ,   4. ]])

In [4]:
data.isnull().sum()

MPG             0
Cylinders       0
Displacement    0
Horsepower      6
Weight          0
Acceleration    0
Model Year      0
Origin          0
dtype: int64

In [5]:
from sklearn.preprocessing import OneHotEncoder

# One-hot encode the categorical columns
encoder = OneHotEncoder()
encoded = encoder.fit_transform(data[cat_cols]).toarray()

# Create a DataFrame with the encoded columns
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(cat_cols))

# Concatenate the original data with the one-hot encoded columns
data = pd.concat([data, encoded_df], axis=1).drop(cat_cols, axis=1)

In [6]:
data.head()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin_1,Origin_2,Origin_3
0,18.0,8,307.0,130.0,3504.0,12.0,70,1.0,0.0,0.0
1,15.0,8,350.0,165.0,3693.0,11.5,70,1.0,0.0,0.0
2,18.0,8,318.0,150.0,3436.0,11.0,70,1.0,0.0,0.0
3,16.0,8,304.0,150.0,3433.0,12.0,70,1.0,0.0,0.0
4,17.0,8,302.0,140.0,3449.0,10.5,70,1.0,0.0,0.0


As a best practice, data transformations should be included in a pipeline, so we'll create a class that handles adding features that can be included in the pipeline

In [7]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

class FeatureAdder(BaseEstimator, TransformerMixin):
    """
    A custom transformer that adds new features based on existing ones in the dataset.
    
    """
    def __init__(self, add_acc_on_power=True):
        self.add_acc_on_power = add_acc_on_power

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Define indices for feature columns
        ACC_INDEX = 4
        HPOWER_INDEX = 2
        CYL_INDEX = 0
        
        # Calculate new features
        acc_per_cyl = X[:, ACC_INDEX] / X[:, CYL_INDEX]
        acc_per_hp = X[:, ACC_INDEX] / X[:, HPOWER_INDEX]
        
        return np.c_[X, acc_per_hp, acc_per_cyl]


In [8]:
feature_adder = FeatureAdder()
transformed_data = feature_adder.transform(data.values)
transformed_data[0]

array([1.80000000e+01, 8.00000000e+00, 3.07000000e+02, 1.30000000e+02,
       3.50400000e+03, 1.20000000e+01, 7.00000000e+01, 1.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 1.14136808e+01, 1.94666667e+02])

## Setting up Data Transformation Pipeline

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder


# Map origin values to strings
# data["Origin"] = data["Origin"].map({1: "USA", 2: "Europe", 3: "Asia"}) 
    

# Select numerical features
num_attrs = ['Cylinders','Displacement','Horsepower','Weight',
                'Acceleration', 'Model Year']

# Numerical pipeline
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('feature_adder', FeatureAdder()),
    ('scaler', StandardScaler()),
])
    
# Categorical features
cat_attrs = ["Origin"]
    

# Final Pipeline for preprocessing    
preprocessing = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_attrs),
        ('cat', OneHotEncoder(), cat_attrs),
    ]
)

In [10]:
data = df.copy()

In [11]:
from sklearn.model_selection import train_test_split

strat_train_set, strat_test_set = train_test_split(
 data, test_size=0.2, stratify=data['Cylinders'], random_state=42)

In [12]:
strat_train_set.head()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
145,32.0,4,83.0,61.0,2003.0,19.0,74,3
151,31.0,4,79.0,67.0,2000.0,16.0,74,2
388,26.0,4,156.0,92.0,2585.0,14.5,82,1
48,18.0,6,250.0,88.0,3139.0,14.5,71,1
114,26.0,4,98.0,90.0,2265.0,15.5,73,2


## Model Training and Selection

We will train these models and see how they perform:

* Linear Regression
* Decision Tree Regressor
* Random Forest Regressor
  
The evaluation metric will be the Root Mean Squared Error RMSE

In [13]:
train = strat_train_set
test = strat_test_set

In [14]:
X_train = train[['Cylinders','Displacement','Horsepower','Weight',
                'Acceleration', 'Model Year', 'Origin']]
y_train = train[['MPG']]

In [15]:
X_test = test[['Cylinders','Displacement','Horsepower','Weight',
                'Acceleration', 'Model Year', 'Origin']]
y_test = test[['MPG']]

### Linear Regression

In [16]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

lin_reg = make_pipeline(preprocessing, LinearRegression())
lin_reg.fit(X_train, y_train)

y_pred = lin_reg.predict(X_train)
lin_rmse = root_mean_squared_error(y_train,y_pred)
print('Linear Regression RMSE on the training set:', lin_rmse)

Linear Regression RMSE on the training set: 2.959040222576087


In [17]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(lin_reg, 
                         X_train, 
                         y_train, 
                         scoring="neg_mean_squared_error", 
                         cv = 3)
lin_reg_rmse_scores = np.sqrt(-scores)
print(lin_reg_rmse_scores)
print('Average: ',lin_reg_rmse_scores.sum()/3)

[3.49462629 2.90299322 3.1744324 ]
Average:  3.190683971358469


### Decision Tree Regressor

In [18]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = make_pipeline(preprocessing, DecisionTreeRegressor(random_state=42))
tree_reg.fit(X_train, y_train)

y_pred = tree_reg.predict(X_train)
tree_rmse = root_mean_squared_error(y_train,y_pred)
print('Decision Tree RMSE on the training set:', tree_rmse)

Decision Tree RMSE on the training set: 0.0


In [19]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, 
                         X_train, 
                         y_train, 
                         scoring="neg_mean_squared_error", 
                         cv = 3)
tree_reg_rmse_scores = np.sqrt(-scores)
print(tree_reg_rmse_scores)
print('Average: ',tree_reg_rmse_scores.sum()/3)

[3.84780016 3.61901425 3.53127536]
Average:  3.666029921938703


### Raandom Forest

In [20]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = make_pipeline(preprocessing, RandomForestRegressor(random_state=42))
forest_reg.fit(X_train, y_train)

y_pred = forest_reg.predict(X_train)
forest_rmse = root_mean_squared_error(y_train,y_pred)
print('Random Forest RMSE on the training set:', tree_rmse)

  return fit_method(estimator, *args, **kwargs)


Random Forest RMSE on the training set: 0.0


In [21]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(forest_reg, 
                         X_train, 
                         y_train, 
                         scoring="neg_mean_squared_error", 
                         cv =3)
forest_reg_rmse_scores = np.sqrt(-scores)
print(forest_reg_rmse_scores)
print('Average: ',forest_reg_rmse_scores.sum()/3)

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


[2.55725244 2.77836241 2.95615479]
Average:  2.763923217493146


  return fit_method(estimator, *args, **kwargs)


### Support Vector Machine

In [67]:
 from sklearn.svm import SVR

svm_reg = make_pipeline(preprocessing, SVR(kernel="rbf", gamma=0.1, C=5))
svm_reg.fit(X_train, y_train)

y_pred = svm_reg.predict(X_train)
svm_rmse = root_mean_squared_error(y_train,y_pred)
print('SVM RMSE on the training set:', tree_rmse)

SVM RMSE on the training set: 0.0


  y = column_or_1d(y, warn=True)


In [68]:
scores = cross_val_score(svm_reg, 
                         X_train, 
                         y_train, 
                         scoring="neg_mean_squared_error", 
                         cv = 3)
svm_reg_rmse_scores = np.sqrt(-scores)
print(svm_reg_rmse_scores)
print('Average: ',svm_reg_rmse_scores.sum()/3)

[2.83732974 3.30829886 2.81309278]
Average:  2.986240458961202


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


## Fine-tuning the Final Model
Random Forest seems to be the most promising model, so we will proceed and fine-tune the model using **Grid Search Cross Validation**

In [None]:
 from sklearn.model_selection import GridSearchCV
 rf_pipeline = Pipeline([
 ("preprocessing", preprocessing),
 ("random_forest", RandomForestRegressor(random_state=42)),
 ])
 param_grid = [
 {
     'random_forest__max_features':[4,6,8],
     'random_forest__max_depth':[7,9,11,13],
     'random_forest__min_samples_split':[3,5,7],
 }
 ]
 grid_search = GridSearchCV(rf_pipeline, param_grid, cv=3,
 scoring='neg_root_mean_squared_error')
 grid_search.fit(X_train, y_train)

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


In [None]:
grid_search.best_params_

We have found the best parameters for the model, it's time to train the Final Model and calculate its score on the test set

In [None]:
 rf_reg = Pipeline([
 ("preprocessing", preprocessing),
 ("random_forest", RandomForestRegressor(max_depth=13,max_features=8,min_samples_split=3,random_state=42)),
 ])

rf_reg.fit(X_train,y_train)

In [None]:
scores = cross_val_score(rf_reg, 
                         X_train, 
                         y_train, 
                         scoring="neg_mean_squared_error", 
                         cv = 3)
rf_reg_rmse_scores = np.sqrt(-scores)
print(rf_reg_rmse_scores)
print('Average: ',rf_reg_rmse_scores.sum()/3)

In [None]:
y_pred = rf_reg.predict(X_test)
rmse = root_mean_squared_error(y_test,y_pred)
print('Fine-tuned Random Forest RMSE on the test set:', rmse)

### Ensemble Model: Linear Regression, Random Forest, Support Vector Machine

In [None]:
from sklearn.ensemble import VotingRegressor

ens_model = VotingRegressor([('lr',lin_reg),('rf',rf_reg),('svm',svm_reg)])
ens_model.fit(X_train,y_train)

scores = cross_val_score(ens_model, 
                         X_train, 
                         y_train, 
                         scoring="neg_mean_squared_error", 
                         cv = 3)
forest_reg_rmse_scores = np.sqrt(-scores)
print(forest_reg_rmse_scores)
print('Average: ',forest_reg_rmse_scores.sum()/3)

In [None]:
y_pred = ens_model.predict(X_test)
rmse = root_mean_squared_error(y_test,y_pred)
print('Ensemble Model RMSE on the test set:', rmse)