## Predicting Fuel Efficiency of Vehicles - Part 3
1. Selecting and Training Models
2. Select and Train a few Algorithms(Linear Regression, Decision Tree, RandomForest)
3. Evaluation using Mean Squared Error
4. Model Evaluation using Cross Validation
5. Hyperparameter Tuning using GridSearchCV
6. Check Feature Importance
7. Evaluate the Final System on test data
8. Saving the Model

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
import warnings

warnings.filterwarnings('ignore')

In [3]:
# reading the .data file using pandas

cols = ['MPG','Cylinders','Displacement','Horsepower','Weight',
                'Acceleration', 'Model Year', 'Origin']

df = pd.read_csv('./auto-mpg.data', names = cols, comment = '\t', sep = " ", na_values = "?", skipinitialspace = True)

data = df.copy()

In [4]:
data.head()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,1


In [5]:
split = StratifiedShuffleSplit(n_splits = 1, random_state = 42, test_size = 0.2)

for train_index, test_index in split.split(data, data['Cylinders']):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]

In [6]:
#segregating the features

data = strat_train_set.drop(['MPG'], axis = 1)
data_labels = strat_train_set['MPG'].copy()

data.head()

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
145,4,83.0,61.0,2003.0,19.0,74,3
151,4,79.0,67.0,2000.0,16.0,74,2
388,4,156.0,92.0,2585.0,14.5,82,1
48,6,250.0,88.0,3139.0,14.5,71,1
114,4,98.0,90.0,2265.0,15.5,73,2


In [7]:
data_labels.head()

145    32.0
151    31.0
388    26.0
48     18.0
114    26.0
Name: MPG, dtype: float64

In [8]:
#preprocessing origin column in data

def preprocess_origin_col(df):
    df['Origin'] = df['Origin'].map({1 : 'India', 2 : 'USA', 3 : 'Germany'})
    return df

In [9]:
##creating custom attribute adder class
acc_ix, hpower_ix, cyl_ix = 4,2, 0

class CustomAttrAdder(BaseEstimator, TransformerMixin):
    def __init__(self, acc_on_power=True): # no *args or **kargs
        self.acc_on_power = acc_on_power
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        acc_on_cyl = X[:, acc_ix] / X[:, cyl_ix]
        if self.acc_on_power:
            acc_on_power = X[:, acc_ix] / X[:, hpower_ix]
            return np.c_[X, acc_on_power, acc_on_cyl]
        
        return np.c_[X, acc_on_cyl]

In [10]:
def num_pipeline_transformer(data):
    ''''
    Function to process numerical transformations
    Argument:
        data: original dataframe 
    Returns:
        num_attrs: numerical dataframe
        num_pipeline: numerical pipeline object
        
    '''
    numerics = ['float64', 'int64']
    num_attrs = data.select_dtypes(include = numerics)
    num_pipeline = Pipeline([
        ('imputer',SimpleImputer(strategy = 'median')),
        ('attrs_adder', CustomAttrAdder()),
        ('std_scaler', StandardScaler()),     
    ])
    
    return num_attrs, num_pipeline

def pipeline_transformer(data):
    ''''
    Complete transformation pipeline for both
    nuerical and categorical data.
    
    Argument:
        data: original dataframe 
    Returns:
        prepared_data: transformed data, ready to use
    '''
    cat_attrs = ['Origin']
    num_attrs, num_pipeline = num_pipeline_transformer(data)
    full_pipeline = ColumnTransformer([
        ("num", num_pipeline, list(num_attrs)),
        ("cat", OneHotEncoder(), cat_attrs),
        ])
    prepared_data = full_pipeline.fit_transform(data)
    return prepared_data

## From raw data to processed data in 2 steps

In [11]:
preprocessed_df = preprocess_origin_col(data)
prepared_data = pipeline_transformer(preprocessed_df)
prepared_data[0]

array([-0.85657842, -1.07804475, -1.15192977, -1.17220298,  1.21586943,
       -0.54436373,  1.70952741,  1.29565517,  1.        ,  0.        ,
        0.        ])

# Selecting and Training Models

1. Linear Regression
2. Decision Tree
3. Random Forest
4. SVM regressor

In [12]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(prepared_data, data_labels)

LinearRegression()

In [13]:
##testing the predictions with the 

sample_data = data.iloc[:5]
sample_labels = data_labels.iloc[:5]

sample_data_prepared = pipeline_transformer(sample_data)
print("Prediction of sample data", lin_reg.predict(sample_data_prepared))

Prediction of sample data [29.08069379 27.78336755 26.08031176 12.70419279 22.23454159]


In [14]:
print("Actual Labels of Samples", list(sample_labels))

Actual Labels of Samples [32.0, 31.0, 26.0, 18.0, 26.0]


## Mean squared Error

In [15]:
from sklearn.metrics import mean_squared_error

mpg_predictions = lin_reg.predict(prepared_data)
lin_mse = mean_squared_error(data_labels, mpg_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

2.9590402225760863

## Decision Tree

In [16]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(prepared_data, data_labels)

DecisionTreeRegressor()

In [17]:
mpg_predictions = tree_reg.predict(prepared_data)
tree_mse = mean_squared_error(data_labels, mpg_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_mse

0.0

In [18]:
## Model Evaluation using cross Validation
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, 
                          prepared_data,
                          data_labels,
                          cv = 10,
                          scoring = "neg_mean_squared_error")

tree_reg_rmse_scores = np.sqrt(-scores)
tree_reg_rmse_scores.mean()

3.277620949296741

In [19]:
scores = cross_val_score(lin_reg,
                        prepared_data,
                        data_labels,
                        cv = 10,
                        scoring = "neg_mean_squared_error")
lin_reg_rmse_scores = np.sqrt(-scores)
lin_reg_rmse_scores.mean()

3.0757081793709324

## Random Forest Model

In [20]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(prepared_data, data_labels)

forest_reg_cv_scores = cross_val_score(forest_reg,
                        prepared_data,
                        data_labels,
                        cv = 10,
                        scoring = "neg_mean_squared_error")

forest_reg_rmse_scores = np.sqrt(-scores)
forest_reg_rmse_scores.mean()

3.0757081793709324

## Support Vector Machine

In [21]:
from sklearn.svm import SVR

svr_reg = SVR(kernel = 'linear')
svr_reg.fit(prepared_data, data_labels)

svr_cv_scores = cross_val_score(svr_reg,
                               prepared_data,
                               data_labels,
                               cv = 10,
                               scoring = "neg_mean_squared_error")
svr_rmse_scores = np.sqrt(-svr_cv_scores)
svr_rmse_scores.mean()

3.08659162080283

## Hyper Parameter tuning using Grid Search CV

In [22]:
from sklearn.model_selection import GridSearchCV

param_grid = [{'n_estimators' : [3, 10, 30],'max_features' : [2, 4, 6, 8]},
             {'bootstrap' : [False], 'n_estimators' : [3, 10], 'max_features' : [2, 3, 4]},]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid,
                          scoring = "neg_mean_squared_error",
                          cv = 10,
                          return_train_score = True)

grid_search.fit(prepared_data, data_labels)

GridSearchCV(cv=10, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [23]:
grid_search.best_estimator_

RandomForestRegressor(max_features=8, n_estimators=30)

In [24]:
grid_search.best_params_

{'max_features': 8, 'n_estimators': 30}

In [25]:
cv_scores = grid_search.cv_results_

cv_scores

{'mean_fit_time': array([0.00765219, 0.02276103, 0.06908264, 0.00932422, 0.02626801,
        0.07698615, 0.01021836, 0.02904472, 0.08139844, 0.00781066,
        0.032634  , 0.08818383, 0.00850298, 0.02231731, 0.00718646,
        0.02190194, 0.01019173, 0.02816811]),
 'std_fit_time': array([0.00514976, 0.00892527, 0.00613852, 0.00740249, 0.0067179 ,
        0.00509047, 0.00705253, 0.00495654, 0.00763141, 0.00781067,
        0.00320434, 0.00554577, 0.00792117, 0.00361774, 0.00423971,
        0.00475458, 0.00477045, 0.00621326]),
 'mean_score_time': array([1.77233219e-03, 1.66521072e-03, 2.60105133e-03, 8.13603401e-04,
        0.00000000e+00, 2.36229897e-03, 0.00000000e+00, 1.56247616e-03,
        3.16557884e-03, 1.56164169e-03, 2.21946239e-03, 3.81436348e-03,
        1.80914402e-03, 1.59807205e-03, 2.36902237e-03, 3.79126072e-03,
        8.03089142e-04, 6.63757324e-05]),
 'std_score_time': array([0.00316331, 0.00465706, 0.00400693, 0.00244081, 0.        ,
        0.00502277, 0.        , 

In [26]:
for mean_score, params in zip(cv_scores['mean_test_score'], cv_scores['params']):
    print(np.sqrt(-mean_score), params)

3.415635301979565 {'max_features': 2, 'n_estimators': 3}
3.0737528991577454 {'max_features': 2, 'n_estimators': 10}
2.9360996694664876 {'max_features': 2, 'n_estimators': 30}
3.197488373652291 {'max_features': 4, 'n_estimators': 3}
2.8605128931883477 {'max_features': 4, 'n_estimators': 10}
2.773545636881714 {'max_features': 4, 'n_estimators': 30}
3.333111148509064 {'max_features': 6, 'n_estimators': 3}
2.7983926442029237 {'max_features': 6, 'n_estimators': 10}
2.678192297609511 {'max_features': 6, 'n_estimators': 30}
3.340528977357674 {'max_features': 8, 'n_estimators': 3}
2.730399625867152 {'max_features': 8, 'n_estimators': 10}
2.64840100686153 {'max_features': 8, 'n_estimators': 30}
3.262893160094818 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
2.863411052018655 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
3.0589043517791352 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
2.860563523041367 {'bootstrap': False, 'max_features': 3, 'n_estimators'

## Checking Feature importance

In [27]:
# feature importances 

feature_importance = grid_search.best_estimator_.feature_importances_
feature_importance

array([0.08087611, 0.35268113, 0.10792442, 0.25379498, 0.01155216,
       0.13044228, 0.02391978, 0.03373684, 0.00159646, 0.00218224,
       0.0012936 ])

In [28]:
extra_attrs = ["acc_on_power", "acc_on_cyl"]
numerics = ['float64', 'int64']
numeric_attrs = list(data.select_dtypes(include = numerics))

attr = extra_attrs + numeric_attrs
sorted(zip(attr, feature_importance), reverse = True)

[('acc_on_power', 0.08087610847178564),
 ('acc_on_cyl', 0.35268112604875496),
 ('Weight', 0.13044227803054828),
 ('Model Year', 0.033736838693057734),
 ('Horsepower', 0.01155215657446422),
 ('Displacement', 0.2537949848022582),
 ('Cylinders', 0.10792442429755222),
 ('Acceleration', 0.02391977825045578)]

## Evaluating the entire system on Test data

In [29]:
final_model = grid_search.best_estimator_

X_test = strat_test_set.drop(["MPG"], axis = 1)
y_test = strat_test_set["MPG"].copy()

X_test_preprocessed = preprocess_origin_col(X_test)
X_test_prepared = pipeline_transformer(X_test_preprocessed)

final_prediction = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_prediction)
final_rmse = np.sqrt(final_mse)
final_rmse

2.92918790888457

## Creating a fuction to cover entire process

In [30]:
def predict_mpg(config, model):
    
    if type(config) == dict:
        df = pd.DataFrame(config)
    else:
        df = config
        
        
    preproc_df = preprocess_origin_col(df)
    prepared_df = pipeline_transformer(preproc_df)
    y_pred = model.predict(prepared_df)
    return y_pred

In [31]:
## Checkking it on random sample

vehicle_config = {
    'Cylinders' : [4, 6, 8],
    'Displacement' : [155.0, 160.0, 165.5],
    'Horsepower' : [93.0, 130.0, 98.0],
    'Weight' : [2500.0, 3150.0, 2600.0],
    'Acceleration' : [15.0, 14.0, 16.0],
    'Model Year' : [81, 80, 78],
    'Origin' : [3, 2, 1]
}

predict_mpg(vehicle_config, final_model)

array([33.42, 16.52, 21.06])

## Save the Model

In [32]:
import pickle

with open('model_bin', "wb") as f_out:
    pickle.dump(final_model, f_out)
    f_out.close()

In [49]:
#Loading the model from the saved file

with open("model_bin", "rb") as f_in:
    model = pickle.load(f_in)
    
predict_mpg(vehicle_config, model)

array([33.42, 16.52, 21.06])

In [55]:
import requests

url = 'http://127.0.0.1:5000/'
r = requests.post(url, json = vehicle_config)
r.text.strip()

'{\n  "mpg_predictions": [\n    33.42000000000001, \n    16.519999999999996, \n    21.06\n  ]\n}'