In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.compose import ColumnTransformer


In [13]:
from sklearn.base import BaseEstimator, TransformerMixin
# acc_ix, hpower_ix, cyl_ix = 4, 2, 0
class CustomAttrAdder(BaseEstimator, TransformerMixin):
    def __init__(self, acc_on_power=True):
        self.acc_on_power = acc_on_power  
    
    def fit(self, x_data, y=None):
        return self  
    
    def transform(self, x_data):
        acc_on_cyl = x_data[:, acc_ix] / x_data[:, cyl_ix] 
        if self.acc_on_power:
            acc_on_power = x_data[:, acc_ix] / x_data[:, hpower_ix]    
            return np.c_[x_data, acc_on_power, acc_on_cyl] 
        else:
            return np.c_[x_data,acc_on_cyl]
            
def num_pipeline_transformer(data):
    '''
    Function to process numerical transformations
    Argument:
        data: original dataframe 
    Returns:
        num_attrs: numerical dataframe
        num_pipeline: numerical pipeline object
        
    '''
    numerics = ['float64', 'int64']

    num_attrs = data.select_dtypes(include=numerics)

    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attrs_adder', CustomAttrAdder()),
        ('std_scaler', StandardScaler()),
        ])
    return num_attrs, num_pipeline

def pipeline_transformer(data):
    '''
    Complete transformation pipeline for both
    nuerical and categorical data.
    
    Argument:
        data: original dataframe 
    Returns:
        prepared_data: transformed data, ready to use
    '''
    cat_attrs = ["Origin"]
    num_attrs, num_pipeline = num_pipeline_transformer(data)
    full_pipeline = ColumnTransformer([
        ("num", num_pipeline, list(num_attrs)),
        ("cat", OneHotEncoder(), cat_attrs),
        ])
    prepared_data = full_pipeline.fit_transform(data)
    return prepared_data

def preprocess_origin_cols(df):
    df["Origin"] = df["Origin"].map({1: "India", 2: "USA", 3: "Germany"})    
    return df



In [3]:
def load_data(file_path):
    cols = ['MPG','Cylinders','Displacement','Horsepower','Weight',
                    'Acceleration', 'Model Year', 'Origin']
    # reading the .data file using pandas
    df = pd.read_csv(f'{file_path}', names=cols, na_values = "?",
                    comment = '\t',
                    sep= " ",
                    skipinitialspace=True)
    return df


In [35]:
data =  load_data('../data/raw/auto-mpg.data')
data.head()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,1


In [5]:
def strat_split(df,target,test_size,seed):
        
    split = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=seed)
    for train_index, test_index in split.split(data, data[f"{target}"]):
        strat_train_set = data.loc[train_index]
        strat_test_set = data.loc[test_index]
    return strat_train_set,strat_test_set

In [6]:
train_df,test_df = strat_split(data,'Cylinders',0.2,42)
print(len(train_df),len(test_df))
train_df.head()

318 80


Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
145,32.0,4,83.0,61.0,2003.0,19.0,74,3
151,31.0,4,79.0,67.0,2000.0,16.0,74,2
388,26.0,4,156.0,92.0,2585.0,14.5,82,1
48,18.0,6,250.0,88.0,3139.0,14.5,71,1
114,26.0,4,98.0,90.0,2265.0,15.5,73,2


In [7]:
def get_feat_labels(df,target):
    features = df.drop(f'{target}',axis=1)
    labels = df[f'{target}'].copy()
    return features,labels

In [30]:
features,labels = get_feat_labels('MPG')
acc_ix=features.columns.get_loc('Acceleration')
hpower_ix=features.columns.get_loc('Horsepower')
cyl_ix=features.columns.get_loc('Cylinders')


In [31]:
features.head()

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
145,4,83.0,61.0,2003.0,19.0,74,3
151,4,79.0,67.0,2000.0,16.0,74,2
388,4,156.0,92.0,2585.0,14.5,82,1
48,6,250.0,88.0,3139.0,14.5,71,1
114,4,98.0,90.0,2265.0,15.5,73,2


In [15]:
preprocessed_df = preprocess_origin_cols(features)
prepared_data = pipeline_transformer(preprocessed_df)
prepared_df = pd.DataFrame(prepared_data)
prepared_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,-0.856578,-1.078045,-1.15193,-1.172203,1.215869,-0.544364,1.709527,1.295655,1.0,0.0,0.0
1,-0.856578,-1.117458,-0.990035,-1.175813,0.130698,-0.544364,0.798675,0.666186,0.0,0.0,1.0
2,-0.856578,-0.358749,-0.315474,-0.471828,-0.411887,1.63652,-0.219068,0.351451,0.0,1.0,0.0
3,0.322607,0.567467,-0.423404,0.194851,-0.411887,-1.362195,-0.129273,-0.662693,0.0,1.0,0.0
4,-0.856578,-0.930244,-0.369439,-0.856914,-0.050164,-0.816974,-0.035899,0.561274,0.0,0.0,1.0


In [18]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
forest_reg = RandomForestRegressor()
scores = cross_val_score(forest_reg, 
                         prepared_data, 
                         labels, 
                         scoring="neg_mean_squared_error", 
                         cv = 10)
forest_reg_rmse_scores = np.sqrt(-scores)
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(forest_reg_rmse_scores)


Scores: [2.22572585 2.48373427 2.6724985  2.43303632 2.03372188 2.50982771
 2.62963422 2.57435831 4.16152903 1.96420253]
Mean: 2.568826863055762
Standard deviation: 0.5787886560994301


In [20]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30], 
    'max_features': [2, 4, 6, 8,10],
     'max_depth': [4,8],
     'random_state':[11],
    'bootstrap': [False], 
    'n_estimators': [3, 10],
    'max_features': [2, 3, 4]},
  ]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid,
                           scoring='neg_mean_squared_error',
                           return_train_score=True,
                           cv=10,
                          )

grid_search.fit(prepared_data, labels)
print(grid_search.best_params_)

{'bootstrap': False, 'max_depth': 8, 'max_features': 4, 'n_estimators': 10, 'random_state': 11}


In [21]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

3.56868265270271 {'bootstrap': False, 'max_depth': 4, 'max_features': 2, 'n_estimators': 3, 'random_state': 11}
3.3549405487655855 {'bootstrap': False, 'max_depth': 4, 'max_features': 2, 'n_estimators': 10, 'random_state': 11}
3.492826605509153 {'bootstrap': False, 'max_depth': 4, 'max_features': 3, 'n_estimators': 3, 'random_state': 11}
3.1913390381081013 {'bootstrap': False, 'max_depth': 4, 'max_features': 3, 'n_estimators': 10, 'random_state': 11}
3.412598172644645 {'bootstrap': False, 'max_depth': 4, 'max_features': 4, 'n_estimators': 3, 'random_state': 11}
3.0949386854262064 {'bootstrap': False, 'max_depth': 4, 'max_features': 4, 'n_estimators': 10, 'random_state': 11}
3.341668453895685 {'bootstrap': False, 'max_depth': 8, 'max_features': 2, 'n_estimators': 3, 'random_state': 11}
3.052410220141972 {'bootstrap': False, 'max_depth': 8, 'max_features': 2, 'n_estimators': 10, 'random_state': 11}
3.0732651159580215 {'bootstrap': False, 'max_depth': 8, 'max_features': 3, 'n_estimators':

In [22]:
# feature importances
feature_importances = grid_search.best_estimator_.feature_importances_

extra_attrs = ["acc_on_power", "acc_on_cyl"]
numerics = ['float64', 'int64']
num_attrs = list(data.select_dtypes(include=numerics))

attrs = num_attrs + extra_attrs
sorted(zip(attrs, feature_importances), reverse=True)

[('acc_on_power', 0.0036082060082722384),
 ('acc_on_cyl', 0.002282929098929391),
 ('Weight', 0.01674068735227941),
 ('Origin', 0.1324414752950371),
 ('Model Year', 0.07602133125546247),
 ('MPG', 0.29834442059908123),
 ('Horsepower', 0.12956544112755816),
 ('Displacement', 0.09913408104399593),
 ('Cylinders', 0.12633298308645366),
 ('Acceleration', 0.1147987941587284)]

In [24]:
##capturing the best configuration
final_model = grid_search.best_estimator_

##segregating the target variable from test set
X_test = test_df.drop("MPG", axis=1)
y_test = test_df["MPG"].copy()

##preprocessing the test data origin column
X_test_preprocessed = preprocess_origin_cols(X_test)

##preparing the data with final transformation
X_test_prepared = pipeline_transformer(X_test_preprocessed)

##making final predictions
final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse

3.1315890591212447

In [None]:
import joblib
joblib.dump(final_model, '../model_checkpoints/rand_model.pkl')