In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelBinarizer
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from numpy import arange
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
import plotly.express as px
from sklearn.linear_model import LinearRegression
%matplotlib inline


# Metrics Functions

In [2]:
def calculate_metrics(df, method, y_test, y_pred):
    mae = metrics.mean_absolute_error(y_test, y_pred)
    mse = metrics.mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
    mape = metrics.mean_absolute_percentage_error(y_test,y_pred)
    df.loc[method] = [mae, mse, rmse, mape]

# Function to calculate cross validation
def calculate_cross_validation_scores (df, model, method, X, y):
    my_cv = 10
    mae = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=my_cv).mean()
    mse = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=my_cv).mean()
    rmse = cross_val_score(model, X, y, scoring='neg_root_mean_squared_error', cv=my_cv).mean()
    mape = cross_val_score(model, X, y, scoring= "neg_mean_absolute_percentage_error", cv=my_cv).mean()
    df.loc[method] = [mae, mse, rmse, mape]

# Load customer dataset

In [3]:

df = pd.read_csv("auto-mpg.csv", header=None, names=["mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration", "model year", "origin", "car name"])
df = df.fillna(np.nan)
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


# Encoding

In [4]:
df["origin"] = df["origin"].astype(str)
df["cylinders"] = df["cylinders"].astype(str)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    object 
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    object 
 8   car name      398 non-null    object 
dtypes: float64(3), int64(2), object(4)
memory usage: 28.1+ KB


In [5]:
categorical_cols = ["origin", "cylinders"]
def encode_and_bind(original_dataframe, feature_to_encode):
    # Convert categorical variable into dummy/indicator variables.
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]], dummy_na=False)
    # concatenate the result to the original dataframe
    res = pd.concat([original_dataframe, dummies], axis=1)
    return(res)



# one hot encode columns
for col in categorical_cols:
    df = encode_and_bind(df, col)


# delete the origin columns
for col in categorical_cols:
    del df[col]


df

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,model year,car name,origin_1,origin_2,origin_3,cylinders_3,cylinders_4,cylinders_5,cylinders_6,cylinders_8
0,18.0,307.0,130,3504,12.0,70,chevrolet chevelle malibu,1,0,0,0,0,0,0,1
1,15.0,350.0,165,3693,11.5,70,buick skylark 320,1,0,0,0,0,0,0,1
2,18.0,318.0,150,3436,11.0,70,plymouth satellite,1,0,0,0,0,0,0,1
3,16.0,304.0,150,3433,12.0,70,amc rebel sst,1,0,0,0,0,0,0,1
4,17.0,302.0,140,3449,10.5,70,ford torino,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
393,27.0,140.0,86,2790,15.6,82,ford mustang gl,1,0,0,0,1,0,0,0
394,44.0,97.0,52,2130,24.6,82,vw pickup,0,1,0,0,1,0,0,0
395,32.0,135.0,84,2295,11.6,82,dodge rampage,1,0,0,0,1,0,0,0
396,28.0,120.0,79,2625,18.6,82,ford ranger,1,0,0,0,1,0,0,0


# Missing Values

In [6]:
missing_value = pd.DataFrame(df.horsepower.str.isdigit()) 
df[missing_value['horsepower'] == False]

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,model year,car name,origin_1,origin_2,origin_3,cylinders_3,cylinders_4,cylinders_5,cylinders_6,cylinders_8
32,25.0,98.0,?,2046,19.0,71,ford pinto,1,0,0,0,1,0,0,0
126,21.0,200.0,?,2875,17.0,74,ford maverick,1,0,0,0,0,0,1,0
330,40.9,85.0,?,1835,17.3,80,renault lecar deluxe,0,1,0,0,1,0,0,0
336,23.6,140.0,?,2905,14.3,80,ford mustang cobra,1,0,0,0,1,0,0,0
354,34.5,100.0,?,2320,15.8,81,renault 18i,0,1,0,0,1,0,0,0
374,23.0,151.0,?,3035,20.5,82,amc concord dl,1,0,0,0,1,0,0,0


In [7]:
df = df.replace('?', np.nan)
df[missing_value['horsepower'] == False]

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,model year,car name,origin_1,origin_2,origin_3,cylinders_3,cylinders_4,cylinders_5,cylinders_6,cylinders_8
32,25.0,98.0,,2046,19.0,71,ford pinto,1,0,0,0,1,0,0,0
126,21.0,200.0,,2875,17.0,74,ford maverick,1,0,0,0,0,0,1,0
330,40.9,85.0,,1835,17.3,80,renault lecar deluxe,0,1,0,0,1,0,0,0
336,23.6,140.0,,2905,14.3,80,ford mustang cobra,1,0,0,0,1,0,0,0
354,34.5,100.0,,2320,15.8,81,renault 18i,0,1,0,0,1,0,0,0
374,23.0,151.0,,3035,20.5,82,amc concord dl,1,0,0,0,1,0,0,0


In [8]:
# I decided to be in arbitrary year to calculate how old that car is.
# For me, it made more sense to how old the car was rather than looking at its manufacturing year.
df["age"] = 99 - df["model year"]
df = df.drop(columns=["car name", "model year"])
df.head()

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,origin_1,origin_2,origin_3,cylinders_3,cylinders_4,cylinders_5,cylinders_6,cylinders_8,age
0,18.0,307.0,130,3504,12.0,1,0,0,0,0,0,0,1,29
1,15.0,350.0,165,3693,11.5,1,0,0,0,0,0,0,1,29
2,18.0,318.0,150,3436,11.0,1,0,0,0,0,0,0,1,29
3,16.0,304.0,150,3433,12.0,1,0,0,0,0,0,0,1,29
4,17.0,302.0,140,3449,10.5,1,0,0,0,0,0,0,1,29


# Imputer

In [9]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=3)
df_x = pd.DataFrame(imputer.fit_transform(df[["horsepower"]]), columns=["hp"])
df = pd.concat((df,df_x), axis=1)
df = df.drop(columns=["horsepower"])

In [10]:
df.head()

Unnamed: 0,mpg,displacement,weight,acceleration,origin_1,origin_2,origin_3,cylinders_3,cylinders_4,cylinders_5,cylinders_6,cylinders_8,age,hp
0,18.0,307.0,3504,12.0,1,0,0,0,0,0,0,1,29,130.0
1,15.0,350.0,3693,11.5,1,0,0,0,0,0,0,1,29,165.0
2,18.0,318.0,3436,11.0,1,0,0,0,0,0,0,1,29,150.0
3,16.0,304.0,3433,12.0,1,0,0,0,0,0,0,1,29,150.0
4,17.0,302.0,3449,10.5,1,0,0,0,0,0,0,1,29,140.0


# Determine the independent variables

Selecting the independent variables, which are cylinders, displacement, horsepower, and acceleration

In [11]:
X = df.loc[:,df.columns != "mpg"]
display(pd.DataFrame(X))

Unnamed: 0,displacement,weight,acceleration,origin_1,origin_2,origin_3,cylinders_3,cylinders_4,cylinders_5,cylinders_6,cylinders_8,age,hp
0,307.0,3504,12.0,1,0,0,0,0,0,0,1,29,130.0
1,350.0,3693,11.5,1,0,0,0,0,0,0,1,29,165.0
2,318.0,3436,11.0,1,0,0,0,0,0,0,1,29,150.0
3,304.0,3433,12.0,1,0,0,0,0,0,0,1,29,150.0
4,302.0,3449,10.5,1,0,0,0,0,0,0,1,29,140.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
393,140.0,2790,15.6,1,0,0,0,1,0,0,0,17,86.0
394,97.0,2130,24.6,0,1,0,0,1,0,0,0,17,52.0
395,135.0,2295,11.6,1,0,0,0,1,0,0,0,17,84.0
396,120.0,2625,18.6,1,0,0,0,1,0,0,0,17,79.0


# Determine the dependent variable

The mpg column is the dependent variable. The data is stored in a vector y.


In [12]:
y = df["mpg"]
display(pd.DataFrame(y))

Unnamed: 0,mpg
0,18.0
1,15.0
2,18.0
3,16.0
4,17.0
...,...
393,27.0
394,44.0
395,32.0
396,28.0


# Scaling

In [13]:
#Feature standardizing of X
from sklearn.preprocessing import MinMaxScaler
sc_X = MinMaxScaler()
X = sc_X.fit_transform(X)

# Splitting the dataset into training and test set

Splitting the dataset into training (X_train, y_train) and test set (X_test, y_test).

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)
display(pd.DataFrame(X_train))
display(pd.DataFrame(y_train))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.645995,0.715055,0.327381,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.833333,0.565217
1,0.074935,0.062659,0.654762,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.916667,0.076087
2,0.074935,0.163028,0.464286,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.166667,0.173913
3,0.074935,0.095549,0.773810,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.750000,0.000000
4,0.134367,0.289765,0.613095,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.083333,0.152174
...,...,...,...,...,...,...,...,...,...,...,...,...,...
313,0.227390,0.336547,0.380952,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.166667,0.320652
314,0.470284,0.493337,0.386905,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.500000,0.320652
315,0.000000,0.072016,0.684524,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.750000,0.016304
316,0.470284,0.473207,0.416667,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.916667,0.293478


Unnamed: 0,mpg
64,15.0
55,27.0
317,34.3
102,26.0
358,31.6
...,...
323,27.9
192,22.0
117,29.0
47,19.0


# Creating Dataframe for errors and cross validation errors

In [15]:
# create error dataframe
error_df = pd.DataFrame(columns = ['MAE', 'MSE', 'RMSE', "MAPE"], index = ['Polynomial Regression', 'KNN Regression', "Random Forest Regression"])
display(error_df)

Unnamed: 0,MAE,MSE,RMSE,MAPE
Polynomial Regression,,,,
KNN Regression,,,,
Random Forest Regression,,,,


In [16]:
# create cross valudation dataframe
cv_df = pd.DataFrame(columns = ['MAE', 'MSE', 'RMSE', "MAPE"], index = ['Polynomial Regression', 'KNN Regression', "Random Forest Regression"])
display(cv_df)

Unnamed: 0,MAE,MSE,RMSE,MAPE
Polynomial Regression,,,,
KNN Regression,,,,
Random Forest Regression,,,,


# Method 1: Polynomial Regression

In [20]:
# Fitting Polynomial Regression to the dataset
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree = 3)
X_poly_train = poly_reg.fit_transform(X_train)
X_poly_test = poly_reg.fit_transform(X_test)
poly_reg.fit(X_poly_train, y_train)

PolynomialFeatures(degree=3)

In [21]:
# the code to compare the results of polynomial regression to linear regression
lin_reg_2 = LinearRegression()
lin_reg_2.fit(X_poly_train, y_train)

# Predict test set result
y_pred = lin_reg_2.predict(X_poly_test)
display(pd.DataFrame(y_pred))

Unnamed: 0,0
0,13.556070
1,24.286188
2,15.452666
3,18.638644
4,17.324851
...,...
75,30.883998
76,31.224143
77,24.438040
78,22.067797


In [22]:
method = 'Polynomial Regression'
calculate_metrics(error_df, method, y_test, y_pred)
display(error_df)
calculate_cross_validation_scores (cv_df, lin_reg_2, method, X, y)
display(cv_df)

Unnamed: 0,MAE,MSE,RMSE,MAPE
Polynomial Regression,3.13019,22.119904,4.70318,0.13127
KNN Regression,,,,
Random Forest Regression,,,,


Unnamed: 0,MAE,MSE,RMSE,MAPE
Polynomial Regression,-2.577159,-11.707441,-3.301421,-0.118408
KNN Regression,,,,
Random Forest Regression,,,,


# Method 2: KNN Regression

In [23]:
#Fitting KNN regression to dataset
from sklearn.neighbors import KNeighborsRegressor
regressor = KNeighborsRegressor(n_neighbors=10)
regressor.fit(X_train, y_train)

#prediction of test values
y_pred = regressor.predict(X_test)
display(pd.DataFrame(y_pred))

Unnamed: 0,0
0,13.70
1,24.10
2,14.00
3,24.20
4,18.70
...,...
75,29.99
76,32.41
77,30.05
78,26.87


In [24]:
method = 'KNN Regression'
calculate_metrics(error_df, method, y_test, y_pred)
display(error_df)
calculate_cross_validation_scores (cv_df, regressor, method, X, y)
display(cv_df)

Unnamed: 0,MAE,MSE,RMSE,MAPE
Polynomial Regression,3.13019,22.119904,4.70318,0.13127
KNN Regression,2.073,8.974182,2.995694,0.084713
Random Forest Regression,,,,


Unnamed: 0,MAE,MSE,RMSE,MAPE
Polynomial Regression,-2.577159,-11.707441,-3.301421,-0.118408
KNN Regression,-2.260281,-10.05361,-3.018319,-0.096489
Random Forest Regression,,,,


# Method 3: Random Forest

In [25]:
#Fitting random forest regression to dataset
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)
regressor.fit(X_train, y_train)

#prediction of new value
y_pred = regressor.predict(X_test)
display(pd.DataFrame(y_pred, columns=["mpg"]))

Unnamed: 0,mpg
0,14.290
1,24.711
2,14.355
3,20.608
4,18.580
...,...
75,28.441
76,27.470
77,31.217
78,30.352


In [26]:
method = 'Random Forest Regression'
calculate_metrics(error_df, method , y_test, y_pred)
display(error_df)
calculate_cross_validation_scores (cv_df, regressor, method, X, y)
display(cv_df)

Unnamed: 0,MAE,MSE,RMSE,MAPE
Polynomial Regression,3.13019,22.119904,4.70318,0.13127
KNN Regression,2.073,8.974182,2.995694,0.084713
Random Forest Regression,1.844888,6.664843,2.581636,0.077204


Unnamed: 0,MAE,MSE,RMSE,MAPE
Polynomial Regression,-2.577159,-11.707441,-3.301421,-0.118408
KNN Regression,-2.260281,-10.05361,-3.018319,-0.096489
Random Forest Regression,-2.04196,-8.138112,-2.692875,-0.089276


# My Conclusion

From the Mean Absolute Percentage Error, or MAPE for short, we know that for this dataset and my given parameters for each method the Random Forest Regression was the most accurate in predicting the MPG of the different cars.