# MODEL CREATION

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

#### Reading the features csv file 

In [2]:
data = pd.read_csv('dataset_used/features_car.csv')
data.head()

Unnamed: 0,length,width,curb-weight,engine-size,horsepower,city-mpg,highway-mpg,wheel-base,bore,drive-wheels,price
0,0.811148,0.890278,2548,130,111,21,27,88.6,3.47,rwd,13495.0
1,0.811148,0.890278,2548,130,111,21,27,88.6,3.47,rwd,16500.0
2,0.822681,0.909722,2823,152,154,19,26,94.5,2.68,rwd,16500.0
3,0.84863,0.919444,2337,109,102,24,30,99.8,3.19,fwd,13950.0
4,0.84863,0.922222,2824,136,115,18,22,99.4,3.19,4wd,17450.0


In [3]:
# BASED ON THE ANALYSIS PART OF THIS NOTEBOOK SERIES WE PICKED THE FEATURES WITH THE HIGHEST CORRELATION
x = data[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg', 'drive-wheels', 'length', 'width', 'city-mpg']].values

# Label Encoder for one-hot-encoding
le_sex = preprocessing.LabelEncoder()
le_sex.fit(['rwd','fwd', '4wd'])
x[:,4] = le_sex.transform(x[:,4]) 

# StandardScaler Preprocessor for scaling the dataset
x = preprocessing.StandardScaler().fit(x).transform(x.astype(float))
y = data.price

# splitting the dataset into training and testing values.
train_x, val_x, train_y, val_y = train_test_split(x, y, test_size=0.1, random_state = 2)

Linear Regression
Random Forest Regressor
Decision Tree Regressor
Lasso
Ridge
ElasticNet

In [4]:
# Using a MultipleLinearRegressor import the necessary library
from sklearn.linear_model import LinearRegression

# training the model
lm = LinearRegression()
lm.fit(train_x, train_y)

# making a prediction
m1 = lm.predict(val_x)
m1

array([ 7877.54726206,  9067.47823695, 18985.34002774, 20965.53608216,
        5981.23505078,  6250.19213406,  6141.54122825, 10741.50295779,
        9592.6704247 , 11484.92892058, 18022.77368062, 24394.01454975,
       23894.3692766 , 15601.03577252, 17529.61123966, 33478.34473006,
       17565.22026952, 16102.86598586, 12829.71767772, 16871.14647111,
        7888.17801255])

In [5]:
# Using a RandomForestRegressor import the necessary library
from sklearn.ensemble import RandomForestRegressor

# trainig the model
rf = RandomForestRegressor()
rf.fit(train_x, train_y)

# making a prepdiction
m2 = rf.predict(val_x)
m2

array([11678.92      ,  9369.97      , 16071.13      , 15416.8       ,
        6583.748     ,  6782.3       ,  5480.69      , 14222.08666667,
        8211.39      , 10298.20833333, 19034.74      , 29470.47      ,
       29132.34      , 16779.8       , 17654.27      , 36010.51      ,
       17676.64      , 13094.78      , 13208.78833333, 15639.135     ,
       11644.68      ])

In [6]:
# Using a DecisionTreeRegressor import the neccesary library
from sklearn.tree import DecisionTreeRegressor

# training the model
dt = DecisionTreeRegressor()
dt.fit(train_x, train_y)

# making a prediction
m3 = dt.predict(val_x)
m3

array([10945.,  9980., 17199., 14399.,  6229.,  6649.,  5399., 16430.,
        8195., 10245., 19045., 28176., 28176., 16500., 18920., 35550.,
       20970.,  9989., 12170., 16515., 10945.])

In [7]:
# Using a Lasso model, import the necessary library
from sklearn.linear_model import Lasso

# training the model
ls = Lasso()
ls.fit(train_x, train_y)

# making a predictin
m4 = ls.predict(val_x)
m4

array([ 7872.89721256,  9065.23310374, 18989.2712049 , 20965.25576953,
        5970.78316307,  6257.5752486 ,  6137.990701  , 10749.53638953,
        9588.85554582, 11485.09055527, 18011.63112021, 24380.0336739 ,
       23888.4335712 , 15603.26924259, 17521.59922777, 33468.49218267,
       17557.20275027, 16100.81216218, 12829.49380646, 16873.68120476,
        7883.35678921])

In [8]:
# Using a Lasso model, import the necessary library
from sklearn.linear_model import Ridge

# training the model
rd = Ridge()
rd.fit(train_x, train_y)

# making a prediction
m5 = rd.predict(val_x)
m5

array([ 7952.92651799,  9067.58964496, 18992.13366239, 20943.19044173,
        5951.86477659,  6256.95455393,  6083.25995571, 10790.61515067,
        9575.15796018, 11472.21760082, 18016.86171915, 24363.73221054,
       23868.98521557, 15652.98046513, 17507.39421568, 33395.79645566,
       17523.06560581, 16089.72153955, 12847.3194403 , 16899.50023157,
        7963.4530498 ])

In [9]:
# Uisng the ElasticNet model, import the necessary library
from sklearn.linear_model import ElasticNet

# training the model
en = ElasticNet()
en.fit(train_x, train_y)

# making a prediction
m6 = en.predict(val_x)
m6

array([11380.45329829,  9563.22951294, 19054.07650653, 19832.80885352,
        6036.21453339,  6681.38126958,  2906.71131119, 12418.09085926,
        9858.12355944, 11699.81058698, 18668.28414624, 22997.73314908,
       22527.22717517, 16553.27087359, 17884.35671064, 30079.51914009,
       16851.72658167, 15673.37374643, 13981.95777806, 17407.52393629,
       11390.46406369])

### MODEL EVALUATION

In [10]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error

In [11]:
# mean_absolute_error
lm_mae = mean_absolute_error(val_y, m1)
rf_mae = mean_absolute_error(val_y, m2)
df_mae = mean_absolute_error(val_y, m3)
ls_mae = mean_absolute_error(val_y, m4)
rd_mae = mean_absolute_error(val_y, m5)
en_mae = mean_absolute_error(val_y, m6)

# r2_score
lm_score = r2_score(val_y, m1)
rf_score = r2_score(val_y, m2)
dt_score = r2_score(val_y, m3)
ls_score = r2_score(val_y, m4)
rd_score = r2_score(val_y, m5)
en_score = r2_score(val_y, m6)

lst = [m1, m2, m3, m4, m5, m6]
lst2 = ["Linear Regression", "Random Forest", "Decision Tree", "Lasso", "Ridge", "Elastic Net"]

for i in range(len(lst)):
    print(f"Mean absolute error of {lst2[i]} = {round(mean_absolute_error(val_y, lst[i]), 3)}\n")

    
# Checking for the accuracy of our models to know which is best used on our testing data.
accuracy = {
    "Multiple Linear Regression": lm_score,
    "Random Forest Regressor": rf_score,
    "Decision Tree Regressor": dt_score,
    "Lasso": ls_score,
    "Ridge": rd_score,
    "ElasticNet": en_score
}

print(f"\nThe {max(accuracy, key=accuracy.get)} model has the highest accuracy with value {round(max(accuracy.values()), 3) * 100}%")

Mean absolute error of Linear Regression = 3080.675

Mean absolute error of Random Forest = 2091.521

Mean absolute error of Decision Tree = 1661.762

Mean absolute error of Lasso = 3082.47

Mean absolute error of Ridge = 3073.337

Mean absolute error of Elastic Net = 2934.218


The Decision Tree Regressor model has the highest accuracy with value 89.7%


In [12]:
print(f"Based on the above it is safe to say the {max(accuracy, key=accuracy.get)} model performed the best "
      "so we can use it for our test data.")

Based on the above it is safe to say the Decision Tree Regressor model performed the best so we can use it for our test data.


<h2>The purpose of this notebook was to try out multiple regression models and pick the one that performs the best</h2>