# MODEL CREATION

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

#### Reading the features csv file 

In [2]:
data = pd.read_csv('dataset_used/features_car.csv')
data.head()

Unnamed: 0,length,width,curb-weight,engine-size,horsepower,city-mpg,highway-mpg,wheel-base,bore,drive-wheels,price
0,0.811148,0.890278,2548,130,111,21,27,88.6,3.47,rwd,13495.0
1,0.811148,0.890278,2548,130,111,21,27,88.6,3.47,rwd,16500.0
2,0.822681,0.909722,2823,152,154,19,26,94.5,2.68,rwd,16500.0
3,0.84863,0.919444,2337,109,102,24,30,99.8,3.19,fwd,13950.0
4,0.84863,0.922222,2824,136,115,18,22,99.4,3.19,4wd,17450.0


In [3]:
# BASED ON THE ANALYSIS PART OF THIS NOTEBOOK SERIES WE PICKED THE FEATURES WITH THE HIGHEST CORRELATION
x = data[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg', 'drive-wheels', 'length', 'width', 'city-mpg']].values

# Label Encoder for one-hot-encoding
le_sex = preprocessing.LabelEncoder()
le_sex.fit(['rwd','fwd', '4wd'])
x[:,4] = le_sex.transform(x[:,4]) 

# StandardScaler Preprocessor for scaling the dataset
x = preprocessing.StandardScaler().fit(x).transform(x.astype(float))
y = data.price

# splitting the dataset into training and testing values.
train_x, val_x, train_y, val_y = train_test_split(x, y, test_size=0.1, random_state=4)

Linear Regression
Random Forest Regressor
Decision Tree Regressor
Lasso
Ridge
ElasticNet

In [4]:
# Using a MultipleLinearRegressor import the necessary library
from sklearn.linear_model import LinearRegression

# training the model
lm = LinearRegression()
lm.fit(train_x, train_y)

# making a prediction
m1 = lm.predict(val_x)
m1

array([17347.06371158, 21456.43207742, 13035.11659145, 22378.75829396,
       13536.95583022, 18902.29390513,  6173.96806124, 20760.91703639,
        6044.77462379, 14617.50571307, 11524.98810672, 18609.09842177,
       17354.37668332, 24343.86558323, 10453.49516396,  9487.05588871,
        6694.14926574, 11932.79338995,  9106.95540263, 11500.36136622,
       18419.2963781 ])

In [5]:
# Using a RandomForestRegressor import the necessary library
from sklearn.ensemble import RandomForestRegressor

# trainig the model
rf = RandomForestRegressor()
rf.fit(train_x, train_y)

# making a prepdiction
m2 = rf.predict(val_x)
m2

array([18473.5525    , 15031.765     , 13210.265     , 17272.33      ,
       13444.38      , 15994.86      ,  7789.96      , 16631.67      ,
        6574.34      , 14635.38      ,  9779.05666667, 18807.88      ,
       15692.04      , 34352.58      ,  9779.65      , 11658.74      ,
        6557.89      , 10663.68      ,  7827.34      ,  9647.18      ,
       14359.56      ])

In [6]:
# Using a DecisionTreeRegressor import the neccesary library
from sklearn.tree import DecisionTreeRegressor

# training the model
dt = DecisionTreeRegressor()
dt.fit(train_x, train_y)

# making a prediction
m3 = dt.predict(val_x)
m3

array([21105., 13499., 15250., 16558., 13495., 13860.,  7999., 15690.,
        6849., 11199., 10898., 19045., 16515., 34028.,  8845., 12290.,
        6229., 10898.,  7975.,  9279., 14869.])

In [7]:
# Using a Lasso model, import the necessary library
from sklearn.linear_model import Lasso

# training the model
ls = Lasso()
ls.fit(train_x, train_y)

# making a predictin
m4 = ls.predict(val_x)
m4

array([17340.38045554, 21452.95700573, 13034.82925257, 22382.10311851,
       13533.64402738, 18933.64315437,  6184.63690412, 20767.92893636,
        6052.90625291, 14622.08385281, 11525.37065353, 18599.43759142,
       17358.44847472, 24331.07444613, 10456.05688287,  9490.07434797,
        6697.10847215, 11939.45387076,  9118.41051787, 11495.59636891,
       18413.88304148])

In [8]:
# Using a Lasso model, import the necessary library
from sklearn.linear_model import Ridge

# training the model
rd = Ridge()
rd.fit(train_x, train_y)

# making a prediction
m5 = rd.predict(val_x)
m5

array([17332.41128442, 21439.3725746 , 13067.41606212, 22372.65199725,
       13540.28426747, 18962.73616752,  6204.08660795, 20782.74723311,
        6061.53143701, 14631.78869127, 11530.49591368, 18612.56886732,
       17378.78550963, 24288.76192781, 10465.46030486,  9505.79703719,
        6676.10724011, 11940.31638391,  9128.96968413, 11509.17983086,
       18393.75407978])

In [9]:
# Uisng the ElasticNet model, import the necessary library
from sklearn.linear_model import ElasticNet

# training the model
en = ElasticNet()
en.fit(train_x, train_y)

# making a prediction
m6 = en.predict(val_x)
m6

array([16823.8676252 , 20524.20531668, 14207.44048375, 21381.46416289,
       13881.86076471, 19651.0477913 ,  7071.46844552, 20587.60585469,
        6645.59972324, 15224.92198705, 11924.36604657, 19073.50560546,
       17761.59882281, 21719.0125515 , 10883.72208515, 10051.48663888,
        6496.55153853, 12056.93255818,  9718.2242554 , 12011.19164901,
       17487.53671527])

### MODEL EVALUATION

In [10]:
from sklearn.metrics import r2_score
# from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_squared_error as mse

In [11]:
# mean_absolute_error
lst = [m1, m2, m3, m4, m5, m6]
lst2 = ["Linear Regression", "Random Forest", "Decision Tree", "Lasso", "Ridge", "Elastic Net"]

for i in range(len(lst)):
    print(f"Mean absolute error of {lst2[i]} = {round(mse(val_y, lst[i]), 3)}")

# r2_score
lm_score = r2_score(val_y, m1)
rf_score = r2_score(val_y, m2)
dt_score = r2_score(val_y, m3)
ls_score = r2_score(val_y, m4)
rd_score = r2_score(val_y, m5)
en_score = r2_score(val_y, m6)

# Checking for the accuracy of our models to know which is best used on our testing data.
accuracy = {
    "Multiple Linear Regression": lm_score,
    "Random Forest Regressor": rf_score,
    "Decision Tree Regressor": dt_score,
    "Lasso": ls_score,
    "Ridge": rd_score,
    "ElasticNet": en_score
}
# accuracy
print(f"\nThe {max(accuracy, key=accuracy.get)} model has the highest accuracy with value {round(max(accuracy.values()), 3) * 100}%")

Mean absolute error of Linear Regression = 13800377.219
Mean absolute error of Random Forest = 2589602.064
Mean absolute error of Decision Tree = 3204153.571
Mean absolute error of Lasso = 13824678.119
Mean absolute error of Ridge = 13852885.963
Mean absolute error of Elastic Net = 15182740.243

The Random Forest Regressor model has the highest accuracy with value 92.7%


In [12]:
print(f"Based on the above it is safe to say the {max(accuracy, key=accuracy.get)} model performed the best "
      "so we can use it for our test data.")

Based on the above it is safe to say the Random Forest Regressor model performed the best so we can use it for our test data.


<h2>The purpose of this notebook was to try out multiple regression models and pick the one that performs the best</h2>