# Model Training

## 1.1 Import Data and Required Packages

In [79]:
# Basic Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Model Training imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, root_mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings("ignore")

In [80]:
# Load dataset

df=pd.read_csv("Cleaned_Car_Data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,name,company,year,price,kms_driven,fuel_type
0,0,Hyundai Santro Xing,Hyundai,2007,80000,45000,Petrol
1,1,Mahindra Jeep CL550,Mahindra,2006,425000,40,Diesel
2,2,Hyundai Grand i10,Hyundai,2014,325000,28000,Petrol
3,3,Ford EcoSport Titanium,Ford,2014,575000,36000,Diesel
4,4,Ford Figo,Ford,2012,175000,41000,Diesel


In [81]:
df.drop("name",axis=1,inplace=True)

In [82]:
print("Categories in 'company' variable:     ",end=" " )
print(df['company'].unique())

print("Categories in 'fuel_type' variable:     ",end=" " )
print(df['fuel_type'].unique())

Categories in 'company' variable:      ['Hyundai' 'Mahindra' 'Ford' 'Maruti' 'Skoda' 'Audi' 'Toyota' 'Renault'
 'Honda' 'Datsun' 'Mitsubishi' 'Tata' 'Volkswagen' 'Chevrolet' 'Mini'
 'BMW' 'Nissan' 'Hindustan' 'Fiat' 'Force' 'Mercedes' 'Land' 'Jaguar'
 'Jeep' 'Volvo']
Categories in 'fuel_type' variable:      ['Petrol' 'Diesel' 'LPG']


### Preparing X and y variables

In [83]:
X=df.drop("price",axis=1)
X.head()

Unnamed: 0.1,Unnamed: 0,company,year,kms_driven,fuel_type
0,0,Hyundai,2007,45000,Petrol
1,1,Mahindra,2006,40,Diesel
2,2,Hyundai,2014,28000,Petrol
3,3,Ford,2014,36000,Diesel
4,4,Ford,2012,41000,Diesel


In [84]:
y=df["price"]
y.head()

0     80000
1    425000
2    325000
3    575000
4    175000
Name: price, dtype: int64

In [85]:
# Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
         ("StandardScaler", numeric_transformer, num_features),        
    ]
)

In [86]:
X=preprocessor.fit_transform(X)

In [87]:
X.shape

(816, 31)

In [88]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=5)
X_train.shape, X_test.shape

((652, 31), (164, 31))

In [89]:
def evaluate_model(true,predicted):
    mse=mean_squared_error(true,predicted)
    mae=mean_absolute_error(true,predicted)
    rmse=np.sqrt(mean_squared_error(true,predicted))
    r2score=r2_score(true,predicted)
    return mae, mse, rmse, r2score

In [90]:
models={
    "LinearRegression":LinearRegression(),
    "Ridge":Ridge(),
    "Lasso":Lasso(),
    "DecisionTreeRegressor":DecisionTreeRegressor(),
    "KNeighborsRegressor":KNeighborsRegressor(),
    "RandomForestRegressor":RandomForestRegressor(),
    "AdaBoostRegressor":AdaBoostRegressor(),
    "SVR":SVR(),
    "XGBRegressor":XGBRegressor()
}

model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train) # Training model

    # Make Predictions
    y_train_pred=model.predict(X_train)
    y_test_pred=model.predict(X_test)

    # Evaluate train and test datasets
    model_train_mae, model_train_mse, model_train_rmse, model_train_r2score=evaluate_model(y_train,y_train_pred)
    model_test_mae, model_test_mse, model_test_rmse, model_test_r2score=evaluate_model(y_test,y_test_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2score))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2score))
    r2_list.append(model_test_r2score)
    
    print('='*35)
    print('\n')

LinearRegression
Model performance for Training set
- Root Mean Squared Error: 373527.2375
- Mean Absolute Error: 151447.4542
- R2 Score: 0.4327
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 205731.1455
- Mean Absolute Error: 137688.3435
- R2 Score: 0.7056


Ridge
Model performance for Training set
- Root Mean Squared Error: 376906.8404
- Mean Absolute Error: 157807.4403
- R2 Score: 0.4224
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 222298.1272
- Mean Absolute Error: 146827.0593
- R2 Score: 0.6562


Lasso
Model performance for Training set
- Root Mean Squared Error: 373527.2439
- Mean Absolute Error: 151451.1124
- R2 Score: 0.4327
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 205733.3344
- Mean Absolute Error: 137689.3868
- R2 Score: 0.7055


DecisionTreeRegressor
Model performance for Training set
- Root Mean Squared Error: 0.0000
- Mean Absolu

## Results

In [91]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

Unnamed: 0,Model Name,R2_Score
0,LinearRegression,0.70555
2,Lasso,0.705544
5,RandomForestRegressor,0.691667
1,Ridge,0.656218
8,XGBRegressor,0.629053
4,KNeighborsRegressor,0.491647
3,DecisionTreeRegressor,0.421501
7,SVR,-0.057699
6,AdaBoostRegressor,-0.184633
