# Model Experiments

In this notebook, we will train various machine learning algorithms on our dataset. This problem is a regression problem.

## Importing required packages and Importing the data

In [1]:
import warnings

import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression, ElasticNet, Ridge, Lasso
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

warnings.filterwarnings("ignore")

In [2]:
# Importing the data
data = pd.read_csv("../data/data_preprocessed.csv")
data.head()

Unnamed: 0,Year,Distance,Owner,Fuel,Drive,Type,State,Brand,Model Name,Price(k)
0,2022,3878,1,PETROL,Manual,HatchBack,HR,Maruti,S,514
1,2018,32041,1,PETROL,Manual,Sedan,TN,Hyundai,Xcent,674
2,2021,96339,1,DIESEL,Automatic,SUV,TS,Tata,Safari,1952
3,2019,51718,1,DIESEL,Manual,SUV,WB,Maruti,Vitara,690
4,2021,19811,1,PETROL,Manual,HatchBack,HR,Tata,Tiago,526


In [3]:
data.head()

Unnamed: 0,Year,Distance,Owner,Fuel,Drive,Type,State,Brand,Model Name,Price(k)
0,2022,3878,1,PETROL,Manual,HatchBack,HR,Maruti,S,514
1,2018,32041,1,PETROL,Manual,Sedan,TN,Hyundai,Xcent,674
2,2021,96339,1,DIESEL,Automatic,SUV,TS,Tata,Safari,1952
3,2019,51718,1,DIESEL,Manual,SUV,WB,Maruti,Vitara,690
4,2021,19811,1,PETROL,Manual,HatchBack,HR,Tata,Tiago,526


In [4]:
# Get basic information about the data
print(f"Total Number of samples in our dataset: {len(data)}")
print(f"Features in the dataset: {list(data.columns)}")

Total Number of samples in our dataset: 7800
Features in the dataset: ['Year', 'Distance', 'Owner', 'Fuel', 'Drive', 'Type', 'State', 'Brand', 'Model Name', 'Price(k)']


## Data Preprocessing 

This section covers about scaling the data and making data ready for the model

### Splitting the data into features and target

In [5]:
features, labels = data.drop(["Price(k)"], axis=1), data["Price(k)"]
len(features), len(labels)

(7800, 7800)

### Splitting the data into training and validation sets

In [6]:
X_train, X_val, y_train, y_val = train_test_split(features, labels, test_size=0.2, random_state=42)
len(X_train), len(y_train), len(X_val), len(y_val)

(6240, 6240, 1560, 1560)

## Data Transformation

In [7]:
num_features = features.select_dtypes(exclude="object").columns
cat_features = features.select_dtypes(include="object").columns

print(num_features)
print(cat_features)

Index(['Year', 'Distance', 'Owner'], dtype='object')
Index(['Fuel', 'Drive', 'Type', 'State', 'Brand', 'Model Name'], dtype='object')


### Create a transformer object and apply it on the data

In [8]:
# Create transformers for numerical and categorical columns
num_transformer = StandardScaler()
cat_transformer = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

# Create preprocessor object
preprocessor = ColumnTransformer(
    [("num_transformer", num_transformer, num_features),
     ("cat_transformer", cat_transformer, cat_features)]
)

# Applying the transformer to the training and validation data
X_train = preprocessor.fit_transform(X_train)
X_val = preprocessor.transform(X_val)

In [9]:
X_train.shape, y_train.shape, X_val.shape, y_val.shape

((6240, 151), (6240,), (1560, 151), (1560,))

In [23]:
preprocessor.named_transformers_["cat_transformer"].get_feature_names_out(cat_features)

array(['Fuel_CNG', 'Fuel_DIESEL', 'Fuel_LPG', 'Fuel_PETROL',
       'Drive_Automatic', 'Drive_Manual', 'Type_HatchBack',
       'Type_Lux_SUV', 'Type_Lux_sedan', 'Type_SUV', 'Type_Sedan',
       'State_AP', 'State_BR', 'State_CH', 'State_DL', 'State_GJ',
       'State_HR', 'State_KA', 'State_KL', 'State_MH', 'State_MP',
       'State_PB', 'State_RJ', 'State_TN', 'State_TS', 'State_UP',
       'State_WB', 'Brand_Datsun', 'Brand_Ford', 'Brand_Honda',
       'Brand_Hyundai', 'Brand_Jeep', 'Brand_KIA', 'Brand_MG',
       'Brand_Mahindra', 'Brand_Maruti', 'Brand_Nissan', 'Brand_Renault',
       'Brand_Skoda', 'Brand_Tata', 'Brand_Toyota', 'Brand_Volkswagen',
       'Model Name_A', 'Model Name_ALCAZAR', 'Model Name_ALTROZ',
       'Model Name_AURA', 'Model Name_Alto', 'Model Name_Amaze',
       'Model Name_Ameo', 'Model Name_BOLERO', 'Model Name_BR-V',
       'Model Name_BREZZA', 'Model Name_Baleno', 'Model Name_Bolero',
       'Model Name_Bolt', 'Model Name_Brio', 'Model Name_CARENS',
     

## Modelling Experiments

In [9]:
def evaluate_model(true, predicted):
    return np.sqrt(mean_squared_error(true, predicted))

In [10]:
models = {
    "Linear Regression": LinearRegression(),
    "Elastic Net": ElasticNet(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "RandomForest": RandomForestRegressor(),
    "Gradient Boosting Regressor": GradientBoostingRegressor(),
    "Adaboost": AdaBoostRegressor(),
    "Decision tree": DecisionTreeRegressor(),
    "Catboost": CatBoostRegressor(verbose=0),
    "xgboost": XGBRegressor()
}

In [11]:
model_list = []
rmse_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]

    model.fit(X_train, y_train)

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)

    # Evaluate train and test dataset
    train_rmse = evaluate_model(y_train, y_train_pred)
    val_rmse = evaluate_model(y_val, y_val_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print("Model performance on the Training set")
    print("Root mean squared error: {:4f}".format(train_rmse))

    print("-" * 35)
    print("Model performance on the validation set")
    print("Root mean squared error : {:4f}".format(val_rmse))
    rmse_list.append(val_rmse)

    print("=" * 35)
    print("\n")

Linear Regression
Model performance on the Training set
Root mean squared error: 104.854761
-----------------------------------
Model performance on the validation set
Root mean squared error : 3121305636763.971680


Elastic Net
Model performance on the Training set
Root mean squared error: 192.467055
-----------------------------------
Model performance on the validation set
Root mean squared error : 187.456023


Ridge
Model performance on the Training set
Root mean squared error: 106.287865
-----------------------------------
Model performance on the validation set
Root mean squared error : 106.485684


Lasso
Model performance on the Training set
Root mean squared error: 139.513746
-----------------------------------
Model performance on the validation set
Root mean squared error : 131.312335


RandomForest
Model performance on the Training set
Root mean squared error: 40.882288
-----------------------------------
Model performance on the validation set
Root mean squared error : 99.9

In [12]:
pd.set_option('display.float_format', lambda x: '%.5f' % x)


In [13]:
results = pd.DataFrame(zip(model_list, rmse_list), columns=["Model Name", "RMSE Score"]).sort_values(by="RMSE Score",
                                                                                                     ascending=True)
results

Unnamed: 0,Model Name,RMSE Score
8,Catboost,90.44929
9,xgboost,94.26382
4,RandomForest,99.91922
2,Ridge,106.48568
5,Gradient Boosting Regressor,117.71009
3,Lasso,131.31234
7,Decision tree,131.72328
1,Elastic Net,187.45602
6,Adaboost,205.59849
0,Linear Regression,3121305636763.972


Catboost is the best performing model
## Get the best performing model

In [14]:
cat_boost = CatBoostRegressor(verbose=False)
catboost = cat_boost.fit(X_train, y_train)
y_pred = cat_boost.predict(X_val)
score = evaluate_model(y_val, y_pred)
print(" Root mean square error of the model is %.2f" % score)

 Root mean square error of the model is 90.45


Find the difference between the actual price and best performing model prediction

In [15]:
pred_df = pd.DataFrame({'Actual Value': y_val, 'Predicted Value': y_pred, 'Difference': y_val - y_pred})
pred_df

Unnamed: 0,Actual Value,Predicted Value,Difference
2114,603,602.65411,0.34589
2080,1531,1395.54914,135.45086
1078,714,759.32030,-45.32030
5395,514,533.86182,-19.86182
101,281,287.37380,-6.37380
...,...,...,...
6304,577,584.00376,-7.00376
4984,555,542.88700,12.11300
2685,767,674.51168,92.48832
5934,667,624.96433,42.03567
