In [1]:
import pandas as pd
from pycaret.regression import setup

In [2]:
# read the data
path = r'data\used_cars_UK.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0.1,Unnamed: 0,title,Price,Mileage(miles),Registration_Year,Previous Owners,Fuel type,Body type,Engine,Gearbox,Doors,Seats,Emission Class,Service history
0,0,SKODA Fabia,6900,70189,2016,3.0,Diesel,Hatchback,1.4L,Manual,5.0,5.0,Euro 6,
1,1,Vauxhall Corsa,1495,88585,2008,4.0,Petrol,Hatchback,1.2L,Manual,3.0,5.0,Euro 4,Full
2,2,Hyundai i30,949,137000,2011,,Petrol,Hatchback,1.4L,Manual,5.0,5.0,Euro 5,
3,3,MINI Hatch,2395,96731,2010,5.0,Petrol,Hatchback,1.4L,Manual,3.0,4.0,Euro 4,Full
4,4,Vauxhall Corsa,1000,85000,2013,,Diesel,Hatchback,1.3L,Manual,5.0,5.0,Euro 5,


In [3]:
df.shape

(3685, 14)

In [4]:
df.isnull().sum()

Unnamed: 0              0
title                   0
Price                   0
Mileage(miles)          0
Registration_Year       0
Previous Owners      1409
Fuel type               0
Body type               0
Engine                 45
Gearbox                 0
Doors                  25
Seats                  35
Emission Class         87
Service history      3145
dtype: int64

# Data Preprocessing

In [6]:
df.drop(['Unnamed: 0', 'Previous Owners', 'Service history'], axis=1, inplace=True)

In [7]:
df.isnull().sum()

title                 0
Price                 0
Mileage(miles)        0
Registration_Year     0
Fuel type             0
Body type             0
Engine               45
Gearbox               0
Doors                25
Seats                35
Emission Class       87
dtype: int64

In [19]:
df['Registration_Year'] = 2025 - df['Registration_Year']

In [20]:
df['Engine'].unique()

array(['1.4L', '1.2L', '1.3L', '2.0L', '1.6L', '2.3L', '1.8L', '1.0L',
       '3.0L', '1.5L', '1.9L', '2.4L', '2.2L', '3.5L', '3.2L', '2.5L',
       '1.1L', '3.1L', '3.7L', '2.6L', '2.1L', '2.8L', '5.0L', '0.8L',
       '2.7L', '1.7L', '0.9L', '4.2L', nan, '4.4L', '4.3L', '3.3L',
       '5.5L', '4.8L', '6.3L'], dtype=object)

In [25]:
def engineCleaner(var):
    if isinstance(var, float):
        return var
    else:
        return float(var.replace("L", ""))

df['Engine'] = df['Engine'].apply(engineCleaner)

In [27]:
df

Unnamed: 0,title,Price,Mileage(miles),Registration_Year,Fuel type,Body type,Engine,Gearbox,Doors,Seats,Emission Class
0,SKODA Fabia,6900,70189,9,Diesel,Hatchback,1.4,Manual,5.0,5.0,Euro 6
1,Vauxhall Corsa,1495,88585,17,Petrol,Hatchback,1.2,Manual,3.0,5.0,Euro 4
2,Hyundai i30,949,137000,14,Petrol,Hatchback,1.4,Manual,5.0,5.0,Euro 5
3,MINI Hatch,2395,96731,15,Petrol,Hatchback,1.4,Manual,3.0,4.0,Euro 4
4,Vauxhall Corsa,1000,85000,12,Diesel,Hatchback,1.3,Manual,5.0,5.0,Euro 5
...,...,...,...,...,...,...,...,...,...,...,...
3680,Renault Megane,1395,76202,19,Petrol,Hatchback,1.6,Manual,5.0,5.0,Euro 4
3681,Audi A4,6990,119000,13,Petrol,Saloon,2.0,Manual,4.0,5.0,Euro 5
3682,BMW 3 Series,3995,139000,12,Diesel,Saloon,2.0,Manual,4.0,5.0,Euro 5
3683,Honda Accord,1390,179190,18,Diesel,Estate,2.2,Manual,5.0,5.0,Euro 4


In [28]:
df.drop('title', axis=1, inplace=True)

In [29]:
df.isnull().sum()

Price                 0
Mileage(miles)        0
Registration_Year     0
Fuel type             0
Body type             0
Engine               45
Gearbox               0
Doors                25
Seats                35
Emission Class       87
dtype: int64

In [30]:
df.dropna(inplace=True)

In [31]:
df.shape

(3591, 10)

In [33]:
df.describe(include='object').T

Unnamed: 0,count,unique,top,freq
Fuel type,3591,5,Petrol,2318
Body type,3591,9,Hatchback,2223
Gearbox,3591,2,Manual,2842
Emission Class,3591,6,Euro 5,1255


In [51]:
set_up = setup(
    data=df,
    target='Price',
    train_size=0.8,
    numeric_features=df.select_dtypes('number').columns.tolist()[1:],
    categorical_features=df.select_dtypes('object').columns.tolist(),
    remove_outliers=True,
    normalize=True,
    experiment_name='fastapi_practice',
    # log_experiment=True,
    n_jobs=-1
)

Unnamed: 0,Description,Value
0,Session id,7158
1,Target,Price
2,Target type,Regression
3,Original data shape,"(3591, 10)"
4,Transformed data shape,"(3447, 26)"
5,Transformed train set shape,"(2728, 26)"
6,Transformed test set shape,"(719, 26)"
7,Numeric features,5
8,Categorical features,4
9,Preprocess,True


In [52]:
from pycaret.regression import compare_models

In [53]:
best_model = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,935.6946,2196786.9105,1469.4531,0.8927,0.2924,0.2267,0.174
gbr,Gradient Boosting Regressor,1005.6322,2245559.9307,1485.2777,0.8905,0.3108,0.2548,0.137
rf,Random Forest Regressor,903.9831,2292493.3588,1501.7048,0.8883,0.2941,0.2191,0.268
et,Extra Trees Regressor,952.9534,2560383.2409,1592.7921,0.875,0.3094,0.2289,0.21
knn,K Neighbors Regressor,1115.448,3933071.0036,1961.4073,0.8095,0.3443,0.2537,0.109
dt,Decision Tree Regressor,1211.2716,4091548.1167,2015.5683,0.7977,0.3884,0.2799,0.077
huber,Huber Regressor,1365.649,4249013.7316,2040.6839,0.7907,0.5344,0.3501,0.091
ridge,Ridge Regression,1398.7714,4277573.0699,2043.9289,0.7881,0.5646,0.3751,0.082
llar,Lasso Least Angle Regression,1398.7983,4277961.037,2044.0568,0.7881,0.5641,0.3751,0.107
lasso,Lasso Regression,1398.8485,4278311.9674,2044.1618,0.788,0.5642,0.3751,0.494


In [54]:
best_model

In [55]:
from pycaret.regression import evaluate_model

In [56]:
evaluate_model(best_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [57]:
from pycaret.regression import finalize_model, create_api, save_model

In [None]:
# save_model(best_model, 'best_model')

In [60]:
df.dtypes

Price                  int64
Mileage(miles)         int64
Registration_Year      int64
Fuel type             object
Body type             object
Engine               float64
Gearbox               object
Doors                float64
Seats                float64
Emission Class        object
dtype: object

In [6]:
import pandas as pd
import numpy as np
from pycaret.classification import load_model

# Load the saved PyCaret model
model = load_model("best_model")

df = pd.read_csv(r"data\Cleaned_used_cars_data.csv")
df = df.head()
df.drop('Price', axis=1, inplace=True)

preds = model.predict(df)
np.round(preds.tolist(), 3)

Transformation Pipeline and Model Successfully Loaded


array([6802.998, 1948.997, 2323.665, 2498.055, 3375.245])

In [None]:

df.head(1)

Unnamed: 0,Price,Mileage(miles),Registration_Year,Fuel type,Body type,Engine,Gearbox,Doors,Seats,Emission Class
0,6900,70189,9,Diesel,Hatchback,1.4,Manual,5.0,5.0,Euro 6


In [86]:
df.to_dict()

{'Price': {0: 6900,
  1: 1495,
  2: 949,
  3: 2395,
  4: 1000,
  5: 800,
  6: 798,
  7: 1995,
  9: 1299,
  10: 1495,
  11: 2950,
  12: 6900,
  13: 19800,
  14: 1100,
  15: 1295,
  16: 999,
  17: 1289,
  18: 1200,
  19: 1949,
  20: 1195,
  21: 14498,
  22: 1995,
  23: 990,
  24: 2950,
  25: 15900,
  26: 8975,
  27: 3399,
  28: 1990,
  29: 1200,
  30: 1490,
  31: 2250,
  32: 2800,
  33: 2490,
  34: 1695,
  35: 1795,
  36: 2500,
  37: 2495,
  38: 15900,
  39: 10698,
  40: 1795,
  41: 1250,
  42: 988,
  43: 1995,
  44: 1999,
  45: 1449,
  46: 1290,
  47: 1449,
  48: 3295,
  49: 1400,
  50: 1950,
  51: 10698,
  52: 19800,
  53: 2249,
  54: 1390,
  55: 1795,
  56: 1699,
  57: 999,
  58: 700,
  59: 3250,
  60: 3245,
  61: 1250,
  62: 1900,
  63: 3495,
  64: 10000,
  65: 19800,
  66: 1000,
  67: 1490,
  68: 1950,
  69: 1695,
  70: 1750,
  71: 2495,
  72: 1389,
  73: 4490,
  74: 990,
  75: 2849,
  76: 2799,
  77: 9149,
  78: 15900,
  79: 1990,
  80: 3495,
  81: 1600,
  82: 2650,
  83: 2990,
  8

In [84]:
df.isnull().sum()

Price                0
Mileage(miles)       0
Registration_Year    0
Fuel type            0
Body type            0
Engine               0
Gearbox              0
Doors                0
Seats                0
Emission Class       0
dtype: int64

In [85]:
df.to_csv("data/Cleaned_used_cars_data.csv", index=False)

In [66]:
from pycaret.regression import load_model
best_model = load_model("best_model")
best_model

Transformation Pipeline and Model Successfully Loaded


In [81]:
best_model.predict(
pd.DataFrame({
  "Mileage(miles)": 52000,
  "Registration_Year": 8,
  "Fuel type": "Diesel",
  "Body type": "Hatchback",
  "Engine": 1.6,
  "Gearbox": "Automatic",
  "Doors": 5,
  "Seats": 5,
  "Emission Class": "Euro 6"
}, index=[0])
).tolist()[0]

12440.975812814775

In [7]:
df.columns

Index(['Mileage(miles)', 'Registration_Year', 'Fuel type', 'Body type',
       'Engine', 'Gearbox', 'Doors', 'Seats', 'Emission Class'],
      dtype='object')

In [None]:
import pandas as pd
import random

# Define possible values for categorical columns
fuel_types = ["Petrol", "Diesel", "Hybrid", "Electric"]
body_types = ["Sedan", "Hatchback", "SUV", "Convertible", "Coupe"]
gearbox_types = ["Manual", "Automatic"]
emission_classes = ["Euro 4", "Euro 5", "Euro 6"]

# Generate random data
num_samples = 20  # Adjust this for more samples

data = {
    "Mileage(miles)": [random.randint(1000, 200000) for _ in range(num_samples)],
    "Registration_Year": [random.randint(2000, 2024) for _ in range(num_samples)],
    "Fuel type": [random.choice(fuel_types) for _ in range(num_samples)],
    "Body type": [random.choice(body_types) for _ in range(num_samples)],
    "Engine": [round(random.uniform(1.0, 5.0), 1) for _ in range(num_samples)],
    "Gearbox": [random.choice(gearbox_types) for _ in range(num_samples)],
    "Doors": [random.choice([2, 3, 4, 5]) for _ in range(num_samples)],
    "Seats": [random.choice([2, 4, 5, 7]) for _ in range(num_samples)],
    "Emission Class": [random.choice(emission_classes) for _ in range(num_samples)]
}

# Convert to DataFrame and save as CSV for testing
df = pd.DataFrame(data)
df.to_csv("../test_car_data.csv", index=False)

print(df)


    Mileage(miles)  Registration_Year Fuel type    Body type  Engine  \
0            18716               2006    Diesel  Convertible     4.0   
1           120859               2015  Electric        Coupe     4.3   
2           190520               2005  Electric    Hatchback     2.0   
3           108506               2011    Diesel          SUV     4.9   
4           117085               2021    Diesel  Convertible     4.0   
5             8302               2008  Electric        Coupe     2.7   
6           160000               2007    Hybrid  Convertible     4.3   
7            33743               2023    Diesel        Sedan     2.2   
8           193363               2015    Hybrid        Coupe     1.7   
9           172674               2004    Hybrid        Sedan     1.5   
10          158155               2016  Electric          SUV     2.2   
11           54126               2009    Petrol          SUV     1.0   
12          153984               2011    Hybrid  Convertible    

: 