# Malaysia Car List Price
Objective: Build a machine learning model to accurately predict the market price of used cars in Malaysia based on vehicle attributes and market conditions.
<br>
<br>
Target: Price

In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

# Data Loading and Overview

In [36]:
# load raw dataset
df = pd.read_csv('Malaysia_Final_CarList_Compiled.csv')
df.head()

Unnamed: 0,Desc,Link,Make,Model,Year,Engine.Cap,Transm,Mileage,Color,Car.Type,Updated,Price
0,2015 Perodua Myvi 1.5 S.E WITH CRAZY DISKAUN U...,/new-cars/2521373/2015-perodua-myvi-1-5-s-e-wi...,Perodua,Myvi,2015,1495cc,Automatic,,Others,NewCar,2015-12-12,47468
1,2015 Perodua Myvi 1.5 ADVANCE WITH CRAZY DISKA...,/new-cars/2519379/2015-perodua-myvi-1-5-advanc...,Perodua,Myvi,2015,1495cc,Automatic,,Others,NewCar,2015-12-12,55668
2,2015 Perodua Alza 1.5 (A) SE WITH DISKAUN UP T...,/new-cars/2519365/2015-perodua-alza-1-5-a-se-w...,Perodua,Alza,2015,1495cc,Automatic,,Others,NewCar,2015-12-12,56132
3,2015 Perodua Myvi 1.5 S.E WITH CRAZY DISKAUN U...,/new-cars/2121602/2015-perodua-myvi-1-5-s-e-wi...,Perodua,Myvi,2015,1495cc,Automatic,,Others,NewCar,2015-12-12,47468
4,2015 Perodua Alza 1.5 Advance Wagon,/new-cars/2611991/2015-perodua-alza-1-5-advanc...,Perodua,Alza,2015,1495cc,Automatic,,Red,NewCar,2015-12-12,65000


In [37]:
# remove uneccesary columns
df = df.drop(columns=['Desc', 'Link'])
df

Unnamed: 0,Make,Model,Year,Engine.Cap,Transm,Mileage,Color,Car.Type,Updated,Price
0,Perodua,Myvi,2015,1495cc,Automatic,,Others,NewCar,2015-12-12,47468
1,Perodua,Myvi,2015,1495cc,Automatic,,Others,NewCar,2015-12-12,55668
2,Perodua,Alza,2015,1495cc,Automatic,,Others,NewCar,2015-12-12,56132
3,Perodua,Myvi,2015,1495cc,Automatic,,Others,NewCar,2015-12-12,47468
4,Perodua,Alza,2015,1495cc,Automatic,,Red,NewCar,2015-12-12,65000
...,...,...,...,...,...,...,...,...,...,...
5995,Perodua,Viva,2007,989cc,Automatic,22500.0,Silver,UsedCar,2015-04-06,18000
5996,Perodua,Rusa,1997,1590cc,Manual,102500.0,Maroon,UsedCar,2015-05-12,8000
5997,Perodua,Myvi,2014,1495cc,Automatic,22500.0,White,UsedCar,2015-06-16,45500
5998,Perodua,Myvi,2014,1298cc,Automatic,17500.0,Silver,UsedCar,2015-08-06,42000


In [38]:
# check structure and data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000 entries, 0 to 5999
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Make        6000 non-null   object 
 1   Model       6000 non-null   object 
 2   Year        6000 non-null   int64  
 3   Engine.Cap  6000 non-null   object 
 4   Transm      6000 non-null   object 
 5   Mileage     3000 non-null   float64
 6   Color       6000 non-null   object 
 7   Car.Type    6000 non-null   object 
 8   Updated     6000 non-null   object 
 9   Price       6000 non-null   int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 468.9+ KB


In [39]:
df.describe(include='all')

Unnamed: 0,Make,Model,Year,Engine.Cap,Transm,Mileage,Color,Car.Type,Updated,Price
count,6000,6000,6000.0,6000,6000,3000.0,6000,6000,6000,6000.0
unique,1,10,,12,2,,18,2,399,
top,Perodua,Myvi,,-,Automatic,,Others,NewCar,2014-06-15,
freq,6000,3000,,1482,4042,,2843,3000,947,
mean,,,2011.191833,,,79661.648,,,,33959.385333
std,,,3.934753,,,44520.370966,,,,14870.941685
min,,,1994.0,,,22.0,,,,3000.0
25%,,,2009.0,,,52500.0,,,,22000.0
50%,,,2014.0,,,72500.0,,,,34346.0
75%,,,2014.0,,,97500.0,,,,45000.0


In [40]:
# check numerical columns
numeric_cols = df.select_dtypes(include=np.number)
numeric_cols.columns

Index(['Year', 'Mileage', 'Price'], dtype='object')

In [41]:
# check categorical columns
categorical_cols = df.select_dtypes(exclude=np.number)
categorical_cols.columns

Index(['Make', 'Model', 'Engine.Cap', 'Transm', 'Color', 'Car.Type',
       'Updated'],
      dtype='object')

# Data Cleaning

In [42]:
# check unique values for each categorical columns
for col in categorical_cols:
    print(f"\n{col}:\n{df[col].unique()}\n")


Make:
['Perodua']


Model:
['Myvi' 'Alza' 'Axia' 'Viva' 'Kancil' 'Kelisa' 'Kenari' 'Rusa' 'Kembara'
 'Nautica']


Engine.Cap:
['1495cc' '1298cc' '998cc' '1500cc' '1000cc' '1300cc' '-' '847cc' '989cc'
 '659cc' '1296cc' '1590cc']


Transm:
['Automatic' 'Manual']


Color:
['Others' 'Red' 'White' 'Silver' 'Green' 'Gold' 'Blue' 'Purple' 'Orange'
 'Yellow' 'Maroon' 'Black' 'Pink' 'Grey' 'Bronze' 'Beige' 'Magenta'
 'Brown']


Car.Type:
['NewCar' 'UsedCar']


Updated:
['2015-12-12' '2015-12-11' '2015-12-10' '2015-12-09' '2015-12-08'
 '2015-12-07' '2015-12-06' '2015-12-05' '2015-12-04' '2015-12-03'
 '2015-12-01' '2015-11-28' '2015-11-27' '2015-11-25' '2015-11-24'
 '2015-11-21' '2015-11-20' '2015-11-18' '2015-11-17' '2015-11-15'
 '2015-11-14' '2015-11-13' '2015-11-12' '2015-11-11' '2015-11-09'
 '2015-11-08' '2015-11-07' '2015-11-06' '2015-11-05' '2015-11-04'
 '2015-11-03' '2015-11-02' '2015-11-26' '2015-11-01' '2015-10-31'
 '2015-10-29' '2015-10-28' '2015-10-27' '2015-10-24' '2015-10-23'
 '2015

**All categorical features have unique values**

In [43]:
# List of categorical features
cat_features = ['Model', 'Transm', 'Color']  # adjust as needed

# Limit brand to top N to avoid clutter
top_brands = df['Model'].value_counts().head(10).index
df_top = df[df['Model'].isin(top_brands)]

fig, axes = plt.subplots(1, len(cat_features), figsize=(6 * len(cat_features), 6))

if len(cat_features) == 1:
    axes = [axes]

for i, col in enumerate(cat_features):
    if col == 'Model':
        data = df_top
    else:
        data = df
    
    sns.boxplot(data=data, x=col, y='Price', ax=axes[i])
    axes[i].set_title(f'Price by {col.title()}')
    axes[i].tick_params(axis='x', rotation=45)
    axes[i].set_ylabel('Price')

plt.tight_layout()
plt.show()

<Figure size 1800x600 with 3 Axes>

In [44]:
# average price by brand (top 10)
avg_price_by_brand = df_top.groupby('Model')['Price'].mean().sort_values(ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x=avg_price_by_brand.index, y=avg_price_by_brand.values)
plt.title('Average Price by Brand (Top 10)')
plt.ylabel('Average Price (RM)')
plt.xlabel('Model')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

<Figure size 1000x600 with 1 Axes>

In [45]:
# update using datetime standard format
df['Updated'] = pd.to_datetime(df['Updated'])

In [46]:
# Use Label Encoder for categorical columns
from sklearn.preprocessing import LabelEncoder

# Initialize and fit LabelEncoder
encoders = {}
for col in ['Model', 'Engine.Cap', 'Transm', 'Color', 'Car.Type']:
  le = LabelEncoder()
  df[col] = le.fit_transform(df[col])
  encoders[col] = le

df.head(20)

Unnamed: 0,Make,Model,Year,Engine.Cap,Transm,Mileage,Color,Car.Type,Updated,Price
0,Perodua,6,2015,5,0,,11,0,2015-12-12,47468
1,Perodua,6,2015,5,0,,11,0,2015-12-12,55668
2,Perodua,0,2015,5,0,,11,0,2015-12-12,56132
3,Perodua,6,2015,5,0,,11,0,2015-12-12,47468
4,Perodua,0,2015,5,0,,14,0,2015-12-12,65000
5,Perodua,6,2015,3,0,,16,0,2015-12-12,43862
6,Perodua,1,2015,11,0,,11,0,2015-12-12,39637
7,Perodua,6,2015,3,1,,11,0,2015-12-12,40862
8,Perodua,1,2015,11,1,,11,0,2015-12-12,36637
9,Perodua,1,2015,11,1,,11,0,2015-12-12,24437


In [47]:
# see the le mapping for each columns
for col, le in encoders.items():
  mapping = dict(zip(le.classes_, le.transform(le.classes_)))
  print('______')
  print(f"Mapping for {col}:")
  for i, label in enumerate(le.classes_):
    print(f"{i} → {label}")

______
Mapping for Model:
0 → Alza
1 → Axia
2 → Kancil
3 → Kelisa
4 → Kembara
5 → Kenari
6 → Myvi
7 → Nautica
8 → Rusa
9 → Viva
______
Mapping for Engine.Cap:
0 → -
1 → 1000cc
2 → 1296cc
3 → 1298cc
4 → 1300cc
5 → 1495cc
6 → 1500cc
7 → 1590cc
8 → 659cc
9 → 847cc
10 → 989cc
11 → 998cc
______
Mapping for Transm:
0 → Automatic
1 → Manual
______
Mapping for Color:
0 → Beige
1 → Black
2 → Blue
3 → Bronze
4 → Brown
5 → Gold
6 → Green
7 → Grey
8 → Magenta
9 → Maroon
10 → Orange
11 → Others
12 → Pink
13 → Purple
14 → Red
15 → Silver
16 → White
17 → Yellow
______
Mapping for Car.Type:
0 → NewCar
1 → UsedCar


In [48]:
# handle missing values
df.isnull().sum()

Make             0
Model            0
Year             0
Engine.Cap       0
Transm           0
Mileage       3000
Color            0
Car.Type         0
Updated          0
Price            0
dtype: int64

**Mileage have 300 missing values**

In [49]:
# fill missing values in Mileage with mean
df['Mileage'] = df['Mileage'].fillna(df['Mileage'].median())
df.isnull().sum()

Make          0
Model         0
Year          0
Engine.Cap    0
Transm        0
Mileage       0
Color         0
Car.Type      0
Updated       0
Price         0
dtype: int64

In [50]:
# Identify outliers using IQR
print("\nIdentifying outliers using IQR:")
outlier_indices = {}
for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)].index
    outlier_indices[col] = outliers
    print(f"  {col:>20}: {len(outliers)} outliers identified")

# Combine indices from all columns
all_outlier_indices = [idx for indices in outlier_indices.values() for idx in indices]
all_outlier_indices = list(set(all_outlier_indices)) # Remove duplicates

print(f"\nTotal unique outliers identified across numeric columns: {len(all_outlier_indices)}")


Identifying outliers using IQR:
                  Year: 153 outliers identified
               Mileage: 2861 outliers identified
                 Price: 1 outliers identified

Total unique outliers identified across numeric columns: 2861


**The outlier is >10% of existing data (6000 rows)**
- Mileage have the highest outlier values

In [51]:
# replace the Mileage outlier with lower bound for values below lower bound
# replace with upper bound for values above upper bound 
df['Mileage'] = np.clip(df['Mileage'], lower_bound, upper_bound)

In [52]:
# check duplicated rows
df.duplicated().sum()

1835

In [53]:
# remove duplicated rows
df = df.drop_duplicates()
df.duplicated().sum()

0

In [54]:
# visualize relationships between features and price
# check numerical columns
numeric_cols = df.select_dtypes(include=np.number)
numeric_cols.columns

plt.figure(figsize=(18,8))
sns.heatmap(numeric_cols.corr(), annot=True, cmap='coolwarm')
plt.show()

<Figure size 1800x800 with 2 Axes>

**The highest correlation is Price with Year, 0.81**

In [55]:
# analyze target variable (price) distribution and skewness
import seaborn as sns
from scipy.stats import skew

# Check skewness
price_skew = skew(df['Price'])
print(f"Skewness of price: {price_skew:.2f}")

# Plot distribution
plt.figure(figsize=(12, 5))

# Histogram + KDE
plt.subplot(1, 2, 1)
sns.histplot(df['Price'], kde=True, bins=50)
plt.title(f'Price Distribution (Skew = {price_skew:.2f})')
plt.xlabel('Price')

Skewness of price: 1.03


Text(0.5, 0, 'Price')

In [56]:
# scatter plot of features to Price
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')

# List of numerical features to plot
numerical_features = ['Year', 'Mileage', 'Engine.Cap', 'Color']

# Remove 'price' if accidentally included
numerical_features = [col for col in numerical_features if col in df.columns and col != 'price']

# Plot scatter plots
fig, axes = plt.subplots(1, len(numerical_features), figsize=(5 * len(numerical_features), 5))

if len(numerical_features) == 1:
    axes = [axes]

for i, col in enumerate(numerical_features):
    sns.scatterplot(x=df[col], y=df['Price'], ax=axes[i], alpha=0.6)
    axes[i].set_title(f'Price vs. {col}')
    axes[i].set_xlabel(col)
    axes[i].set_ylabel('Price')

plt.tight_layout()
plt.show()

<Figure size 1200x500 with 1 Axes>

<Figure size 2000x500 with 4 Axes>

# Model Training

**R2: higher is better<br>MSE/MAE/RMSE: lower is better**


## 1. Linear Regression

In [57]:
from sklearn.model_selection import train_test_split

# data splitting
feature = ['Year']
target = 'Price'

X = df[feature]
y = df[target]

# 80% train set, 20% test set
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=100)

In [58]:
X_train.shape

(3332, 1)

In [59]:
X_test.shape

(833, 1)

In [60]:
# linear reg model training
from sklearn.linear_model import LinearRegression

model_linear = LinearRegression()

#fitting = learning from the training set

model_linear.fit(X_train, y_train)

#predicting

y_pred = model_linear.predict(X_test)

In [61]:
# evaluation
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np

print('R2 Score: ', r2_score(y_test, y_pred))
print('MSE: ', mean_squared_error(y_test, y_pred))
print('MAE: ', mean_absolute_error(y_test, y_pred))
print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_pred)))

R2 Score:  0.5688015626259091
MSE:  118546656.92104197
MAE:  6583.643196504114
RMSE:  10887.91334099615


## 2. Multiple Linear Regression

In [62]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

feature = ['Model', 'Year', 'Color', 'Engine.Cap'] # select all the highest correlative
target = 'Price'

X = df[feature]
y = df[target]

# Step 1: Split the data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model_multilinear = LinearRegression()

# training
model_multilinear.fit(X_train, y_train)

# predict
y_pred = model_multilinear.predict(X_test) # which later can be used to compare y_pred with y_test

# evaluation
print("R2 Score:", r2_score(y_test, y_pred)*100)
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_pred)))

R2 Score: 77.03998725024894
MSE: 53138065.10242495
MAE: 5395.278946436077
RMSE:  7289.586072091127


## 3. Random Forest

In [63]:
from sklearn.ensemble import RandomForestRegressor

model_RF = RandomForestRegressor()

# training
model_RF.fit(X_train, y_train)

# predict
y_pred = model_RF.predict(X_test) # which later can be used to compare y_pred with y_test

# evaluation
print("R2 Score:", r2_score(y_test, y_pred)*100)
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_pred)))

R2 Score: 91.78305481405756
MSE: 19017087.35934395
MAE: 3020.251292972944
RMSE:  4360.858557594359


## 4. Decision Tree

In [64]:
from sklearn.tree import DecisionTreeRegressor

model_DT = DecisionTreeRegressor()

# training
model_DT.fit(X_train, y_train)

# predict
y_pred = model_DT.predict(X_test) # which later can be used to compare y_pred with y_test

# evaluation
print("R2 Score:", r2_score(y_test, y_pred)*100)
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_pred)))

R2 Score: 91.88206690888077
MSE: 18787936.304508373
MAE: 3051.7593495306623
RMSE:  4334.505312548177


In [65]:
models = {
    'Simple Linear Regression': model_linear,
    'Multiple Linear Regression': model_multilinear,
    'Random Forest Regressor': model_RF,
    'Decision Tree Regressor': model_DT,
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    results[name] = {'R2 Score': r2, 'MSE': mse, 'MAE': mae, 'RMSE': rmse}
    print(f'{name}:')
    print(f'R2 Score: {r2*100}')
    print(f'MSE: {mse}')
    print(f'MAE: {mae}')
    print(f'RMSE: {rmse}')
    print('\n')

# Find the best model based on R2 Score
best_model_name = max(results, key=lambda k: results[k]['R2 Score'])
print(f'The best model based on R2 Score is: {best_model_name} with R2 Score of {results[best_model_name]["R2 Score"]*100:.2f}%')

Simple Linear Regression:
R2 Score: 77.03998725024894
MSE: 53138065.10242495
MAE: 5395.278946436077
RMSE: 7289.586072091127


Multiple Linear Regression:
R2 Score: 77.03998725024894
MSE: 53138065.10242495
MAE: 5395.278946436077
RMSE: 7289.586072091127


Random Forest Regressor:
R2 Score: 91.92469264753795
MSE: 18689284.38734739
MAE: 3015.486432717007
RMSE: 4323.1104990906015


Decision Tree Regressor:
R2 Score: 91.87586943077021
MSE: 18802279.58902218
MAE: 3055.973035004852
RMSE: 4336.159543769369


The best model based on R2 Score is: Random Forest Regressor with R2 Score of 91.92%
Random Forest Regressor:
R2 Score: 91.92469264753795
MSE: 18689284.38734739
MAE: 3015.486432717007
RMSE: 4323.1104990906015


Decision Tree Regressor:
R2 Score: 91.87586943077021
MSE: 18802279.58902218
MAE: 3055.973035004852
RMSE: 4336.159543769369


The best model based on R2 Score is: Random Forest Regressor with R2 Score of 91.92%


# Deployment

This notebook trains multiple regression models and selects the best model by R2 score.
To make the chosen model available for a lightweight interactive demo, we save the best model to disk (Joblib) and load it from a small Gradio app (`app.py`) in the project root.

Notes:
- The Gradio demo expects the model to be saved as `best_model.joblib`.
- The demo currently uses the features: `Model`, `Year`, `Color`, `Engine.Cap` (these are the features used in the notebook's multiple regression / comparison section). If you change feature columns, update `app.py` accordingly.

In [66]:
# Save the best model to disk so external apps (e.g., Gradio) can load it.
import joblib

# Retrieve the best model object from the `models` dictionary
try:
    best_model = models[best_model_name]
    joblib.dump(best_model, 'best_model.joblib')
    print(f'Saved best model ({best_model_name}) to best_model.joblib')
except Exception as e:
    print('Could not save best model automatically. Make sure `models` and `best_model_name` exist and are fitted.')
    print('Error:', e)

# Save label encoders (if present) so external apps can map human-readable categories
try:
    joblib.dump(encoders, 'encoders.joblib')
    print('Saved encoders to encoders.joblib')
except Exception as e:
    print('No `encoders` dict found or could not save encoders:', e)

Saved best model (Random Forest Regressor) to best_model.joblib
Saved encoders to encoders.joblib


In [None]:
import os
import joblib
import pandas as pd
import numpy as np
import tempfile
import gradio as gr

# Feature columns used by the notebook's final comparison (adjust if you changed features)
FEATURE_COLS = ['Model', 'Year', 'Color', 'Engine.Cap']


def load_model(path='best_model.joblib'):
    if not os.path.exists(path):
        raise FileNotFoundError(
            f"Model file '{path}' not found. Run the notebook cell that saves the best model to 'best_model.joblib' and ensure it's in the project root."
        )
    return joblib.load(path)


try:
    model = load_model()
except Exception as exc:
    model = None
    load_error = str(exc)
else:
    load_error = None

# Try to load encoders (saved from the notebook) if available
encoders = None
enc_load_error = None
if os.path.exists('encoders.joblib'):
    try:
        encoders = joblib.load('encoders.joblib')
    except Exception as e:
        enc_load_error = str(e)


def safe_encode(col_name, val):
    """
    If encoders exist and contain col_name, try to transform val using the encoder.
    If encoder missing, attempt to cast to float and return numeric value.
    On unknown label, raise ValueError with informative message.
    """
    if encoders is not None and col_name in encoders:
        le = encoders[col_name]
        try:
            return le.transform([val])[0]
        except Exception:
            # unknown label
            raise ValueError(f"Unknown {col_name} '{val}'. Allowed values: {list(le.classes_)}")
    else:
        # no encoder: try numeric fallback
        try:
            return float(val)
        except Exception:
            raise ValueError(f"No encoder for '{col_name}' and provided value is not numeric. Provide numeric value or save encoders.joblib.")


def predict_price(model_name: str, year: float, color: str, engine_cap: float):
    """
    Simple wrapper to predict price for a single example.
    Returns a string formatted with two decimal places (e.g. '43281.21').
    """
    if model is None:
        return f"Model not loaded: {load_error}"

    try:
        model_encoded = safe_encode('Model', model_name)
        color_encoded = safe_encode('Color', color)
        engine_encoded = safe_encode('Engine.Cap', engine_cap)
    except ValueError as ve:
        return str(ve)

    # Year should be numeric; try to coerce
    try:
        year_val = float(year)
    except Exception:
        return "Year must be numeric."

    X = pd.DataFrame([[model_encoded, year_val, color_encoded, engine_encoded]], columns=FEATURE_COLS)
    pred = model.predict(X)[0]
    # Return formatted string with two decimals so the UI shows exactly two decimals
    return f"{pred:.2f}"


def batch_predict(csv_file):
    """Accepts a CSV file with the same feature columns and returns predictions appended.

    Output columns (display): Model, Year, Color, Engine Capacity, Price (Predicted)
    """
    if model is None:
        raise ValueError(f"Model not loaded: {load_error}")

    # Read uploaded CSV (gr.File gives .name)
    df = pd.read_csv(csv_file.name)

    # Accept either 'Engine.Cap' or 'Engine Capacity' as input; normalize to 'Engine.Cap'
    if 'Engine Capacity' in df.columns and 'Engine.Cap' not in df.columns:
        df['Engine.Cap'] = df['Engine Capacity']

    required = ['Model', 'Year', 'Color', 'Engine.Cap']
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise ValueError(f"Uploaded CSV is missing required columns: {missing}")

    # Preserve original human-readable columns for display
    df['_orig_Model'] = df['Model'].astype(str)
    df['_orig_Color'] = df['Color'].astype(str)
    df['_orig_Engine_Cap'] = df['Engine.Cap']

    # Prepare encoded columns for prediction
    def prepare_encoding(col):
        enc_col = f"{col}_enc"
        if encoders is not None and col in encoders:
            # If column contains object (strings), check for unknowns and transform
            if df[col].dtype == object:
                unknowns = set(df[col][~df[col].isin(encoders[col].classes_)])
                if len(unknowns) > 0:
                    raise ValueError(f"Uploaded CSV contains unknown {col} labels: {list(unknowns)}. Allowed: {list(encoders[col].classes_)}")
                df[enc_col] = encoders[col].transform(df[col])
            else:
                try:
                    # try transforming stringified values
                    df[enc_col] = encoders[col].transform(df[col].astype(str))
                except Exception:
                    df[enc_col] = pd.to_numeric(df[col], errors='coerce')
        else:
            df[enc_col] = pd.to_numeric(df[col], errors='coerce')

    prepare_encoding('Model')
    prepare_encoding('Color')
    prepare_encoding('Engine.Cap')

    # Ensure Year is numeric
    df['Year'] = pd.to_numeric(df['Year'], errors='coerce')

    # Build the prediction DataFrame using the SAME feature names used during training.
    # This is the critical fix: rename encoded columns back to the original feature names.
    X_for_pred = pd.DataFrame({
        'Model': df['Model_enc'],
        'Year': df['Year'],
        'Color': df['Color_enc'],
        'Engine.Cap': df['Engine.Cap_enc']
    })

    if X_for_pred.isnull().any(axis=1).any():
        bad = X_for_pred.isnull().any(axis=1)
        raise ValueError("Uploaded CSV contains non-numeric or missing values in required columns after processing. Problem rows (sample):\n"
                         + df.loc[bad, ['Model', 'Year', 'Color', 'Engine.Cap']].head(5).to_csv(index=False))

    # Now call model.predict with the DataFrame that has the same column names as at fit time
    preds = model.predict(X_for_pred)
    # Format predicted prices as two-decimal strings to guarantee display (e.g. '43281.21')
    df['Predicted Price (RM)'] = [f"{v:.2f}" for v in preds]

    # Build the display DataFrame with requested column names/order
    # Keep human-readable Model and Color values
    out = pd.DataFrame({
        'Model': df['_orig_Model'],
        'Year': df['Year'].astype(int) if not df['Year'].isnull().any() else df['Year'],
        'Color': df['_orig_Color'],
        'Engine Capacity': df['_orig_Engine_Cap'],
        'Predicted Price (RM)': df['Predicted Price (RM)']
    })
    return out

def batch_predict_file(csv_file):
    """
    Run batch prediction and write results to a temporary CSV file.
    Returns the path to the generated CSV, suitable for Gradio File output.
    """
    if csv_file is None:
        raise ValueError("No CSV file provided for download.")
    # Reuse batch_predict to produce the DataFrame (this will raise informative errors if input invalid)
    out_df = batch_predict(csv_file)
    # Write to a temp file and return its path (Gradio accepts a filepath for File output)
    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
    out_df.to_csv(tmp.name, index=False)
    tmp.close()
    return tmp.name


def batch_predict_and_file(csv_file):
    """
    Run batch prediction and return both the display DataFrame and a temp CSV path
    so a single Gradio action can update the Dataframe and provide a downloadable file.
    """
    if csv_file is None:
        raise ValueError("No CSV file provided for download.")

    # Produce DataFrame using existing batch_predict (this raises clear errors on invalid input)
    out_df = batch_predict(csv_file)

    # Write results to a temp CSV and return both outputs
    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
    out_df.to_csv(tmp.name, index=False)
    tmp.close()
    return out_df, tmp.name

def clear_batch_outputs():
    """
    Clear the batch outputs displayed in the UI.
    Returns an empty DataFrame for the table and None for the file output so Gradio clears both.
    """
    return pd.DataFrame(), None


# Prepare dropdown options based on encoders if available (fallbacks handled in UI creation)
model_choices = list(encoders['Model'].classes_) if encoders is not None and 'Model' in encoders else []
color_choices = list(encoders['Color'].classes_) if encoders is not None and 'Color' in encoders else []
engine_choices = list(encoders['Engine.Cap'].classes_) if encoders is not None and 'Engine.Cap' in encoders else [] 
engine_choices = list(encoders['Engine.Cap'].classes_) if encoders is not None and 'Engine.Cap' in encoders else []

title = "Perodua Car Price Predictor"
description = "Single prediction or upload a CSV for batch predictions."

# Build Gradio app using Blocks with Tabs
with gr.Blocks() as demo:
    gr.Markdown(f"## {title}\n\n{description}")

    with gr.Tab("Single"):
        # Model input: dropdown if encoder present, otherwise textbox
        if model_choices:
            model_input = gr.Dropdown(choices=model_choices, label="Model", value=model_choices[0])
        else:
            model_input = gr.Textbox(label="Model (no encoders found; enter encoded numeric or save encoders)", value="")

        year_input = gr.Number(label="Year", value=2016)

        # Color input
        if color_choices:
            color_input = gr.Dropdown(choices=color_choices, label="Color", value=color_choices[0])
        else:
            color_input = gr.Textbox(label="Color (no encoders found; enter encoded numeric or save encoders)", value="")

        # Engine Capacity: if encoder exists, use dropdown; else use Number input
        if engine_choices:
            engine_input = gr.Dropdown(choices=engine_choices, label="Engine Capacity", value=engine_choices[0])
        else:
            engine_input = gr.Number(label="Engine Capacity", value=1600)

        predict_btn = gr.Button("Predict")
        # Use a Textbox to display formatted price (exactly two decimals)
        single_out = gr.Textbox(label="Predicted Price (RM)")

        predict_btn.click(fn=predict_price, inputs=[model_input, year_input, color_input, engine_input], outputs=single_out)

    with gr.Tab("Batch"):
        gr.Markdown("Upload CSV with columns: Model, Year, Color, Engine.Cap (or 'Engine Capacity').")
        csv_input = gr.File(label="Upload CSV", file_types=['.csv'])

        # Predict -> show table and generate downloadable CSV
        batch_btn = gr.Button("Predict & Prepare Download")
        df_out = gr.Dataframe(headers=None)
        file_out = gr.File(label="Download Predictions (CSV)")

        # Single click will update both the table and provide a downloadable CSV
        batch_btn.click(fn=batch_predict_and_file, inputs=csv_input, outputs=[df_out, file_out])

        # Clear (✖) button — clears the table and removes the file link
        clear_btn = gr.Button("✖ Clear Output", variant="secondary")
        clear_btn.click(fn=clear_batch_outputs, inputs=[], outputs=[df_out, file_out])

# Launch app
def main():
    demo.launch(share=True)


if __name__ == '__main__':
    main()

* Running on local URL:  http://127.0.0.1:7862
* Running on public URL: https://a428e3a4d4556d377d.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
* Running on public URL: https://a428e3a4d4556d377d.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


ERROR:    Exception in ASGI application
Traceback (most recent call last):
  File "c:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\uvicorn\protocols\http\h11_impl.py", line 403, in run_asgi
    result = await app(  # type: ignore[func-returns-value]
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\uvicorn\middleware\proxy_headers.py", line 60, in __call__
    return await self.app(scope, receive, send)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\fastapi\applications.py", line 1134, in __call__
    await super().__call__(scope, receive, send)
  File "c:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\starlette\applications.py", line 113, in __call__
    await self.middleware_stack(scope, receive, send)
  File "c:\Users\user\AppData\Local\Programs\Python\Python312\Lib\sit