# [](http://)Russian Plate Price Prediction using Ensemble Learning
by Ken Quach
> This code also uses the feature engineering and Autogluon code from [Solomon Andryushenko](https://www.kaggle.com/lumium) and [liangkaixin](https://www.kaggle.com/liangkaixin)

**If my notebook helps you, please consider upvoting it :)**

# Import libraries and download AutoGluon

In [1]:
import os
import sys
import string

import pandas as pd




!pip install autogluon
from autogluon.tabular import TabularDataset, TabularPredictor
from autogluon.core.metrics import make_scorer

import xgboost as xgb
import numpy as np
import matplotlib.pyplot as plt
import torch
import numpy as np
from tqdm import tqdm  # Import tqdm for progress bar

import warnings
warnings.filterwarnings('ignore')

Collecting autogluon
  Downloading autogluon-1.2-py3-none-any.whl.metadata (11 kB)
Collecting autogluon.core==1.2 (from autogluon.core[all]==1.2->autogluon)
  Downloading autogluon.core-1.2-py3-none-any.whl.metadata (12 kB)
Collecting autogluon.features==1.2 (from autogluon)
  Downloading autogluon.features-1.2-py3-none-any.whl.metadata (11 kB)
Collecting autogluon.tabular==1.2 (from autogluon.tabular[all]==1.2->autogluon)
  Downloading autogluon.tabular-1.2-py3-none-any.whl.metadata (14 kB)
Collecting autogluon.multimodal==1.2 (from autogluon)
  Downloading autogluon.multimodal-1.2-py3-none-any.whl.metadata (12 kB)
Collecting autogluon.timeseries==1.2 (from autogluon.timeseries[all]==1.2->autogluon)
  Downloading autogluon.timeseries-1.2-py3-none-any.whl.metadata (12 kB)
Collecting scikit-learn<1.5.3,>=1.4.0 (from autogluon.core==1.2->autogluon.core[all]==1.2->autogluon)
  Downloading scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collec

# How the plate looks like?

The plate consists of series and number, where the last two or three digits are the region code. Courtesy from [Skoda-storyboard](https://www.skoda-storyboard.com/en/models/deciphering-number-plates-russia/).
![image](https://cdn.skoda-storyboard.com/2019/04/Russian-license-plate-english.jpg#?s_aid=bd5wemcb-uz36-l03s-yigd-6qaeks5bqrk7_004_1)

# Define all functions for feature extraction/engineer

We will use the supplementary files and split the plate characters, then date time...

In [2]:
from supplemental_english import *  # REGION_CODES, GOVERNMENT_CODES

# Ensure the logs directory exists
os.makedirs("./logs", exist_ok=True)

# Define log file path
log_file_path = "./logs/training_log.txt"

# Function to log messages to both console and file
def printt(message):
    print(message)
    with open(log_file_path, "a") as log_file:
        log_file.write(message + "\n")

# SMAPE
def smape(y_true, y_pred):
    y_pred = np.exp(y_pred)  
    y_true = np.exp(y_true) 
    return np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred) + 1e-8)) * 100

def find_importance_values_for_plate(plate: str, gov_codes: dict) -> tuple:
    letters = plate[0] + plate[4:6]  # Extracts letters
    numbers = int(plate[1:4])  # Extracts numbers
    region_code = plate[6:]  # Extracts region code

    # print(plate, "---", letters, numbers, region_code)
    
    for (code_letters, num_range, region), details in gov_codes.items():
        if letters == code_letters and region_code == region:
            if num_range[0] <= numbers <= num_range[1]:  # Checks if within range
                return (details[2], details[3])  # Importance values
    
    return (0, 0)  # Ordinary plate, no government affiliation


def add_advantage_on_road_and_significance(data: pd.DataFrame) -> pd.DataFrame:
    def apply_helper(row):
        advantage_on_road, significance = find_importance_values_for_plate(row["plate"], GOVERNMENT_CODES)
        return pd.Series({
            "advantage_on_road": advantage_on_road,
            "significance": significance,
        })

    data[["advantage_on_road", "significance"]] = data.apply(apply_helper, axis=1)
    return data

def encode_plate(plate: str) -> list[int]:
    encoded = []
    for char in plate:
        if char in char2idx:
            encoded.append(char2idx[char])
        else:
            encoded.append(0)
    return encoded

# Define constants
PLATE_POSSIBLE_LETTERS = "ABEKMHOPCTYX"  # 12 total
ALL_CHARS = PLATE_POSSIBLE_LETTERS + string.digits  # 12 + 10 = 22 total
RANDOM_STATE = 37
char2idx = {c: i for i, c in enumerate(ALL_CHARS)}  # char to identifier map

# Pre-process data

In [3]:
# preprocess data
def get_region_code(plate):
    region_code = str(int(plate[6:]))
    for region, codes in REGION_CODES.items():
        if region_code in codes:
            return region
    return "Unknown"

def is_zoro(num_str):
    """ゾロ目を検知"""
    return int(len(set(num_str)) == 1)

VIP_NUMBERS = {
    "001", "002", "003", "007",
    "100", "200", "300", "400", "600", "700", "800", "900", "911"
}

def is_vip(num_str):
    """事前に定義したVIP番号リストに該当するか"""
    return int(num_str in VIP_NUMBERS)

SEQUENTIAL_NUMBERS = {
    "123", "234", "345", "456", "567", "678", "789",
    "321", "432", "543", "654", "765", "876", "987"
}

def is_sequential(num_str):
    """連番を検知"""
    return int(num_str in SEQUENTIAL_NUMBERS)

def process_data(csv_link, region_price_dict):
    
    # Read CSV file
    df = pd.read_csv(
        csv_link,
        dtype={
            "id": int,
            "plate": str,
        },
        parse_dates=["date"],
    )
    
    # Ensure 'date' is in datetime format
    df["date"] = pd.to_datetime(df["date"], errors="coerce")
    
    # Extracting date features
    df["year"] = df["date"].dt.year
    df["month"] = df["date"].dt.month
    df["day"] = df["date"].dt.day
    df["weekday"] = df["date"].dt.weekday
    # Removing unnecessary columns
    df = df.drop(columns=["date"])
    df = df.drop(columns=["id"])
    
    # Adding features (advantage on road (bool), significance (int))
    df = add_advantage_on_road_and_significance(df)
    
    # Standardizing plate format (ensuring 9-character plates)
    df["plate"] = df["plate"].apply(lambda plate: plate if len(plate) == 9 else f"{plate[:6]}0{plate[6:]}")

    # add region code
    df["region_name"] = df["plate"].apply(get_region_code).astype(str)

    # add series
    df["plate_number"] = df["plate"].apply(lambda plate: plate[1:4]).astype(str)

    # add number
    df["plate_series"] = df["plate"].apply(lambda plate: plate[0]+plate[4:6]).astype(str)

    # add region number
    df["plate_region"] = df["plate"].apply(lambda plate: plate[6:]).astype(str)

    # add VIP / zoro / sequential features
    df["is_zoro"] = df["plate_number"].apply(is_zoro)
    df["is_vip"] = df["plate_number"].apply(is_vip)
    df["is_seq"] = df["plate_number"].apply(is_sequential)
    
    df = df.drop(columns=["plate"], errors="ignore")

    # map region average to each records
    df["region_avg_price"] = df["region_name"].map(region_price_dict)

    # Apply logarithm transformation
    df['price'] = np.log1p(df['price'])
    df['region_avg_price'] = np.log1p(df['region_avg_price'])
        
    return df

# get dict of average region price
train_link = "/kaggle/input/russian-car-plates-prices-prediction/train.csv"
df = pd.read_csv(
    train_link,
    dtype={
        "id": int,
        "plate": str,
    },
    parse_dates=["date"],
)

df["region_code"] = df["plate"].apply(get_region_code)
df['region_avg_price'] = df.groupby("region_code")["price"].transform("median") 
region_avg_price_dict = df.groupby("region_code")["region_avg_price"].first().to_dict()

# read data
dataset_link = "/kaggle/input/russian-car-plates-prices-prediction/train.csv"
train_df = process_data(dataset_link, region_avg_price_dict)
train_df.head(2)

Unnamed: 0,price,year,month,day,weekday,advantage_on_road,significance,region_name,plate_number,plate_series,plate_region,is_zoro,is_vip,is_seq,region_avg_price
0,11.082158,2024,12,26,3,0,0,Moscow,59,XCP,797,0,0,0,11.918397
1,11.512935,2024,7,12,4,0,0,Moscow Oblast,800,YMH,790,0,1,0,11.982935


# Training phase

In [4]:
# Define smape as metrics for AutoGluon
smape_scorer = make_scorer(name='smape1', score_func=smape, greater_is_better=False)

predictor = TabularPredictor(label='price', eval_metric=smape_scorer).fit(
    train_df,
    time_limit=3600,
)

No path specified. Models will be saved in: "AutogluonModels/ag-20250501_044352"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.10.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Sun Nov 10 10:07:59 UTC 2024
CPU Count:          4
Memory Avail:       29.91 GB / 31.35 GB (95.4%)
Disk Space Avail:   19.50 GB / 19.52 GB (99.9%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='experimental' : New in v1.2: Pre-trained foundation model + parallel fits. The absolute best accuracy without consideration for inference speed. Does not support GPU.
	presets='best'         : Maximize accuracy. Recommended for most users. Use in competitions and benchmarks.
	presets='high'         : Strong accu

[1000]	valid_set's l2: 0.365787	valid_set's smape1: -39.7328
[2000]	valid_set's l2: 0.357285	valid_set's smape1: -38.8644
[3000]	valid_set's l2: 0.355215	valid_set's smape1: -38.5948
[4000]	valid_set's l2: 0.355312	valid_set's smape1: -38.5115
[5000]	valid_set's l2: 0.355626	valid_set's smape1: -38.4401
[6000]	valid_set's l2: 0.356714	valid_set's smape1: -38.3461
[7000]	valid_set's l2: 0.357532	valid_set's smape1: -38.2792
[8000]	valid_set's l2: 0.358988	valid_set's smape1: -38.289
[9000]	valid_set's l2: 0.359081	valid_set's smape1: -38.2174
[10000]	valid_set's l2: 0.360534	valid_set's smape1: -38.2128


	-38.1865	 = Validation score   (-smape1)
	47.35s	 = Training   runtime
	3.44s	 = Validation runtime
Fitting model: LightGBM ... Training model for up to 3546.74s of the 3546.74s of remaining time.


[1000]	valid_set's l2: 0.335524	valid_set's smape1: -37.5641
[2000]	valid_set's l2: 0.333142	valid_set's smape1: -37.2942
[3000]	valid_set's l2: 0.331978	valid_set's smape1: -37.2229
[4000]	valid_set's l2: 0.332986	valid_set's smape1: -37.1905
[5000]	valid_set's l2: 0.333798	valid_set's smape1: -37.187
[6000]	valid_set's l2: 0.334543	valid_set's smape1: -37.1775
[7000]	valid_set's l2: 0.335132	valid_set's smape1: -37.1879


	-37.1529	 = Validation score   (-smape1)
	53.49s	 = Training   runtime
	3.05s	 = Validation runtime
Fitting model: RandomForestMSE ... Training model for up to 3489.43s of the 3489.43s of remaining time.
	-43.5771	 = Validation score   (-smape1)
	38.65s	 = Training   runtime
	0.2s	 = Validation runtime
Fitting model: CatBoost ... Training model for up to 3448.97s of the 3448.97s of remaining time.
	-37.0728	 = Validation score   (-smape1)
	424.75s	 = Training   runtime
	0.13s	 = Validation runtime
Fitting model: ExtraTreesMSE ... Training model for up to 3022.74s of the 3022.74s of remaining time.
	-46.3956	 = Validation score   (-smape1)
	16.9s	 = Training   runtime
	0.21s	 = Validation runtime
Fitting model: NeuralNetFastAI ... Training model for up to 3002.67s of the 3002.67s of remaining time.
Metric smape1 is not supported by this model - using mean_squared_error instead
No improvement since epoch 4: early stopping
	-42.7786	 = Validation score   (-smape1)
	84.31s	 = Training   r

[1000]	valid_set's l2: 0.323735	valid_set's smape1: -36.3675
[2000]	valid_set's l2: 0.323791	valid_set's smape1: -36.2176
[3000]	valid_set's l2: 0.323019	valid_set's smape1: -36.1528
[4000]	valid_set's l2: 0.323199	valid_set's smape1: -36.1531
[5000]	valid_set's l2: 0.323187	valid_set's smape1: -36.1293
[6000]	valid_set's l2: 0.32339	valid_set's smape1: -36.1161
[7000]	valid_set's l2: 0.323469	valid_set's smape1: -36.1106
[8000]	valid_set's l2: 0.323392	valid_set's smape1: -36.0916
[9000]	valid_set's l2: 0.323401	valid_set's smape1: -36.0949
[10000]	valid_set's l2: 0.323441	valid_set's smape1: -36.0947


	-36.0903	 = Validation score   (-smape1)
	162.26s	 = Training   runtime
	10.63s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ... Training model for up to 360.00s of the 2552.96s of remaining time.
	Ensemble Weights: {'LightGBMLarge': 0.455, 'CatBoost': 0.318, 'NeuralNetTorch': 0.136, 'LightGBM': 0.091}
	-34.9246	 = Validation score   (-smape1)
	0.04s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 1047.13s ... Best model: WeightedEnsemble_L2 | Estimated inference throughput: 180.6 rows/s (2500 batch size)
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("/kaggle/working/AutogluonModels/ag-20250501_044352")


# Generate Prediction

In [5]:
# Load test dataset
test_data = pd.read_csv(
    "/kaggle/input/russian-car-plates-prices-prediction/test.csv",
    dtype={"id": int, "plate": str},
    parse_dates=["date"],
)

test_ids = test_data["id"].copy()

xgb_df_test = process_data("/kaggle/input/russian-car-plates-prices-prediction/test.csv", region_avg_price_dict)

# Make prediction
test_data = TabularDataset(xgb_df_test)
test_pred = predictor.predict(test_data)
test_pred = np.round(np.expm1(test_pred))
submission = pd.DataFrame()
submission['id'] = test_ids
submission['price'] = test_pred
submission.to_csv('submission.csv',index=False)
print('Done producing submission.csv')

Done producing submission.csv


**Now it is your turn to beat this 🚗💨**