In [514]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm
from sklearn.model_selection import train_test_split
from clean_data import create_new_features, get_plate_info, get_government_series
from data.supplemental_english import GOVERNMENT_CODES, REGION_CODES
import seaborn as sns
import re

**1. Load the dataset**:

In [515]:
# Read the train and test datasets
print("Read the datasets...")
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
print(f"Train data shape: {train.shape}")
print(f"Test data shape: {test.shape}")

# Add a column 'train' to determine if the row is from the train or test set, it is easy to process both of the train and test sets 
# at the same time
train['train'] = 1
test['train'] = 0
df = pd.concat([train, test], ignore_index=True)
print(f"Combined data shape: {df.shape}")

Read the datasets...
Train data shape: (51635, 4)
Test data shape: (7695, 4)
Combined data shape: (59330, 5)


**2. Feature Engineering**

In [516]:
# All functions to extract the information from the plate
def is_all_latin(s):
    return bool(re.fullmatch(r'[A-Za-z0-9]+', s))

def get_government_series():
    gov_series = set()
    for key in GOVERNMENT_CODES:
        if is_all_latin(key[0]):
            gov_series.add(key[0])
    return gov_series

def get_region(region_code):
    for region in REGION_CODES:
        for code in REGION_CODES[region]:
            if code == region_code:
                return region
    return "NAN"


def get_plate_info(plate):
    """
    The output will be: government_plate, region, first_three, last_three, full_number, series
    """
    series1 = plate[0]
    series2 = plate[4:6]
    region_code = plate[6:]
    gov_series = get_government_series()
    series = series1 + series2
    government_plate = 1 if (series) in gov_series else 0
    region = get_region(region_code)
    first_three = plate[1:4]
    last_three = plate[6:9] if len(plate) > 8 else plate[6:8]
    full_number = first_three + last_three
    series = series
    return government_plate, region, first_three, last_three, full_number, series

print("Extracting features from the plate...")
df[['government_plate', 'region', 'first_three', 'last_three', 'full_number', 'series']] = \
    df['plate'].apply(get_plate_info).apply(pd.Series)
print("Extracted: government_plate, region, first_three, last_three, full_number, series")

Extracting features from the plate...
Extracted: government_plate, region, first_three, last_three, full_number, series


In [517]:
# Extract the features from the date
def extract_date_features(df):
    '''
    Extract the features from the date and also cycle the features
    '''
    # Basic date features
    df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d %H:%M:%S')
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['day_of_week'] = df['date'].dt.dayofweek
    df['weekend'] = df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
    df['week_of_year'] = df['date'].dt.isocalendar().week.astype(int)
    df['quarter'] = df['date'].dt.quarter   
    df['day_name'] = df['date'].dt.day_name()

    # Cycle the features
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    df['day_sin'] = np.sin(2 * np.pi * df['day'] / 31)  
    df['day_cos'] = np.cos(2 * np.pi * df['day'] / 31)
    df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
    df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)

    return df

df = extract_date_features(df)
print("Extracted date features: year, month, day, day_of_week, weekend, week_of_year, quarter, day_name")

Extracted date features: year, month, day, day_of_week, weekend, week_of_year, quarter, day_name


In [518]:
# Get supplemental information from the given file
df['government_agency'] = None
df['forbidden_to_buy'] = False
df['road_advantage'] = False
df['significance_level'] = 0

def get_government_info(row):
    """
    Get the government information from the given row
    """
    series = row['series']
    number = int(row['first_three'])
    region_code = row['last_three']
    for (govern_series, (start, end), govern_region_code) in GOVERNMENT_CODES.keys():
        if series == govern_series and start <= number <= end and region_code == govern_region_code:
            agency = GOVERNMENT_CODES[(govern_series, (start, end), govern_region_code)][0]
            forbiden = bool(GOVERNMENT_CODES[(govern_series, (start, end), govern_region_code)][1])
            road_adv = bool(GOVERNMENT_CODES[(govern_series, (start, end), govern_region_code)][2])
            significance = int(GOVERNMENT_CODES[(govern_series, (start, end), govern_region_code)][3])
            return agency, forbiden, road_adv, significance
    return None, False, False, 0

print("Extracting government information...")
gov_info_df = df.apply(get_government_info, axis=1)
df['government_agency'] = [info[0] for info in gov_info_df]
df['forbidden_to_buy'] = [info[1] for info in gov_info_df]
df['road_advantage'] = [info[2] for info in gov_info_df]
df['significance_level'] = [info[3] for info in gov_info_df]
print("Extracted government information: government_agency, forbidden_to_buy, road_advantage, significance_level")

Extracting government information...
Extracted government information: government_agency, forbidden_to_buy, road_advantage, significance_level


In [519]:
# Handle agency names
df['government_agency'] = df['government_agency'].fillna('Non-Government')
print('If the agency is not in the list, it is Non-Government')

# Function to categorize the government agency
def categorize_agency(agency):
    if agency == 'Non-Government':
        return 'Non-Government'
    elif 'president' in agency.lower():
        return 'Presidential'
    elif 'police' in agency.lower() or 'internal affairs' in agency.lower():
        return 'Police/Security'
    elif 'government' in agency.lower():
        return 'Government'
    elif 'military' in agency.lower() or 'army' in agency.lower() or 'defense' in agency.lower():
        return 'Military'
    elif 'federal' in agency.lower():
        return 'Federal Services'
    elif 'judge' in agency.lower() or 'court' in agency.lower() or 'justice' in agency.lower() or 'prosecutor' in agency.lower():
        return 'Judicial'
    elif 'administration' in agency.lower():
        return 'Administration'
    else:
        return 'Other Governmental'
    
df['agency_category'] = df['government_agency'].apply(categorize_agency)
print("Categorized government agency names")

# Create binary variables for the government agency using one-hot encoding
agency_dummies = pd.get_dummies(df['agency_category'], prefix='agency')
df = pd.concat([df, agency_dummies], axis=1)
print("Created binary variables for government agency using one-hot encoding")

If the agency is not in the list, it is Non-Government
Categorized government agency names
Created binary variables for government agency using one-hot encoding


In [520]:
# Check for repeated letters in the 'series' (e.g., 'AAA', 'XXX')
# This feature indicates patterns that might be considered desirable.
df['has_repeated_letters'] = df['series'].apply(lambda x: len(set(x)) == 1)
print("Created 'has_repeated_letters' feature.")

# Check for repeated digits in the 'numbers' (e.g., '111', '777')
# These are often considered "beautiful" or "prestigious" numbers.
df['has_repeated_numbers'] = df['first_three'].str.replace(r'(.)(?=.*\1)', '', regex=True).str.len() < df['first_three'].str.len()


# Check for sequential digits (e.g., '123', '987')
# Another pattern that can indicate prestige.
consecutive_numbers = ["123", "234", "345", "456", "567", "789", "987", "876", "765", "654", "543", "432", "321"]
df['has_sequential_numbers'] = df['first_three'].isin(consecutive_numbers)
print("Created 'has_sequential_numbers' feature.")

# Check for mirror digits (e.g., '121', '303') or palindromic numbers (e.g., '111')
# These are also considered special patterns.
df['has_mirror_numbers'] = df['first_three'].apply(lambda x: x == x[::-1])
print("Created 'has_mirror_numbers' feature.")

# Define a list of prestigious letter series (e.g., specific combinations like 'AAA', 'XXX')
prestigious_letter_series = ["AAA", "MMM", "EEE", "KKK", "OOO", "PPP", "CCC", "TTT", "XXX", "BBB", "YYY", "HHH"]
df['is_beautiful_series'] = df['series'].isin(prestigious_letter_series)
print("Created 'is_beautiful_series' feature based on prestigious letter combinations.")

# Define a list of prestigious number combinations (e.g., single digits, triple digits, hundreds)
prestigious_numbers = ["001", "002", "003", "004", "005", "006", "007", "008", "009", "777",
                   "010", "020", "030", "040", "050", "060", "070", "077", "080", "090", "707",
                   "100", "111", "200", "222", "300", "333", "400", "444", "500", "555", "600", 
                   "666", "700", "800", "888", "900", "999"] 
df['is_prestigious_number'] = df['first_three'].isin(prestigious_numbers)
print("Created 'is_prestigious_number' feature based on specific prestigious number patterns.")

# Check if the last threee digits are 77 or 777
special_last_numbers = ["77", "777"]
df["duplicated_two_last_numbers"] = df['last_three'].isin(special_last_numbers)
print("Created 'duplicated_two_last_numbers' feature based on specific last number patterns.")

# Calculate the complexity of letters based on the number of unique characters
# A lower complexity (e.g., 'AAA') might indicate simplicity and prestige.
df['letter_complexity'] = df['series'].apply(
    lambda x: len(set(x)) if pd.notnull(x) else 0
)
print("Calculated 'letter_complexity' feature.")

# Check for the whole number, eg. '777777', '123456'
df['has_repeated_numbers_full_number'] = df['full_number'].apply(lambda x: len(set(x)) == 1)
print("Created 'has_repeated_numbers_full_number' feature.")

# A column for the longest sequence of repeated characters in the plate
def longest_run_length_str(d):
    digits = np.array(list(d), dtype=int)  
    change_points = np.where(np.diff(digits) != 0)[0] + 1
    run_starts = np.concatenate(([0], change_points))
    run_ends = np.concatenate((change_points, [len(digits)]))
    run_lengths = run_ends - run_starts
    return run_lengths.max()
df['longest_run_length'] = df['full_number'].apply(longest_run_length_str)
print("Calculated 'longest_run_length' feature.")

# Check for the monotone of the  whole number
full_consecutive_numbers = ["01234", "012345", "12345", "123456", "23456", "234567", "34567", "345678", "45678", "456789", "56789",
                            "43210", "543210", "54321", "654321", "65432", "765432", "76543", "876543", "87654", "987654", "98765"]
df['has_sequential_numbers_full_number'] = df['full_number'].isin(full_consecutive_numbers)
print("Created 'has_sequential_numbers_full_number' feature.")

# Create an overall prestige score by weighting different prestige-related features
# This combines multiple signals into a single numeric score.
df['prestige_score'] = (
    (df['is_beautiful_series'].astype(int) * 3) + # Higher weight for beautiful letter series
    (df['is_prestigious_number'].astype(int) * 2) + # Moderate weight for prestigious numbers
    (df['has_repeated_letters'].astype(int) * 1) +
    (df['has_repeated_numbers'].astype(int) * 1) +
    (df['has_sequential_numbers'].astype(int) * 1) +
    (df['has_mirror_numbers'].astype(int) * 1) +
    (df['significance_level'].fillna(0)) # Include governmental significance level
)
print("Calculated 'prestige_score' by combining various prestige indicators.")

Created 'has_repeated_letters' feature.
Created 'has_sequential_numbers' feature.
Created 'has_mirror_numbers' feature.
Created 'is_beautiful_series' feature based on prestigious letter combinations.
Created 'is_prestigious_number' feature based on specific prestigious number patterns.
Created 'duplicated_two_last_numbers' feature based on specific last number patterns.
Calculated 'letter_complexity' feature.
Created 'has_repeated_numbers_full_number' feature.
Calculated 'longest_run_length' feature.
Created 'has_sequential_numbers_full_number' feature.
Calculated 'prestige_score' by combining various prestige indicators.


In [521]:
# Frequency Encoding for the 'numbers' feature
# This replaces the number with its frequency of occurrence in the dataset.
freq_table = df['first_three'].value_counts().reset_index()
freq_table.columns = ['first_three', 'n']
freq_table['freq_enc'] = freq_table['n'] / freq_table['n'].sum()
freq_table['log_freq_enc'] = np.log1p(freq_table['freq_enc']) # Log transform for potential skewed distribution

# Merge frequency encodings back to the main DataFrame
df = df.merge(freq_table[['first_three', 'freq_enc', 'log_freq_enc']], 
              on='first_three', how='left')
print("Applied Frequency Encoding to 'numbers' feature.")

# Target Encoding (Mean Encoding) for categorical features
# This technique replaces a categorical value with the mean of the target variable
# for that category. It's crucial to perform this only on the training data
# to avoid data leakage from the test set.
train_data = df[df['train'] == 1].copy()

# For regions: calculate mean price for each region name
region_mean_price = train_data.groupby('region')['price'].mean().reset_index()
region_mean_price['log_region_mean_price'] = np.log1p(region_mean_price['price'])
region_mean_price.drop(columns=['price'], inplace=True)
df = df.merge(region_mean_price, on='region', how='left')
print("Target encoded 'region_name' with 'region_mean_price'.")

# Logarithmic transformation of the target variable 'price'
# This is a common practice in regression to make the target distribution more normal
# and reduce the impact of outliers, improving model performance.
df['log_price'] = np.log1p(df['price'])
print("Applied logarithmic transformation (log1p) to 'price' to create 'log_price'.")


Applied Frequency Encoding to 'numbers' feature.
Target encoded 'region_name' with 'region_mean_price'.
Applied logarithmic transformation (log1p) to 'price' to create 'log_price'.


In [529]:
from scipy.stats import rankdata
df['prestige_rank'] = rankdata(df['prestige_score'].astype(int), method='average') / len(df)
print("Created 'prestige_rank' based on 'prestige_score'.")

# Interaction Features:
df['letter_number_combo'] = df['series'] + "_" + df['first_three'].astype(str)
# Interaction between 'is_government' and 'prestige_score'
df['is_gov_and_prestige'] = df['government_plate'] * df['prestige_score'].astype(int)
print("Added 'letter_number_combo' and 'is_gov_and_prestige' interaction features.")

# Flag common premium regions (e.g., major cities/oblasts) as a binary feature.
premium_regions = ['Moscow', 'Saint Petersburg', 'Moscow Oblast']
df['is_premium_region'] = df['region'].isin(premium_regions).astype(int)
print("Created 'is_premium_region' feature for major economic centers.")

Created 'prestige_rank' based on 'prestige_score'.
Added 'letter_number_combo' and 'is_gov_and_prestige' interaction features.
Created 'is_premium_region' feature for major economic centers.


In [530]:
from sklearn.base import clone # For cloning estimators in cross-validation
from sklearn.compose import ColumnTransformer # To apply different transformers to different columns
from sklearn.pipeline import Pipeline # To chain multiple processing steps and a final estimator
from sklearn.preprocessing import OrdinalEncoder, KBinsDiscretizer, OneHotEncoder # Various encoding/discretization methods
from sklearn.model_selection import StratifiedKFold # Cross-validation strategy
from xgboost import XGBRegressor # Gradient Boosting Machine from XGBoost
from lightgbm import LGBMRegressor # Gradient Boosting Machine from LightGBM
from catboost import CatBoostRegressor # Gradient Boosting Machine from CatBoost
import category_encoders as ce # Advanced categorical encoders (install with: pip install category-encoders)
 
seed = 42 # Random seed for reproducibility
n_splits = 10

target = 'log_price'
drop_cols = ['id', 'train', 'plate', 'price', 'log_price', 'date', 'government_agency', 'agency_category']


In [531]:
# Separating the concatenated DataFrame back into original training and testing sets
# based on the 'is_train' flag.
train_df = df[df['train'] == 1].copy() # .copy() to avoid SettingWithCopyWarning
test_df = df[df['train'] == 0].copy()

# Defining the features (X) and the target (y) for the training set,
# and features for the test set (X_test).
# 'errors='ignore'' handles cases where a column in DROP_COLS might not exist, preventing errors.
X = train_df.drop(columns=drop_cols, errors='ignore')
y = train_df[target].copy()
X_test = test_df.drop(columns=drop_cols, errors='ignore')

print("Data split into training and testing sets.")
print(f"Training features (X) shape: {X.shape}")
print(f"Training target (y) shape: {y.shape}")
print(f"Test features (X_test) shape: {X_test.shape}")

Data split into training and testing sets.
Training features (X) shape: (51635, 50)
Training target (y) shape: (51635,)
Test features (X_test) shape: (7695, 50)


In [532]:
# This section is crucial for handling different data types dynamically.
# It automatically identifies numerical, boolean, and categorical columns,
# and further segments categorical columns by their cardinality to apply
# appropriate encoding strategies.

def detect_columns(X):
    """
    Detects and segments columns by their data type and cardinality.
    This helps in applying specific preprocessing steps to different column types.
    """
    bool_cols = [c for c in X.columns if X[c].dtype == 'bool'] # Identify boolean columns
    num_cols = [c for c in X.columns if X[c].dtype.kind in 'if' and c not in bool_cols] # Identify numerical (int/float) columns, excluding booleans
    cat_cols = [c for c in X.columns if c not in num_cols + bool_cols]  # Remaining columns are treated as categorical

    # Further segmentation of categorical columns by cardinality (number of unique values)
    # Different encoding strategies are optimal for different cardinalities.
    cat_low = [c for c in cat_cols if X[c].nunique() <= 20] # Low cardinality for One-Hot Encoding
    cat_high = [c for c in cat_cols if X[c].nunique() >= 20] # High cardinality for Target Encoding

    print('\nColumn Summary ➜ Numerical:', len(num_cols),
          '| Boolean:', len(bool_cols),
          '| Low Cardinality Categorical:', len(cat_low),
          '| High Cardinality Categorical:', len(cat_high))

    return num_cols, bool_cols, cat_low, cat_high

# Apply the column detection function to the training features
num_cols, bool_cols, cat_low, cat_high = detect_columns(X)


Column Summary ➜ Numerical: 24 | Boolean: 19 | Low Cardinality Categorical: 1 | High Cardinality Categorical: 6


In [533]:
# The `ColumnTransformer` is the core component here. It allows applying
# different transformations to different subsets of columns in parallel.
# This ensures that each column type is handled appropriately before feeding to the model.

preprocess = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', num_cols), # Numerical columns: 'passthrough' means no transformation
        ('bool', 'passthrough', bool_cols), # Boolean columns: 'passthrough' as they are already binary
        ('low', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_low),  # One-Hot Encoding for low cardinality categories
                                                                                        # 'handle_unknown='ignore'' prevents errors if new categories appear in test set
                                                                                        # 'sparse_output=False' returns a dense NumPy array
        ('high', ce.TargetEncoder(cols=cat_high, smoothing=0.2), cat_high)  # Target Encoding for high cardinality categories
                                                                            # Replaces category with mean of target. 'smoothing' helps prevent overfitting.
    ],
    remainder='drop',  # Drops any columns not explicitly specified in `transformers` (safer approach)
    n_jobs=-1  # Utilizes all available CPU cores for parallel processing during transformation
)
print("\nPreprocessing pipeline (ColumnTransformer) defined.")



Preprocessing pipeline (ColumnTransformer) defined.


In [534]:
# Optimized Hyperparameters for various Gradient Boosting Regressors.
# These parameters are typically found through hyperparameter optimization
# techniques like GridSearchCV, RandomizedSearchCV, or more advanced tools like Optuna.

# XGBoost Parameters
xgb_params = {
            'n_estimators': 1495, 
            'max_depth': 11, 
            'learning_rate': 0.01320706464652558, 
            'subsample': 0.6617047575164631, 
            'colsample_bytree': 0.48638458903836135, 
            'reg_alpha': 0.033824195612960836, 
            'reg_lambda': 0.23379853342154155, 
            'gamma': 0.0049614495111536695,
            'objective': 'reg:tweedie',
            'n_jobs': -1,
            'random_state': seed}

# Dictionary of models to be trained. Easily extensible to include more models.
# Uncomment LGBM and CatBoost to include them in the ensemble.
models = {
    'XGB': XGBRegressor(**xgb_params),
    #'LGBM': LGBMRegressor(**lgb_params),
    #'CatBoost': CatBoostRegressor(**cat_params)
}
print("\nModels and their optimized hyperparameters defined.")

# Construct the full pipeline for each model: preprocessing + estimator
# Each pipeline handles all necessary data transformations before training the model.
pipelines = {name: Pipeline(steps=[('prep', preprocess), ('model', model)]) for name, model in models.items()}
print("Pipelines constructed: Preprocessing -> Model.")


Models and their optimized hyperparameters defined.
Pipelines constructed: Preprocessing -> Model.


In [535]:
# The Symmetric Mean Absolute Percentage Error (SMAPE) is often used in forecasting
# and is robust to zero values in the actuals. It's defined once to ensure consistency.

def smape(y_true, y_pred):
    """
    Calculates the Symmetric Mean Absolute Percentage Error (SMAPE).
    Formula: (1/n) * Sum(|y_true - y_pred| / ((|y_true| + |y_pred|) / 2)) * 100
    This metric handles cases where y_true or y_pred (or both) are zero.
    """
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    diff = np.abs(y_true - y_pred)
    
    # Handle division by zero: if denominator is zero (i.e., both y_true and y_pred are zero),
    # the corresponding term for SMAPE is defined as 0.0 to avoid NaN/Inf.
    smape_term = np.zeros_like(diff, dtype=float)
    non_zero_denom = denominator != 0 # Identify where denominator is not zero
    smape_term[non_zero_denom] = diff[non_zero_denom] / denominator[non_zero_denom]
    
    return np.mean(smape_term) * 100

print("\nSMAPE evaluation metric defined.")


SMAPE evaluation metric defined.


In [538]:
import optuna
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 1300, 1500),
        'max_depth': trial.suggest_int('max_depth', 10, 14),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.03, log=True),  
        'subsample': trial.suggest_float('subsample', 0.6, 0.7),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 0.5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 0.05, log=True),          
        'reg_lambda': trial.suggest_float('reg_lambda', 0.2, 0.35),
        'gamma': trial.suggest_float('gamma', 0.001, 0.005, log=True),               
        'tweedie_variance_power': 1.0869464555654937,
        'objective': 'reg:tweedie',
        'n_jobs': -1,
        'random_state': seed,
    }

    model = XGBRegressor(**params)
    pipeline = Pipeline(steps=[('prep', preprocess), ('model', model)])

    kf = KFold(n_splits=5, shuffle=True, random_state=seed)
    smape_scores = []

    for train_idx, val_idx in kf.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        pipe = clone(pipeline)
        pipe.fit(X_train, y_train)
        preds = pipe.predict(X_val)
        score = smape(np.exp(y_val.values), np.exp(preds))
        smape_scores.append(score)

    return np.mean(smape_scores)  # minimize mean SMAPE

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20, n_jobs=1)

print("Best params:", study.best_params)
print("Best CV SMAPE:", study.best_value)

[I 2025-05-22 00:36:10,443] A new study created in memory with name: no-name-27573558-ea81-4b7b-83fe-bf932c9f1089
[I 2025-05-22 00:36:35,408] Trial 0 finished with value: 36.747288095125796 and parameters: {'n_estimators': 1345, 'max_depth': 10, 'learning_rate': 0.01441296218544771, 'subsample': 0.6698589896520895, 'colsample_bytree': 0.41517728613056454, 'reg_alpha': 0.011843121686051368, 'reg_lambda': 0.21646763642367575, 'gamma': 0.004229066093403417}. Best is trial 0 with value: 36.747288095125796.
[I 2025-05-22 00:37:06,521] Trial 1 finished with value: 36.89740744183984 and parameters: {'n_estimators': 1425, 'max_depth': 11, 'learning_rate': 0.019360571338285216, 'subsample': 0.6558066957919729, 'colsample_bytree': 0.4076655411460617, 'reg_alpha': 0.035319171432527115, 'reg_lambda': 0.2702059829719194, 'gamma': 0.0032819884694872538}. Best is trial 0 with value: 36.747288095125796.
[I 2025-05-22 00:37:37,535] Trial 2 finished with value: 36.98753237491073 and parameters: {'n_esti

Best params: {'n_estimators': 1493, 'max_depth': 12, 'learning_rate': 0.011731532487467835, 'subsample': 0.663569119260017, 'colsample_bytree': 0.4964471128737775, 'reg_alpha': 0.017258632934593145, 'reg_lambda': 0.2515888473499397, 'gamma': 0.003266699674463174}
Best CV SMAPE: 36.684376911537896


In [536]:
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import StratifiedKFold
import numpy as np

# Get the target variable and bin it into strata for stratification
y_bins = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile') \
    .fit_transform(y.values.reshape(-1, 1)).astype(int).ravel()
print(f"\nTarget variable ('price') binned into {y_bins.max() + 1} strata for stratification.")

kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)

oof_preds = {name: np.zeros(len(y)) for name in models}
test_preds = {name: np.zeros(len(X_test)) for name in models}
feature_importances = {}

print('\n===== CROSS-VALIDATION TRAINING =====')
for model_name, pipeline in pipelines.items():
    print(f"\nInitiating training for model: {model_name}...")
    try:
        for fold, (train_idx, val_idx) in enumerate(kf.split(X, y_bins), 1):
            print(f"  Fold {fold:02d}/{n_splits}")
            X_tr, y_tr = X.iloc[train_idx], y.iloc[train_idx]
            X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]

            pipeline.fit(X_tr, y_tr)
            oof_preds[model_name][val_idx] = pipeline.predict(X_val)
            test_preds[model_name] += pipeline.predict(X_test) / n_splits

        # Calculate the SMAPE for the out-of-fold predictions
        cv_smape = smape(np.exp(y), np.exp(oof_preds[model_name]))
        print(f'⮕  Overall CV SMAPE for {model_name}: {cv_smape:.2f}%')

        if hasattr(pipeline['model'], 'feature_importances_'):
            feature_importances[model_name] = pipeline['model'].feature_importances_
        elif hasattr(pipeline['model'], 'get_booster'): 
            feature_importances[model_name] = pipeline['model'].get_booster().get_score(importance_type='weight') 
        else:
            feature_importances[model_name] = None

    except Exception as e:
        print(f"Error during training of model {model_name}: {str(e)}")
        continue



Target variable ('price') binned into 10 strata for stratification.

===== CROSS-VALIDATION TRAINING =====

Initiating training for model: XGB...
  Fold 01/10
  Fold 02/10
  Fold 03/10
  Fold 04/10
  Fold 05/10
  Fold 06/10
  Fold 07/10
  Fold 08/10
  Fold 09/10
  Fold 10/10
⮕  Overall CV SMAPE for XGB: 36.54%


In [537]:
X.columns

Index(['government_plate', 'region', 'first_three', 'last_three',
       'full_number', 'series', 'year', 'month', 'day', 'day_of_week',
       'weekend', 'week_of_year', 'quarter', 'day_name', 'month_sin',
       'month_cos', 'day_sin', 'day_cos', 'day_of_week_sin', 'day_of_week_cos',
       'forbidden_to_buy', 'road_advantage', 'significance_level',
       'agency_Administration', 'agency_Federal Services', 'agency_Government',
       'agency_Judicial', 'agency_Non-Government', 'agency_Other Governmental',
       'agency_Police/Security', 'agency_Presidential', 'has_repeated_letters',
       'has_repeated_numbers', 'has_sequential_numbers', 'has_mirror_numbers',
       'is_beautiful_series', 'is_prestigious_number',
       'duplicated_two_last_numbers', 'letter_complexity',
       'has_repeated_numbers_full_number', 'longest_run_length',
       'has_sequential_numbers_full_number', 'prestige_score', 'freq_enc',
       'log_freq_enc', 'log_region_mean_price', 'prestige_rank',
       '

In [415]:
def get_feature_names(preprocessor):
    output_features = []

    # Add numeric and bool columns (passthrough)
    output_features.extend(num_cols)
    output_features.extend(bool_cols)

    # OneHotEncoder feature names
    onehot = preprocessor.named_transformers_['low']
    onehot_features = onehot.get_feature_names_out(cat_low).tolist()
    output_features.extend(onehot_features)

    # TargetEncoder columns (1 col per high cardinality cat)
    output_features.extend(cat_high)

    return output_features


feature_names = get_feature_names(pipeline.named_steps['prep'])
feat_imp_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importances['XGB']
}).sort_values(by='importance', ascending=False)
print(feat_imp_df)  # Top 10 features

                               feature  importance
21                       prestige_rank    0.095047
39               is_prestigious_number    0.079891
41    has_repeated_numbers_full_number    0.070085
17                      prestige_score    0.066061
25                      road_advantage    0.059778
22                 is_gov_and_prestige    0.049516
52                          last_three    0.043619
40         duplicated_two_last_numbers    0.035820
34                has_repeated_letters    0.035089
53                         full_number    0.033891
38                 is_beautiful_series    0.032571
1                                 year    0.028828
0                     government_plate    0.025732
51                         first_three    0.021461
28                   agency_Government    0.018033
15                   letter_complexity    0.018004
54                              series    0.017869
14                  significance_level    0.015788
30               agency_Non-Gov

In [None]:
submission = pd.DataFrame({
    'id': test_df['id'],
    'price': np.exp(test_preds['XGB'])
})
submission.to_csv('submission8.csv', index=False)