In [1]:
# === 1. Imports ===
import sys
import os
sys.path.append(os.path.abspath("../"))

import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from catboost import Pool
import joblib

from src.load_data import load_train_data, save_processed_data, load_test_data
from src.preprocess.preprocessing_pipeline import PreprocessingPipeline


In [2]:
# === 2. Load data ===
df = load_train_data()

# === 3. Prepare features and target (before preprocessing) ===
X_raw = df.drop(columns=["price"])
y_raw = df["price"]

# === 4. Train-test split (on raw data) ===
X_train_raw, X_val_raw, y_train_raw, y_val_raw = train_test_split(
    X_raw, y_raw, test_size=0.2, random_state=42
)

In [3]:
# === 4a. Preprocess training data ===
train_df = X_train_raw.copy()
train_df["price"] = y_train_raw

pipeline = PreprocessingPipeline(
    use_log_target=True,
    drop_low_importance=True,
    encode_data=False  # CatBoost handles categoricals natively
)
train_processed = pipeline.fit_transform(train_df)

# Ensure categorical columns are strings
cat_features = ["model", "brand", "transmission", "fuelType"]
for col in cat_features:
    if col in train_processed.columns:
        train_processed[col] = train_processed[col].astype(str)

X_train = train_processed.drop(columns=["price", "log_price"], errors='ignore')
y_train = train_processed["log_price"]

print("X_train shape:", X_train.shape)
print("X_train sample columns:", X_train.columns.tolist())
print("y_train description:\n", y_train.describe())

X_train shape: (19748, 14)
X_train sample columns: ['model', 'transmission', 'mileage', 'fuelType', 'tax', 'mpg', 'engineSize', 'brand', 'car_age', 'mileage_per_year', 'engine_efficiency', 'power_index', 'age_mileage_interaction', 'log_mileage']
y_train description:
 count    19748.000000
mean         9.838892
std          0.463652
min          6.478510
25%          9.546241
50%          9.862718
75%         10.146277
max         11.918051
Name: log_price, dtype: float64


In [4]:
# === 4b. Preprocess validation data (using fitted pipeline) ===
val_df = X_val_raw.copy()
val_df["price"] = y_val_raw
val_processed = pipeline.transform(val_df)

for col in cat_features:
    if col in val_processed.columns:
        val_processed[col] = val_processed[col].astype(str)

X_val = val_processed.drop(columns=["price", "log_price"], errors='ignore')
y_val = val_processed["log_price"] if "log_price" in val_processed.columns else y_val_raw

In [5]:
# === 5. Hyperparameter tuning ===
# Define parameter grid for tuning
param_grid = {
    'depth': [6, 8, 10],
    'learning_rate': [0.03, 0.05, 0.1],
    'iterations': [500, 1000]
}

cat_feature_indices = [X_train.columns.get_loc(col) for col in cat_features if col in X_train.columns]

cat_model = CatBoostRegressor(verbose=0, random_state=42, cat_features=cat_feature_indices)

grid_search = GridSearchCV(estimator=cat_model, param_grid=param_grid, cv=3, scoring='neg_root_mean_squared_error', verbose=3)
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
cat_model = grid_search.best_estimator_

Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV 1/3] END depth=6, iterations=500, learning_rate=0.03;, score=-0.109 total time=   1.3s
[CV 2/3] END depth=6, iterations=500, learning_rate=0.03;, score=-0.112 total time=   1.2s
[CV 3/3] END depth=6, iterations=500, learning_rate=0.03;, score=-0.114 total time=   1.1s
[CV 1/3] END depth=6, iterations=500, learning_rate=0.05;, score=-0.103 total time=   1.2s
[CV 2/3] END depth=6, iterations=500, learning_rate=0.05;, score=-0.106 total time=   1.1s
[CV 3/3] END depth=6, iterations=500, learning_rate=0.05;, score=-0.108 total time=   1.2s
[CV 1/3] END depth=6, iterations=500, learning_rate=0.1;, score=-0.098 total time=   1.2s
[CV 2/3] END depth=6, iterations=500, learning_rate=0.1;, score=-0.101 total time=   1.2s
[CV 3/3] END depth=6, iterations=500, learning_rate=0.1;, score=-0.104 total time=   1.2s
[CV 1/3] END depth=6, iterations=1000, learning_rate=0.03;, score=-0.101 total time=   2.3s
[CV 2/3] END depth=6, iteration

In [6]:
# === 6. Evaluate performance ===
val_preds = cat_model.predict(X_val)

print("Validation predictions range:", val_preds.min(), "to", val_preds.max())
rmse = root_mean_squared_error(y_val, val_preds)
print(f"Validation RMSE (log scale): {rmse}")
print(f"Validation RMSE ($): {root_mean_squared_error(np.expm1(y_val), np.expm1(val_preds))}")

Validation predictions range: 7.608216132873414 to 11.672617062798953
Validation RMSE (log scale): 0.09557646862363758
Validation RMSE ($): 1952.9618702506903


In [7]:
# === 7. Save model ===
model_path = "../models/catboost_model.joblib"
joblib.dump(cat_model, model_path)
print(f"Saved CatBoost model to: {model_path}")


Saved CatBoost model to: ../models/catboost_model.joblib


In [8]:
# === 8. Predict on test set ===
test_df = load_test_data()
# Use the same pipeline that was fitted on training data
test_clean = pipeline.transform(test_df)

for col in cat_features:
    if col in test_clean.columns:
        test_clean[col] = test_clean[col].astype(str)

# Ensure test columns match training set
test_clean = test_clean[X_train.columns]

test_preds_log = cat_model.predict(test_clean)

# === 9. Convert back to original scale and save ===
test_preds = np.expm1(test_preds_log)
submission = pd.DataFrame({
    "ID": test_df["ID"],
    "Actual": test_preds
})

submission_path = "../results/catboost_test_preds.csv"
submission.to_csv(submission_path, index=False)
print(f"Saved CatBoost predictions to: {submission_path}")

Saved CatBoost predictions to: ../results/catboost_test_preds.csv
