In [None]:
# === 1. Imports ===
import sys
import os
sys.path.append(os.path.abspath("../"))

# Reload modules to pick up latest changes
import importlib
import src.preprocess.preprocessing_pipeline
import src.preprocess.encoding
import src.preprocess.feature_engineering
importlib.reload(src.preprocess.encoding)
importlib.reload(src.preprocess.feature_engineering)
importlib.reload(src.preprocess.preprocessing_pipeline)

import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split
from catboost import Pool
import joblib

from src.load_data import load_train_data, save_processed_data, load_test_data
from src.preprocess.preprocessing_pipeline import PreprocessingPipeline

print("✅ Modules reloaded - ready to use new features!")


In [None]:
# === 2. Load data ===
df = load_train_data()

# === 3. Prepare features and target (before preprocessing) ===
X_raw = df.drop(columns=["price"])
y_raw = df["price"]

# === 4. Train-test split (on raw data) ===
X_train_raw, X_val_raw, y_train_raw, y_val_raw = train_test_split(
    X_raw, y_raw, test_size=0.2, random_state=42
)

In [None]:
# === 4a. Preprocess training data ===
train_df = X_train_raw.copy()
train_df["price"] = y_train_raw

pipeline = PreprocessingPipeline(
    use_log_target=True,
    drop_low_importance=True,
    encode_data=False,  # CatBoost handles categoricals natively
    use_target_encoding=True  # Add target encoding as additional features
)
train_processed = pipeline.fit_transform(train_df)

# Ensure categorical columns are strings
cat_features = ["model", "brand", "transmission", "fuelType", "brand_model"]
for col in cat_features:
    if col in train_processed.columns:
        train_processed[col] = train_processed[col].astype(str)

X_train = train_processed.drop(columns=["price", "log_price"], errors='ignore')
y_train = train_processed["log_price"]

print("✅ X_train shape:", X_train.shape)
print("✅ Categorical features (for native handling):", cat_features)
print("✅ Target-encoded features:", [col for col in X_train.columns if 'target_enc' in col])
print("✅ Tax features:", [col for col in X_train.columns if 'tax' in col.lower()])
print("✅ Polynomial features:", [col for col in X_train.columns if 'squared' in col or 'interaction' in col])
print("\nX_train sample columns:", X_train.columns.tolist()[:10], "...")
print("y_train description:\n", y_train.describe())

In [None]:
# === 4b. Preprocess validation data (using fitted pipeline) ===
val_df = X_val_raw.copy()
val_df["price"] = y_val_raw
val_processed = pipeline.transform(val_df)

# Ensure categorical columns are strings (same list as training)
cat_features = ["model", "brand", "transmission", "fuelType", "brand_model"]
for col in cat_features:
    if col in val_processed.columns:
        val_processed[col] = val_processed[col].astype(str)

X_val = val_processed.drop(columns=["price", "log_price"], errors='ignore')
y_val = val_processed["log_price"] if "log_price" in val_processed.columns else y_val_raw

In [None]:
# === 5. Hyperparameter tuning - Best Generalization Approach ===
# Strategy: Start from known good config (depth=8, lr=0.1, iterations=1000, RMSE ~2023)
# Only tune regularization (l2_leaf_reg) to prevent overfitting
# This minimizes hyperparameter space and reduces overfitting risk

# Define categorical features (must match what was used in preprocessing)
cat_features = ["model", "brand", "transmission", "fuelType", "brand_model"]
cat_feature_indices = [X_train.columns.get_loc(col) for col in cat_features if col in X_train.columns]

# Test different regularization values with known good base config
l2_values = [1, 3, 5, 7, 10]
best_val_rmse = float('inf')
best_l2 = None
best_model = None

print("Testing regularization values with base config:")
print("Base: depth=8, learning_rate=0.1, iterations=1000")
print("-" * 60)

for l2 in l2_values:
    model = CatBoostRegressor(
        depth=8,
        learning_rate=0.1,
        iterations=1000,
        l2_leaf_reg=l2,
        verbose=0,
        random_state=42,
        cat_features=cat_feature_indices,
        loss_function='RMSE'
    )
    
    # Train on training set
    model.fit(X_train, y_train)
    
    # Evaluate on holdout validation set (better for generalization than CV)
    val_preds = model.predict(X_val)
    val_rmse = root_mean_squared_error(y_val, val_preds)
    val_rmse_raw = root_mean_squared_error(np.expm1(y_val), np.expm1(val_preds))
    
    print(f"l2_leaf_reg={l2:2d}: Val RMSE (log)={val_rmse:.5f}, Val RMSE ($)={val_rmse_raw:.2f}")
    
    if val_rmse < best_val_rmse:
        best_val_rmse = val_rmse
        best_l2 = l2
        best_model = model

print("-" * 60)
print(f"\n✅ Best regularization: l2_leaf_reg={best_l2}")
print(f"Best validation RMSE (log scale): {best_val_rmse:.5f}")
print(f"Best validation RMSE ($): {root_mean_squared_error(np.expm1(y_val), np.expm1(best_model.predict(X_val))):.2f}")

cat_model = best_model

In [None]:
# === 6. Evaluate performance ===
val_preds = cat_model.predict(X_val)

print("Validation predictions range:", val_preds.min(), "to", val_preds.max())
rmse_log = root_mean_squared_error(y_val, val_preds)
rmse_raw = root_mean_squared_error(np.expm1(y_val), np.expm1(val_preds))
print(f"Validation RMSE (log scale): {rmse_log:.5f}")
print(f"Validation RMSE ($): {rmse_raw:.2f}")

In [None]:
# === 7. Save model ===
model_path = "../models/catboost_model.joblib"
joblib.dump(cat_model, model_path)
print(f"Saved CatBoost model to: {model_path}")


In [None]:
# === 8. Predict on test set ===
test_df = load_test_data()
# Use the same pipeline that was fitted on training data
test_clean = pipeline.transform(test_df)

# Ensure categorical columns are strings (same list as training)
cat_features = ["model", "brand", "transmission", "fuelType", "brand_model"]
for col in cat_features:
    if col in test_clean.columns:
        test_clean[col] = test_clean[col].astype(str)

# Ensure test columns match training set
test_clean = test_clean[X_train.columns]

test_preds_log = cat_model.predict(test_clean)

# === 9. Convert back to original scale and save ===
test_preds = np.expm1(test_preds_log)
submission = pd.DataFrame({
    "ID": test_df["ID"],
    "Actual": test_preds
})

submission_path = "../results/catboost_test_preds.csv"
submission.to_csv(submission_path, index=False)
print(f"Saved CatBoost predictions to: {submission_path}")