In [None]:
import os
import sys
from pathlib import Path
from datetime import datetime
import pandas as pd
import numpy as np
import logging
import matplotlib.pyplot as plt
import seaborn as sns

# ML libraries
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from catboost import CatBoostRegressor, Pool, cv

# Add project root to path
project_root = Path.cwd().parent.parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

from config import (
    DATA_DIR_GOLD,
    MODELS_DIR,
    RESULTS_DIR,
    MODEL_CATBOOST,
    MODEL_CONFIG,
    NUMERIC_FEATURES,
    CATEGORICAL_FEATURES,
    TARGET_VARIABLE
)
from elferspot_listings.utils.helpers import setup_logging, load_data, ensure_dir

# Setup logging
logger = setup_logging(level='INFO')
logger.info("CatBoost modeling initialized")

# Set random seed for reproducibility
RANDOM_SEED = MODEL_CONFIG['random_state']
np.random.seed(RANDOM_SEED)
RESULTS_MODEL_DIR = ensure_dir(RESULTS_DIR / "model_predictions")
RUN_ID = datetime.utcnow().strftime("%Y%m%d_%H%M%S")

## Step 1: Load Gold Data

In [None]:
# Find the most recent gold file
gold_files = sorted(DATA_DIR_GOLD.glob("listings_gold*.xlsx"))

if gold_files:
    gold_path = gold_files[-1]
    logger.info(f"Using gold file: {gold_path.name}")
else:
    raise FileNotFoundError(f"No gold files found in {DATA_DIR_GOLD}")

# Load data
df_gold = load_data(gold_path)
print(f"✓ Loaded {len(df_gold):,} rows from gold layer")
print(f"  File: {gold_path.name}")

df_gold.head()

## Step 2: Define Features for Modeling

In [None]:
numeric_features = [col for col in NUMERIC_FEATURES if col in df_gold.columns]
categorical_features = [col for col in CATEGORICAL_FEATURES if col in df_gold.columns]
all_features = numeric_features + categorical_features
target_column = 'log_price' if 'log_price' in df_gold.columns else TARGET_VARIABLE

if not all_features:
    raise ValueError("No modeling features were found in the Gold dataset. Verify feature engineering output.")

print(f"Numeric features ({len(numeric_features)}): {numeric_features}")
print(f"\nCategorical features ({len(categorical_features)}): {categorical_features}")
print(f"\nTarget column: {target_column}")

## Step 3: Prepare Modeling Dataset

In [None]:
required_columns = all_features + [target_column, TARGET_VARIABLE]
missing_columns = [col for col in required_columns if col not in df_gold.columns]
if missing_columns:
    raise ValueError(f"Missing required columns in Gold dataset: {missing_columns}")

df_model = df_gold[required_columns].copy()
df_model = df_model.dropna(subset=[target_column, TARGET_VARIABLE])

for col in categorical_features:
    df_model[col] = df_model[col].fillna('Unknown').astype(str)

for col in numeric_features:
    df_model[col] = pd.to_numeric(df_model[col], errors='coerce')
    df_model[col] = df_model[col].fillna(df_model[col].median())

print(f"Dataset ready with shape: {df_model.shape}")
print(f"Remaining nulls:\n{df_model.isnull().sum()[df_model.isnull().sum() > 0]}")

## Step 4: Train-Test Split

In [None]:
X = df_model[all_features]
y = df_model[target_column]
price_series = df_model[TARGET_VARIABLE]

X_train, X_test, y_train, y_test, price_train, price_test = train_test_split(
    X,
    y,
    price_series,
    test_size=MODEL_CONFIG['test_size'],
    random_state=RANDOM_SEED
 )

print(f"Training set: {len(X_train):,} samples")
print(f"Test set: {len(X_test):,} samples")
print(f"Split ratio: {MODEL_CONFIG['test_size']*100:.0f}% test")

cat_feature_indices = [X_train.columns.get_loc(col) for col in categorical_features if col in X_train.columns]
print(f"\nCategorical feature indices: {len(cat_feature_indices)}")

## Step 5: Train CatBoost Model

In [None]:
# Initialize CatBoost model with config
model = CatBoostRegressor(
    iterations=MODEL_CONFIG['catboost']['iterations'],
    learning_rate=MODEL_CONFIG['catboost']['learning_rate'],
    depth=MODEL_CONFIG['catboost']['depth'],
    l2_leaf_reg=MODEL_CONFIG['catboost']['l2_leaf_reg'],
    random_seed=MODEL_CONFIG['catboost']['random_seed'],
    cat_features=cat_feature_indices,
    verbose=100,
    eval_metric='RMSE',
    early_stopping_rounds=50
)

print("Training CatBoost model...")
print(f"Configuration: {MODEL_CONFIG['catboost']}")

# Train model
model.fit(
    X_train, y_train,
    eval_set=(X_test, y_test),
    plot=False
)

print("\n✓ Model training complete!")

## Step 6: Model Evaluation

In [None]:
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

if target_column == TARGET_VARIABLE:
    y_train_price = y_train
    y_test_price = y_test
    y_train_pred_price = y_train_pred
    y_test_pred_price = y_test_pred
    y_train_log = np.log(np.maximum(y_train_price, 1))
    y_test_log = np.log(np.maximum(y_test_price, 1))
    y_train_pred_log = np.log(np.maximum(y_train_pred_price, 1))
    y_test_pred_log = np.log(np.maximum(y_test_pred_price, 1))
else:
    y_train_price = price_train
    y_test_price = price_test
    y_train_pred_price = np.exp(y_train_pred)
    y_test_pred_price = np.exp(y_test_pred)
    y_train_log = y_train
    y_test_log = y_test
    y_train_pred_log = y_train_pred
    y_test_pred_log = y_test_pred

def summarize_metrics(y_true, y_pred, label: str) -> dict:
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"\n{label} Metrics:")
    print(f"  RMSE: {rmse:,.4f}")
    print(f"  MAE:  {mae:,.4f}")
    print(f"  R²:   {r2:.4f}")
    return {'RMSE': rmse, 'MAE': mae, 'R2': r2}

train_metrics_log = summarize_metrics(y_train_log, y_train_pred_log, 'Training (log)')
test_metrics_log = summarize_metrics(y_test_log, y_test_pred_log, 'Test (log)')

train_metrics_price = summarize_metrics(y_train_price, y_train_pred_price, 'Training (price)')
test_metrics_price = summarize_metrics(y_test_price, y_test_pred_price, 'Test (price)')

print("\nOverfitting check (price scale):")
print(f"  R² gap: {train_metrics_price['R2'] - test_metrics_price['R2']:.4f}")
if train_metrics_price['R2'] - test_metrics_price['R2'] > 0.1:
    print("  ⚠️ Potential overfitting")
else:
    print("  ✓ Generalization looks healthy")

residuals_price = y_test_price - y_test_pred_price
residuals_log = y_test_log - y_test_pred_log

## Step 7: Prediction Visualization

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].scatter(y_test_price, y_test_pred_price, alpha=0.5)
axes[0].plot([y_test_price.min(), y_test_price.max()], [y_test_price.min(), y_test_price.max()], 'r--', lw=2)
axes[0].set_xlabel('Actual Price (EUR)')
axes[0].set_ylabel('Predicted Price (EUR)')
axes[0].set_title(f'Predicted vs Actual (Test)\nR² = {test_metrics_price["R2"]:.4f}')
axes[0].grid(True, alpha=0.3)

axes[1].scatter(y_test_pred_price, residuals_price, alpha=0.5)
axes[1].axhline(y=0, color='r', linestyle='--', lw=2)
axes[1].set_xlabel('Predicted Price (EUR)')
axes[1].set_ylabel('Residuals (EUR)')
axes[1].set_title('Residual Plot')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nResidual Statistics (price scale):")
print(f"  Mean: {residuals_price.mean():.2f}")
print(f"  Std:  {residuals_price.std():.2f}")
print(f"  Min:  {residuals_price.min():.2f}")
print(f"  Max:  {residuals_price.max():.2f}")

## Step 8: Feature Importance Analysis

In [None]:
# Get feature importance
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("=== Top 20 Most Important Features ===")
display(feature_importance.head(20))

# Visualize feature importance
plt.figure(figsize=(10, 8))
top_features = feature_importance.head(20)
plt.barh(range(len(top_features)), top_features['importance'])
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Importance')
plt.title('Top 20 Feature Importances')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## Step 9: Cross-Validation

In [None]:
# Perform cross-validation
print("Performing 5-fold cross-validation...")

# Create Pool for CatBoost CV
cv_data = Pool(
    data=X_train,
    label=y_train,
    cat_features=cat_feature_indices
)

# CV parameters
cv_params = {
    'iterations': 500,
    'learning_rate': MODEL_CONFIG['catboost']['learning_rate'],
    'depth': MODEL_CONFIG['catboost']['depth'],
    'l2_leaf_reg': MODEL_CONFIG['catboost']['l2_leaf_reg'],
    'random_seed': RANDOM_SEED,
    'verbose': False
}

# Run CV
cv_results = cv(
    cv_data,
    cv_params,
    fold_count=5,
    shuffle=True,
    partition_random_seed=RANDOM_SEED,
    plot=False,
    verbose=False
)

# Display CV results
print(f"\nCross-Validation Results:")
print(f"  Mean RMSE: {cv_results['test-RMSE-mean'].iloc[-1]:.4f}")
print(f"  Std RMSE:  {cv_results['test-RMSE-std'].iloc[-1]:.4f}")

# Plot CV learning curve
plt.figure(figsize=(10, 6))
plt.plot(cv_results['test-RMSE-mean'], label='CV Mean RMSE')
plt.fill_between(
    range(len(cv_results)),
    cv_results['test-RMSE-mean'] - cv_results['test-RMSE-std'],
    cv_results['test-RMSE-mean'] + cv_results['test-RMSE-std'],
    alpha=0.3
)
plt.xlabel('Iterations')
plt.ylabel('RMSE')
plt.title('Cross-Validation Learning Curve')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## Persist predictions
Store model-level prediction intervals for downstream analysis notebooks (price opportunity scans, KPI reporting).

In [None]:
ci_z = 1.645
residual_std_log = residuals_log.std(ddof=1)
pred_lower = np.exp(y_test_pred_log - ci_z * residual_std_log)
pred_upper = np.exp(y_test_pred_log + ci_z * residual_std_log)

base_cols = [
    'URL',
    'Title',
    'Model',
    'Series',
    'Year of construction',
    'Mileage_km',
    'price_in_eur',
    'Car location',
]
available_cols = [col for col in base_cols if col in df_gold.columns]
results_df = df_gold.loc[X_test.index, available_cols].copy()
results_df['pred_price'] = y_test_pred_price
results_df['pred_lower'] = pred_lower
results_df['pred_upper'] = pred_upper
results_df['residual_price'] = residuals_price
results_df['residual_log'] = residuals_log
results_df['model'] = 'catboost'
results_df['run_id'] = RUN_ID

underpriced_df = results_df[results_df['price_in_eur'] < results_df['pred_lower']].copy()
overpriced_df = results_df[results_df['price_in_eur'] > results_df['pred_upper']].copy()

prediction_file = RESULTS_MODEL_DIR / f"catboost_predictions_{RUN_ID}.xlsx"
with pd.ExcelWriter(prediction_file) as writer:
    results_df.sort_values('residual_price').to_excel(writer, sheet_name='all_results', index=False)
    underpriced_df.sort_values('residual_price').to_excel(writer, sheet_name='underpriced', index=False)
    overpriced_df.sort_values('residual_price').to_excel(writer, sheet_name='overpriced', index=False)

print(f"✓ Prediction export saved to {prediction_file}")
print(f"  Underpriced listings: {len(underpriced_df)}")
print(f"  Overpriced listings: {len(overpriced_df)}")

## Step 10: Save Trained Model

In [None]:
MODELS_DIR.mkdir(parents=True, exist_ok=True)
model_path = MODELS_DIR / f"catboost_model_{RUN_ID}.cbm"
model.save_model(str(model_path))

importance_path = MODELS_DIR / f"feature_importance_{RUN_ID}.csv"
feature_importance.to_csv(importance_path, index=False)

metrics_df = pd.DataFrame({
    'metric': ['RMSE', 'MAE', 'R2'],
    'train': [train_metrics_price['RMSE'], train_metrics_price['MAE'], train_metrics_price['R2']],
    'test': [test_metrics_price['RMSE'], test_metrics_price['MAE'], test_metrics_price['R2']]
})
metrics_path = MODELS_DIR / f"model_metrics_{RUN_ID}.csv"
metrics_df.to_csv(metrics_path, index=False)

print(f"✓ Model saved to: {model_path}")
print(f"✓ Feature importance saved to: {importance_path}")
print(f"✓ Metrics saved to: {metrics_path}")

## Summary

✓ **CatBoost Model Training Complete**

### Model Performance
- **Test R²:** {test_metrics_price['R2']:.4f}
- **Test RMSE:** {test_metrics_price['RMSE']:,.2f} EUR
- **Test MAE:** {test_metrics_price['MAE']:,.2f} EUR

### Files Saved
- Model: `{model_path.name}`
- Feature Importance: `{importance_path.name}`
- Metrics: `{metrics_path.name}`
- Predictions: `{prediction_file.name}`

**Next Steps:**
- Compare with other models (Ridge, ElasticNet)
- Feed `model_predictions/catboost_predictions_*.xlsx` into analysis notebooks
- Deploy model in Streamlit app