In [None]:
import os
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import logging
import matplotlib.pyplot as plt
import seaborn as sns

# ML libraries
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from catboost import CatBoostRegressor, Pool, cv

# Add project root to path
project_root = Path.cwd().parent.parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

from config import (DATA_DIR_GOLD, MODELS_DIR, MODEL_CATBOOST, 
                    MODEL_CONFIG, CATEGORICAL_FEATURES, TARGET_VARIABLE)
from elferspot_listings.utils.helpers import setup_logging, load_data

# Setup logging
logger = setup_logging(level='INFO')
logger.info("CatBoost modeling initialized")

# Set random seed for reproducibility
RANDOM_SEED = MODEL_CONFIG['random_state']
np.random.seed(RANDOM_SEED)

## Step 1: Load Gold Data

In [None]:
# Find the most recent gold file
gold_files = sorted(DATA_DIR_GOLD.glob("listings_gold*.xlsx"))

if gold_files:
    gold_path = gold_files[-1]
    logger.info(f"Using gold file: {gold_path.name}")
else:
    raise FileNotFoundError(f"No gold files found in {DATA_DIR_GOLD}")

# Load data
df_gold = load_data(gold_path)
print(f"✓ Loaded {len(df_gold):,} rows from gold layer")
print(f"  File: {gold_path.name}")

df_gold.head()

## Step 2: Define Features for Modeling

In [None]:
# Define feature sets
numeric_features = [
    'Mileage_km',
    'Mileage_sq',
    'log_mileage',
    'listing_score',
    'Year of construction'
]

categorical_features = [
    'Series',
    'model_category',
    'Transmission',
    'Drive',
    'Ready to drive',
    'Car location',
    'Matching numbers',
    'Interior color',
    'Paint-to-Sample (PTS)',
    'is_fully_restored',
    'owners_known'
]

# Filter to only available columns
numeric_features = [f for f in numeric_features if f in df_gold.columns]
categorical_features = [f for f in categorical_features if f in df_gold.columns]

all_features = numeric_features + categorical_features
target = 'log_price' if 'log_price' in df_gold.columns else 'price_in_eur'

print(f"Numeric features ({len(numeric_features)}): {numeric_features}")
print(f"\nCategorical features ({len(categorical_features)}): {categorical_features}")
print(f"\nTarget variable: {target}")

## Step 3: Prepare Modeling Dataset

In [None]:
# Remove rows with missing target
df_model = df_gold[all_features + [target]].copy()
df_model = df_model.dropna(subset=[target])

print(f"Dataset shape: {df_model.shape}")
print(f"Missing values:\n{df_model.isnull().sum()[df_model.isnull().sum() > 0]}")

# Fill missing values in features
for col in categorical_features:
    if col in df_model.columns:
        df_model[col] = df_model[col].fillna('Unknown').astype(str)

for col in numeric_features:
    if col in df_model.columns:
        df_model[col] = df_model[col].fillna(df_model[col].median())

print(f"\n✓ Prepared {len(df_model):,} samples for modeling")

## Step 4: Train-Test Split

In [None]:
# Prepare X and y
X = df_model[all_features]
y = df_model[target]

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=MODEL_CONFIG['test_size'],
    random_state=RANDOM_SEED
)

print(f"Training set: {len(X_train):,} samples")
print(f"Test set: {len(X_test):,} samples")
print(f"Split ratio: {MODEL_CONFIG['test_size']*100:.0f}% test")

# Identify categorical feature indices
cat_feature_indices = [X_train.columns.get_loc(col) for col in categorical_features if col in X_train.columns]
print(f"\nCategorical feature indices: {len(cat_feature_indices)}")

## Step 5: Train CatBoost Model

In [None]:
# Initialize CatBoost model with config
model = CatBoostRegressor(
    iterations=MODEL_CONFIG['catboost']['iterations'],
    learning_rate=MODEL_CONFIG['catboost']['learning_rate'],
    depth=MODEL_CONFIG['catboost']['depth'],
    l2_leaf_reg=MODEL_CONFIG['catboost']['l2_leaf_reg'],
    random_seed=MODEL_CONFIG['catboost']['random_seed'],
    cat_features=cat_feature_indices,
    verbose=100,
    eval_metric='RMSE',
    early_stopping_rounds=50
)

print("Training CatBoost model...")
print(f"Configuration: {MODEL_CONFIG['catboost']}")

# Train model
model.fit(
    X_train, y_train,
    eval_set=(X_test, y_test),
    plot=False
)

print("\n✓ Model training complete!")

## Step 6: Model Evaluation

In [None]:
# Make predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Calculate metrics
def calculate_metrics(y_true, y_pred, label=''):
    """Calculate and display regression metrics."""
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    print(f"\n{label} Metrics:")
    print(f"  RMSE: {rmse:,.2f}")
    print(f"  MAE:  {mae:,.2f}")
    print(f"  R²:   {r2:.4f}")
    
    return {'RMSE': rmse, 'MAE': mae, 'R2': r2}

train_metrics = calculate_metrics(y_train, y_train_pred, 'Training')
test_metrics = calculate_metrics(y_test, y_test_pred, 'Test')

# Check for overfitting
print(f"\nOverfitting check:")
print(f"  R² difference: {train_metrics['R2'] - test_metrics['R2']:.4f}")
if train_metrics['R2'] - test_metrics['R2'] > 0.1:
    print("  ⚠️ Possible overfitting detected")
else:
    print("  ✓ Model generalization looks good")

## Step 7: Prediction Visualization

In [None]:
# Visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Predicted vs Actual (Test set)
axes[0].scatter(y_test, y_test_pred, alpha=0.5)
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[0].set_xlabel('Actual')
axes[0].set_ylabel('Predicted')
axes[0].set_title(f'Predicted vs Actual (Test)\nR² = {test_metrics["R2"]:.4f}')
axes[0].grid(True, alpha=0.3)

# Residuals
residuals = y_test - y_test_pred
axes[1].scatter(y_test_pred, residuals, alpha=0.5)
axes[1].axhline(y=0, color='r', linestyle='--', lw=2)
axes[1].set_xlabel('Predicted')
axes[1].set_ylabel('Residuals')
axes[1].set_title('Residual Plot')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Residual statistics
print(f"\nResidual Statistics:")
print(f"  Mean: {residuals.mean():.4f}")
print(f"  Std:  {residuals.std():.4f}")
print(f"  Min:  {residuals.min():.4f}")
print(f"  Max:  {residuals.max():.4f}")

## Step 8: Feature Importance Analysis

In [None]:
# Get feature importance
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("=== Top 20 Most Important Features ===")
display(feature_importance.head(20))

# Visualize feature importance
plt.figure(figsize=(10, 8))
top_features = feature_importance.head(20)
plt.barh(range(len(top_features)), top_features['importance'])
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Importance')
plt.title('Top 20 Feature Importances')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## Step 9: Cross-Validation

In [None]:
# Perform cross-validation
print("Performing 5-fold cross-validation...")

# Create Pool for CatBoost CV
cv_data = Pool(
    data=X_train,
    label=y_train,
    cat_features=cat_feature_indices
)

# CV parameters
cv_params = {
    'iterations': 500,
    'learning_rate': MODEL_CONFIG['catboost']['learning_rate'],
    'depth': MODEL_CONFIG['catboost']['depth'],
    'l2_leaf_reg': MODEL_CONFIG['catboost']['l2_leaf_reg'],
    'random_seed': RANDOM_SEED,
    'verbose': False
}

# Run CV
cv_results = cv(
    cv_data,
    cv_params,
    fold_count=5,
    shuffle=True,
    partition_random_seed=RANDOM_SEED,
    plot=False,
    verbose=False
)

# Display CV results
print(f"\nCross-Validation Results:")
print(f"  Mean RMSE: {cv_results['test-RMSE-mean'].iloc[-1]:.4f}")
print(f"  Std RMSE:  {cv_results['test-RMSE-std'].iloc[-1]:.4f}")

# Plot CV learning curve
plt.figure(figsize=(10, 6))
plt.plot(cv_results['test-RMSE-mean'], label='CV Mean RMSE')
plt.fill_between(
    range(len(cv_results)),
    cv_results['test-RMSE-mean'] - cv_results['test-RMSE-std'],
    cv_results['test-RMSE-mean'] + cv_results['test-RMSE-std'],
    alpha=0.3
)
plt.xlabel('Iterations')
plt.ylabel('RMSE')
plt.title('Cross-Validation Learning Curve')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## Step 10: Save Trained Model

In [None]:
# Save model
MODELS_DIR.mkdir(parents=True, exist_ok=True)
model_path = MODELS_DIR / f"catboost_model_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.cbm"
model.save_model(str(model_path))

print(f"✓ Model saved to: {model_path}")

# Save feature importance
importance_path = MODELS_DIR / f"feature_importance_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.csv"
feature_importance.to_csv(importance_path, index=False)
print(f"✓ Feature importance saved to: {importance_path}")

# Save model metrics
metrics_df = pd.DataFrame({
    'metric': ['RMSE', 'MAE', 'R2'],
    'train': [train_metrics['RMSE'], train_metrics['MAE'], train_metrics['R2']],
    'test': [test_metrics['RMSE'], test_metrics['MAE'], test_metrics['R2']]
})
metrics_path = MODELS_DIR / f"model_metrics_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.csv"
metrics_df.to_csv(metrics_path, index=False)
print(f"✓ Model metrics saved to: {metrics_path}")

## Summary

✓ **CatBoost Model Training Complete**

### Model Performance
- **Test R²:** {test_metrics['R2']:.4f}
- **Test RMSE:** {test_metrics['RMSE']:,.2f}
- **Test MAE:** {test_metrics['MAE']:,.2f}

### Files Saved
- Model: `{model_path.name}`
- Feature Importance: `{importance_path.name}`
- Metrics: `{metrics_path.name}`

**Next Steps:**
- Compare with other models (Ridge, ElasticNet) in `04_model_comparison.ipynb`
- Use model for predictions in analysis notebooks
- Deploy model in Streamlit app