# %% [markdown]
"""
# 🧠 Modélisation Prédictive - Performance Commerciale

**Objectifs** :
1. Prédire la probabilité de conversion par appel
2. Identifier les drivers clés de performance
3. Générer des recommandations actionnables

**Approche** :
- Random Forest (classification binaire)
- Feature Importance + SHAP Values
- Validation temporelle
"""

In [None]:
import sys
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import (classification_report, roc_auc_score,
                             ConfusionMatrixDisplay, RocCurveDisplay)
from xgboost import XGBClassifier
import shap
import mlflow
import joblib

# Configuration globale
plt.style.use('ggplot')
sns.set_palette("husl")
mlflow.set_tracking_uri("http://localhost:5000")

In [None]:
## 2. Chargement des Données
# %%
DATA_PATH = "../data/generated/campaign_data.csv"

def load_data(path: str) -> pd.DataFrame:
    """Charge et prépare les données brutes"""
    df = pd.read_csv(
        path,
        parse_dates=['call_date'],
        dtype={
            'region': 'category',
            'product': 'category',
            'script_version': 'category'
        }
    )
    df['converted'] = df['converted'].astype('bool')
    df['call_hour'] = df['call_time'].astype('category')
    return df.sort_values('call_date')

df = load_data(DATA_PATH)
print(f"📊 Dimensions initiales : {df.shape}")
display(df.head(3))

In [None]:
## 3. Feature Engineering
# %%
def create_features(df: pd.DataFrame) -> pd.DataFrame:
    """Crée des features métier et temporelles"""
    # Features temporelles
    df['day_of_week'] = df['call_date'].dt.day_name()
    df['is_weekend'] = df['call_date'].dt.weekday >= 5
    
    # Features métier
    df['peak_hour'] = df['call_time'].between(14, 16)
    df['duration_min'] = df['duration'] / 60
    
    # Interaction features
    df['script_region'] = df['script_version'] + '_' + df['region']
    return df

df = create_features(df)

In [None]:
# %% [markdown]
## 4. Préparation des Données
# %%
TARGET = 'converted'
FEATURES = [
    'duration_min',
    'peak_hour',
    'region',
    'product',
    'script_version',
    'day_of_week'
]

# Split temporel
train_size = int(len(df) * 0.8)
X_train, X_test = df[FEATURES].iloc[:train_size], df[FEATURES].iloc[train_size:]
y_train, y_test = df[TARGET].iloc[:train_size], df[TARGET].iloc[train_size:]


In [None]:
# %% [markdown]
## 5. Pipeline de Préprocessing
# %%
preprocessor = make_column_transformer(
    (StandardScaler(), ['duration_min']),
    (OneHotEncoder(
        handle_unknown='ignore',
        drop='if_binary'
    ), ['region', 'product', 'script_version', 'day_of_week']),
    remainder='passthrough'
)

# Get feature names
preprocessor.fit(X_train)
ohe = preprocessor.named_transformers_['onehotencoder']
feature_names = (
    ['duration_scaled'] + 
    list(ohe.get_feature_names_out()) + 
    ['peak_hour']
)

In [None]:
# %% [markdown]
## 6. Entraînement du Modèle
# %%
def train_model(X, y):
    """Pipeline complet d'entraînement avec optimisation"""
    
    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', XGBClassifier(
            objective='binary:logistic',
            eval_metric='logloss',
            early_stopping_rounds=10
        ))
    ])
    
    param_grid = {
        'classifier__max_depth': [3, 5],
        'classifier__learning_rate': [0.05, 0.1],
        'classifier__subsample': [0.8, 1.0]
    }
    
    cv = TimeSeriesSplit(n_splits=3)
    
    with mlflow.start_run():
        search = GridSearchCV(
            pipe,
            param_grid,
            cv=cv,
            scoring='roc_auc',
            verbose=2
        )
        search.fit(X, y)
        
        # Logging MLflow
        mlflow.log_params(search.best_params_)
        mlflow.log_metric("best_auc", search.best_score_)
        mlflow.sklearn.log_model(search.best_estimator_, "model")
        
        return search.best_estimator_

best_model = train_model(X_train, y_train)

In [None]:
# %% [markdown]
## 7. Évaluation
# %%
def evaluate_model(model, X, y):
    """Évaluation complète du modèle"""
    
    y_pred = model.predict(X)
    y_proba = model.predict_proba(X)[:, 1]
    
    print(classification_report(y, y_pred))
    print(f"\nAUC Score: {roc_auc_score(y, y_proba):.2f}")
    
    fig, ax = plt.subplots(1, 2, figsize=(12, 5))
    ConfusionMatrixDisplay.from_predictions(y, y_pred, ax=ax[0])
    RocCurveDisplay.from_predictions(y, y_proba, ax=ax[1])
    plt.tight_layout()

print("🚀 Performances sur le Test Set :")
evaluate_model(best_model, X_test, y_test)

In [None]:
# %% [markdown]
## 8. Explicabilité avec SHAP
# %%
def explain_model(model, X):
    """Analyse d'impact des features avec SHAP"""
    
    # Préparation des données
    processed_data = model.named_steps['preprocessor'].transform(X)
    explainer = shap.TreeExplainer(model.named_steps['classifier'])
    
    # Calcul des valeurs SHAP
    shap_values = explainer.shap_values(processed_data)
    
    # Visualisation
    fig1 = plt.figure()
    shap.summary_plot(shap_values, processed_data, feature_names=feature_names)
    
    fig2 = plt.figure()
    shap.dependence_plot(
        'peak_hour',
        shap_values,
        processed_data,
        feature_names=feature_names,
        interaction_index=None
    )
    
    return fig1, fig2

print("🔍 Interprétabilité du modèle :")
explain_model(best_model, X_test.sample(1000))

In [None]:
# %% [markdown]
## 9. Déploiement
# %%
def deploy_model(model, version: str = "1.0.0"):
    """Exporte le modèle pour la production"""
    
    # Sauvegarde locale
    joblib.dump(model, f"models/model_v{version}.pkl")
    
    # Logging d'artefacts
    with mlflow.start_run():
        mlflow.log_artifact("feature_importance.png")
        mlflow.log_param("version", version)
    
    print(f"✅ Modèle version {version} déployé avec succès")

deploy_model(best_model)

In [None]:
# %% [markdown]
## 10. Recommandations Stratégiques
# %%
def generate_recommendations(model, features):
    """Génère des insights actionnables à partir du modèle"""
    
    # Calcul des meilleures combinaisons
    df_analysis = features.copy()
    df_analysis['pred_proba'] = model.predict_proba(features)[:, 1]
    
    recommendations = (
        df_analysis.groupby(['script_version', 'peak_hour', 'region'])
        ['pred_proba'].mean()
        .sort_values(ascending=False)
        .head(5)
        .reset_index()
    )
    
    # Formatage Markdown
    md_output = "## 🎯 Top 5 des Stratégies Recommandées\n\n"
    for idx, row in recommendations.iterrows():
        md_output += (
            f"{idx+1}. **Script {row['script_version']}** en **{row['region']}** "
            f"entre **{14}h et {16}h** : "
            f"Taux de conversion estimé à **{row['pred_proba']*100:.1f}%**\n\n"
        )
    
    return md_output

print(generate_recommendations(best_model, X_test))