In [1]:
import os 

In [3]:
os.chdir("C:/Users/User/Desktop/3-IDSD/mlops/MLOPS/MlopsProject")

In [4]:
%pwd

'C:\\Users\\User\\Desktop\\3-IDSD\\mlops\\MLOPS\\MlopsProject'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    train_data_path: Path
    test_data_path: Path
    model_name: str
    target_column: str
    all_params: dict  # contient les params de tous les modèles


In [6]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params  # contient RandomForest, XGBoost, LightGBM, CatBoost
        schema = self.schema.TARGET_COLUMN

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            train_data_path=config.train_data_path,
            test_data_path=config.test_data_path,
            model_name=config.model_name,
            target_column=schema.name,
            all_params=params  # stocke tous les params
        )

        return model_trainer_config
    
    def get_all_model_params(self) -> dict:
        return self.params  # si self.params contient le YAML complet avec RandomForest, XGBoost…




In [8]:
import pandas as pd
import os
import joblib
from mlProject import logger
from imblearn.over_sampling import SMOTE

# Modèles
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier


In [9]:
class SMOTEHandler:
    def __init__(self, random_state=42):
        self.random_state = random_state
    
    def fit_resample(self, X_train, y_train):
        smote = SMOTE(random_state=self.random_state)
        X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
        return X_resampled, y_resampled


class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig, all_params: dict):
        self.config = config
        self.all_params = all_params  # dictionnaire contenant les hyperparamètres pour chaque modèle
        self.smote_handler = SMOTEHandler(random_state=42)  # ajout du handler SMOTE

    def train(self):
        # 1️⃣ Charger les données
        train_data = pd.read_csv(self.config.train_data_path)
        test_data = pd.read_csv(self.config.test_data_path)

        # 2️⃣ Séparer X / y
        X_train = train_data.drop([self.config.target_column], axis=1)
        y_train = train_data[self.config.target_column]

        X_test = test_data.drop([self.config.target_column], axis=1)
        y_test = test_data[self.config.target_column]

        # 3️⃣ Appliquer SMOTE via SMOTEHandler uniquement sur le train
        X_train_res, y_train_res = self.smote_handler.fit_resample(X_train, y_train)

        logger.info("Target distribution AFTER SMOTE:")
        logger.info(pd.Series(y_train_res).value_counts().to_string())
        logger.info(f"Train before SMOTE: {X_train.shape}")
        logger.info(f"Train after SMOTE: {X_train_res.shape}")

        # 4️⃣ Définir tous les modèles avec leurs paramètres depuis all_params
        baseline_models = {
            'RandomForest': RandomForestClassifier(
                **self.all_params.get('RandomForest', {}),
                random_state=42,
                n_jobs=-1
            ),
            'XGBoost': XGBClassifier(
                **self.all_params.get('XGBoost', {}),
                random_state=42,
                eval_metric='logloss'
            ),
            'LightGBM': LGBMClassifier(
                **self.all_params.get('LightGBM', {}),
                random_state=42
            ),
            'CatBoost': CatBoostClassifier(
                **self.all_params.get('CatBoost', {}),
                random_seed=42,
                verbose=0
            )
        }

        # 5️⃣ Boucler sur tous les modèles pour entraîner et sauvegarder
        os.makedirs(self.config.root_dir, exist_ok=True)
        for name, model in baseline_models.items():
            logger.info(f"Training {name}...")
            model.fit(X_train_res, y_train_res)
            model_path = os.path.join(self.config.root_dir, f"{name}.pkl")
            joblib.dump(model, model_path)
            logger.info(f"{name} saved at {model_path}")


In [10]:
try:
    config = ConfigurationManager()
    model_trainer_config_obj = config.get_model_trainer_config()  # config spécifique du trainer
    all_params = config.get_all_model_params()  # méthode à créer ou utiliser pour récupérer les params du YAML

    model_trainer = ModelTrainer(
        config=model_trainer_config_obj,
        all_params=all_params
    )
    model_trainer.train()
except Exception as e:
    raise e


[2025-12-24 20:20:45,797: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-12-24 20:20:45,800: INFO: common: yaml file: params.yaml loaded successfully]
[2025-12-24 20:20:45,803: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-12-24 20:20:45,803: INFO: common: created directory at: artifacts]
[2025-12-24 20:20:45,803: INFO: common: created directory at: artifacts/model_trainer]
[2025-12-24 20:20:46,049: INFO: 144711623: Target distribution AFTER SMOTE:]
[2025-12-24 20:20:46,053: INFO: 144711623: Churned
0    28440
1    28440]
[2025-12-24 20:20:46,054: INFO: 144711623: Train before SMOTE: (40000, 21)]
[2025-12-24 20:20:46,054: INFO: 144711623: Train after SMOTE: (56880, 21)]
[2025-12-24 20:20:46,056: INFO: 144711623: Training RandomForest...]
[2025-12-24 20:20:48,462: INFO: 144711623: RandomForest saved at artifacts/model_trainer\RandomForest.pkl]
[2025-12-24 20:20:48,462: INFO: 144711623: Training XGBoost...]
[2025-12-24 20:20:49,292: INFO: 1447116