In [4]:
import os 

In [5]:
os.chdir(r"C:\Users\User\Desktop\3-IDSD\mlops\MLOPS\MlopsProject")

In [6]:
%pwd

'C:\\Users\\User\\Desktop\\3-IDSD\\mlops\\MLOPS\\MlopsProject'

In [7]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [8]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories

In [9]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
        )

        return data_transformation_config

In [15]:
import os
from mlProject import logger
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import joblib

In [16]:
class Preprocessor:
    def __init__(self, num_cols, cat_cols, drop_cols=None):
        self.num_cols = num_cols
        self.cat_cols = cat_cols
        self.label_encoders = {}
        self.drop_cols = drop_cols if drop_cols is not None else []

    def fit_transform(self, X_train):
        X_train = X_train.copy()
        
        # Supprimer les colonnes inutiles
        X_train = X_train.drop(columns=self.drop_cols, errors="ignore")
        self.num_cols = [c for c in self.num_cols if c not in self.drop_cols]
        self.cat_cols = [c for c in self.cat_cols if c not in self.drop_cols]

        # Numérique : imputation + clipping + outliers
        for col in self.num_cols:
            X_train[col] = X_train[col].fillna(X_train[col].median())
            Q1 = X_train[col].quantile(0.25)
            Q3 = X_train[col].quantile(0.75)
            IQR = Q3 - Q1
            lower = Q1 - 1.5 * IQR
            upper = Q3 + 1.5 * IQR
            X_train[col] = X_train[col].clip(lower, upper)

        # Catégorique : LabelEncoder
        for col in self.cat_cols:
            le = LabelEncoder()
            X_train[col] = le.fit_transform(X_train[col])
            self.label_encoders[col] = le
        
        return X_train

    def transform(self, X_test):
        X_test = X_test.copy()

        # Supprimer les colonnes inutiles
        X_test = X_test.drop(columns=self.drop_cols, errors="ignore")

        # Numérique : imputation + clipping + outliers
        for col in self.num_cols:
            X_test[col] = X_test[col].fillna(X_test[col].median())
            Q1 = X_test[col].quantile(0.25)
            Q3 = X_test[col].quantile(0.75)
            IQR = Q3 - Q1
            lower = Q1 - 1.5 * IQR
            upper = Q3 + 1.5 * IQR
            X_test[col] = X_test[col].clip(lower, upper)

        # Catégorique : transformation avec LabelEncoder sauvegardé
        for col in self.cat_cols:
            le = self.label_encoders[col]
            X_test[col] = le.transform(X_test[col])
        
        return X_test

    def save(self, filepath="preprocessor.pkl"):
        joblib.dump({
            "num_cols": self.num_cols,
            "cat_cols": self.cat_cols,
            "label_encoders": self.label_encoders,
            "drop_cols": self.drop_cols
        }, filepath)
        print(f"Preprocessor sauvegardé dans {filepath} !")

class DataTransformation:
    def __init__(self, config, target: str):
        self.config = config
        self.target = target
        self.df = pd.read_csv(self.config.data_path)

    def transform_and_split(self, test_size=0.20, random_state=42):
        # Colonnes numériques et catégoriques
        num_cols = [
            "Age", "Session_Duration_Avg", "Pages_Per_Session", "Wishlist_Items",
            "Days_Since_Last_Purchase", "Discount_Usage_Rate", "Returns_Rate",
            "Email_Open_Rate", "Customer_Service_Calls", "Product_Reviews_Written",
            "Social_Media_Engagement_Score", "Mobile_App_Usage", 
            "Payment_Method_Diversity", "Credit_Balance"
        ]
        cat_cols = ["Gender", "Country", "City", "Signup_Quarter"]
        drop_cols = ["Gender", "Signup_Quarter", "Country"]  # colonnes à supprimer
        # Mettre à jour cat_cols après drop
        cat_cols = [c for c in cat_cols if c not in drop_cols]

        # Split features/target
        X = self.df.drop(columns=[self.target])
        y = self.df[self.target]

        # Split train/test
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=random_state, stratify=y
        )

        # Sauvegarder l'ordre des features
        FEATURES = X_train.columns.tolist()
        os.makedirs(self.config.root_dir, exist_ok=True)
        joblib.dump(FEATURES, os.path.join(self.config.root_dir, "feature_order.pkl"))

        # Preprocessing
        preprocessor = Preprocessor(num_cols, cat_cols, drop_cols=drop_cols)
        X_train_prep = preprocessor.fit_transform(X_train)
        X_test_prep = preprocessor.transform(X_test)
        preprocessor.save(os.path.join(self.config.root_dir, "preprocessor.pkl"))

        # Ajouter la target correctement
        X_train_prep = X_train_prep.copy()
        X_test_prep = X_test_prep.copy()
        X_train_prep[self.target] = y_train.values
        X_test_prep[self.target] = y_test.values

        # Sauvegarder CSV
        X_train_prep.to_csv(os.path.join(self.config.root_dir, "train.csv"), index=False)
        X_test_prep.to_csv(os.path.join(self.config.root_dir, "test.csv"), index=False)

        logger.info("Data transformed and split into training and test sets")
        print(f"Train shape: {X_train_prep.shape}, Test shape: {X_test_prep.shape}")

        return X_train_prep, X_test_prep, y_train, y_test


In [17]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    
    target_column = "Churned"
    data_transformation = DataTransformation(config=data_transformation_config, target=target_column)
    
    data_transformation.transform_and_split()
except Exception as e:
    raise e


[2025-12-24 20:02:59,943: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-12-24 20:02:59,944: INFO: common: yaml file: params.yaml loaded successfully]
[2025-12-24 20:02:59,947: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-12-24 20:02:59,948: INFO: common: created directory at: artifacts]
[2025-12-24 20:02:59,949: INFO: common: created directory at: artifacts/data_transformation]
Preprocessor sauvegardé dans artifacts/data_transformation\preprocessor.pkl !
[2025-12-24 20:03:00,406: INFO: 3851889913: Data transformed and split into training and test sets]
Train shape: (40000, 22), Test shape: (10000, 22)


In [38]:
import pandas as pd

train = pd.read_csv("artifacts/data_transformation/train.csv")


In [39]:
train.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,...,WindDir3pm_NW,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW,Weekday
0,15.1,23.9,0.0,2.251292,4.219508,19.0,22.0,38.0,68.0,1001.9,...,False,False,False,False,False,False,True,False,False,2
1,9.7,14.2,2.151762,2.251292,3.931826,15.0,28.0,91.0,56.0,1008.2,...,False,False,False,False,False,False,True,False,False,0
2,13.2,25.4,0.0,2.282382,3.433987,6.0,17.0,79.0,63.0,1025.2,...,False,False,False,False,False,False,False,False,False,1
3,7.6,14.8,0.0,2.079442,4.394449,30.0,35.0,52.0,45.0,1004.6,...,False,False,False,False,False,False,False,True,False,1
4,12.9,22.2,0.0,2.186051,3.637586,15.0,20.0,69.0,52.0,1023.0,...,False,False,False,True,False,False,False,False,False,0


In [40]:
train.columns

Index(['MinTemp', 'MaxTemp', 'Rainfall', 'Sunshine', 'WindGustSpeed',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am',
       ...
       'WindDir3pm_NW', 'WindDir3pm_S', 'WindDir3pm_SE', 'WindDir3pm_SSE',
       'WindDir3pm_SSW', 'WindDir3pm_SW', 'WindDir3pm_W', 'WindDir3pm_WNW',
       'WindDir3pm_WSW', 'Weekday'],
      dtype='object', length=108)

In [41]:
train.isnull().sum()

MinTemp           0
MaxTemp           0
Rainfall          0
Sunshine          0
WindGustSpeed     0
                 ..
WindDir3pm_SW     0
WindDir3pm_W      0
WindDir3pm_WNW    0
WindDir3pm_WSW    0
Weekday           0
Length: 108, dtype: int64