In [1]:
import os 

In [2]:
os.chdir(r"C:\Users\User\Desktop\3-IDSD\mlops\MLOPS\MlopsProject")

In [3]:
%pwd

'C:\\Users\\User\\Desktop\\3-IDSD\\mlops\\MLOPS\\MlopsProject'

In [4]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [5]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories

In [6]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
        )

        return data_transformation_config

In [7]:
import os
from mlProject import logger
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import joblib

In [None]:
class Preprocessor:
    def __init__(self, num_cols, cat_cols, drop_cols=None):
        self.num_cols = num_cols
        self.cat_cols = cat_cols
        self.label_encoders = {}
        self.drop_cols = drop_cols if drop_cols is not None else []

    def fit_transform(self, X_train):
        X_train = X_train.copy()
        
        # Supprimer les colonnes inutiles
        X_train = X_train.drop(columns=self.drop_cols, errors="ignore")
        self.num_cols = [c for c in self.num_cols if c not in self.drop_cols]
        self.cat_cols = [c for c in self.cat_cols if c not in self.drop_cols]

        # Numérique : imputation + clipping + outliers
        for col in self.num_cols:
            X_train[col] = X_train[col].fillna(X_train[col].median())
            Q1 = X_train[col].quantile(0.25)
            Q3 = X_train[col].quantile(0.75)
            IQR = Q3 - Q1
            lower = Q1 - 1.5 * IQR
            upper = Q3 + 1.5 * IQR
            X_train[col] = X_train[col].clip(lower, upper)

        # Catégorique : LabelEncoder
        for col in self.cat_cols:
            le = LabelEncoder()
            X_train[col] = le.fit_transform(X_train[col])
            self.label_encoders[col] = le
        
        return X_train

    def transform(self, X_test):
        X_test = X_test.copy()

        # Supprimer les colonnes inutiles
        X_test = X_test.drop(columns=self.drop_cols, errors="ignore")

        # Numérique : imputation + clipping + outliers
        for col in self.num_cols:
            X_test[col] = X_test[col].fillna(X_test[col].median())
            Q1 = X_test[col].quantile(0.25)
            Q3 = X_test[col].quantile(0.75)
            IQR = Q3 - Q1
            lower = Q1 - 1.5 * IQR
            upper = Q3 + 1.5 * IQR
            X_test[col] = X_test[col].clip(lower, upper)

        # Catégorique : transformation avec LabelEncoder sauvegardé
        for col in self.cat_cols:
            le = self.label_encoders[col]
            X_test[col] = le.transform(X_test[col])
        
        return X_test

    def save(self, filepath="preprocessor.pkl"):
        preprocessor_dict = {
            "num_cols": self.num_cols,
            "cat_cols": self.cat_cols,
            "drop_cols": self.drop_cols,
            "label_encoders": self.label_encoders
        }
        joblib.dump(preprocessor_dict, filepath)
        print(f"Preprocessor sauvegardé sous forme de dict dans {filepath}")


class DataTransformation:
    def __init__(self, config, target: str):
        self.config = config
        self.target = target
        self.df = pd.read_csv(self.config.data_path)

    def transform_and_split(self, test_size=0.20, random_state=42):
        # Colonnes numériques et catégoriques
        num_cols = [
            "Age", "Session_Duration_Avg", "Pages_Per_Session", "Wishlist_Items",
            "Days_Since_Last_Purchase", "Discount_Usage_Rate", "Returns_Rate",
            "Email_Open_Rate", "Customer_Service_Calls", "Product_Reviews_Written",
            "Social_Media_Engagement_Score", "Mobile_App_Usage", 
            "Payment_Method_Diversity", "Credit_Balance"
        ]
        cat_cols = ["Gender", "Country", "City", "Signup_Quarter"]
        drop_cols = ["Gender", "Signup_Quarter", "Country"]  # colonnes à supprimer
        # Mettre à jour cat_cols après drop
        cat_cols = [c for c in cat_cols if c not in drop_cols]

        # Split features/target
        X = self.df.drop(columns=[self.target])
        y = self.df[self.target]

        # Split train/test
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=random_state, stratify=y
        )

        # Sauvegarder l'ordre des features
        FEATURES = X_train.columns.tolist()
        os.makedirs(self.config.root_dir, exist_ok=True)
        joblib.dump(FEATURES, os.path.join(self.config.root_dir, "feature_order.pkl"))

        # Preprocessing
        preprocessor = Preprocessor(num_cols, cat_cols, drop_cols=drop_cols)
        X_train_prep = preprocessor.fit_transform(X_train)
        X_test_prep = preprocessor.transform(X_test)
        preprocessor.save(os.path.join(self.config.root_dir, "preprocessor.pkl"))

        # Ajouter la target correctement
        X_train_prep = X_train_prep.copy()
        X_test_prep = X_test_prep.copy()
        X_train_prep[self.target] = y_train.values
        X_test_prep[self.target] = y_test.values

        # Sauvegarder CSV
        X_train_prep.to_csv(os.path.join(self.config.root_dir, "train.csv"), index=False)
        X_test_prep.to_csv(os.path.join(self.config.root_dir, "test.csv"), index=False)

        logger.info("Data transformed and split into training and test sets")
        print(f"Train shape: {X_train_prep.shape}, Test shape: {X_test_prep.shape}")

        return X_train_prep, X_test_prep, y_train, y_test


In [9]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    
    target_column = "Churned"
    data_transformation = DataTransformation(config=data_transformation_config, target=target_column)
    
    data_transformation.transform_and_split()
except Exception as e:
    raise e


[2025-12-25 18:56:02,985: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-12-25 18:56:02,987: INFO: common: yaml file: params.yaml loaded successfully]
[2025-12-25 18:56:02,989: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-12-25 18:56:02,990: INFO: common: created directory at: artifacts]
[2025-12-25 18:56:02,991: INFO: common: created directory at: artifacts/data_transformation]
Preprocessor sauvegardé dans artifacts/data_transformation\preprocessor.pkl
[2025-12-25 18:56:03,486: INFO: 623445119: Data transformed and split into training and test sets]
Train shape: (40000, 22), Test shape: (10000, 22)


In [10]:
import pandas as pd

train = pd.read_csv("artifacts/data_transformation/train.csv")


In [11]:
train.head()

Unnamed: 0,Age,City,Membership_Years,Login_Frequency,Session_Duration_Avg,Pages_Per_Session,Cart_Abandonment_Rate,Wishlist_Items,Total_Purchases,Average_Order_Value,...,Returns_Rate,Email_Open_Rate,Customer_Service_Calls,Product_Reviews_Written,Social_Media_Engagement_Score,Mobile_App_Usage,Payment_Method_Diversity,Lifetime_Value,Credit_Balance,Churned
0,24.0,13,1.2,9.0,38.9,7.5,53.9,9.0,11.0,67.3,...,2.8,46.1,9.0,2.0,22.2,25.5,2.0,617.37,2187.0,0
1,49.0,27,3.3,28.0,54.95,18.2,12.2,12.0,31.0,116.97,...,3.4,42.4,3.0,6.0,72.6,41.7,1.0,3500.79,3173.0,0
2,56.0,12,1.7,9.0,26.8,11.5,67.6,3.0,13.0,172.88,...,5.0,12.4,7.0,4.0,26.2,19.2,2.0,2426.02,2582.0,1
3,67.5,18,1.8,11.0,44.2,13.4,36.0,4.0,24.7,93.46,...,8.6,16.2,8.0,8.0,13.7,26.9,4.0,1433.75,3789.0,0
4,47.0,14,4.9,12.0,33.4,12.5,60.6,9.0,15.0,133.1,...,2.5,39.1,2.0,4.0,42.3,21.2,2.0,2318.7,1540.0,0


In [12]:
train.columns

Index(['Age', 'City', 'Membership_Years', 'Login_Frequency',
       'Session_Duration_Avg', 'Pages_Per_Session', 'Cart_Abandonment_Rate',
       'Wishlist_Items', 'Total_Purchases', 'Average_Order_Value',
       'Days_Since_Last_Purchase', 'Discount_Usage_Rate', 'Returns_Rate',
       'Email_Open_Rate', 'Customer_Service_Calls', 'Product_Reviews_Written',
       'Social_Media_Engagement_Score', 'Mobile_App_Usage',
       'Payment_Method_Diversity', 'Lifetime_Value', 'Credit_Balance',
       'Churned'],
      dtype='object')

In [13]:
train.isnull().sum()

Age                              0
City                             0
Membership_Years                 0
Login_Frequency                  0
Session_Duration_Avg             0
Pages_Per_Session                0
Cart_Abandonment_Rate            0
Wishlist_Items                   0
Total_Purchases                  0
Average_Order_Value              0
Days_Since_Last_Purchase         0
Discount_Usage_Rate              0
Returns_Rate                     0
Email_Open_Rate                  0
Customer_Service_Calls           0
Product_Reviews_Written          0
Social_Media_Engagement_Score    0
Mobile_App_Usage                 0
Payment_Method_Diversity         0
Lifetime_Value                   0
Credit_Balance                   0
Churned                          0
dtype: int64