In [1]:
import os

In [2]:
%pwd

'd:\\AI Personal Projects\\Machine Learning Projects\\Customer Churn Prediction\\Customer-Churn-Prediction-\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\AI Personal Projects\\Machine Learning Projects\\Customer Churn Prediction\\Customer-Churn-Prediction-'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    balancing_method: str

In [6]:
from CustomerChurn.constants import *
from CustomerChurn.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
        
        create_directories([self.config.artifacts_root])
        
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        params = self.params.DataTransformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            balancing_method=params.balancing_method
        )

        return data_transformation_config

In [8]:
import os
import pandas as pd
from typing import Union
from CustomerChurn import logger
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import ADASYN
from imblearn.combine import SMOTEENN

In [9]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
    
    def load_data(self) -> pd.DataFrame:
        try:
            data = pd.read_csv(self.config.data_path)
            logger.info("Data loaded successfully.")
            return data
        except Exception as e:
            logger.error(f"Error loading data: {e}")
            raise e

    def encode_data(self, data: pd.DataFrame) -> pd.DataFrame:
        try:
            binary_columns = ['Partner', 'Dependents', 'PaperlessBilling', 'Churn', 'PhoneService']
            data[binary_columns] = data[binary_columns].map(lambda x: 1 if x == 'Yes' else 0)
            
            data['gender'] = data['gender'].apply(lambda x: 1 if x == 'Female' else 0)

            data['MultipleLines'] = data['MultipleLines'].map({'No phone service': 0, 'No': 0, 'Yes': 1})

            internet_service_columns = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']
            data[internet_service_columns] = data[internet_service_columns].replace({'No internet service': 0, 'No': 0, 'Yes': 1})

            categorical_columns = ['InternetService', 'Contract', 'PaymentMethod']
            data = pd.get_dummies(data, columns=categorical_columns, drop_first=True, dtype='int')

            return data

        except Exception as e:
            logger.error(f"Error during data encoding: {str(e)}")
            raise e

    def feature_engineering(self, data: pd.DataFrame) -> pd.DataFrame:
        try:
            data['avg_monthly_value'] = data['TotalCharges'] / (data['tenure'] + 1)
            data['tenure_ratio'] = data['tenure'] / (data['TotalCharges'] + 1)
            data['service_density'] = data['OnlineSecurity'] + data['OnlineBackup'] + data['TechSupport']
            data['tenure_MonthlyCharges'] = data['tenure'] * data['MonthlyCharges']
            logger.info("Feature engineering complete.")
            return data
        
        except Exception as e:
            logger.error(f"Error in feature engineering: {str(e)}")
            raise e

    def data_balancing(self, data: pd.DataFrame) -> Union[pd.DataFrame, pd.Series]:
        try:
            X = data.drop('Churn', axis=1)
            y = data['Churn']
            method = self.config.balancing_method
            
            if method == 'SMOTEENN':
                smoteenn = SMOTEENN()
                X_res, y_res = smoteenn.fit_resample(X, y)
                logger.info("Data balanced using SMOTEENN.")
            elif method == 'ADASYN':
                adasyn = ADASYN()
                X_res, y_res = adasyn.fit_resample(X, y)
                logger.info("Data balanced using ADASYN.")
            else:
                raise ValueError(f"Invalid method: {method}. Choose either 'SMOTEENN' or 'ADASYN'.")
            
            logger.info(f"Before balancing: {y.value_counts()}")
            logger.info(f"After balancing: {y_res.value_counts()}")
            
            balanced_data = pd.DataFrame(X_res, columns=X.columns)
            balanced_data['Churn'] = y_res.values
            return balanced_data

        except Exception as e:
            logger.error(f"Error during data balancing: {str(e)}")
            raise e

    def train_test_splitting(self, data: pd.DataFrame):
        
        try:
            train, test = train_test_split(data, test_size=0.2, random_state=42, stratify=data['Churn'])

            if isinstance(train, pd.DataFrame) and isinstance(test, pd.DataFrame):
                train.to_csv(os.path.join(self.config.root_dir, "train.csv"), index=False)
                test.to_csv(os.path.join(self.config.root_dir, "test.csv"), index=False)

                logger.info("Data split into training and test sets.")
                logger.info(f"Train set shape: {train.shape}")
                logger.info(f"Test set shape: {test.shape}")
                logger.info(f"Train CSV saved to: {os.path.join(self.config.root_dir, 'train.csv')}")
                logger.info(f"Test CSV saved to: {os.path.join(self.config.root_dir, 'test.csv')}")
            else:
                raise ValueError("Train-test split did not return DataFrames.")

        except Exception as e:
            logger.error(f"Error during train-test splitting: {str(e)}")
            raise e

In [10]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data = data_transformation.load_data()
    data = data_transformation.encode_data(data=data)
    data = data_transformation.feature_engineering(data=data)
    balanced_data = data_transformation.data_balancing(data=data)
    data_transformation.train_test_splitting(data=balanced_data)
except Exception as e:
    logger.error(f"An error occurred during the transformation process: {e}")
    raise e

[2025-07-19 00:49:29,045] 32 CustomerChurnLogger - common - INFO - yaml file: config\config.yaml loaded successfully
[2025-07-19 00:49:29,046] 32 CustomerChurnLogger - common - INFO - yaml file: params.yaml loaded successfully
[2025-07-19 00:49:29,061] 32 CustomerChurnLogger - common - INFO - yaml file: schema.yaml loaded successfully
[2025-07-19 00:49:29,062] 52 CustomerChurnLogger - common - INFO - Created directory at: artifacts
[2025-07-19 00:49:29,063] 52 CustomerChurnLogger - common - INFO - Created directory at: artifacts/data_transformation
[2025-07-19 00:49:29,363] 8 CustomerChurnLogger - 3657965283 - INFO - Data loaded successfully.
[2025-07-19 00:49:29,503] 41 CustomerChurnLogger - 3657965283 - INFO - Feature engineering complete.


  data[internet_service_columns] = data[internet_service_columns].replace({'No internet service': 0, 'No': 0, 'Yes': 1})


[2025-07-19 00:49:36,525] 61 CustomerChurnLogger - 3657965283 - INFO - Data balanced using ADASYN.
[2025-07-19 00:49:36,531] 65 CustomerChurnLogger - 3657965283 - INFO - Before balancing: Churn
0    5163
1    1869
Name: count, dtype: int64
[2025-07-19 00:49:36,532] 66 CustomerChurnLogger - 3657965283 - INFO - After balancing: Churn
1    5293
0    5163
Name: count, dtype: int64
[2025-07-19 00:49:36,670] 85 CustomerChurnLogger - 3657965283 - INFO - Data split into training and test sets.
[2025-07-19 00:49:36,671] 86 CustomerChurnLogger - 3657965283 - INFO - Train set shape: (8364, 28)
[2025-07-19 00:49:36,672] 87 CustomerChurnLogger - 3657965283 - INFO - Test set shape: (2092, 28)
[2025-07-19 00:49:36,673] 88 CustomerChurnLogger - 3657965283 - INFO - Train CSV saved to: artifacts/data_transformation\train.csv
[2025-07-19 00:49:36,673] 89 CustomerChurnLogger - 3657965283 - INFO - Test CSV saved to: artifacts/data_transformation\test.csv
