In [1]:
import os

In [2]:
%pwd

'd:\\New Projects\\Customer Churn Prediction - End to End (ML)\\Customer-Churn-Prediction\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\New Projects\\Customer Churn Prediction - End to End (ML)\\Customer-Churn-Prediction'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [6]:
from CustomerChurn.constants import *
from CustomerChurn.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
        
        create_directories([self.config.artifacts_root])
        
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
        )

        return data_transformation_config

In [9]:
import os
import pandas as pd
from typing import Union
from CustomerChurn import logger
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN

In [12]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
    
    def load_data(self) -> pd.DataFrame:
        try:
            data = pd.read_csv(self.config.data_path)
            logger.info("Data loaded successfully.")
            return data
        except Exception as e:
            logger.error(f"Error loading data: {e}")
            raise e
        
    def encode_data(self, data: pd.DataFrame) -> pd.DataFrame:
        try:
            binary_columns = ['Partner', 'Dependents', 'PaperlessBilling', 'Churn', 'PhoneService']
            data[binary_columns] = data[binary_columns].applymap(lambda x: 1 if x == 'Yes' else 0)
            
            data['gender'] = data['gender'].apply(lambda x: 1 if x == 'Female' else 0)

            data['MultipleLines'] = data['MultipleLines'].map({'No phone service': 0, 'No': 0, 'Yes': 1})

            internet_service_columns = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']
            data[internet_service_columns] = data[internet_service_columns].replace({'No internet service': 0, 'No': 0, 'Yes': 1})

            categorical_columns = ['InternetService', 'Contract', 'PaymentMethod']
            data = pd.get_dummies(data, columns=categorical_columns, drop_first=True, dtype='int')

            return data

        except Exception as e:
            logger.error(f"Error during data encoding: {str(e)}")
            raise e

    def feature_engineering(self, data: pd.DataFrame) -> pd.DataFrame:
        try:
            data['TotalCharges_per_month'] = data['TotalCharges'] / (data['tenure'] + 1)  # Avoiding division by zero
            logger.info("Feature engineering complete.")
            return data
        
        except Exception as e:
            logger.error(f"Error in feature engineering: {str(e)}")
            raise e

    def data_balancing(self, data: pd.DataFrame, method: str) -> Union[pd.DataFrame, pd.Series]:
        try:
            X = data.drop('Churn', axis=1)
            y = data['Churn']
            
            if method == 'SMOTE':
                smote = SMOTE()
                X_res, y_res = smote.fit_resample(X, y)
                logger.info("Data balanced using SMOTE.")
            elif method == 'SMOTEENN':
                smoteenn = SMOTEENN()
                X_res, y_res = smoteenn.fit_resample(X, y)
                logger.info("Data balanced using SMOTEENN.")
            else:
                raise ValueError(f"Invalid method: {method}. Choose either 'SMOTE' or 'SMOTEENN'.")
            
            logger.info(f"Before balancing: {y.value_counts()}")
            logger.info(f"After balancing: {y_res.value_counts()}")
            
            balanced_data = pd.DataFrame(X_res)
            balanced_data['Churn'] = y_res.values
            return balanced_data

        except Exception as e:
            logger.error(f"Error during data balancing: {str(e)}")
            raise e

    def train_test_splitting(self, data: pd.DataFrame):
        
        try:
            train, test = train_test_split(data)

            if isinstance(train, pd.DataFrame) and isinstance(test, pd.DataFrame):
                train.to_csv(os.path.join(self.config.root_dir, "train.csv"), index=False)
                test.to_csv(os.path.join(self.config.root_dir, "test.csv"), index=False)

                logger.info("Data split into training and test sets.")
                logger.info(f"Train set shape: {train.shape}")
                logger.info(f"Test set shape: {test.shape}")
            else:
                raise ValueError("Train-test split did not return DataFrames.")

        except Exception as e:
            logger.error(f"Error during train-test splitting: {str(e)}")
            raise e
        

In [13]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data = data_transformation.load_data()
    data = data_transformation.encode_data(data=data)
    data = data_transformation.feature_engineering(data=data)
    balanced_data = data_transformation.data_balancing(data=data, method='SMOTE')
    data_transformation.train_test_splitting(data=balanced_data)
except Exception as e:
    logger.error(f"An error occurred during the transformation process: {e}")
    raise e

[2024-09-20 12:30:28,865] 32 CustomerChurnLogger - common - INFO - yaml file: config\config.yaml loaded successfully
[2024-09-20 12:30:28,869] 32 CustomerChurnLogger - common - INFO - yaml file: params.yaml loaded successfully
[2024-09-20 12:30:28,875] 32 CustomerChurnLogger - common - INFO - yaml file: schema.yaml loaded successfully
[2024-09-20 12:30:28,877] 52 CustomerChurnLogger - common - INFO - Created directory at: artifacts
[2024-09-20 12:30:28,879] 52 CustomerChurnLogger - common - INFO - Created directory at: artifacts/data_transformation
[2024-09-20 12:30:28,929] 8 CustomerChurnLogger - 2969946110 - INFO - Data loaded successfully.
[2024-09-20 12:30:28,984] 38 CustomerChurnLogger - 2969946110 - INFO - Feature engineering complete.
[2024-09-20 12:30:29,114] 53 CustomerChurnLogger - 2969946110 - INFO - Data balanced using SMOTE.
[2024-09-20 12:30:29,118] 61 CustomerChurnLogger - 2969946110 - INFO - Before balancing: Churn
0    5163
1    1869
Name: count, dtype: int64


  data[binary_columns] = data[binary_columns].applymap(lambda x: 1 if x == 'Yes' else 0)
  data[internet_service_columns] = data[internet_service_columns].replace({'No internet service': 0, 'No': 0, 'Yes': 1})


[2024-09-20 12:30:29,122] 62 CustomerChurnLogger - 2969946110 - INFO - After balancing: Churn
0    5163
1    5163
Name: count, dtype: int64
[2024-09-20 12:30:29,420] 81 CustomerChurnLogger - 2969946110 - INFO - Data split into training and test sets.
[2024-09-20 12:30:29,421] 82 CustomerChurnLogger - 2969946110 - INFO - Train set shape: (7744, 25)
[2024-09-20 12:30:29,423] 83 CustomerChurnLogger - 2969946110 - INFO - Test set shape: (2582, 25)
