In [1]:
%pwd

'f:\\GitHub\\NLP-Emotion-Classification-End-to-End-Project\\NLP-Emotion-Classification-End-to-End-Project\\research'

In [2]:
import os
os.chdir("../")
%pwd

'f:\\GitHub\\NLP-Emotion-Classification-End-to-End-Project\\NLP-Emotion-Classification-End-to-End-Project'

In [20]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_cleaned_dir: Path
    transformation_dir: Path

In [21]:
from pathlib import Path

from src.emotionClassification.constants import *
from src.emotionClassification.utils.common import read_yaml_file, create_directories


class ConfigurationManager:
    """
    Class to manage the configuration parameters and initialize configurations.
    """

    def __init__(
        self,
        config_file_path: Path = CONFIG_FILE_PATH,
        params_file_path: Path = PARAMS_FILE_PATH,
    ) -> None:
        """
        Initialize the ConfigurationManager with the provided file paths.
        """

        self.config = read_yaml_file(config_file_path)
        self.params = read_yaml_file(params_file_path)

        create_directories(filepath_list=[self.config.artifacts_root])

    def get_data_transformation_config_and_params(self) -> DataTransformationConfig:
        """
        Return the DataTransormationConfig object initialized with the configuration parameters.
        """
        config = self.config.data_transformation

        create_directories([config.root_dir, config.transformation_dir])

        return DataTransformationConfig(
            root_dir=config.root_dir,
            data_cleaned_dir=config.data_cleaned_dir,
            transformation_dir=config.transformation_dir,
        ), self.params.model_params


In [28]:
from box import ConfigBox


from datasets import load_from_disk
from transformers import AutoTokenizer
import datasets

from src.emotionClassification.logging import logger
from src.emotionClassification.entity import DataTransformationConfig


class DataTransformation:
    """
    Represents a data transformation process.
    """

    def __init__(self, config: DataTransformationConfig, params: ConfigBox) -> None:
        """
        Initialize the DataTransformation class with the given configuration.
        """
        self.config = config
        self.params = params
        self.tokenizer = AutoTokenizer.from_pretrained(params.model_checkpoint)
        self.class_labels = params.labels

    def tokenize(self, batch: datasets.Dataset) -> datasets.Dataset:
        """
        Tokenizes the text data in the input batch and adds the corresponding emotion labels.
        Args:
            batch: Input batch containing the text data and emotion labels.
        Returns:
            The input batch with tokenized text and emotion labels.
        """

        labels = [int(batch[label]) for label in self.class_labels]

        # Tokenize the text
        tokens = self.tokenizer(batch["Tweet"], padding=True, truncation=True)

        # Add the formatted labels to the tokenized output
        tokens.update({"labels": labels})

        return tokens

    def save_and_return_transformed_data(self) -> dict:
        """
        Save the transformed data to disk.
        """
        loaded_data = load_from_disk(self.config.data_cleaned_dir)
        transformed_data = loaded_data.map(self.tokenize, batched=False).remove_columns(
            ["ID", "Tweet"] + self.class_labels
        )

        transformed_data.save_to_disk(self.config.transformation_dir)

        return transformed_data


In [29]:
class DataTransformationPipeline:

    def __init__(self) -> None:
        """Initialize the pipeline"""
        pass

    def main(self) -> dict:
        """Execute the pipeline"""

        config = ConfigurationManager()
        data_transformation_config, data_transformation_params = config.get_data_transformation_config_and_params()

        data_transformation = DataTransformation(config=data_transformation_config, 
                                           params=data_transformation_params)
        transformed_data = data_transformation.save_and_return_transformed_data()

        return transformed_data


In [30]:
from src.emotionClassification.logging import logger

STAGE_NAME = "Data Transformation"

try:
    logger.info(f">>>> Stage {STAGE_NAME} Started <<<<")
    data_transformtion = DataTransformationPipeline()
    transformed_data = data_transformtion.main()
    logger.info(f">>>> Stage {STAGE_NAME} Completed Successfully <<<<")
except Exception as e:
    logger.error(f">>>> Stage {STAGE_NAME} Failed <<<<")
    logger.exception(e)
    raise e


[2024-08-31 10:50:24,581: INFO: 1067048094: >>>> Stage Data Transformation Started <<<<]
[2024-08-31 10:50:24,591: INFO: common: YAML file config\config.yaml loaded successfully!]
[2024-08-31 10:50:24,596: INFO: common: YAML file params.yaml loaded successfully!]
[2024-08-31 10:50:24,598: INFO: common: Directory artifacts already exists!]
[2024-08-31 10:50:24,598: INFO: common: Directory artifacts/data_transformation already exists!]
[2024-08-31 10:50:24,603: INFO: common: Directory artifacts/data_transformation/sem_eval_2018_task_1 already exists!]


Map: 100%|██████████| 6838/6838 [00:05<00:00, 1339.70 examples/s]
Map: 100%|██████████| 3259/3259 [00:01<00:00, 1714.44 examples/s]
Map: 100%|██████████| 886/886 [00:00<00:00, 1146.85 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 6838/6838 [00:00<00:00, 225133.45 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 3259/3259 [00:00<00:00, 315373.57 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 886/886 [00:00<00:00, 75921.98 examples/s] 

[2024-08-31 10:50:33,143: INFO: 1067048094: >>>> Stage Data Transformation Completed Successfully <<<<]



