In [None]:
%pwd

In [1]:
import os
os.chdir("../")
%pwd

'f:\\GitHub\\NLP-Emotion-Classification-End-to-End-Project\\NLP-Emotion-Classification-End-to-End-Project'

In [2]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataCleaningConfig:
    root_dir: Path
    data_ingestion_dir: Path
    cleaned_dir: Path

In [21]:
from pathlib import Path

from src.emotionClassification.constants import *
from src.emotionClassification.utils.common import read_yaml_file, create_directories


class ConfigurationManager:
    """
    Class to manage the configuration parameters and initialize configurations.
    """

    def __init__(
        self,
        config_file_path: Path = CONFIG_FILE_PATH,
        params_file_path: Path = PARAMS_FILE_PATH,
    ) -> None:
        """
        Initialize the ConfigurationManager with the provided file paths.
        """

        self.config = read_yaml_file(config_file_path)
        self.params = read_yaml_file(params_file_path)

        create_directories(filepath_list=[self.config.artifacts_root, self.config.data_cleaning.cleaned_dir,])

    def get_data_cleaning_config_and_params(self) -> DataCleaningConfig:
        """
        Return the DataCleaningConfig object initialized with the configuration parameters.
        """
        config = self.config.data_cleaning

        create_directories([config.root_dir])

        return DataCleaningConfig(
            root_dir=config.root_dir,
            data_ingestion_dir=config.data_ingestion_dir,
            cleaned_dir=config.cleaned_dir,
        ), self.params.data_cleaning


In [22]:
import os
from pathlib import Path
from box import ConfigBox

import re
import nltk
from nltk.corpus import stopwords
import emoji
import contractions
from autocorrect import Speller
import pandas as pd
import string
from datasets import load_from_disk

from src.emotionClassification.logging import logger


class DataCleaning:
    """
    Represents a data cleaning process.
    """

    def __init__(self, config: DataCleaningConfig, params: ConfigBox) -> None:
        """
        Initialize the DataCleaning class with the given configuration.
        """
        self.config = config
        self.params = params


    def clean_data(self, batch: dict) -> dict:
        """
        Cleans the text data in the input batch.
        Args:
            batch: Input batch containing the text data.
        Returns:
            The input batch with preprocessed text.
        """
        text = batch['Tweet']
        series = pd.Series(text) # To Speed up operations on batch
        
        # Apply preprocessing steps using vectorized operations
        series = series.apply(lambda x: contractions.fix(x)) # Expand contractions
        series = series.str.lower() # Lowercase
        series = series.str.replace(r'http\S+|www\S+|https\S+', '', regex=True)  # Remove URLs
        series = series.str.replace(r'@\w+', '', regex=True)  # Remove mentions
        series = series.str.replace(r'[^\w\s]', '', regex=True)  # Remove special characters
        series = series.str.replace(r'(.)\1+', r'\1\1', regex=True)  # Handle elongation
        series = series.str.replace(f'[{string.punctuation}]', '', regex=True) # Remove punctuation

    #     # Remove stopwords
    #     def remove_stopwords(text):
    #         words = text.split()
    #         filtered_words = [word for word in words if word not in stop_words]
    #         return ' '.join(filtered_words)
    #     series = series.apply(remove_stopwords)
        
        series = series.apply(lambda x: emoji.demojize(x))  # Convert emojis to text
        text = series.tolist()
        batch['Tweet'] = text

        return batch
    
    def save_and_return_cleaned_data(self) -> dict:
        """
        Save the cleaned data to disk.
        """
        loaded_data = load_from_disk(self.config.data_ingestion_dir)
        cleaned_data = loaded_data.map(self.clean_data, batched=True, batch_size=self.params.batch_size)
        cleaned_data.save_to_disk(self.config.cleaned_dir)

        return cleaned_data

In [23]:
class DataCleaningPipeline:

    def __init__(self) -> None:
        """Initialize the pipeline"""
        pass

    def main(self) -> dict:
        """Execute the pipeline"""

        config = ConfigurationManager()
        data_cleaning_config, data_cleaning_params = config.get_data_cleaning_config_and_params()

        data_cleaning = DataCleaning(config=data_cleaning_config, params=data_cleaning_params)
        cleaned_data = data_cleaning.save_and_return_cleaned_data()

        return cleaned_data


In [24]:
from src.emotionClassification.logging import logger

STAGE_NAME = "Data Cleaning/Preprocessing"

try:
    logger.info(f">>>> Stage {STAGE_NAME} Started <<<<")
    data_cleaning = DataCleaningPipeline()
    cleaned_data = data_cleaning.main()
    logger.info(f">>>> Stage {STAGE_NAME} Completed Successfully <<<<")
except Exception as e:
    logger.error(f">>>> Stage {STAGE_NAME} Failed <<<<")
    logger.exception(e)
    raise e


[2024-08-29 18:51:11,031: INFO: 1559613091: >>>> Stage Data Cleaning/Preprocessing Started <<<<]
[2024-08-29 18:51:11,049: INFO: common: YAML file config\config.yaml loaded successfully!]
[2024-08-29 18:51:11,059: INFO: common: YAML file params.yaml loaded successfully!]
[2024-08-29 18:51:11,064: INFO: common: Directory artifacts already exists!]
[2024-08-29 18:51:11,064: INFO: common: Directory artifacts/data_cleaning/sem_eval_2018_task_1 already exists!]
[2024-08-29 18:51:11,069: INFO: common: Directory artifacts/data_cleaning already exists!]


Map: 100%|██████████| 6838/6838 [00:04<00:00, 1628.44 examples/s]
Map: 100%|██████████| 3259/3259 [00:01<00:00, 2406.60 examples/s]
Map: 100%|██████████| 886/886 [00:00<00:00, 2167.68 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 6838/6838 [00:00<00:00, 110204.23 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 3259/3259 [00:00<00:00, 112284.98 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 886/886 [00:00<00:00, 61848.27 examples/s]

[2024-08-29 18:51:24,404: INFO: 1559613091: >>>> Stage Data Cleaning/Preprocessing Completed Successfully <<<<]



