In [1]:
%pwd

'f:\\GitHub\\NLP-Emotion-Classification-End-to-End-Project\\NLP-Emotion-Classification-End-to-End-Project\\research'

In [2]:
import os
os.chdir("../")
%pwd

'f:\\GitHub\\NLP-Emotion-Classification-End-to-End-Project\\NLP-Emotion-Classification-End-to-End-Project'

In [3]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_url: str
    local_data_file: Path
    extract_dir: Path

In [4]:
from pathlib import Path

from src.emotionClassification.constants import *
from src.emotionClassification.utils.common import read_yaml_file, create_directories
# from src.emotionClassification.entity import DataIngestionConfig


class ConfigurationManager:
    """
    Class to manage the configuration parameters and initialize configurations.
    """

    def __init__(
        self,
        config_file_path: Path = CONFIG_FILE_PATH,
        params_file_path: Path = PARAMS_FILE_PATH,
    ) -> None:
        """
        Initialize the ConfigurationManager with the provided file paths.
        """

        self.config = read_yaml_file(config_file_path)
        self.params = read_yaml_file(params_file_path)

        create_directories(filepath_list=[self.config.artifacts_root])

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        """
        Return the DataIngestionConfig object initialized with the configuration parameters.
        """
        config = self.config.data_ingestion

        create_directories([config.root_dir, config.local_data_file])

        return DataIngestionConfig(
            root_dir=config.root_dir,
            source_url=config.source_url,
            local_data_file=config.local_data_file,
            extract_dir=config.extract_dir,
        )


In [5]:
import os

from datasets import load_dataset

# from src.emotionClassification.entity import DataIngestionConfig
from src.emotionClassification.logging import logger
from src.emotionClassification.utils.common import get_directory_size


class DataIngestion:
    """
    Represents a data ingestion process.
    """

    def __init__(self, config: DataIngestionConfig) -> None:
        """
        Initialize the DataIngestion class with the given configuration.
        """
        self.config = config

    def download_data(self) -> None:
        """
        Download the data specified in the configuration.
        """
        print(f"Downloading data from {self.config.source_url}")

        local_file = self.config.local_data_file
        print(f"Size of {local_file} is {os.path.getsize(local_file)}")
        if get_directory_size(local_file) == 0:

            ds = load_dataset(
                self.config.source_url, "subtask5.english", trust_remote_code=True
            )

            # Save the dataset to a local file
            ds.save_to_disk(local_file)

            print(f"File downloaded successfully as {local_file}")
            logger.info(f"Downloaded {local_file}")
        else:
            logger.info(
                f"Data file {local_file} already exists with size {get_directory_size(local_file)}"
            )


  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# from src.emotionClassification.config.configuration import ConfigurationManager
# from src.emotionClassification.components.data_ingestion import DataIngestion


class DataIngestionTrainingPipeline:

    def __init__(self) -> None:
        """Initialize the pipeline"""
        pass

    def main(self) -> None:
        """Execute the pipeline"""

        config = ConfigurationManager()
        data_ingestion_config = config.get_data_ingestion_config()

        data_ingestion = DataIngestion(config=data_ingestion_config)
        data_ingestion.download_data()


In [7]:
from src.emotionClassification.logging import logger
# from src.emotionClassification.pipeline.stage_01_data_ingestion import (
#     DataIngestionTrainingPipeline,
# )

STAGE_NAME = "Data Ingestion"

try:
    logger.info(f">>>> Stage {STAGE_NAME} Started <<<<")
    data_ingestion = DataIngestionTrainingPipeline()
    data_ingestion.main()
    logger.info(f">>>> Stage {STAGE_NAME} Completed Successfully <<<<")
except Exception as e:
    logger.exception(e)
    raise e


[2024-08-29 14:46:38,656: INFO: 3148584053: >>>> Stage Data Ingestion Started <<<<]
[2024-08-29 14:46:38,662: INFO: common: YAML file config\config.yaml loaded successfully!]
[2024-08-29 14:46:38,668: INFO: common: YAML file params.yaml loaded successfully!]
[2024-08-29 14:46:38,671: INFO: common: Directory artifacts created successfully!]
[2024-08-29 14:46:38,697: INFO: common: Directory artifacts/data_ingestion created successfully!]
[2024-08-29 14:46:38,702: INFO: common: Directory artifacts/data_ingestion/sem_eval_2018_task_1 created successfully!]
Downloading data from SemEvalWorkshop/sem_eval_2018_task_1
Size of artifacts/data_ingestion/sem_eval_2018_task_1 is 0


Saving the dataset (1/1 shards): 100%|██████████| 6838/6838 [00:00<00:00, 119142.31 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 3259/3259 [00:00<00:00, 98281.15 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 886/886 [00:00<00:00, 49223.18 examples/s]

File downloaded successfully as artifacts/data_ingestion/sem_eval_2018_task_1
[2024-08-29 14:46:42,357: INFO: 723864665: Downloaded artifacts/data_ingestion/sem_eval_2018_task_1]
[2024-08-29 14:46:42,359: INFO: 3148584053: >>>> Stage Data Ingestion Completed Successfully <<<<]



