In [1]:
import os

os.chdir("../")

In [2]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    ALL_REQUIRED_DATA: Path
    tokenizer_name: Path

In [3]:
from summarylm.constants import *
from summarylm.utils.common import read_yaml, create_directories

In [4]:
class ConfigurationManager:
    def __init__(self, config_filepath = CONFIG_FILE_PATH, params_filepath = PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            ALL_REQUIRED_DATA=config.ALL_REQUIRED_DATA,
            tokenizer_name=config.tokenizer_name
        )

        return data_transformation_config

In [7]:
import os
import sys
from summarylm.logging import logger
from summarylm.exception import CustomException
from transformers import AutoTokenizer
from datasets import load_dataset, load_from_disk
from datasets import concatenate_datasets, DatasetDict

In [24]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name)

    def convert_data_into_right_format(self, datasets: list) -> DatasetDict:
        # loading all datasets
        loaded_datasets = {}
        print("Loading the dataset")
        for data in datasets:
            loaded_datasets[data] = load_from_disk(data)

        dataset1 = loaded_datasets[datasets[0]]
        dataset2 = loaded_datasets[datasets[1]]
        print("Dataset loaded")

        # removing unwanted columns from dataset1
        dataset1_train = dataset1['train'].select_columns(['article', 'summary'])
        dataset1_test = dataset1['test'].select_columns(['article', 'summary'])
        dataset1_validation = dataset1['validation'].select_columns(['article', 'summary'])

        # renaming data column name of dataset1
        dataset1_train = dataset1_train.rename_column('article', 'text')
        dataset1_test = dataset1_test.rename_column('article', 'text')
        dataset1_validation = dataset1_validation.rename_column('article', 'text')

        # renaming data column name of dataset2
        dataset2_train = dataset2['train'].rename_column('document', 'text')
        dataset2_test = dataset2['test'].rename_column('document', 'text')
        dataset2_validation = dataset2['validation'].rename_column('document', 'text')

        # concatenate_datasets
        dataset_train = concatenate_datasets([dataset1_train, dataset2_train])
        dataset_test = concatenate_datasets([dataset1_test, dataset2_test])
        dataset_validation = concatenate_datasets([dataset1_validation, dataset2_validation])

        # loading teh dataset into DatasetDict
        dataset = DatasetDict({
            "train": dataset_train,
            "validation": dataset_validation,
            "test": dataset_test,
        })

        return dataset

    def convert_examples_to_features(self, example_batch):
        input_encodings = self.tokenizer(example_batch['text'], max_length = 1024, truncation = True)
    
        with self.tokenizer.as_target_tokenizer():
            target_encodings = self.tokenizer(example_batch['summary'], max_length = 128, truncation = True)
        
        return {
            'input_ids': input_encodings['input_ids'],
            'attention_mask': input_encodings['attention_mask'],
            'labels': target_encodings['input_ids']
        }
    
    def convert(self):
        data1 = os.path.join(self.config.data_path, self.config.ALL_REQUIRED_DATA[0])
        data2 = os.path.join(self.config.data_path, self.config.ALL_REQUIRED_DATA[1])

        dataset = self.convert_data_into_right_format([data1, data2])
        dataset_pt = dataset.map(self.convert_examples_to_features, batched=True)
        dataset_pt.save_to_disk(os.path.join(self.config.root_dir, "dataset"))

In [25]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.convert()
except Exception as e:
    raise e

[2024-05-23 09:04:24,048: INFO: common: Yaml file: config\config.yaml loaded successfully]
[2024-05-23 09:04:24,051: INFO: common: Yaml file: params.yaml loaded successfully]
[2024-05-23 09:04:24,052: INFO: common: Directory created successfully at: artifacts]
[2024-05-23 09:04:24,053: INFO: common: Directory created successfully at: artifacts/data_transformation]
Loading the dataset
Dataset loaded


Map: 100%|██████████| 80747/80747 [11:43<00:00, 114.72 examples/s]
Map: 100%|██████████| 7622/7622 [01:20<00:00, 94.22 examples/s] 
Map: 100%|██████████| 7622/7622 [01:59<00:00, 63.80 examples/s] 
Saving the dataset (3/3 shards): 100%|██████████| 80747/80747 [00:13<00:00, 5803.62 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 7622/7622 [00:01<00:00, 4202.00 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 7622/7622 [00:01<00:00, 6924.25 examples/s]
