In [1]:
import os

In [2]:
%pwd

'd:\\TextSummarizationProject\\End-to-end-Text-Summarizer-Project\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\TextSummarizationProject\\End-to-end-Text-Summarizer-Project'

In [7]:
"""README > 3. Update entity """

from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationEntity:
    root_dir: Path #artifacts/data_transformation
    data_path: Path #artifacts/data_receiver/samsum_dataset
    tokenizer: Path #google/pegasus-cnn-dailymail. Automatically download the tokenizer.


In [19]:
"""README > 4. Update the configuration manager in src config """
from textSummarizer.constants import *
from textSummarizer.utils.common import read_yaml, create_directories

class ConfigurationManager:
    def __init__(self, config_yaml_path = FILE_PATH_CONFIG):
        self.config = read_yaml(config_yaml_path)

    def get_config_data_transformation(self) -> DataTransformationEntity:
        
        create_directories([self.config.data_transformation.root_dir])

        """ConfigBox kullanmadan böyle de attributelar çağrılabilir."""
        return DataTransformationEntity(
            root_dir = self.config.data_transformation.root_dir,
            data_path = self.config.data_transformation.data_path,
            tokenizer = self.config.data_transformation.tokenizer
        ) 

In [12]:
"""README > 5. Update components """
import os
from textSummarizer.logging import logger
from transformers import AutoTokenizer
from datasets import load_dataset, load_from_disk



class DataTransformer:
    def __init__(self, config: DataTransformationEntity):
        self.config = config
        self.tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer)
    
    def convert_examples_to_features(self, example_batch):

        input_encodings = self.tokenizer(example_batch['dialogue'] , max_length = 1024, truncation = True )
        """ 
        Tokenizer, metinleri modelin anlayabileceği tokenlere dönüştüren bir objecttir. Mesela cümledeki her bir harf bir token yapılabilir. 
        Benzer şekilde cümledeki her bir kelime de token yapılabilir.
        Yani tokenizerlar ile metin daha küçük paraçalara ayrılır.
        max_length = 1024 demek, tokenlere ayrılacak olan metnin maximum uzunluğunu göstermektedir. 
        "truncation = True" parametresi ise eğer metin 1024 karakterden daha uzun ise kesileceği anlamına gelir. 
        """

        with self.tokenizer.as_target_tokenizer():
            target_encodings = self.tokenizer(example_batch['summary'], max_length = 128, truncation = True )

        return { #Burada ise datanın featurelarına "input_ids, attention_mask, labels" columnlarını eklemekteyiz.
            'input_ids' : input_encodings['input_ids'],
            'attention_mask': input_encodings['attention_mask'],
            'labels': target_encodings['input_ids']
    }

    def convert(self):
        dataset_samsum = load_from_disk(self.config.data_path)
        dataset_samsum_pt = dataset_samsum.map(self.convert_examples_to_features, batched = True)
        saving_dir = os.path.join(self.config.root_dir, "samsum_dataset")
        dataset_samsum_pt.save_to_disk(saving_dir)

In [21]:
try:
    config_manager_obj = ConfigurationManager()
    data_transformation_config =  config_manager_obj.get_config_data_transformation() 
    data_transformation_instance = DataTransformer(config=data_transformation_config)
    data_transformation_instance.convert()
except Exception as e:
    raise e

[2024-05-20 22:11:14,621: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-05-20 22:11:14,622: INFO: common: Created directory at artifacts/data_transformation]


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Map: 100%|██████████| 14732/14732 [00:02<00:00, 5479.19 examples/s]
Map: 100%|██████████| 819/819 [00:00<00:00, 6314.69 examples/s]
Map: 100%|██████████| 818/818 [00:00<00:00, 4060.64 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 14732/14732 [00:00<00:00, 527578.20 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 819/819 [00:00<00:00, 163104.08 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 818/818 [00:00<00:00, 136669.08 examples/s]
