In [1]:
%pwd

'x:\\deep_learning\\extractive_qna\\research'

In [2]:
import os
os.chdir("../")

In [5]:
%pwd

'x:\\deep_learning\\extractive_qna'

In [6]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [7]:
from extractive_qna.constants import *
from extractive_qna.utils.common import read_yaml, create_directories
from extractive_qna.entity import DataTransformationConfig

In [8]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path
        )

        return data_transformation_config

In [15]:
import os
from extractive_qna.logging import logger
from transformers import AutoTokenizer
from datasets import load_dataset, load_from_disk
import pandas as pd
import operator

In [26]:
import operator
class DataTransformation:
    
    def __init__(self , config: DataTransformationConfig,max_length=400,stride=100):
        self.config = config
        self.tokenizer=AutoTokenizer.from_pretrained(MODEL_NAME)
        self.max_length=max_length
        self.stride=stride
        
    # Utility function to find the *last* occurrence of a sequence.
    def rindex(self,lst, value):
        import operator
        x = len(lst) - operator.indexOf(reversed(lst), value) - 1
        return x
    
    def prepare_dataset(self,examples):
        # Some tokenizers don't strip spaces. If there happens to be question text
        # with excessive spaces, the context may not get encoded at all.
        examples["question"] = [q.lstrip() for q in examples["question"]]
        examples["context"] = [c.lstrip() for c in examples["context"]]

        # Tokenize.
        tokenized_examples = self.tokenizer(
            examples['question'],
            examples['context'],
            truncation="only_second",
            max_length = self.max_length,
            stride=self.stride,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            padding="max_length"
        )

        # We'll collect a list of starting positions and ending positions.
        tokenized_examples['start_positions'] = []
        tokenized_examples['end_positions'] = []

        # Work through every sequence.
        for seq_idx in range(len(tokenized_examples['input_ids'])):
            seq_ids = tokenized_examples.sequence_ids(seq_idx)
            offset_mappings = tokenized_examples['offset_mapping'][seq_idx]

            cur_example_idx = tokenized_examples['overflow_to_sample_mapping'][seq_idx]
            answer = examples['answers'][cur_example_idx]
            answer_text = answer['text'][0]
            answer_start = answer['answer_start'][0]
            answer_end = answer_start + len(answer_text)

            context_pos_start = seq_ids.index(1)
            context_pos_end = self.rindex(seq_ids, 1)

            s = e = 0
            if (offset_mappings[context_pos_start][0] <= answer_start and
                offset_mappings[context_pos_end][1] >= answer_end):
                i = context_pos_start
                while offset_mappings[i][0] < answer_start:
                    i += 1
                if offset_mappings[i][0] == answer_start:
                    s = i
                else:
                    s = i - 1

                j = context_pos_end
                while offset_mappings[j][1] > answer_end:
                    j -= 1
                if offset_mappings[j][1] == answer_end:
                    e = j
                else:
                    e = j + 1

            tokenized_examples['start_positions'].append(s)
            tokenized_examples['end_positions'].append(e)

        return tokenized_examples
    
    def convert(self):
        dataset_squad=load_from_disk(self.config.data_path)
        dataset_squad_tf=dataset_squad.map(self.prepare_dataset,batched=True,remove_columns=dataset_squad["train"].column_names,num_proc=2,).remove_columns(["offset_mapping","overflow_to_sample_mapping"])
        dataset_squad_tf.save_to_disk(os.path.join(self.config.root_dir,"squad_dataset"))




In [27]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.convert()
except Exception as e:
    raise e

[2024-08-10 12:43:51,015: INFO :common :yaml file: config\config.yaml loaded successfully]
[2024-08-10 12:43:51,018: INFO :common :yaml file: params.yaml loaded successfully]
[2024-08-10 12:43:51,020: INFO :common :created directory at: artifacts]
[2024-08-10 12:43:51,022: INFO :common :created directory at: artifacts/data_preparation]


Map (num_proc=2): 100%|██████████| 87599/87599 [02:09<00:00, 678.59 examples/s]
Map (num_proc=2): 100%|██████████| 10570/10570 [00:29<00:00, 356.94 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 88331/88331 [00:01<00:00, 61860.43 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 10756/10756 [00:00<00:00, 205654.12 examples/s]
