# Тестирование предобработки данных

## Обработка данных в датасете

In [2]:
import sys
import os

PROJECT_ROOT = os.path.abspath("..")
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

In [3]:
from src.data_utils import MamsClaraDataset, debug_clara_batch, stack_batch

In [12]:
from transformers import AutoTokenizer

model_id = "/home/jovyan/phi35_local"
NUM_MEM_TOKENS = 8

In [13]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

mem_tokens = [f"[M{i}]" for i in range(NUM_MEM_TOKENS)]
tokenizer.add_special_tokens({"additional_special_tokens": mem_tokens})

tokenizer.padding_side = 'right' 

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [14]:
# check special tokens
print(tokenizer.tokenize("[M0] [M1] hello"))
print(tokenizer.convert_tokens_to_ids(["[M0]", "[M1]"]))

['[M0]', '▁▁', '[M1]', '▁', '▁hello']
[32011, 32012]


In [15]:
tokenizer

LlamaTokenizerFast(name_or_path='/home/jovyan/phi35_local', vocab_size=32000, model_max_length=131072, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '<|endoftext|>', 'unk_token': '<unk>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['[M0]', '[M1]', '[M2]', '[M3]', '[M4]', '[M5]', '[M6]', '[M7]']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=True, lstrip=False, single_word=False, normalized=False, special=False),
	32000: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32001: AddedToken("<|assistant|>", rstrip=True, lstrip=False, single_word=False, normalized=False, special=True),
	32002: AddedToken("<|p

In [16]:
ds = MamsClaraDataset("../data/train.xml", tokenizer, num_mem_tokens=NUM_MEM_TOKENS)
print("Raw sentences:", len(ds.raw_data))
print("Flattened samples:", len(ds))

print(ds.raw_data[0])
print(ds[0].keys())
print(ds[0]["task"], tokenizer.decode(ds[0]["dec_input_ids"]))

Raw sentences: 4297
Flattened samples: 19780
{'text': 'The decor is not special at all but their food and amazing prices make up for it.', 'aspects': [{'term': 'decor', 'polarity': 'negative', 'from': 4, 'to': 9}, {'term': 'food', 'polarity': 'positive', 'from': 42, 'to': 46}, {'term': 'prices', 'polarity': 'positive', 'from': 59, 'to': 65}]}
dict_keys(['enc_input_ids', 'enc_mask', 'dec_input_ids', 'dec_mask', 'labels', 'task'])
rec [M0]  [M1]  [M2]  [M3]  [M4]  [M5]  [M6]  [M7]  Restore text:  The decor is not special at all but their food and amazing prices make up for it.<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|e

In [17]:
print("pad_token:", tokenizer.pad_token, tokenizer.pad_token_id)
print("eos_token:", tokenizer.eos_token, tokenizer.eos_token_id)

pad_token: <|endoftext|> 32000
eos_token: <|endoftext|> 32000


In [18]:
item = ds[0]
ids = item["dec_input_ids"]
mask = item["dec_mask"].bool()
print(item["task"], tokenizer.decode(ids[mask], skip_special_tokens=False))

rec [M0]  [M1]  [M2]  [M3]  [M4]  [M5]  [M6]  [M7]  Restore text:  The decor is not special at all but their food and amazing prices make up for it.<|endoftext|>


In [19]:
batch = stack_batch([ds[0], ds[1], ds[2]])
debug_clara_batch(batch, tokenizer)


DIAGNOSTIC REPORT FOR BATCH (Batch Size: 3)

--- SAMPLE 0 | TASK: REC ---
[Encoder Input]: The decor is not special at all but their food and amazing prices make up for it. [M0]  [M1]  [M2]  ... endoftext|><|endoftext|><|endoftext|><|endoftext|>
  > Memory Tokens Detected: ['[M0]', '[M1]', '[M2]', '[M3]', '[M4]', '[M5]', '[M6]', '[M7]']
  > Padding tokens count: 93
[Decoder Input]: [M0]  [M1]  [M2]  [M3]  [M4]  [M5]  [M6]  [M7]  Restore text:  The decor is not special at all but their food and amazing prices make up for it.<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|

## Формирование батчей

In [20]:
task_weights = {"rec": 1.0, "ext": 1.0, "reason": 0.5} # decrease frequency of 'reason' task
weights = [task_weights[s['task']] for s in ds.samples]

In [21]:
import torch 

# create sampler to shuffle data
sampler = torch.utils.data.WeightedRandomSampler(
    weights=weights, 
    num_samples=len(ds), 
    replacement=True
)

In [22]:
# transfer to dataloader with sampler
train_loader = torch.utils.data.DataLoader(
    ds, 
    batch_size=16, 
    sampler=sampler, 
    num_workers=12
)

In [23]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [24]:
from collections import Counter

# get first 100 batches and count tasks on types
counts = Counter()
for i, batch in enumerate(train_loader):
    counts.update(batch['task'])
    if i > 100: break

print("Frequency of tasks in train dataset (based on 100 batches):", counts)

Frequency of tasks in train dataset (based on 100 batches): Counter({'reason': 631, 'rec': 522, 'ext': 479})


In [25]:
for i, batch in enumerate(train_loader):
    print(batch)
    break

{'enc_input_ids': tensor([[ 1619,  6532,   322,  ..., 32000, 32000, 32000],
        [  739,   471,  9051,  ..., 32000, 32000, 32000],
        [ 1058,  6140, 15993,  ..., 32000, 32000, 32000],
        ...,
        [  450,  9679, 29879,  ..., 32000, 32000, 32000],
        [  450,   350,  3028,  ..., 32000, 32000, 32000],
        [  450,  3495,  1395,  ..., 32000, 32000, 32000]]), 'enc_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'dec_input_ids': tensor([[32011,   259, 32012,  ..., 32000, 32000, 32000],
        [32011,   259, 32012,  ..., 32000, 32000, 32000],
        [32011,   259, 32012,  ..., 32000, 32000, 32000],
        ...,
        [32011,   259, 32012,  ..., 32000, 32000, 32000],
        [32011,   259, 32012,  ..., 32000, 32000, 32000],
        [32011,   259, 32012,  ..., 32000, 32000, 32000]]), 'dec_mas