In [1]:
import sys
import os

PROJECT_ROOT = os.path.abspath("..")
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

In [2]:
from src.data_utils import MamsClaraDataset, debug_clara_batch, stack_batch

In [3]:
from transformers import AutoTokenizer

model_id = "microsoft/Phi-3.5-mini-instruct"
NUM_MEM_TOKENS = 4

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

mem_tokens = [f"[M{i}]" for i in range(NUM_MEM_TOKENS)]
tokenizer.add_special_tokens({"additional_special_tokens": mem_tokens})

tokenizer.padding_side = 'right' 

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [6]:
# check special tokens
print(tokenizer.tokenize("[M0] [M1] hello"))
print(tokenizer.convert_tokens_to_ids(["[M0]", "[M1]"]))

['[M0]', '▁▁', '[M1]', '▁', '▁hello']
[32011, 32012]


In [10]:
ds = MamsClaraDataset("../data/train.xml", tokenizer)
print("Raw sentences:", len(ds.raw_data))
print("Flattened samples:", len(ds))

print(ds.raw_data[0])
print(ds[0].keys())
print(ds[0]["task"], tokenizer.decode(ds[0]["dec_input_ids"]))

Raw sentences: 4297
Flattened samples: 19780
{'text': 'The decor is not special at all but their food and amazing prices make up for it.', 'aspects': [{'term': 'decor', 'polarity': 'negative', 'from': 4, 'to': 9}, {'term': 'food', 'polarity': 'positive', 'from': 42, 'to': 46}, {'term': 'prices', 'polarity': 'positive', 'from': 59, 'to': 65}]}
dict_keys(['enc_input_ids', 'enc_mask', 'dec_input_ids', 'dec_mask', 'labels', 'task'])
rec [M0]  [M1]  [M2]  [M3]  Restore text:  The decor is not special at all but their food and amazing prices make up for it.<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><

In [11]:
print("pad_token:", tokenizer.pad_token, tokenizer.pad_token_id)
print("eos_token:", tokenizer.eos_token, tokenizer.eos_token_id)

pad_token: <|endoftext|> 32000
eos_token: <|endoftext|> 32000


In [12]:
item = ds[0]
ids = item["dec_input_ids"]
mask = item["dec_mask"].bool()
print(item["task"], tokenizer.decode(ids[mask], skip_special_tokens=False))

rec [M0]  [M1]  [M2]  [M3]  Restore text:  The decor is not special at all but their food and amazing prices make up for it.<|endoftext|>


In [13]:
batch = stack_batch([ds[0], ds[1], ds[2]])
debug_clara_batch(batch, tokenizer)


DIAGNOSTIC REPORT FOR BATCH (Batch Size: 3)

--- SAMPLE 0 | TASK: REC ---
[Encoder Input]: The decor is not special at all but their food and amazing prices make up for it. [M0]  [M1]  [M2]  ... endoftext|><|endoftext|><|endoftext|><|endoftext|>
  > Memory Tokens Detected: ['[M0]', '[M1]', '[M2]', '[M3]']
  > Padding tokens count: 101
[Decoder Input]: [M0]  [M1]  [M2]  [M3]  Restore text:  The decor is not special at all but their food and amazing prices make up for it.<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|end