In [2]:
import json
import zlib
from tqdm import tqdm

from transformers import PreTrainedTokenizerFast

MAX_SEQ_LEN = 128+1
tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="models/tokenizer.json", 
    pad_token="[PAD]", 
    unk_token="[UNK]", 
    max_len=MAX_SEQ_LEN,
    add_prefix_space=False
)

In [4]:
def save_text_subsequences(
        source,
        dest,
        tokenizer,
        window_size,
):
    with open(source, "r", encoding="utf-8") as f:
        data_in = json.load(f)

    with open(f"{dest}.zlib", "wb") as f:
        data_out = []
        for text in tqdm(data_in):
            offsets = tokenizer(
                text,
                return_offsets_mapping=True,
                return_special_tokens_mask=False,
                add_special_tokens=False,
                truncation=False,
                return_token_type_ids=False,
                return_attention_mask=False,
                return_overflowing_tokens=True
            )["offset_mapping"]

            for i in range(len(offsets) - window_size):
                start = offsets[i][0]
                end = offsets[i + window_size][1]
                subsequence = text[start:end]
                data_out.append(subsequence)
        
        compressed = zlib.compress(data_out)
        f.write(compressed)
        

In [3]:
temp = json.load(open("data/valid-sampled.json", "r", encoding="utf-8"))

In [6]:
tokenizer(
    temp[0],
    return_offsets_mapping=True,
    return_special_tokens_mask=False,
    return_attention_mask=False,
    max_length=MAX_SEQ_LEN,
    truncation=True,
    padding="max_length",
    return_overflowing_tokens=True,
    return_tensors="np",
    stride=MAX_SEQ_LEN - 1
)["offset_mapping"].shape

(186, 129, 2)

In [10]:
temp[0].__len__()

976

In [8]:
int(0.1*MAX_SEQ_LEN)

12

In [10]:
tokenizer(
    temp[0],
    return_offsets_mapping=True,
    return_special_tokens_mask=False,
    return_attention_mask=False,
    max_length=MAX_SEQ_LEN,
    truncation=True,
    padding="max_length",
    return_overflowing_tokens=True,
    return_tensors="np",
    stride=MAX_SEQ_LEN - 2
)["input_ids"].shape

(94, 129)

In [5]:
train_source = "data/train-sampled.json"
train_dest = "data/train-subsequences.json"

save_text_subsequences(
    train_source,
    train_dest,
    tokenizer,
    window_size=MAX_SEQ_LEN,
)

  0%|          | 0/135884 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (412 > 129). Running this sequence through the model will result in indexing errors
100%|██████████| 135884/135884 [01:42<00:00, 1325.32it/s]


TypeError: a bytes-like object is required, not 'list'

In [None]:
valid_source = "data/valid-sampled.json"
valid_dest = "data/valid-subsequences.json"

save_text_subsequences(
    valid_source,
    valid_dest,
    tokenizer,
    window_size=MAX_SEQ_LEN,
)