In [None]:
from datasets import load_dataset

In [None]:
ds = load_dataset("snli")

In [None]:
ds

In [None]:
ds["train"].features

In [None]:
ds["train"][0:3]

In [21]:
from typing import List


def split_tokens_and_subtokens(sent: str) -> List[str]:
    tokens = []
    for token in sent.split():
        # collect all chars of token until it finds a non-alphanumeric char
        subtoken = ""
        for char in token:
            if char.isalnum():
                subtoken += char
            else:
                if subtoken:
                    tokens.append(subtoken)
                subtoken = "##" + char
        tokens.append(subtoken)
    return tokens


split_tokens_and_subtokens("A person who's on a horse jumps over a broken down airplane.")

['A',
 'person',
 'who',
 "##'s",
 'on',
 'a',
 'horse',
 'jumps',
 'over',
 'a',
 'broken',
 'down',
 'airplane',
 '##.']

In [24]:
from typing import Optional
import itertools
import random
from tqdm.auto import tqdm
from hlm12nli.tokenisation import Hlm12NliTextTokeniserConfig


def produce_vocab_for(filepath: str, sample_size: Optional[int] = None) -> None:
    ref = Hlm12NliTextTokeniserConfig()
    data = ds["train"]["premise"]
    if sample_size is not None and sample_size > 0:
        data = random.sample(data, sample_size)
    vocab = sorted(set(itertools.chain(*[split_tokens_and_subtokens(sent) for sent in tqdm(data)])))
    vocab = sorted(set([t.lower() for t in vocab]))
    vocab.insert(0, ref.token_end)
    vocab.insert(0, ref.token_str)
    vocab.insert(0, ref.token_oov)
    vocab.insert(0, ref.token_pad)
    with open(filepath, "w") as f:
        f.write("\n".join(vocab))

In [25]:
produce_vocab_for("../data/vocab.txt")

100%|██████████| 550152/550152 [00:14<00:00, 38103.18it/s]


In [26]:
produce_vocab_for("../tests/fixtures/vocab.txt", sample_size=10)

100%|██████████| 10/10 [00:00<00:00, 2529.74it/s]
