In [21]:
from datasets import load_dataset

# use huggingface for data
dataset_small = load_dataset("wikitext", "wikitext-2-v1")
dataset_large = load_dataset("wikitext", "wikitext-103-v1")

In [36]:
# dataset is structured like this
# top level: train, test, validate
# second level: examples
# third level: {'text': '...'}
print(dataset["train"][4]['text'])

 The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more forgiving for series newcomers . Character designer <unk> Honjou and composer Hitoshi Sakimoto both returned from previous entries , along with Valkyria Chronicles II director Takeshi Ozawa . A large team of writers handled the script . The game 's opening theme was sung by May 'n . 



In [26]:
import torch
import torch.nn as nn
from dataclasses import dataclass, field

@dataclass(repr=True)
class Word2VecParams:
    """ 
    Class contains all the configurable parameters
    """
    # skipgram params
    MIN_FREQ = 50  # frequency cutoff for vocabulary
    SKIPGRAM_N_WORDS = 8  # number of neighboring words on skipgram
    T = 85  # distribution percentile for sampling from vocab
    NEG_SAMPLES = 50  # negative samples per training example
    NS_ARRAY_LEN = 5_000_000  # negative sample vector size
    SPECIALS = ""  # placeholder for words excluded due to low freq
    TOKENIZER = "basic_english"  # e.g. split text by spaces

    # network params
    BATCH_SIZE = 100  # number of documents per batch
    EMBED_DIM = 300  # embeddings dimension
    N_EPOCHS = 5  # training epochs
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    CRITERION = nn.BCEWithLogitsLoss()  # loss function


params = Word2VecParams()

In [32]:
import numpy as np
from typing import Union, List, Optional

class Vocab:
    def __init__(self, list, specials):
        # stoi= string-to-int, v[0]= word, k= index, v[1]= frequency
        self.stoi = {v[0]: (k, v[1]) for k, v in enumerate(list)}
        self.itos = {k: (v[0], v[1]) for k, v in enumerate(list)}
        self._specials = specials[0]
        # destructures stoi to add all frequencies for total vocab size
        self.total_tokens = np.nansum([f for _, (_, f) in self.stoi.items()], dtype=int)

    def __len__(self):
        return len(self.stoi) - 1

    def get_index(self, word: Union[str, List[str]]):
        # single word rather than list
        if isinstance(word, str):
            # word is in vocabulary
            if word in self.stoi:
                # return key for token
                return self.stoi.get(word)[0]
            else:
                # return key for out of vocab tokens
                return self.stoi.get(self._specials)[0]
        # dealing with a list of words, likely the skipgram
        elif isinstance(word, list):
            res = []
            # do the same as for word, but for a list
            # definitely a way to compact this code
            for w in word:
                if w in self.stoi:
                    # if word in vocab add id to response
                    res.append(self.stoi.get(w)[0])
                else:
                    # add out of vocab token id
                    res.append(self.stoi.get(self._specials)[0])
            return res
        else:
            raise ValueError(f"Word {word} is not a string or list")

    def get_freq(self, word: Union[str, List[str]]):
        if isinstance(word, str):
            if word in self.stoi:
                # return word frequency
                return self.stoi.get(word)[1]
            else:
                # return frequency of non-vocab words
                return self.stoi.get(self._specials)[1]
        elif isinstance(word, list):
            res = []
            for w in word:
                if w in self.stoi:
                    res.append(self.stoi.get(w)[1])
                else:
                    res.append(self.stoi.get(self._specials)[1])
            return res
        else:
            raise ValueError(f"Word {word} is not a string or list")

    def lookup_token(self, token: Union[int, List[int]]):
        if isinstance(token, (int, np.int64)):
            if token in self.itos:
                return itos.get(token)[0]
            else:
                raise ValueError("Token {token} out of vocabulary")
        elif isinstance(token, list):
            res = []
            for t in token:
                if t in self.itos:
                    res.append(self.itos.get(t)[0])
                else:
                    raise ValueError(f"Token {t} is not a valid index")
            return res
        else:
            raise ValueError(f"Token {token} is not an int or list")

In [42]:
from collections import Counter, OrderedDict
import re

def yield_tokens(iterator, tokenizer):
    r = re.compile("[a-z1-9]")
    for i in iterator:
        res = tokenizer(i['text'])
        res = list(filter(r.match, res))
        yield res


def build_vocab(
    iterator,
    tokenizer,
    params: Word2VecParams,
    max_tokens: Optional[int] = None,
):
    counter = Counter()
    for tokens in yield_tokens(iterator, tokenizer):
        counter.update(tokens)

    # sort by freq descending, then lexicographically
    sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
    ordered_dict = OrderedDict(sorted_by_freq_tuples)

    tokens = []
    for token, freq in ordered_dict.items():
        if freq >= params.MIN_FREQ:
            tokens.append((token, freq))

    specials = (params.SPECIALS, np.nan)
    tokens[0] = specials

    return Vocab(tokens, specials)

In [57]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
vocab = build_vocab(dataset['train'], word_tokenize, params)

[nltk_data] Downloading package punkt to /home/timekeeper/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [71]:
print(vocab.stoi.get('Had'))

None


In [47]:
dataset['test'][3]

{'text': ' Robert Boulter is an English film , television and theatre actor . He had a guest @-@ starring role on the television series The Bill in 2000 . This was followed by a starring role in the play Herons written by Simon Stephens , which was performed in 2001 at the Royal Court Theatre . He had a guest role in the television series Judge John Deed in 2002 . In 2004 Boulter landed a role as " Craig " in the episode " Teddy \'s Story " of the television series The Long Firm ; he starred alongside actors Mark Strong and Derek Jacobi . He was cast in the 2005 theatre productions of the Philip Ridley play Mercury Fur , which was performed at the Drum Theatre in Plymouth and the <unk> Chocolate Factory in London . He was directed by John Tiffany and starred alongside Ben Whishaw , Shane Zaza , Harry Kent , Fraser Ayres , Sophie Stanton and Dominic Hall . \n'}