In [1]:
# imports
import datasets
import torch
import sklearn.feature_extraction
from pprint import pprint

  from .autonotebook import tqdm as notebook_tqdm


This notebook follows an example found in GitHub: TurkuNLP/intro-to-nlp/mlp_imdb_hf_dset_and_trainer.ipynb

### Preparation

In [2]:
# load imdb dataset
ds = datasets.load_dataset("imdb")

In [3]:
# print structure of dataset
print(ds)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [4]:
# we can remove the 'unsupervised' set and then shuffle the dataset
del ds["unsupervised"]
ds = ds.shuffle()

### Tokenization and mapping

In [5]:
# build a vectorizer (20000 most common words)
vectorizer=sklearn.feature_extraction.text.CountVectorizer(binary=True,max_features=20000)

# convert texts from the dataset to a list
texts=[example["text"] for example in ds["train"]]

# fit the vectorizer to train it on our data
vectorizer.fit(texts)

In [6]:
def vectorize_example(ex):
    vectorized=vectorizer.transform([ex["text"]]) # [...] makes a "list" of the words of the review because the vectorizer expects a list/iterable over inputs, not one input
    non_zero_features=vectorized.nonzero()[1] #.nonzero gives a pair of (rows,columns), we want the columns
    non_zero_features+=1 #feature index 0 will have a special meaning
                         # so let us not produce it by adding +1 to everything
    return {"input_ids":non_zero_features}

vectorized=vectorize_example(ds["train"][0])
print(vectorized)

{'input_ids': array([   10,    58,    80,   786,   887,  1207,  1702,  1919,  2295,
        2604,  2672,  3050,  3335,  3516,  3599,  4640,  5529,  5991,
        6283,  6370,  6612,  6878,  7127,  7801,  7932,  8241,  8553,
        8809,  9602,  9630,  9631, 10392, 10399, 10475, 10558, 10695,
       11081, 11116, 11190, 11681, 11832, 12030, 12202, 12363, 12504,
       13093, 13209, 13778, 14338, 14340, 14782, 15551, 15763, 15791,
       15835, 16020, 17053, 17635, 17897, 17947, 17968, 17982, 18115,
       18562, 19107, 19112, 19377, 19398, 19712, 19807])}


In [7]:
# map indexes back to words to check that everything works

idx2word=dict((i,w) for (w,i) in vectorizer.vocabulary_.items()) #inverse the vocab dictionary
words=[]
for idx in vectorized["input_ids"]:
    words.append(idx2word[idx-1]) ## It is easy to forgot we moved all by +1
pprint(", ".join(words)) #This is now the bag of words representation of the document

('10, 1930, 1950, although, and, as, be, bicycle, br, but, call, chaplin, '
 'classed, cold, comedy, deals, drama, emotional, ettore, except, fairy, film, '
 'for, good, great, hard, hilarious, humanism, is, it, italian, less, letdown, '
 'like, little, lovable, masterpiece, matters, meant, more, music, neo, not, '
 'of, or, person, picture, probably, realism, realistic, resist, scola, '
 'sentimental, serious, sex, should, stone, tale, the, thief, this, though, '
 'to, typical, version, very, warmth, was, with, wouldn')


In [10]:
# apply the tokenizer to the whole dataset using .map()
ds_tokenized = ds.map(vectorize_example)
pprint(ds_tokenized["train"][0])

{'input_ids': [10,
               58,
               80,
               786,
               887,
               1207,
               1702,
               1919,
               2295,
               2604,
               2672,
               3050,
               3335,
               3516,
               3599,
               4640,
               5529,
               5991,
               6283,
               6370,
               6612,
               6878,
               7127,
               7801,
               7932,
               8241,
               8553,
               8809,
               9602,
               9630,
               9631,
               10392,
               10399,
               10475,
               10558,
               10695,
               11081,
               11116,
               11190,
               11681,
               11832,
               12030,
               12202,
               12363,
               12504,
               13093,
               13209,
     

The label and text remain in every example but in addition we now have 'input_ids' for every example.

### Batching and padding

In [12]:
# define a collator function that recieves a list of examples and returns a batch with padded examples

def collator(list_of_examples):
    batch={"labels":torch.tensor(list(ex["label"] for ex in list_of_examples))} #assign label values to batch

    # pad examples
    tensors = []
    max_len = max(len(example["input_ids"]) for example in list_of_examples)
    for example in list_of_examples:
        ids=torch.tensor(example["input_ids"]) #pick the input ids
        padded=torch.nn.functional.pad(ids, (0, max_len-ids.shape[0])) #pad by max - current length, pads with zero by default
        tensors.append(padded) #accumulated the padded ids
    batch["input_ids"]=torch.vstack(tensors) #now that we have all of them the same length, a simple vstack() stacks them up
    return batch #...and that's all there is to it