# Loading Libraries

In [10]:
# !pip install 'portalocker>=2.0.0'

In [1]:
import os
import numpy as np
import pandas as pd
import polars as pl
import spacy

import torch
import torchtext

In [2]:
# load spacy model
spacy.load('en_core_web_sm')

<spacy.lang.en.English at 0x7e9989814d30>

In [3]:
print(torch.__version__)

2.2.1+cu121


# Loading an In-built Dataset

In [4]:
# load imdb dataset
from torchtext.datasets import IMDB

In [5]:
train_iter, test_iter = IMDB(split=('train', 'test'))

In [6]:
# print raw data
counter = 0
for label, review in train_iter:
    counter += 1
    print(f"Label: {label}")
    print(f"Review: {review}")

    if counter > 5:
        break

Label: 1
Review: I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and

# Build Text Preprocessing Pipeline

## Create Tokenizer

In [7]:
from torchtext.data.utils import get_tokenizer

In [8]:
# create tokenizer object

# in-built `basic_english` tokenizer
tokenizer = get_tokenizer("basic_english")

# using external tokenizer using spacy
# tokenizer = get_tokenizer("spacy", language="en_core_web_sm")

In [9]:
# create tokens
example_text = "You can now install TorchText using pip!"
tokens = tokenizer(example_text)
print(tokens)

['you', 'can', 'now', 'install', 'torchtext', 'using', 'pip', '!']


## Build Vocabulary

In [10]:
from torchtext.vocab import build_vocab_from_iterator

In [36]:
def yield_tokens(data_iter, tokenizer=tokenizer):
    for _, text in data_iter:
        yield tokenizer(text)

def get_vocab(train_datapipe):
    vocab = build_vocab_from_iterator(
        iterator=yield_tokens(train_datapipe),
        specials=['<unk>', '<pad>'],
        max_tokens=20000
    )

    vocab.set_default_index(vocab['<unk>'])

    return vocab

text_transform = lambda x: vocab(tokenizer(x))
# label_pipeline = lambda x: int(x) - 1

In [37]:
# get vocabulary
vocab = get_vocab(train_datapipe = train_iter)

In [38]:
text_transform("here")

[129]

### Properties of a Vocabulary

In [None]:
# review vocabulary
# vocab.get_stoi()

In [13]:
# length of the vocabulary
len(vocab)

20000

In [14]:
# get a list of tokens
vocab.get_itos()[:10]

['<UNK>', '<PAD>', '.', 'the', ',', 'a', 'and', "'", 'of', 'to']

In [15]:
# get a key (position in the vovab) for the token
# vocab['the']
vocab.get_stoi()['the']

3

# Generate data batch and iterator

To train a model efficiently, we build an iterator to generate data batch.

We use `torch.utils.data.DataLoader` to generate data batch. We can customize the batch by defining a `collate_batch()` function, and pass it as a `collate_fn` argument to the DataLoader constructor. In `collate_batch()` we process the raw text data and add padding to dynamically match the longest sentence in a batch.

In [27]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

In [39]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [40]:
# def collate_batch(batch):
#     label_list, text_list = [], []

#     for (_label, _text) in batch:
#         label_list.append(_label)
#         processed_text = torch.tensor(text_transform(_text))
#         text_list.append(processed_text)

#     return torch.tensor(label_list), pad_sequence(text_list, padding_value=3.0)

def collate_batch(batch):
    label_list, text_list = [], []

    for _label, _text in batch:
        label_list.append(_label)
        processed_text = torch.tensor(text_transform(_text), dtype=torch.int64)
        text_list.append(processed_text)

    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device)

In [41]:
train_iter = IMDB(split="train")

train_dataloader = DataLoader(
    train_iter,
    batch_size=8,
    shuffle=False,
    collate_fn=collate_batch
)

# Reference

- https://medium.com/@bitdribble/migrate-torchtext-to-the-new-0-9-0-api-1ff1472b5d71