# CS310 Natural Language Processing
# Lab 2: Neural Text Classification

This tutorial is adopted from the official PyTorch tutorial: *Text classification with the torchtext library*
https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html#text-classification-with-the-torchtext-library

### Install torchtext

Url: https://pypi.org/project/torchtext/
```bash
conda install -c pytorch torchtext
```

You may or may not need to manually install the following packages:
    
```bash
pip install chardet
pip install -U portalocker>=2.0.0
```

or with conda

```bash
conda install -c conda-forge 'portalocker>=2.0.0'
```

In [3]:
import torch
from torchtext.datasets import SST2 # SST2 is the sentiment analysis dataset, binary

In [4]:
# Check the raw data
train_iter = iter(SST2(split='train'))

count = 0
for item in train_iter:
    print(item)
    count += 1
    if count > 7:
        break

('hide new secretions from the parental units', 0)
('contains no wit , only labored gags', 0)
('that loves its characters and communicates something rather beautiful about human nature', 1)
('remains utterly satisfied to remain the same throughout', 0)
('on the worst revenge-of-the-nerds clichés the filmmakers could dredge up', 0)
("that 's far too tragic to merit such superficial treatment", 0)
('demonstrates that the director of such hollywood blockbusters as patriot games can still turn out a small , personal film with an emotional wallop .', 1)
('of saucy', 1)


### Apply Tokenization

In [5]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer("basic_english")

def yield_tokens(data_iter):
    for text, _ in data_iter:
        yield tokenizer(text)

In [6]:
# Check the output of yield_tokens()
count = 0
for tokens in yield_tokens(iter(SST2(split='train'))): # Use a new iterator
    print(tokens)
    count += 1
    if count > 7:
        break

['hide', 'new', 'secretions', 'from', 'the', 'parental', 'units']
['contains', 'no', 'wit', ',', 'only', 'labored', 'gags']
['that', 'loves', 'its', 'characters', 'and', 'communicates', 'something', 'rather', 'beautiful', 'about', 'human', 'nature']
['remains', 'utterly', 'satisfied', 'to', 'remain', 'the', 'same', 'throughout']
['on', 'the', 'worst', 'revenge-of-the-nerds', 'clichés', 'the', 'filmmakers', 'could', 'dredge', 'up']
['that', "'", 's', 'far', 'too', 'tragic', 'to', 'merit', 'such', 'superficial', 'treatment']
['demonstrates', 'that', 'the', 'director', 'of', 'such', 'hollywood', 'blockbusters', 'as', 'patriot', 'games', 'can', 'still', 'turn', 'out', 'a', 'small', ',', 'personal', 'film', 'with', 'an', 'emotional', 'wallop', '.']
['of', 'saucy']


### Build Vocabulary

In [7]:
vocab = build_vocab_from_iterator(yield_tokens(iter(SST2(split='train'))), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [8]:
# Check the vocab
print(vocab(['here', 'is', 'an', 'example']))
print(vocab(['hide', 'new', 'secretions', 'from', 'the', 'parental', 'units']))
print(vocab(['of', 'saucy']))

# What about unknown words, i.e., out-of-vocabulary (OOV) words?
print(vocab(['here', 'is', 'a', '@#$@!#$%']))

[224, 10, 16, 1567]
[4579, 92, 13266, 38, 1, 7742, 10000]
[5, 7100]
[224, 10, 3, 0]


In [9]:
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x)

In [10]:
# Test text_pipeline()
tokens = text_pipeline('here is the an example')
print(tokens)

# Test label_pipeline()
lbl = label_pipeline('1')
print(lbl)

[224, 10, 1, 16, 1567]
1


### Data Batch

Define the `Collate_batch` function, which will be used to process the "raw" data batch.

In [11]:
from torch.utils.data import DataLoader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
    label_list, token_ids_list, offsets = [], [], [0]
    for _text, _label in batch:
        label_list.append(label_pipeline(_label))
        token_ids = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        token_ids_list.append(token_ids)
        offsets.append(token_ids.size(0))

    labels = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    token_ids = torch.cat(token_ids_list)

    return labels.to(device), token_ids.to(device), offsets.to(device)

In [12]:
# Use collate_batch to generate the dataloader
train_iter = SST2(split="train")
dataloader = DataLoader(
    train_iter, batch_size=8, shuffle=False, collate_fn=collate_batch
)

In [13]:
# Test the dataloader
for i, (labels, token_ids, offsets) in enumerate(dataloader):
    print(f"batch {i} label: {labels}")
    print(f"batch {i} text: {token_ids}")
    print(f"batch {i} offsets: {offsets}")
    if i == 0:
        break

# What does offsets mean?
print('Number of tokens: ', token_ids.size(0))
print('Number of examples in one batch: ', labels.size(0))
print('Example 1: ', token_ids[offsets[0]:offsets[1]])
print('Example 8: ', token_ids[offsets[7]:])

batch 0 label: tensor([0, 0, 1, 0, 0, 0, 1, 1])
batch 0 text: tensor([ 4579,    92, 13266,    38,     1,  7742, 10000,  2927,    58,   327,
            2,    88,  1995,   548,    11,  1791,    18,    54,     4,  6088,
           95,   184,   262,    36,   176,   624,   591,   679,  6403,     8,
         2010,     1,   287,   701,    25,     1,   252,  5417,   551,     1,
          357,   116,  4856,    53,    11,     7,     9,   171,    50,   780,
            8,  1840,   120,   952,  1037,  2723,    11,     1,   107,     5,
          120,   161,  3473,    14,  7011,  1444,    65,   149,   414,    49,
            3,   394,     2,   529,    17,    15,    16,   205,  3149,     6,
            5,  7100])
batch 0 offsets: tensor([ 0,  7, 14, 26, 34, 44, 55, 80])
Number of tokens:  82
Number of examples in one batch:  8
Example 1:  tensor([ 4579,    92, 13266,    38,     1,  7742, 10000])
Example 8:  tensor([   5, 7100])


### Define the Model

In [15]:
from torch import nn

class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=False)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, token_ids, offsets):
        embedded = self.embedding(token_ids, offsets)
        return self.fc(embedded)

In [16]:
# Build the model
train_iter = iter(SST2(split='train'))
num_class = len(set([label for (_, label) in train_iter]))
vocab_size = len(vocab)
emsize = 64 # embedding size
model = TextClassificationModel(vocab_size, emsize, num_class).to(device)

In [17]:
# Test the model
model.eval()
with torch.no_grad():
    for i, (labels, token_ids, offsets) in enumerate(dataloader):
        output = model(token_ids, offsets)
        # print(f"batch {i} output: {output}")
        if i == 0:
            break

# Examine the output
print('output size:', output.size())
print('output:', output)

output size: torch.Size([8, 2])
output: tensor([[-0.0107, -0.0334],
        [-0.4063, -0.0219],
        [ 0.0674,  0.1346],
        [-0.0334,  0.2256],
        [-0.2274,  0.0812],
        [-0.1080, -0.3154],
        [ 0.0026,  0.2916],
        [-0.4137, -0.0761]])




### Train and Evaluate Functions
Define train() and evaluate()

In [20]:
import time

def train(model, dataloader, optimizer, criterion, epoch: int):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (labels, token_ids, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        output = model(token_ids, offsets)
        try:
            loss = criterion(output, labels)
        except Exception:
            print('Error in loss calculation')
            print('output: ', output.size())
            print('labels: ', labels.size())
            # print('token_ids: ', token_ids)
            # print('offsets: ', offsets)
            raise
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()

        total_acc += (output.argmax(1) == labels).sum().item()
        total_count += labels.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print(
                "| epoch {:3d} | {:5d}/{:5d} batches "
                "| accuracy {:8.3f}".format(
                    epoch, idx, len(dataloader), total_acc / total_count
                )
            )
            total_acc, total_count = 0, 0
            start_time = time.time()

def evaluate(model, dataloader, criterion):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            output = model(text, offsets)
            loss = criterion(output, label)
            total_acc += (output.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc / total_count

### Hyper-parameters, loss, optimizer, and learning-rate scheduler

In [21]:
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset

# Hyperparameters
EPOCHS = 10  # epoch
LR = 5  # learning rate
BATCH_SIZE = 8  # batch size for training

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)

Test `criterion`, i.e., the loss function

In [22]:
# First, obtain some output and labels
model.eval()
with torch.no_grad():
    for i, (labels, token_ids, offsets) in enumerate(dataloader):
        output = model(token_ids, offsets)
        # print(f"batch {i} output: {output}")
        if i == 0:
            break

loss = criterion(output, labels)
print('loss:', loss)

criterion2 = torch.nn.CrossEntropyLoss(reduction='none')
loss2 = criterion2(output, labels)
print('loss non-reduced:', loss2)
print('mean of loss non-reduced:', torch.mean(loss2))

# Manually calculate the loss
probs = torch.exp(output[0,:]) / torch.exp(output[0,:]).sum()
loss3 = -torch.log(probs[labels[0]])
print('loss manually computed:', loss3)

loss: tensor(0.7035)
loss non-reduced: tensor([0.6819, 0.9037, 0.6601, 0.8310, 0.8593, 0.5948, 0.5590, 0.5385])
mean of loss non-reduced: tensor(0.7035)
loss manually computed: tensor(0.6819)


In [23]:
# Prepare train, valid, and test data
train_iter = SST2(split="train")
test_iter = SST2(split="test")
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)

num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = random_split(
    train_dataset, [num_train, len(train_dataset) - num_train]
)

train_dataloader = DataLoader(
    split_train_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)
valid_dataloader = DataLoader(
    split_valid_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)
test_dataloader = DataLoader(
    test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)

### Main Training Loop

In [24]:
# Run the training loop
total_accu = None
for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()

    train(model, train_dataloader, optimizer, criterion, epoch)
    accu_val = evaluate(model, valid_dataloader, criterion)

    if total_accu is not None and total_accu > accu_val:
        scheduler.step()
    else:
        total_accu = accu_val

    print("-" * 59)
    print(
        "| end of epoch {:3d} | time: {:5.2f}s | "
        "valid accuracy {:8.3f} ".format(
            epoch, time.time() - epoch_start_time, accu_val
        )
    )
    print("-" * 59)

| epoch   1 |   500/ 7998 batches | accuracy    0.588
| epoch   1 |  1000/ 7998 batches | accuracy    0.665
| epoch   1 |  1500/ 7998 batches | accuracy    0.712
| epoch   1 |  2000/ 7998 batches | accuracy    0.750
| epoch   1 |  2500/ 7998 batches | accuracy    0.773
| epoch   1 |  3000/ 7998 batches | accuracy    0.777
| epoch   1 |  3500/ 7998 batches | accuracy    0.789
| epoch   1 |  4000/ 7998 batches | accuracy    0.795
| epoch   1 |  4500/ 7998 batches | accuracy    0.819
| epoch   1 |  5000/ 7998 batches | accuracy    0.821
| epoch   1 |  5500/ 7998 batches | accuracy    0.826
| epoch   1 |  6000/ 7998 batches | accuracy    0.837
| epoch   1 |  6500/ 7998 batches | accuracy    0.848
| epoch   1 |  7000/ 7998 batches | accuracy    0.833
| epoch   1 |  7500/ 7998 batches | accuracy    0.843
-----------------------------------------------------------
| end of epoch   1 | time: 15.72s | valid accuracy    0.809 
-----------------------------------------------------------
| epoch  

In [25]:
# Save the model
torch.save(model.state_dict(), "text_classification_model.pth")

### Evaluate with Test Data

This is a necessary step. But since the `test` split of SST2 is not annotated, we will use the `dev` split here to pretend it is the test data.

In [26]:
accu_test = evaluate(model, valid_dataloader, criterion)
print("test accuracy {:8.3f}".format(accu_test))

test accuracy    0.904


### Predict

Test the model with a few unannotated examples.

In [27]:
sentiment_labels = ['negative', 'positive']

def predict(text, model, vocab, tokenizer, labels):
    model.eval()
    with torch.no_grad():
        text = torch.tensor(vocab(tokenizer(text)), device=device)
        output = model(text, torch.tensor([0], device=device))
        return labels[output.argmax(1).item()]

ex_text_str = "A very well-made, funny and entertaining picture."
print("This is a %s sentiment." % (predict(ex_text_str, model, vocab, tokenizer, sentiment_labels)))

This is a positive sentiment.
