## CS310 Natural Language Processing
## Assignment 1. Neural Text Classification

**Total points**: 50

You should roughtly follow the structure of the notebook. Add additional cells if you feel needed. 

You can (and you should) re-use the code from Lab 2. 

Make sure your code is readable and well-structured.

### 0. Import Necessary Libraries

In [56]:
import re
import json
import torch
import jieba
from torch import nn
from torch.utils.data import DataLoader
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data.dataset import random_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

### 1. Data Processing

In [57]:
class JSONLinesIterator:
    def __init__(self, file_path):
        self.file_path = file_path
        with open(self.file_path, 'r', encoding='utf-8') as file:
            self.data = [json.loads(line) for line in file]

    def __getitem__(self, index):
        return self.data[index]

    def __len__(self):
        return len(self.data)

    def __iter__(self):
        return iter(self.data)

train_iter = iter(JSONLinesIterator('train.jsonl'))

for _ in range(5):
    print(next(train_iter))


{'sentence': '卖油条小刘说：我说', 'choices': ['0', '1'], 'label': [0], 'id': 'train_0'}
{'sentence': '保姆小张说：干啥子嘛？', 'choices': ['0', '1'], 'label': [0], 'id': 'train_1'}
{'sentence': '卖油条小刘说：你看你往星空看月朦胧，鸟朦胧', 'choices': ['0', '1'], 'label': [1], 'id': 'train_2'}
{'sentence': '卖油条小刘说：咱是不是歇一下这双，疲惫的双腿？', 'choices': ['0', '1'], 'label': [0], 'id': 'train_3'}
{'sentence': '卖油条小刘说：快把我累死了', 'choices': ['0', '1'], 'label': [0], 'id': 'train_4'}


In [58]:
#Tokenization 
def basic_tokenizer(sentence):
    if not sentence or not isinstance(sentence, str):
        return []
    tokens = re.findall(r'[\u4e00-\u9fff]', sentence)
    return tokens

def improved_tokenizer(sentence):
    if not sentence or not isinstance(sentence, str):
        return []
    tokens = re.findall(r'[\u4e00-\u9fff]|\d+|[a-zA-Z]+|[^\u4e00-\u9fff\da-zA-Z\s]', sentence)
    return tokens

def jieba_tokenizer(sentence):
    if not sentence or not isinstance(sentence, str):
        return []
    seg_list = jieba.cut_for_search(sentence)
    tokens = []
    for seg in seg_list:
        tokens.append(seg)  # Keep only Chinese characters
    return tokens

def yield_tokens(data_iter):
    for item in data_iter:
        yield jieba_tokenizer(item['sentence'])

# Check the output of yield_tokens()
count = 0
for tokens in yield_tokens(iter(JSONLinesIterator('train.jsonl'))): # Use a new iterator
    print(tokens)
    count += 1
    if count > 5:
        break

['卖', '油条', '小', '刘说', '：', '我', '说']
['保姆', '小张', '说', '：', '干', '啥子', '嘛', '？']
['卖', '油条', '小', '刘说', '：', '你', '看', '你', '往', '星空', '看', '月', '朦胧', '，', '鸟', '朦胧']
['卖', '油条', '小', '刘说', '：', '咱', '不是', '是不是', '歇', '一下', '这', '双', '，', '疲惫', '的', '双腿', '？']
['卖', '油条', '小', '刘说', '：', '快', '把', '我', '累死', '了']
['卖', '油条', '小', '刘说', '：', '我', '说', '亲爱', '的', '大姐', '你', '贵姓', '啊', '？']


In [59]:
# Build the vocabulary
vocab = build_vocab_from_iterator(yield_tokens(iter(JSONLinesIterator('train.jsonl'))), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])
print(vocab(['卖', '油条', '小', '刘说', '：', '我', '说']))

[440, 574, 80, 767, 1, 3, 2]


In [60]:
text_pipeline = lambda x: vocab(jieba_tokenizer(x))
label_pipeline = lambda x: int(x)

tokens = text_pipeline('卖油条小刘说：我说')
print(tokens)

lbl = label_pipeline('1')
print(lbl)

[440, 574, 80, 767, 1, 3, 2]
1


In [61]:
# Batch the data
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
    label_list, token_ids_list, offsets = [], [], [0]
    for _item in batch:
        _label = _item["label"]
        _sentence = _item["sentence"]
        label_list.append(label_pipeline(_label[0]))
        token_ids = torch.tensor(text_pipeline(_sentence), dtype=torch.int64)
        token_ids_list.append(token_ids)
        offsets.append(token_ids.size(0))

    labels = torch.tensor(label_list, dtype=torch.int64)  
    token_ids = torch.cat(token_ids_list, dim=0)  
    offsets = torch.cumsum(torch.tensor(offsets[:-1]), dim=0)  

    return labels.to(device), token_ids.to(device), offsets.to(device)

In [62]:
# Use collate_batch to generate the dataloader
train_iter = JSONLinesIterator('train.jsonl')
dataloader = DataLoader(
    train_iter, batch_size=8, shuffle=False, collate_fn=collate_batch
)

# Test the dataloader
for i, (labels, token_ids, offsets) in enumerate(dataloader):
    if i == 0:
        break

print('Number of tokens in this batch: ', token_ids.size(0))
print('Number of examples in one batch: ', labels.size(0))
print('Example 0: ', token_ids[offsets[0]:offsets[1]])
print('Example 7: ', token_ids[offsets[7]:])

Number of tokens in this batch:  88
Number of examples in one batch:  8
Example 0:  tensor([440, 574,  80, 767,   1,   3,   2])
Example 7:  tensor([ 440,  574,   80,  767,    1, 2347])


### 2. Build the Model

In [63]:
class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=False)
        self.fc = nn.Sequential(
            nn.Linear(embed_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, num_class)
        )
        # self.init_weights()

    # def init_weights(self):
    #     initrange = 0.5
    #     self.embedding.weight.data.uniform_(-initrange, initrange)
    #     self.fc.weight.data.uniform_(-initrange, initrange)
    #     self.fc.bias.data.zero_()

    def forward(self, token_ids, offsets):
        embedded = self.embedding(token_ids, offsets)
        out = self.fc(embedded)
        return out
    
# Build the model
train_iter = iter(JSONLinesIterator('train.jsonl'))
num_class = len(set([item["label"][0] for item in train_iter]))  # Extract label from JSON data
vocab_size = len(vocab)
emsize = 64 
hidden_dim = 128 
model = TextClassificationModel(vocab_size, emsize, hidden_dim, num_class).to(device)

# Test the model
model.eval()
with torch.no_grad():
    for i, (labels, token_ids, offsets) in enumerate(dataloader):
        output = model(token_ids, offsets)
        if i == 0:
            break
print('output size:', output.size())

EPOCHS = 10  
LR = 1  
BATCH_SIZE = 8  

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)

output size: torch.Size([8, 2])


### 3. Train and Evaluate

In [64]:
import time

def train(model, dataloader, optimizer, criterion, epoch: int):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (labels, token_ids, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        output = model(token_ids, offsets) 
        labels = labels.squeeze()  
        try:
            loss = criterion(output, labels)
        except Exception:
            print('Error in loss calculation')
            print('output: ', output.size())
            print('labels: ', labels.size())
            print('token_ids: ', token_ids)
            print('offsets: ', offsets)
            raise
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()

        _, predicted = torch.max(output, dim=1)  # Get the predicted class (index of max logit) for each example
        total_acc += (predicted == labels).sum().item()  # Count correct predictions (True = 1, False = 0)
        total_count += labels.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print(
                "| epoch {:3d} | {:5d}/{:5d} batches "
                "| accuracy {:8.3f}".format(
                    epoch, idx, len(dataloader), total_acc / total_count
                )
            )
            total_acc, total_count = 0, 0
            start_time = time.time()

@torch.no_grad()
def evaluate(model, dataloader, criterion):
    model.eval()
    total_acc, total_count = 0, 0
    all_labels = []
    all_predicted = []

    for idx, (labels, text, offsets) in enumerate(dataloader):
        output = model(text, offsets) 
        labels = labels.squeeze()  
        loss = criterion(output, labels)
        _, predicted = torch.max(output, dim=1) 

        all_labels.extend(labels.cpu().numpy())
        all_predicted.extend(predicted.cpu().numpy())

        total_acc += (predicted == labels).sum().item()  # Count correct predictions
        total_count += labels.size(0)
    accuracy = total_acc / total_count
    precision = precision_score(all_labels, all_predicted, average='weighted')
    recall = recall_score(all_labels, all_predicted, average='weighted')
    f1 = f1_score(all_labels, all_predicted, average='weighted')

    return accuracy, precision, recall, f1

In [65]:
# Prepare train, valid, and test data
train_iter = JSONLinesIterator('train.jsonl')
test_iter = JSONLinesIterator('test.jsonl')
# train_dataset = to_map_style_dataset(train_iter)
# test_dataset = to_map_style_dataset(test_iter)

num_train = int(len(train_iter) * 0.95)
split_train_, split_valid_ = random_split(
    train_iter, [num_train, len(train_iter) - num_train]
)

train_dataloader = DataLoader(
    split_train_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)
valid_dataloader = DataLoader(
    split_valid_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)
test_dataloader = DataLoader(
    test_iter, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)

In [66]:
# Run the training loop
total_accu = None
for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()

    train(model, train_dataloader, optimizer, criterion, epoch)
    accuracy, precision, recall, f1 = evaluate(model, valid_dataloader, criterion)

    if total_accu is not None and total_accu > accuracy:
        scheduler.step()
    else:
        total_accu = accuracy

    print("-" * 59)
    print(
        "| end of epoch {:3d} | time: {:5.2f}s | "
        "valid accuracy {:8.3f} ".format(
            epoch, time.time() - epoch_start_time, accuracy
        )
    )
    print("-" * 59)

# Save the model
torch.save(model.state_dict(), "text_classification_model.pth")
test_accuracy, test_precision, test_recall, test_f1 = evaluate(model, test_dataloader, criterion)
print("Test Accuracy: {:.3f}".format(test_accuracy))
print("Test Precision: {:.3f}".format(test_precision))
print("Test Recall: {:.3f}".format(test_recall))
print("Test F1-Score: {:.3f}".format(test_f1))

| epoch   1 |   500/ 1506 batches | accuracy    0.711
| epoch   1 |  1000/ 1506 batches | accuracy    0.712
| epoch   1 |  1500/ 1506 batches | accuracy    0.708
-----------------------------------------------------------
| end of epoch   1 | time:  9.14s | valid accuracy    0.748 
-----------------------------------------------------------
| epoch   2 |   500/ 1506 batches | accuracy    0.720
| epoch   2 |  1000/ 1506 batches | accuracy    0.702
| epoch   2 |  1500/ 1506 batches | accuracy    0.708
-----------------------------------------------------------
| end of epoch   2 | time:  8.30s | valid accuracy    0.744 
-----------------------------------------------------------
| epoch   3 |   500/ 1506 batches | accuracy    0.713
| epoch   3 |  1000/ 1506 batches | accuracy    0.710
| epoch   3 |  1500/ 1506 batches | accuracy    0.720
-----------------------------------------------------------
| end of epoch   3 | time:  7.59s | valid accuracy    0.740 
-------------------------------

### 4. Explore Word Segmentation

In [67]:
sentiment_labels = ['not humor', 'humor']

def predict(text, model, vocab, labels):
    model.eval()
    with torch.no_grad():
        # text = torch.tensor(vocab(basic_tokenizer(text)), device=device)
        # text = torch.tensor(vocab(improved_tokenizer(text)), device=device)
        text = torch.tensor(vocab(jieba_tokenizer(text)), device=device)

        output = model(text, torch.tensor([0], device=device))
        return labels[output.argmax(1).item()]

ex_text_str = "小刘说：我要卖油条"
print("This is a %s sentence." % (predict(ex_text_str, model, vocab, sentiment_labels)))

This is a not humor sentence.
