## Assignment 1. Neural Text Classification
## CS310 Natural Language Processing

**Total points**: 50

You should roughtly follow the structure of the notebook. Add additional cells if you feel needed. 

You can (and you should) re-use the code from Lab 2. 

Make sure your code is readable and well-structured.

### 0. Import Necessary Libraries

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import json
import re
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import jieba
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data.dataset import random_split
from sklearn.metrics import precision_score, recall_score, f1_score
import time


  from .autonotebook import tqdm as notebook_tqdm


### 1. Data Processing

In [2]:
class TextDataset(Dataset):
    def __init__(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            lines = file.readlines()
        processed_data = []
        for line in lines:
            json_data = json.loads(line)
            processed_data.append(json_data)
        self.data = processed_data


    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index]


def basic_tokenizer(sentence):
    tokens = re.findall(r'[\u4e00-\u9fff]', sentence)
    return tokens

def improved_tokenizer(sentence):
    pattern = r'[\u4e00-\u9fff]|[0-9]+|[a-zA-Z]+|[^\u4e00-\u9fff\da-zA-Z\s]'

    tokens = re.findall(pattern, sentence)
    # tokens = re.findall(r'[\u4e00-\u9fff]|\d+|[a-zA-Z]+|[^\u4e00-\u9fff\da-zA-Z\s]', sentence)

    return tokens

def yield_tokens(data_iter):
    for item in data_iter:
        yield improved_tokenizer(item['sentence'])

def collate_batch(batch):
    label_list, token_ids_list, offsets = [], [], [0]
    for item in batch:
        label_list.append(label_pipeline(item['label'][0]))
        token_ids = torch.tensor(text_pipeline(item['sentence']), dtype=torch.int64)
        token_ids_list.append(token_ids)
        offsets.append(token_ids.size(0))

    labels = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    token_ids = torch.cat(token_ids_list)

    return labels.to(device), token_ids.to(device), offsets.to(device)


BATCH_SIZE = 8  
train_dataset = TextDataset('train.jsonl')
train_iterator = iter(train_dataset)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True,collate_fn=collate_batch)
test_dataset=TextDataset('test.jsonl')
test_dataloader=DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)

count = 0
for tokens in yield_tokens(train_iterator): # Use a new iterator
    print(tokens)
    count += 1
    if count > 50:
        break

vocab = build_vocab_from_iterator(yield_tokens(train_iterator), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])
text_pipeline = lambda x: vocab(improved_tokenizer(x))
label_pipeline = lambda x: int(x)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("Vocabulary size:", len(vocab))

['卖', '油', '条', '小', '刘', '说', '：', '我', '说']
['保', '姆', '小', '张', '说', '：', '干', '啥', '子', '嘛', '？']
['卖', '油', '条', '小', '刘', '说', '：', '你', '看', '你', '往', '星', '空', '看', '月', '朦', '胧', '，', '鸟', '朦', '胧']
['卖', '油', '条', '小', '刘', '说', '：', '咱', '是', '不', '是', '歇', '一', '下', '这', '双', '，', '疲', '惫', '的', '双', '腿', '？']
['卖', '油', '条', '小', '刘', '说', '：', '快', '把', '我', '累', '死', '了']
['卖', '油', '条', '小', '刘', '说', '：', '我', '说', '亲', '爱', '的', '大', '姐', '你', '贵', '姓', '啊', '？']
['保', '姆', '小', '张', '说', '：', '我', '免', '贵', '姓', '张', '我', '叫', '张', '凤', '姑']
['卖', '油', '条', '小', '刘', '说', '：', '凤', '姑']
['保', '姆', '小', '张', '说', '：', '天', '天', '买', '你', '的', '油', '条', '还', '没', '有', '问', '过', '师', '傅', '，', '你', '贵', '姓', '啊', '？']
['卖', '油', '条', '小', '刘', '说', '：', '我', '免', '贵', '，', '我', '姓', '刘', '，', '我', '叫', '刘', '建', '军']
['卖', '油', '条', '小', '刘', '说', '：', '凤', '姑', '姑']
['卖', '油', '条', '小', '刘', '说', '：', '我', '的', '姑', '啊', '我', '亲', '爱', '的', '姑']
['卖', '油', '条', '小', '刘

### 2. Build the Model

In [3]:
class BoWClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim,num_classes):
        super(BoWClassifier, self).__init__()
        self.embedding_bag = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Sequential(
            nn.Linear(embed_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, num_classes)
        )
        
    def forward(self, text, offsets):
        embedded = self.embedding_bag(text, offsets)
        return self.fc(embedded)


vocab_size = len(vocab)
embed_dim = 64
hidden_dim=128
num_classes = 2
model = BoWClassifier(vocab_size, embed_dim,hidden_dim, num_classes).to(device)

EPOCHS = 10  
LR = 0.001

sparse_parameters = [params for params in model.embedding_bag.parameters()]
dense_parameters = [params for params in model.fc.parameters()]
optimizer_sparse = optim.SparseAdam(sparse_parameters, lr=LR)
optimizer_dense = optim.Adam(dense_parameters, lr=LR)

criterion = nn.CrossEntropyLoss()
sparse_scheduler = torch.optim.lr_scheduler.StepLR(optimizer_sparse, 1.0, gamma=0.1)
dense_scheduler = torch.optim.lr_scheduler.StepLR(optimizer_dense, 1.0, gamma=0.1)

model.eval()
with torch.no_grad():

    for i, (labels, token_ids, offsets) in enumerate(train_dataloader):

        output = model(token_ids, offsets)
        if i == 0:
            break

# Examine the output
print('output size:', output.size())
print('output:', output)




output size: torch.Size([8, 2])
output: tensor([[-0.0576,  0.0215],
        [-0.0584,  0.0416],
        [-0.0646,  0.0185],
        [-0.0772,  0.0300],
        [-0.0675,  0.0210],
        [-0.0495,  0.0207],
        [-0.0447,  0.0492],
        [-0.0559,  0.0845]])


### 3. Train and Evaluate

In [4]:

def eval(model, dataloader,criterion):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            output = model(text, offsets)
            loss = criterion(output, label)
            total_acc += (output.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc / total_count



def train(model, train_dataloader,optimizer_sparse,optimizer_dense, criterion, epoch: int):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 100
    start_time = time.time()

    for idx, (labels, tokens, offsets) in enumerate(train_dataloader):
        optimizer_sparse.zero_grad()
        optimizer_dense.zero_grad()
        output = model(tokens, offsets)
        try:
            loss = criterion(output, labels)
        except Exception:
            print('Error in loss calculation')
            print('output: ', output.size())
            print('labels: ', labels.size())
            # print('token_ids: ', token_ids)
            # print('offsets: ', offsets)
            raise
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer_sparse.step()
        optimizer_dense.step()

        total_acc += (output.argmax(1) == labels).sum().item()
        total_count += labels.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print(
                "| epoch {:3d} | {:5d}/{:5d} batches "
                "| accuracy {:8.3f}".format(
                    epoch, idx, len(train_dataloader), total_acc / total_count
                )
            )
            total_acc, total_count = 0, 0
            start_time = time.time()


num_train = int(len(train_dataset) * 0.8)
split_train_, split_valid_ = random_split(
    train_dataset, [num_train, len(train_dataset) - num_train]
)


train_dataloader = DataLoader(
    split_train_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)
valid_dataloader = DataLoader(
    split_valid_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)
test_dataloader = DataLoader(
    test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)

# Run the training loop
total_accu = None
criterion = torch.nn.CrossEntropyLoss()

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()

    train(model, train_dataloader,optimizer_sparse,optimizer_dense, criterion, epoch)
    accu_val = eval(model, valid_dataloader,criterion)

    if total_accu is not None and total_accu >= accu_val:
        sparse_scheduler.step()
        dense_scheduler.step()
    else:
        total_accu = accu_val

    print("-" * 59)
    print(
        "| end of epoch {:3d} | time: {:5.2f}s | "
        "accuracy {:8.3f} ".format(
            epoch, time.time() - epoch_start_time, accu_val
        )
    )
    print("-" * 59)

torch.save(model.state_dict(), 'model.pth')

| epoch   1 |   100/ 1268 batches | accuracy    0.684
| epoch   1 |   200/ 1268 batches | accuracy    0.711
| epoch   1 |   300/ 1268 batches | accuracy    0.706
| epoch   1 |   400/ 1268 batches | accuracy    0.708
| epoch   1 |   500/ 1268 batches | accuracy    0.694
| epoch   1 |   600/ 1268 batches | accuracy    0.701
| epoch   1 |   700/ 1268 batches | accuracy    0.743
| epoch   1 |   800/ 1268 batches | accuracy    0.718
| epoch   1 |   900/ 1268 batches | accuracy    0.711
| epoch   1 |  1000/ 1268 batches | accuracy    0.736
| epoch   1 |  1100/ 1268 batches | accuracy    0.706
| epoch   1 |  1200/ 1268 batches | accuracy    0.729
-----------------------------------------------------------
| end of epoch   1 | time:  7.12s | accuracy    0.710 
-----------------------------------------------------------
| epoch   2 |   100/ 1268 batches | accuracy    0.686
| epoch   2 |   200/ 1268 batches | accuracy    0.720
| epoch   2 |   300/ 1268 batches | accuracy    0.709
| epoch   2 |  

In [5]:
model = BoWClassifier(vocab_size, embed_dim, hidden_dim,num_classes)
model.load_state_dict(torch.load('model.pth'))
def result_evaluate(model, dataloader, criterion):
    model.eval()
    all_predictions = []
    all_true_labels = []

    with torch.no_grad():
        for idx, (labels, text, offsets) in enumerate(dataloader):
            output = model(text, offsets)
            predictions = output.argmax(dim=1)
            all_predictions.extend(predictions.cpu().numpy())
            all_true_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_true_labels, all_predictions)
    precision = precision_score(all_true_labels, all_predictions, average='binary', zero_division=0)
    recall = recall_score(all_true_labels, all_predictions, average='binary', zero_division=0)
    f1 = f1_score(all_true_labels, all_predictions, average='binary', zero_division=0)

    return accuracy, precision, recall, f1

accuracy, precision, recall, f1 = result_evaluate(model, test_dataloader, criterion)
print(f"Accuracy: {accuracy:.8f}")
print(f"Precision: {precision:.8f}")
print(f"Recall: {recall:.8f}")
print(f"F1 Score: {f1:.8f}")

Accuracy: 0.72657450
Precision: 0.45000000
Recall: 0.21176471
F1 Score: 0.28800000


### 4. Explore Word Segmentation

In [6]:
def jieba_tokenizer(sentence):
    tokens=[]
    seg_list = jieba.cut(sentence)
    for seg in seg_list:
        tokens.append(seg)
    return tokens


def yield_tokens(data_iter):
    for item in data_iter:
        yield jieba_tokenizer(item['sentence'])




BATCH_SIZE = 8  
train_dataset = TextDataset('train.jsonl')
train_iterator = iter(train_dataset)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True,collate_fn=collate_batch)
test_dataset=TextDataset('test.jsonl')
test_dataloader=DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)

count = 0
for tokens in yield_tokens(train_iterator): # Use a new iterator
    print(tokens)
    count += 1
    if count > 50:
        break

vocab = build_vocab_from_iterator(yield_tokens(train_iterator), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])
text_pipeline = lambda x: vocab(jieba_tokenizer(x))
label_pipeline = lambda x: int(x)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("Vocabulary size:", len(vocab))

Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/qd/pr_ybl6s7c7d3nbyj6yt9g0w0000gn/T/jieba.cache
Loading model cost 1.083 seconds.
Prefix dict has been built successfully.


['卖', '油条', '小', '刘说', '：', '我', '说']
['保姆', '小张', '说', '：', '干', '啥子', '嘛', '？']
['卖', '油条', '小', '刘说', '：', '你', '看', '你', '往', '星空', '看', '月', '朦胧', '，', '鸟', '朦胧']
['卖', '油条', '小', '刘说', '：', '咱', '是不是', '歇', '一下', '这', '双', '，', '疲惫', '的', '双腿', '？']
['卖', '油条', '小', '刘说', '：', '快', '把', '我', '累死', '了']
['卖', '油条', '小', '刘说', '：', '我', '说', '亲爱', '的', '大姐', '你', '贵姓', '啊', '？']
['保姆', '小张', '说', '：', '我免', '贵姓', '张', '我', '叫', '张凤姑']
['卖', '油条', '小', '刘说', '：', '凤姑']
['保姆', '小张', '说', '：', '天天', '买', '你', '的', '油条', '还', '没有', '问过', '师傅', '，', '你', '贵姓', '啊', '？']
['卖', '油条', '小', '刘说', '：', '我免', '贵', '，', '我姓', '刘', '，', '我', '叫', '刘建军']
['卖', '油条', '小', '刘说', '：', '凤', '姑姑']
['卖', '油条', '小', '刘说', '：', '我', '的', '姑', '啊', '我', '亲爱', '的', '姑']
['卖', '油条', '小', '刘说', '：', '我', '怎么', '那么', '别扭', '呢', '？']
['卖', '油条', '小', '刘说', '：', '我', '自从', '见', '了', '你', '以后', '我', '的', '这个', '生活', '，', '我', '的', '这个', '事业', '发生', '了', '翻天覆地', '的', '变化']
['卖', '油条', '小', '刘说', '：', '只要', '你', 

In [7]:
vocab_size = len(vocab)
embed_dim = 64
hidden_dim=128
num_classes = 2
model = BoWClassifier(vocab_size, embed_dim,hidden_dim, num_classes).to(device)
EPOCHS = 10  
LR = 0.001

sparse_parameters = [params for params in model.embedding_bag.parameters()]
dense_parameters = [params for params in model.fc.parameters()]
optimizer_sparse = optim.SparseAdam(sparse_parameters, lr=LR)
optimizer_dense = optim.Adam(dense_parameters, lr=LR)

criterion = nn.CrossEntropyLoss()
sparse_scheduler = torch.optim.lr_scheduler.StepLR(optimizer_sparse, 1.0, gamma=0.1)
dense_scheduler = torch.optim.lr_scheduler.StepLR(optimizer_dense, 1.0, gamma=0.1)

model.eval()
with torch.no_grad():

    for i, (labels, token_ids, offsets) in enumerate(train_dataloader):

        output = model(token_ids, offsets)
        if i == 0:
            break

# Examine the output
print('output size:', output.size())
print('output:', output)

output size: torch.Size([8, 2])
output: tensor([[-0.0821, -0.0335],
        [-0.0669,  0.0211],
        [-0.0743, -0.0733],
        [-0.0807,  0.0118],
        [-0.0721,  0.0080],
        [-0.0567,  0.0034],
        [-0.1384,  0.0338],
        [-0.0365, -0.0220]])


In [8]:
num_train = int(len(train_dataset) * 0.8)
split_train_, split_valid_ = random_split(
    train_dataset, [num_train, len(train_dataset) - num_train]
)
train_dataloader = DataLoader(
    split_train_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)
valid_dataloader = DataLoader(
    split_valid_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)
test_dataloader = DataLoader(
    test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)


# Run the training loop
total_accu = None
criterion = torch.nn.CrossEntropyLoss()

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()

    train(model, train_dataloader,optimizer_sparse,optimizer_dense, criterion, epoch)
    accu_val = eval(model, valid_dataloader,criterion)

    if total_accu is not None and total_accu >= accu_val:
        sparse_scheduler.step()
        dense_scheduler.step()
    else:
        total_accu = accu_val

    print("-" * 59)
    print(
        "| end of epoch {:3d} | time: {:5.2f}s | "
        "accuracy {:8.3f} ".format(
            epoch, time.time() - epoch_start_time, accu_val
        )
    )
    print("-" * 59)

torch.save(model.state_dict(), 'model_jieba.pth')

| epoch   1 |   100/ 1268 batches | accuracy    0.707
| epoch   1 |   200/ 1268 batches | accuracy    0.705
| epoch   1 |   300/ 1268 batches | accuracy    0.699
| epoch   1 |   400/ 1268 batches | accuracy    0.708
| epoch   1 |   500/ 1268 batches | accuracy    0.700
| epoch   1 |   600/ 1268 batches | accuracy    0.713
| epoch   1 |   700/ 1268 batches | accuracy    0.721
| epoch   1 |   800/ 1268 batches | accuracy    0.700
| epoch   1 |   900/ 1268 batches | accuracy    0.708
| epoch   1 |  1000/ 1268 batches | accuracy    0.710
| epoch   1 |  1100/ 1268 batches | accuracy    0.699
| epoch   1 |  1200/ 1268 batches | accuracy    0.738
-----------------------------------------------------------
| end of epoch   1 | time:  8.85s | accuracy    0.723 
-----------------------------------------------------------
| epoch   2 |   100/ 1268 batches | accuracy    0.703
| epoch   2 |   200/ 1268 batches | accuracy    0.726
| epoch   2 |   300/ 1268 batches | accuracy    0.714
| epoch   2 |  

In [9]:
model = BoWClassifier(vocab_size, embed_dim,hidden_dim, num_classes)
model.load_state_dict(torch.load('model_jieba.pth'))

# 使用函数
accuracy, precision, recall, f1 = result_evaluate(model, test_dataloader, criterion)
print(f"Accuracy: {accuracy:.8f}")
print(f"Precision: {precision:.8f}")
print(f"Recall: {recall:.8f}")
print(f"F1 Score: {f1:.8f}")

Accuracy: 0.71735791
Precision: 0.44696970
Recall: 0.34705882
F1 Score: 0.39072848
