In [3]:
import jieba
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np

# Parameters
MAX_LEN = 100
BATCH_SIZE = 64

# Load and preprocess data
def load_data(file_path):
    texts, labels = [], []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            text, label = line.strip().split('\t')
            tokens = list(jieba.cut(text))
            texts.append(tokens)
            labels.append(int(label))
    return texts, labels

train_texts, train_labels = load_data('train.txt')
dev_texts, dev_labels = load_data('dev.txt')
test_texts, test_labels = load_data('test.txt')

# Build vocabulary
from collections import Counter
all_tokens = [token for text in train_texts for token in text]
vocab = {word: idx+2 for idx, (word, _) in enumerate(Counter(all_tokens).most_common())}
vocab['<PAD>'] = 0
vocab['<UNK>'] = 1

# Encoding and padding
def encode_texts(texts):
    encoded_texts = []
    for tokens in texts:
        encoded = [vocab.get(token, vocab['<UNK>']) for token in tokens]
        if len(encoded) < MAX_LEN:
            encoded += [vocab['<PAD>']] * (MAX_LEN - len(encoded))
        else:
            encoded = encoded[:MAX_LEN]
        encoded_texts.append(encoded)
    return np.array(encoded_texts)

train_inputs = encode_texts(train_texts)
dev_inputs = encode_texts(dev_texts)
test_inputs = encode_texts(test_texts)
train_labels = np.array(train_labels)
dev_labels = np.array(dev_labels)
test_labels = np.array(test_labels)

# Create Datasets and Dataloaders
class TextDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = torch.LongTensor(inputs)
        self.labels = torch.LongTensor(labels)
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        return self.inputs[idx], self.labels[idx]

train_dataset = TextDataset(train_inputs, train_labels)
dev_dataset = TextDataset(dev_inputs, dev_labels)
test_dataset = TextDataset(test_inputs, test_labels)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [4]:
import torch.nn as nn
import torch.nn.functional as F

class TextCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, num_classes=2):
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.convs = nn.ModuleList([
            nn.Conv2d(1, 100, (k, embed_dim)) for k in [3, 4, 5]
        ])
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(300, num_classes)

    def forward(self, x):
        x = self.embedding(x)  # [B, L, D]
        x = x.unsqueeze(1)     # [B, 1, L, D]
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs]  # [(B, Co, Lk), ...]
        x = [F.max_pool1d(item, item.size(2)).squeeze(2) for item in x]  # [(B, Co), ...]
        x = torch.cat(x, 1)    # [B, Co * len(Ks)]
        x = self.dropout(x)
        output = self.fc(x)
        return output

In [6]:
# Determine the number of classes
all_labels = train_labels.tolist() + dev_labels.tolist() + test_labels.tolist()
num_classes = len(set(all_labels))
# print(f'Number of classes: {num_classes}')  # Should print 4

# Update the model initialization
VOCAB_SIZE = len(vocab)
model = TextCNN(VOCAB_SIZE, num_classes=num_classes)

# Proceed with the rest of your code
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

# Early Stopping Parameters
patience = 5
best_dev_loss = float('inf')
counter = 0

# Training Loop
for epoch in range(20):
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    
    # Validation
    model.eval()
    dev_loss = 0
    with torch.no_grad():
        for inputs, labels in dev_loader:
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            dev_loss += loss.item()
    dev_loss /= len(dev_loader)
    
    print(f'Epoch {epoch+1}, Dev Loss: {dev_loss}')
    
    # Early Stopping Check
    if dev_loss < best_dev_loss:
        best_dev_loss = dev_loss
        torch.save(model.state_dict(), 'best_model.pt')
        counter = 0
    else:
        counter += 1
        if counter >= patience:
            print('Early stopping!')
            break

# Load the best model
model.load_state_dict(torch.load('best_model.pt'))

Number of classes: 4
Epoch 1, Dev Loss: 0.8798480965197086
Epoch 2, Dev Loss: 0.7621156759560108
Epoch 3, Dev Loss: 0.6583794206380844
Epoch 4, Dev Loss: 0.6079136319458485
Epoch 5, Dev Loss: 0.6292803473770618
Epoch 6, Dev Loss: 0.6079052556306124
Epoch 7, Dev Loss: 0.6116425096988678
Epoch 8, Dev Loss: 0.6058465857058764
Epoch 9, Dev Loss: 0.6539240535348654
Epoch 10, Dev Loss: 0.7168174311518669
Epoch 11, Dev Loss: 0.6745563875883818
Epoch 12, Dev Loss: 0.7070010732859373
Epoch 13, Dev Loss: 0.6963232941925526
Early stopping!


<All keys matched successfully>

In [7]:
# Testing
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        predictions = torch.argmax(outputs, dim=1)
        correct += (predictions == labels).sum().item()
        total += labels.size(0)

test_accuracy = correct / total * 100
print(f'Test Accuracy: {test_accuracy:.2f}%')

Test Accuracy: 80.40%


# Report

## Config of CNN

The config of CNN is as follows:

```
embed_dim = 128
num_classes = 4
learning_rate = 1e-3
early_stopping_patience = 5
max_len = 100
batch_size = 64
max_epochs = 20
```

## Training

The test accuracy is 80.40%