In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
!pip install torch torchvision pandas numpy scikit-learn


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

# MLP


In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import numpy as np
import json
from sklearn.metrics import classification_report, accuracy_score
from collections import Counter
from torch.nn.utils.rnn import pad_sequence

data_path = "/content/drive/MyDrive/CS5242/dataset/"

def load_data(file):
    with open(data_path + file, 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]
    return pd.DataFrame(data)

train_df = load_data('train.json')
dev_df = load_data('dev.json')
test_public_df = load_data('test_public.json')


In [4]:
def build_vocab(sentences, max_vocab_size=10000):
    counter = Counter()
    for sent in sentences:
        counter.update(sent.split())
    vocab = {'<pad>': 0, '<unk>': 1}
    for word, _ in counter.most_common(max_vocab_size - len(vocab)):
        vocab[word] = len(vocab)
    return vocab

all_sentences = pd.concat([train_df, dev_df])['query'] + " " + pd.concat([train_df, dev_df])['title']
vocab = build_vocab(all_sentences)

embedding_dim = 100
embedding_matrix = np.random.normal(scale=0.6, size=(len(vocab), embedding_dim)).astype(np.float32)


In [5]:
def encode_text(df, vocab):
    encoded = [
        torch.tensor([vocab.get(w, vocab['<unk>']) for w in (q + " " + t).split()])
        for q, t in zip(df['query'], df['title'])
    ]
    return pad_sequence(encoded, batch_first=True, padding_value=0)

X_train_ids = encode_text(train_df, vocab)
y_train = torch.tensor(train_df['label'].astype(int).values)

X_dev_ids = encode_text(dev_df, vocab)
y_dev = torch.tensor(dev_df['label'].astype(int).values)

X_test_ids = encode_text(test_public_df, vocab)


In [6]:
class MyMLP(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, output_dim, num_layers):
        super(MyMLP, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix))
        self.hidden_dim = hidden_dim
        self.layers = nn.ModuleList()

        input_dim = embedding_matrix.shape[1]
        for _ in range(num_layers - 1):
            self.layers.append(nn.Linear(input_dim, hidden_dim))
            input_dim = hidden_dim
        self.out_layer = nn.Linear(hidden_dim, output_dim)

    def forward(self, input_ids):
        x = self.embedding(input_ids).mean(dim=1)
        for layer in self.layers:
            x = F.relu(layer(x))
        x = self.out_layer(x)
        return x


In [7]:
model = MyMLP(embedding_matrix, hidden_dim=128, output_dim=3, num_layers=3)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


MyMLP(
  (embedding): Embedding(10000, 100)
  (layers): ModuleList(
    (0): Linear(in_features=100, out_features=128, bias=True)
    (1): Linear(in_features=128, out_features=128, bias=True)
  )
  (out_layer): Linear(in_features=128, out_features=3, bias=True)
)

In [8]:
batch_size = 64

train_loader = DataLoader(TensorDataset(X_train_ids, y_train), batch_size=batch_size, shuffle=True)
dev_loader = DataLoader(TensorDataset(X_dev_ids, y_dev), batch_size=batch_size)
test_loader = DataLoader(TensorDataset(X_test_ids), batch_size=batch_size)


In [9]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

epochs = 5
for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    print(f'Epoch {epoch+1}, Loss: {epoch_loss / len(train_loader):.4f}')

    model.eval()
    preds, truths = [], []
    with torch.no_grad():
        for inputs, labels in dev_loader:
            inputs = inputs.to(device)
            outputs = model(inputs)
            preds.extend(outputs.argmax(dim=1).cpu().numpy())
            truths.extend(labels.numpy())
    print(classification_report(truths, preds, digits=4))


Epoch 1, Loss: 0.8952
              precision    recall  f1-score   support

           0     0.4826    0.0198    0.0381      4894
           1     0.6315    0.9922    0.7718     12592
           2     0.6000    0.0036    0.0071      2514

    accuracy                         0.6300     20000
   macro avg     0.5714    0.3385    0.2723     20000
weighted avg     0.5911    0.6300    0.4961     20000

Epoch 2, Loss: 0.8859
              precision    recall  f1-score   support

           0     0.4868    0.0793    0.1364      4894
           1     0.6365    0.9693    0.7684     12592
           2     0.5769    0.0060    0.0118      2514

    accuracy                         0.6304     20000
   macro avg     0.5667    0.3515    0.3055     20000
weighted avg     0.5924    0.6304    0.5186     20000

Epoch 3, Loss: 0.8737
              precision    recall  f1-score   support

           0     0.5833    0.0930    0.1604      4894
           1     0.6401    0.9726    0.7721     12592
         

In [10]:
model.eval()
test_preds = []
with torch.no_grad():
    for inputs in test_loader:
        inputs = inputs[0].to(device)
        outputs = model(inputs)
        test_preds.extend(outputs.argmax(dim=1).cpu().numpy())

submission_df = pd.DataFrame({
    "id": test_public_df["id"],
    "label": [str(x) for x in test_preds]
})

submission_df.to_json("test_public_predictions.json", orient='records', lines=True, force_ascii=False)


In [12]:
import pandas as pd

df_pred = pd.read_json("test_public_predictions.json", lines=True)

df_pred.head()


Unnamed: 0,id,label
0,13475,1
1,19170,1
2,15378,1
3,11256,1
4,3189,1


In [26]:
from sklearn.metrics import accuracy_score, classification_report

df_true = load_data("test_public.json")
df_true = df_true.sort_values("id").reset_index(drop=True)

df_pred = pd.read_json("test_public_predictions.json", lines=True)
df_pred = df_pred.sort_values("id").reset_index(drop=True)

assert all(df_true["id"].values == df_pred["id"].values), "The IDs do not match, please check!"

y_true = df_true["label"].astype(int)
y_pred = df_pred["label"].astype(int)

acc = accuracy_score(y_true, y_pred)
print("Test Accuracy:", acc)
print("Classification Report:\n", classification_report(y_true, y_pred, digits=4))


Test Accuracy: 0.646
Classification Report:
               precision    recall  f1-score   support

           0     0.6166    0.1290    0.2134      1209
           1     0.6484    0.9725    0.7780      3159
           2     0.2222    0.0032    0.0062       632

    accuracy                         0.6460      5000
   macro avg     0.4957    0.3682    0.3326      5000
weighted avg     0.5868    0.6460    0.5439      5000



# CNN

In [19]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd
import numpy as np
import json
from collections import Counter
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import classification_report, accuracy_score


In [22]:
DATA_PATH = '/content/drive/MyDrive/CS5242/dataset/'

def load_jsonl(file):
    with open(DATA_PATH + file, 'r', encoding='utf-8') as f:
        return pd.DataFrame([json.loads(line) for line in f])

train_df = load_jsonl('train.json')
dev_df = load_jsonl('dev.json')
test_public_df = load_jsonl('test_public.json')


In [27]:
def build_vocab(sentences, max_size=10000):
    counter = Counter()
    for sent in sentences:
        counter.update(sent.split())
    vocab = {'<pad>': 0, '<unk>': 1}
    for word, _ in counter.most_common(max_size - len(vocab)):
        vocab[word] = len(vocab)
    return vocab

def encode_text(df, vocab, min_len=5):
    df['query'] = df['query'].astype(str)
    df['title'] = df['title'].astype(str)

    encoded = [
        torch.tensor([vocab.get(w, vocab['<unk>']) for w in (q + " " + t).split()])
        for q, t in zip(df['query'], df['title'])
    ]
    padded = [seq if len(seq) >= min_len else F.pad(seq, (0, min_len - len(seq)), value=0) for seq in encoded]
    return pad_sequence(padded, batch_first=True, padding_value=0)

train_text = train_df['query'].astype(str) + " " + train_df['title'].astype(str)
dev_text = dev_df['query'].astype(str) + " " + dev_df['title'].astype(str)
all_text = pd.concat([train_text, dev_text])

vocab = build_vocab(all_text)
embedding_dim = 100
embedding_matrix = np.random.normal(0, 0.1, size=(len(vocab), embedding_dim)).astype(np.float32)

X_train = encode_text(train_df, vocab)
y_train = torch.tensor(train_df['label'].astype(int).values)

X_dev = encode_text(dev_df, vocab)
y_dev = torch.tensor(dev_df['label'].astype(int).values)

X_test = encode_text(test_public_df, vocab)


In [28]:
class TextCNN(nn.Module):
    def __init__(self, embedding_matrix, num_classes, kernel_nums=[100, 100, 100], kernel_sizes=[3, 4, 5]):
        super(TextCNN, self).__init__()
        vocab_size, embed_dim = embedding_matrix.shape
        self.embedding = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix))
        self.convs = nn.ModuleList([
            nn.Conv2d(1, kernel_nums[i], (kernel_sizes[i], embed_dim))
            for i in range(len(kernel_sizes))
        ])
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(sum(kernel_nums), num_classes)

    def conv_and_pool(self, x, conv):
        x = F.relu(conv(x)).squeeze(3)         # [batch_size, kernel_num, seq_len - k + 1]
        x = F.max_pool1d(x, x.size(2)).squeeze(2)
        return x

    def forward(self, input_ids):
        x = self.embedding(input_ids)          # [batch_size, seq_len, embed_dim]
        x = x.unsqueeze(1)                     # [batch_size, 1, seq_len, embed_dim]
        x = torch.cat([self.conv_and_pool(x, conv) for conv in self.convs], dim=1)
        x = self.dropout(x)
        logits = self.fc(x)
        return logits


In [29]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=64, shuffle=True)
dev_loader = DataLoader(TensorDataset(X_dev, y_dev), batch_size=64)
test_loader = DataLoader(TensorDataset(X_test), batch_size=64)

model = TextCNN(embedding_matrix, num_classes=3)
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(5):
    model.train()
    total_loss = 0
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        optimizer.zero_grad()
        output = model(batch_X)
        loss = criterion(output, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

    model.eval()
    preds, truths = [], []
    with torch.no_grad():
        for batch_X, batch_y in dev_loader:
            batch_X = batch_X.to(device)
            logits = model(batch_X)
            preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
            truths.extend(batch_y.numpy())
    print("Validation Accuracy:", accuracy_score(truths, preds))
    print(classification_report(truths, preds, digits=4))


Epoch 1, Loss: 2524.9415
Validation Accuracy: 0.6296
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000      4894
           1     0.6296    1.0000    0.7727     12592
           2     0.0000    0.0000    0.0000      2514

    accuracy                         0.6296     20000
   macro avg     0.2099    0.3333    0.2576     20000
weighted avg     0.3964    0.6296    0.4865     20000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 2, Loss: 2515.7200
Validation Accuracy: 0.6296
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000      4894
           1     0.6296    1.0000    0.7727     12592
           2     0.0000    0.0000    0.0000      2514

    accuracy                         0.6296     20000
   macro avg     0.2099    0.3333    0.2576     20000
weighted avg     0.3964    0.6296    0.4865     20000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 3, Loss: 2510.1681
Validation Accuracy: 0.6296
              precision    recall  f1-score   support

           0     0.5000    0.0002    0.0004      4894
           1     0.6296    0.9999    0.7727     12592
           2     0.0000    0.0000    0.0000      2514

    accuracy                         0.6296     20000
   macro avg     0.3765    0.3334    0.2577     20000
weighted avg     0.5188    0.6296    0.4866     20000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 4, Loss: 2505.7881
Validation Accuracy: 0.6301
              precision    recall  f1-score   support

           0     0.6000    0.0025    0.0049      4894
           1     0.6301    0.9994    0.7729     12592
           2     0.8571    0.0024    0.0048      2514

    accuracy                         0.6301     20000
   macro avg     0.6957    0.3347    0.2608     20000
weighted avg     0.6512    0.6301    0.4884     20000

Epoch 5, Loss: 2502.9500
Validation Accuracy: 0.62985
              precision    recall  f1-score   support

           0     0.5769    0.0031    0.0061      4894
           1     0.6303    0.9975    0.7725     12592
           2     0.4565    0.0084    0.0164      2514

    accuracy                         0.6299     20000
   macro avg     0.5546    0.3363    0.2650     20000
weighted avg     0.5954    0.6299    0.4899     20000



In [30]:
model.eval()
test_preds = []
with torch.no_grad():
    for batch_X in test_loader:
        batch_X = batch_X[0].to(device)
        logits = model(batch_X)
        test_preds.extend(torch.argmax(logits, dim=1).cpu().numpy())

submission = pd.DataFrame({
    "id": test_public_df["id"],
    "label": [str(x) for x in test_preds]
})
submission.to_json("test_public_predictions_cnn.json", orient="records", lines=True, force_ascii=False)


In [32]:
from sklearn.metrics import accuracy_score, classification_report

df_true = load_jsonl("test_public.json")
df_true = df_true.sort_values("id").reset_index(drop=True)

df_pred = pd.read_json("test_public_predictions_cnn.json", lines=True)
df_pred = df_pred.sort_values("id").reset_index(drop=True)

assert all(df_true["id"].values == df_pred["id"].values), "❌ The IDs do not match, please check!"

y_true = df_true["label"].astype(int)
y_pred = df_pred["label"].astype(int)

acc = accuracy_score(y_true, y_pred)
print("Test Accuracy:", acc)
print("Classification Report:\n", classification_report(y_true, y_pred, digits=4))


Test Accuracy: 0.632
Classification Report:
               precision    recall  f1-score   support

           0     0.8333    0.0041    0.0082      1209
           1     0.6322    0.9984    0.7742      3159
           2     0.2000    0.0016    0.0031       632

    accuracy                         0.6320      5000
   macro avg     0.5552    0.3347    0.2618      5000
weighted avg     0.6262    0.6320    0.4915      5000

