In [1]:
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch
from torchmetrics import Accuracy
from json import load

In [2]:
filename = 'word_id_dict.json'
with open(filename) as f:
    word_id = load(f)
vocab_size = max(word_id.values())+2 # unk, padding

In [3]:
class RNN(nn.Module):
    def __init__(self, vocab_size=vocab_size, embedding_dim=300, hidden_size=50, num_classes=4):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim, padding_idx=vocab_size-1)
        self.rnn = nn.RNN(embedding_dim, hidden_size, batch_first=True)
        self.linear = nn.Linear(in_features=hidden_size, out_features=num_classes)
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, input):
        x = torch.argmax(input, dim=-1)
        # print(x.shape)
        x = self.embedding(x)
        # print(x.shape)
        o, x = self.rnn(x)
        # print(x.shape)
        x = self.linear(x[0])
        # print(x.shape)
        out = self.softmax(x)
        return out

In [4]:
df_train = pd.read_csv('train.feature.txt')
df_valid = pd.read_csv('valid.feature.txt')
df_test = pd.read_csv('test.feature.txt')

def tokenize(input):
    return [word_id[word] if word_id.get(word) is not None else 0 for word in input.split()]

class CustomDataset(Dataset):
    def __init__(self, df):
        self.df = df
    
    def __len__(self):
        return len(self.df.index)
    
    def __getitem__(self, idx):
        return {
            "input": nn.functional.one_hot(torch.tensor(tokenize(self.df["title"][idx]), dtype=torch.int64), num_classes=vocab_size),
            "label": torch.tensor(self.df["category"][idx], dtype=torch.int64)
        }

def collate_fn(batch):
    inputs = []
    labels = []
    max_len = 0
    for data in batch:
        inputs.append(data['input'])
        labels.append(data['label'])
        max_len = max(max_len, len(data['input']))
    pad = [0]*(vocab_size-1)+[1]
    return {
        'input': torch.stack([torch.cat([input, torch.tensor([pad]*(max_len-len(input)), dtype=torch.int64)]) for input in inputs]),
        'label': torch.tensor(labels, dtype=torch.int64)
    }

train_data = CustomDataset(df_train[["title", "category"]])
valid_data = CustomDataset(df_valid[["title", "category"]])
test_data = CustomDataset(df_test[["title", "category"]])
train_dataloader = DataLoader(train_data, batch_size=4, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(valid_data)
test_dataloader = DataLoader(test_data)
print(len(train_dataloader), len(valid_dataloader), len(test_dataloader))

2668 1334 1334


In [5]:
accuracy = Accuracy(task='multiclass', num_classes=4)

In [7]:
model = RNN()
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)
epochs = 10
for i in range(epochs):
    print("epochs:", i)

    # train
    model.train()
    total_train_loss = 0.0
    preds = []
    labels = []
    for data in train_dataloader:
        x, y = data["input"], data["label"]
        optimizer.zero_grad()
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        total_train_loss += loss.item()
        loss.backward()
        optimizer.step()
        
        preds.append(torch.argmax(y_pred, dim=1).numpy())
        labels.append(y.numpy())    
    
    preds = torch.tensor(np.concatenate(preds))
    labels = torch.tensor(np.concatenate(labels))
    print('train loss:', total_train_loss / len(train_dataloader), 'train acc:', float(accuracy(preds, labels)))
    
    # valid
    model.eval()
    total_valid_loss = 0.0
    preds = []
    labels = []
    for data in valid_dataloader:
        x, y = data["input"], data["label"]
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        total_valid_loss += loss.item()
        
        preds.append(torch.argmax(y_pred, dim=1).numpy())
        labels.append(y.numpy())
    
    preds = torch.tensor(np.concatenate(preds))
    labels = torch.tensor(np.concatenate(labels))
    print('valid loss:', total_valid_loss / len(valid_dataloader), 'valid acc:', float(accuracy(preds, labels)))
    
# test
preds = []
labels = []
for data in test_dataloader:
    x, y = data["input"], data["label"]
    y_pred = model(x)
    
    preds.append(torch.argmax(y_pred, dim=1).numpy())
    labels.append(y.numpy())

preds = torch.tensor(np.concatenate(preds))
labels = torch.tensor(np.concatenate(labels))
print('test acc:', float(accuracy(preds, labels)))

epochs: 0
train loss: 1.2552933269801705 train acc: 0.4860382378101349
valid loss: 1.250510450245916 valid acc: 0.4790104925632477
epochs: 1
train loss: 1.1678686853738383 train acc: 0.5723388195037842
valid loss: 1.1355501987944836 valid acc: 0.617691159248352
epochs: 2
train loss: 1.1086329900432026 train acc: 0.6346514225006104
valid loss: 1.1301916982905975 valid acc: 0.6101949214935303
epochs: 3
train loss: 1.143845460873315 train acc: 0.6001686453819275
valid loss: 1.1949274832162184 valid acc: 0.5494752526283264
epochs: 4
train loss: 1.1587060728798741 train acc: 0.5833020806312561
valid loss: 1.149800824663271 valid acc: 0.5854572653770447
epochs: 5
train loss: 1.1113551015528602 train acc: 0.6308096051216125
valid loss: 1.1215149699926734 valid acc: 0.6146926283836365
epochs: 6
train loss: 1.0899765434070328 train acc: 0.6530172228813171
valid loss: 1.0956461947897207 valid acc: 0.6454272866249084
epochs: 7
train loss: 1.0778511733576275 train acc: 0.6654797792434692
valid los