In [1]:
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch
from torchmetrics import Accuracy
from json import load

In [2]:
filename = 'word_id_dict.json'
with open(filename) as f:
    word_id = load(f)
vocab_size = max(word_id.values())+2 # unk, padding

In [3]:
class RNN(nn.Module):
    def __init__(self, vocab_size=vocab_size, embedding_dim=300, hidden_size=50, num_classes=4):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_size, batch_first=True)
        self.linear = nn.Linear(in_features=hidden_size, out_features=num_classes)
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, input):
        x = torch.argmax(input, dim=-1)
        # print(x.shape)
        x = self.embedding(x)
        # print(x.shape)
        o, x = self.rnn(x)
        # print(x.shape)
        x = self.linear(x[0])
        # print(x.shape)
        out = self.softmax(x)
        return out

In [4]:
df_train = pd.read_csv('train.feature.txt')
df_valid = pd.read_csv('valid.feature.txt')
df_test = pd.read_csv('test.feature.txt')

def tokenize(input):
    return [word_id[word] if word_id.get(word) is not None else 0 for word in input.split()]

class CustomDataset(Dataset):
    def __init__(self, df):
        self.df = df
    
    def __len__(self):
        return len(self.df.index)
    
    def __getitem__(self, idx):
        return {
            "input": nn.functional.one_hot(torch.tensor(tokenize(self.df["title"][idx]), dtype=torch.int64), num_classes=vocab_size),
            "label": torch.tensor(self.df["category"][idx], dtype=torch.int64)
        }

train_data = CustomDataset(df_train[["title", "category"]])
valid_data = CustomDataset(df_valid[["title", "category"]])
test_data = CustomDataset(df_test[["title", "category"]])
train_dataloader = DataLoader(train_data, batch_size=1, shuffle=True)
valid_dataloader = DataLoader(valid_data)
test_dataloader = DataLoader(test_data)
print(len(train_dataloader), len(valid_dataloader), len(test_dataloader))

10672 1334 1334


In [5]:
accuracy = Accuracy(task='multiclass', num_classes=4)

In [6]:
model = RNN()
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001)
epochs = 10
for i in range(epochs):
    print("epochs:", i)

    # train
    model.train()
    total_train_loss = 0.0
    preds = []
    labels = []
    for data in train_dataloader:
        x, y = data["input"], data["label"]
        optimizer.zero_grad()
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        total_train_loss += loss.item()
        loss.backward()
        optimizer.step()
        
        preds.append(torch.argmax(y_pred, dim=1).numpy())
        labels.append(y.numpy())    
    
    preds = torch.tensor(np.concatenate(preds))
    labels = torch.tensor(np.concatenate(labels))
    print('train loss:', total_train_loss / len(train_dataloader), 'train acc:', float(accuracy(preds, labels)))
    
    # valid
    model.eval()
    total_valid_loss = 0.0
    preds = []
    labels = []
    for data in valid_dataloader:
        x, y = data["input"], data["label"]
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        total_valid_loss += loss.item()
        
        preds.append(torch.argmax(y_pred, dim=1).numpy())
        labels.append(y.numpy())
    
    preds = torch.tensor(np.concatenate(preds))
    labels = torch.tensor(np.concatenate(labels))
    print('valid loss:', total_valid_loss / len(valid_dataloader), 'valid acc:', float(accuracy(preds, labels)))
    
# test
preds = []
labels = []
for data in test_dataloader:
    x, y = data["input"], data["label"]
    y_pred = model(x)
    
    preds.append(torch.argmax(y_pred, dim=1).numpy())
    labels.append(y.numpy())

preds = torch.tensor(np.concatenate(preds))
labels = torch.tensor(np.concatenate(labels))
print('test acc:', float(accuracy(preds, labels)))

epochs: 0
train loss: 1.3070982587446962 train acc: 0.42016491293907166
valid loss: 1.2693372653878254 valid acc: 0.4947526156902313
epochs: 1
train loss: 1.238513960166537 train acc: 0.524550199508667
valid loss: 1.2267499137139213 valid acc: 0.5269864797592163
epochs: 2
train loss: 1.2011945720488342 train acc: 0.5493815541267395
valid loss: 1.204099293323471 valid acc: 0.5344827771186829
epochs: 3
train loss: 1.1750376168442989 train acc: 0.5715892314910889
valid loss: 1.1853234325123692 valid acc: 0.552473783493042
epochs: 4
train loss: 1.1518567335335301 train acc: 0.595108687877655
valid loss: 1.1730596088487348 valid acc: 0.5622189044952393
epochs: 5
train loss: 1.1289590145370503 train acc: 0.6212518811225891
valid loss: 1.1451826036601946 valid acc: 0.6019490361213684
epochs: 6
train loss: 1.1044113980843566 train acc: 0.6447713375091553
valid loss: 1.1197424205257438 valid acc: 0.6259370446205139
epochs: 7
train loss: 1.078777679170983 train acc: 0.668571949005127
valid loss: