In [1]:
import torch
import torch.nn as nn
import pandas as pd
import pickle
import numpy as np
import json

from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [2]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [1]:
# Download model
!wget https://storage.googleapis.com/postdata-models/stanzas/eval/saved_weights_glove_model.pt -O glove_data/saved_weights_glove_model.pt

--2021-01-21 16:27:58--  https://storage.googleapis.com/postdata-models/stanzas/eval/saved_weights_glove_model.pt
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
Resolving storage.googleapis.com (storage.googleapis.com)... 216.58.211.240, 216.58.209.80, 142.250.184.176, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|216.58.211.240|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 59799315 (57M) [application/octet-stream]
Saving to: ‘glove_data/saved_weights_glove_model.pt’

  glove_data/saved_  20%[===>                ]  11,94M  7,36MB/s               ^C


In [3]:
test = pickle.load(open("glove_data/test_public.p","rb"))
with open('glove_data/vocab2index-glove.json') as json_file:
    vocab2index = json.load(json_file)
pretrained_weights = pickle.load(open("glove_data/glove_weights_verses.p","rb"))

In [4]:
X_test = list(test['encoded'])
y_test = list(test['label'])

In [5]:
class StanzasDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.y = Y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X[idx][0], self.y[idx], self.X[idx][1]

In [6]:
test_ds = StanzasDataset(X_test, y_test)

In [7]:
test_dl = DataLoader(test_ds)

In [8]:
def test_model(model, test_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    sum_rmse = 0.0
    for x, y, l in test_dl:
        x = x.long()
        y = y.long()
        x = x.to(device)
        y = y.to(device)
        y_hat = model(x, l.cpu())
        pred = torch.max(y_hat, 1)[1]
        correct += (pred == y).float().sum()
        total += y.shape[0]
    return correct/total

In [9]:
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, glove_weights):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.embeddings.weight.data.copy_(torch.from_numpy(glove_weights))
        self.embeddings.weight.requires_grad = False
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, num_layers=3, dropout=0.2, bidirectional=True)
        self.linear = nn.Linear(hidden_dim, 46)
        
    def forward(self, text, text_length):
        x = self.embeddings(text)
        x_pack = pack_padded_sequence(x, text_length, batch_first=True, enforce_sorted=False)
        out_pack, (ht, ct) = self.lstm(x_pack)
        out = self.linear(ht[-1])
        return out

In [10]:
model = BiLSTM(len(vocab2index)+1, 300, 300, pretrained_weights)
model = model.to(device)
path = 'glove_data/saved_weights_glove_model.pt'
model.load_state_dict(torch.load(path))

<All keys matched successfully>

In [11]:
print("accuracy: ", test_model(model, test_dl).item())

accuracy:  0.6863136887550354
