In [1]:
import numpy as np
import random
import torch
import os
import spacy
from torchtext.vocab import GloVe, FastText
from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report, accuracy_score
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm

dataset_file = open("./Assignment4aDataset.txt", "r").read()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def set_seed(seed = 42):
    '''
        For Reproducibility: Sets the seed of the entire notebook.
    '''

    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
    # Sets a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)

set_seed(1)

In [3]:
dataset_lines = dataset_file.splitlines()

dataset = []
for line in dataset_lines:
    x, y = line.split(',')
    x, y = x.lower().strip()[1:-1], y.lower().strip()[1:-1]
    
    if '/' in x:
        x = x.split('/')
    else:
        x = x.split(' ')
        
    dataset.append((x, y))

In [45]:
dataset_preprocessed = []

months = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december']
days = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']

def find_abbr(word, sets):
    for idx, item in enumerate(sets):
        if word in item:
            return idx
    return -1

for x, y in dataset:

    x_prepro = []
    for word in x:
        x_prepro.append(word)

    labels = []
    for label in y:
        if label == '-':
            labels.append(10)
        else:
            labels.append(int(label))
    dataset_preprocessed.append((x_prepro, labels))

In [46]:
vocabulary = set()
vocab_to_ids = {}
ids = 1
for x, y in dataset_preprocessed:
    print(x, y)
    for i in x:
        vocabulary.add(i)
        if i not in vocab_to_ids:
            vocab_to_ids[i] = ids
            ids += 1
vocab_to_ids['unk'] = ids

['may', '20', '2034'] [2, 0, 3, 4, 10, 0, 5, 10, 2, 0]
['9', 'may', '1630'] [1, 6, 3, 0, 10, 0, 5, 10, 0, 9]
['15', '03', '2014'] [2, 0, 1, 4, 10, 0, 3, 10, 1, 5]
['mar', '16', '1675'] [1, 6, 7, 5, 10, 0, 3, 10, 1, 6]
['jun', '16', '1640'] [1, 6, 4, 0, 10, 0, 6, 10, 1, 6]
['friday', '1791', '2', '09'] [1, 7, 9, 1, 10, 0, 9, 10, 0, 2]
['wed', '1776', '11', 'september'] [1, 7, 7, 6, 10, 0, 9, 10, 1, 1]
['1833', '9', 'jun'] [1, 8, 3, 3, 10, 0, 6, 10, 0, 9]
['sun', '26', 'oct', '1788'] [1, 7, 8, 8, 10, 1, 0, 10, 2, 6]
['1685', '30', 'oct'] [1, 6, 8, 5, 10, 1, 0, 10, 3, 0]
['12', '17', '51'] [1, 7, 5, 1, 10, 1, 2, 10, 1, 7]
['october', '21', '2051'] [2, 0, 5, 1, 10, 1, 0, 10, 2, 1]
['jul', '11', '1562'] [1, 5, 6, 2, 10, 0, 7, 10, 1, 1]
['29', 'april', '1979'] [1, 9, 7, 9, 10, 0, 4, 10, 2, 9]
['11', '23', '13'] [1, 8, 1, 3, 10, 1, 1, 10, 2, 3]
['6', 'december', '1802'] [1, 8, 0, 2, 10, 1, 2, 10, 0, 6]
['sep', '23', '2066'] [2, 0, 6, 6, 10, 0, 9, 10, 2, 3]
['nov', '6', '1567'] [1, 5, 6, 7, 10

In [47]:
dataset_corpus = []
for x, y in dataset_preprocessed:
    x_ids = []
    for i in x:
        x_ids.append(vocab_to_ids[i])
    dataset_corpus.append((x_ids, y))

In [48]:
print(len(vocabulary))
print(vocabulary)
#print(vocab_to_ids)

697
{'1900', '1829', '05', '2066', '1934', 'february', '1707', '1572', '1675', '83', '1679', '06', '2067', '1604', '1550', '1670', '66', '1816', '1745', '1590', '1697', '1723', '1', '2035', '1944', 'jul', '1615', '1772', '1891', '1953', '90', '2021', '1729', '84', '1702', '2055', '1639', '2014', '1938', '1894', '30', 'august', '1668', '2044', '1730', 'feb', '1920', 'september', '1994', '2039', '1896', '2030', '1845', '1569', '79', '1665', '5', '1940', 'tuesday', '1827', '71', '1978', '1625', '89', '1853', '2046', '1578', '43', '26', '2027', '1677', '1985', '1991', '1523', '1824', '1810', '2060', '1647', '2036', '1649', '1783', '1960', '1737', '2050', '1563', '1609', '1603', '1727', '1594', '16', 'oct', 'mar', '2065', '1946', '1736', '1769', '1567', '1614', '1629', '1997', '91', '1534', '1875', '1710', '19', '1753', '1581', '1577', '1701', '59', '1556', '1794', '1546', '1801', '1886', '2040', '1735', '1952', '1524', '1846', '86', '1643', '1850', 'wednesday', '1787', '1808', '1803', '197

In [49]:
# Downloads GloVe and FastText
global_vectors = GloVe(name='840B', dim=300)

emb_dim = 300
embeds = torch.zeros(len(vocab_to_ids) + 1, emb_dim)

for token, idx in vocab_to_ids.items():
    embeds[idx] = global_vectors[token] 

In [50]:
# Train-Valid split of 80-20
def split_indices(n, val_pct):

    # Determine size of Validation set
    n_val = int(val_pct * n)

    # Create random permutation of 0 to n-1
    idxs = np.random.permutation(n)
    return idxs[n_val:], idxs[:n_val]

train_indices, val_indices = split_indices(len(dataset_corpus), 0.2)

In [58]:
from torch.nn.utils.rnn import pad_sequence

# ----------- Batching the data -----------
def collate_fn(instn):
    sentence = [torch.Tensor(x[0]) for x in instn]
    labels = torch.Tensor([x[1] for x in instn])

    padded_sent = pad_sequence(sentence, batch_first=True, padding_value=0)

    l = torch.cat((labels[:, 0], labels[:, 1],labels[:, 2],labels[:, 3],labels[:, 4],labels[:, 5],labels[:, 6],labels[:, 7],labels[:, 8],labels[:, 9]), dim = 0)

    return (padded_sent.long(), l.long())


batch_size = 64

train_sampler   = SubsetRandomSampler(train_indices)
train_loader    = DataLoader(dataset_corpus, batch_size, sampler=train_sampler, collate_fn=collate_fn)

val_sampler     = SubsetRandomSampler(val_indices)
val_loader      = DataLoader(dataset_corpus, batch_size, sampler=val_sampler, collate_fn=collate_fn)

In [59]:
if torch.cuda.is_available():
    device = torch.device("cuda:1")
else:
    device = torch.device("cpu")

In [60]:
# ----------- Encoder -----------
class Encoder(nn.Module):
    def __init__(self, embeds, rnn_type):
        super().__init__()

        self.embeddings = nn.Embedding.from_pretrained(embeds, padding_idx=0, freeze=False)
        self.dropout = nn.Dropout(0.2)

        if rnn_type == 'gru':
            self.lstm = nn.GRU(input_size = 300, hidden_size = 128, num_layers = 1, batch_first = True, bidirectional = True)
        else:
            self.lstm = nn.LSTM(input_size = 300, hidden_size = 128, num_layers = 1, batch_first = True, bidirectional = True)
    
    def forward(self, Xb):

        x = self.embeddings(Xb)
        x = self.dropout(x)
        out, h = self.lstm(x)

        return out              # B x Seq_len x 128

# ----------- Decoder -----------
class Decoder(nn.Module):
    def __init__(self, rnn_type):
        super().__init__()

        #self.input_combine = nn.Linear(267, 200)

        if rnn_type == 'gru':
            self.gru = nn.GRUCell(input_size = 267, hidden_size = 128)
        else:
            self.gru = nn.LSTMCell(input_size = 256, hidden_size = 128)

        self.rnn_type = rnn_type
            
        self.lin1 = nn.Linear(128, 64)
        self.lin2 = nn.Linear(64, 11)
        
    def forward(self, Cb, sp, y):
        """ Cb - B x 384
            sp - B x seq_len x 128
            y - B x 11
        """

        if self.rnn_type == 'gru':          # TODO: Teacher enforcing
            input = torch.cat((y, Cb), dim = 1) 
            sp = sp[:, 0, :].squeeze(dim=1) 
            #input = self.input_combine(input)
            s = self.gru(input, sp)
        else:
            s = self.gru(Cb)[0]
        x = self.lin1(s)
        x = F.relu(x)
        x = self.lin2(x)

        return x, s

class Translator(nn.Module):
    
    def __init__(self, embeds, rnn_type = 'gru'):
        super().__init__()

        self.encoder = Encoder(embeds, rnn_type)
        self.attention = nn.Linear(384, 1)   # TODO: Make it 32, 384
        self.decoder = Decoder(rnn_type)

        self.initial_hidden = nn.Linear(256, 128)

        self.transform_hidden_dec = nn.Linear(128, 64)
        self.transform_hidden_enc = nn.Linear(256, 128)
        self.energy = nn.Linear(128, 64)

    def forward(self, Xb):
        
        # Batch size, Sequence Length
        B, seq_len = Xb.shape[0], Xb.shape[1]

        # Hidden vectors
        H = self.encoder(Xb)                                      # B x seq_len x (128 * 2)
        s = self.initial_hidden(H)                                # TODO: make it learnable  # B x seq_len x 128
        
        logits = torch.zeros(B, 11).to(device)
        for i in range(10):
            Hs = torch.cat((H, s), dim = 2)                         # B x seq_len x 384

            # Attention
            a = torch.softmax(self.attention(Hs).squeeze(), dim=1)  # B x seq_len         

            # Context Vectors
            c = torch.bmm(a.unsqueeze(dim = 1), H).squeeze(dim=1)   # B x (128*2)

            x, s = self.decoder(c, s, logits[B*i: B*(i+1)])          # B x 11
            logits = torch.cat((logits, x), dim = 0)                                
            s = s.unsqueeze(dim=1)
            s = s.repeat(1, seq_len, 1)
        
        return logits[B:]

In [61]:
def exact_match(y_hat, yb):
    B = yb.shape[0] // 10       # Batch size

    correct = 0
    for i in range(B):
        f = True
        for j in range(10):
            if yb[i+(j*B)] != y_hat[i+(j*B)]:
                f = False
                break 
        if f: correct += 1

    #correct = 0
    #for i in range(B*10):
    #    if yb[i] == y_hat[i]:
    #        correct += 1
    return correct, B

In [62]:
model = Translator(embeds, 'gru')
model.to(device)
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = F.cross_entropy

In [63]:
# ----------- Main Training Loop -----------
max_epoch = 10

best_test_acc = 0
for ep in range(max_epoch):

    epoch_loss = 0

    model.train()
    train_labels = []
    train_pred = []
    correct_train, B_train = 0, 0
    for xb, yb in tqdm(train_loader):
        xb = xb.to(device)
        yb = yb.to(device)

        y_hat = model(xb)
        loss = loss_fn(y_hat, yb)

        loss.backward()
        opt.step()
        opt.zero_grad()

        epoch_loss += float(loss)

        y_hat = torch.softmax(y_hat, dim = 1).argmax(dim=1)

        correct, B = exact_match(y_hat, yb)
        correct_train += correct
        B_train += B
        train_labels.extend(yb.cpu().detach().numpy())
        train_pred.extend(y_hat.cpu().detach().numpy())

    print("Epoch: ", ep+1, " Training Loss: ", epoch_loss/len(train_loader))
    print("Train accuracy: ", accuracy_score(train_labels, train_pred)*100)
    print("Train EM: ", (correct_train/ B_train)*100)


    #----------- Validation -----------

    val_labels = []
    val_pred = []

    model.eval()
    val_epoch_loss = 0
    correct_val, B_val = 0, 0
    with torch.no_grad():
        for xb, yb in tqdm(val_loader):
            xb = xb.to(device)
            yb = yb.to(device)

            y_hat = model(xb)
            loss = loss_fn(y_hat, yb)

            val_epoch_loss += float(loss)

            #print(torch.softmax(y_hat, dim = 1).argmax(dim=1))
            #print(yb)

            y_hat = torch.softmax(y_hat, dim = 1).argmax(dim=1)

            correct, B = exact_match(y_hat, yb)
            correct_val += correct
            B_val += B

            val_labels.extend(yb.cpu().detach().numpy())
            val_pred.extend(y_hat.cpu().detach().numpy())

    val_epoch_loss = val_epoch_loss / len(val_loader)
    print("Validation loss: ", val_epoch_loss)
    print("Validation accuracy: ", accuracy_score(val_labels, val_pred)*100)
    print("Validation EM: ", (correct_val/ B_val)*100)
    
    if ep == 5:
        min_val_loss = val_epoch_loss

    if ep > 4 and val_epoch_loss <= min_val_loss:
        print("---- Saving Model ----")
        torch.save(model.state_dict(), "best_model.pt")
        min_val_loss = val_epoch_loss

    if ep > 4 and prev_val_loss - val_epoch_loss > 0.05:
        print("---- Early Stopping ----")

    prev_val_loss = val_epoch_loss

100%|██████████| 500/500 [00:23<00:00, 20.91it/s]


Epoch:  1  Training Loss:  0.988590897321701
Train accuracy:  64.63
Train EM:  1.946875


100%|██████████| 125/125 [00:04<00:00, 29.24it/s]


Validation loss:  0.4282810208797455
Validation accuracy:  85.6125
Validation EM:  16.9625


100%|██████████| 500/500 [00:32<00:00, 15.35it/s]


Epoch:  2  Training Loss:  0.17722850377112628
Train accuracy:  95.00375
Train EM:  64.571875


100%|██████████| 125/125 [00:06<00:00, 18.77it/s]


Validation loss:  0.044910231128335
Validation accuracy:  99.20375
Validation EM:  94.7625


100%|██████████| 500/500 [00:35<00:00, 14.22it/s]


Epoch:  3  Training Loss:  0.03802499686926603
Train accuracy:  99.0690625
Train EM:  93.19062500000001


100%|██████████| 125/125 [00:06<00:00, 18.61it/s]


Validation loss:  0.018109417671337724
Validation accuracy:  99.58875
Validation EM:  96.78750000000001


100%|██████████| 500/500 [00:22<00:00, 22.47it/s]


Epoch:  4  Training Loss:  0.022551949093583972
Train accuracy:  99.3953125
Train EM:  95.2375


100%|██████████| 125/125 [00:03<00:00, 40.66it/s]


Validation loss:  0.014018317701295017
Validation accuracy:  99.64125
Validation EM:  96.96249999999999


100%|██████████| 500/500 [00:22<00:00, 21.86it/s]


Epoch:  5  Training Loss:  0.01828084863582626
Train accuracy:  99.4875
Train EM:  95.753125


100%|██████████| 125/125 [00:03<00:00, 39.78it/s]


Validation loss:  0.015107894940301776
Validation accuracy:  99.6175
Validation EM:  96.7


100%|██████████| 500/500 [00:20<00:00, 24.81it/s]


Epoch:  6  Training Loss:  0.01542006146814674
Train accuracy:  99.5421875
Train EM:  96.15625


100%|██████████| 125/125 [00:03<00:00, 39.91it/s]


Validation loss:  0.013731952973641454
Validation accuracy:  99.64625000000001
Validation EM:  97.0
---- Saving Model ----


100%|██████████| 500/500 [00:23<00:00, 21.24it/s]


Epoch:  7  Training Loss:  0.014909514673287048
Train accuracy:  99.5421875
Train EM:  96.11562500000001


100%|██████████| 125/125 [00:03<00:00, 41.49it/s]


Validation loss:  0.014850371601060033
Validation accuracy:  99.60875
Validation EM:  96.625


100%|██████████| 500/500 [00:20<00:00, 23.83it/s]


Epoch:  8  Training Loss:  0.053011570586822925
Train accuracy:  98.786875
Train EM:  92.07499999999999


100%|██████████| 125/125 [00:02<00:00, 44.75it/s]


Validation loss:  0.012340792116709054
Validation accuracy:  99.65625
Validation EM:  97.0125
---- Saving Model ----


100%|██████████| 500/500 [00:21<00:00, 23.32it/s]


Epoch:  9  Training Loss:  0.01279929273482412
Train accuracy:  99.609375
Train EM:  96.65


100%|██████████| 125/125 [00:05<00:00, 23.14it/s]


Validation loss:  0.011584546794183552
Validation accuracy:  99.64874999999999
Validation EM:  96.96249999999999
---- Saving Model ----


100%|██████████| 500/500 [00:21<00:00, 23.14it/s]


Epoch:  10  Training Loss:  0.011806430976488628
Train accuracy:  99.6134375
Train EM:  96.63125000000001


100%|██████████| 125/125 [00:02<00:00, 44.51it/s]


Validation loss:  0.013945432047825307
Validation accuracy:  99.51
Validation EM:  96.925


In [64]:
testset_file = open("./Assignment4aTestDataset.txt", "r").read()

testset_lines = testset_file.splitlines()

testset = []
for line in testset_lines:
    x, y = line.split(',')
    x, y = x.lower().strip()[1:-1], y.lower().strip()[1:-1]
    
    if '/' in x:
        x = x.split('/')
    else:
        x = x.split(' ')
        
    testset.append((x, y))

testset_preprocessed = []
for x, y in testset:

    x_prepro = []
    for word in x:
        x_prepro.append(word)

    labels = []
    for label in y:
        if label == '-':
            labels.append(10)
        else:
            labels.append(int(label))
    testset_preprocessed.append((x_prepro, labels))

testset_corpus = []
for x, y in testset_preprocessed:
    x_ids = []
    for i in x:
        x_ids.append(vocab_to_ids[i])
    testset_corpus.append((x_ids, y))

test_loader   = DataLoader(testset_corpus, batch_size, collate_fn=collate_fn, shuffle=True)

model = Translator(embeds, 'gru')
model.to(device)
model.load_state_dict(torch.load("best_model.pt"))
model.eval()
#----------- Test -----------

def exact_match(y_hat, yb):
    B = yb.shape[0] // 10       # Batch size

    correct = 0
    for i in range(B):
        f = True
        for j in range(10):
            if yb[i+(j*B)] != y_hat[i+(j*B)]:
                f = False
                break 
        if f: correct += 1

    output_correct = [0 for i in range(10)]
    for i in range(B):
        for j in range(10):
            if yb[i+(j*B)] == y_hat[i+(j*B)]:
                output_correct[j] += 1
    return correct, B, output_correct

val_labels = []
val_pred = []

val_epoch_loss = 0
correct_val, B_val = 0, 0
out_correct = [0 for i in range(10)]
with torch.no_grad():
    for xb, yb in tqdm(val_loader):
        xb = xb.to(device)
        yb = yb.to(device)

        y_hat = model(xb)
        loss = loss_fn(y_hat, yb)

        val_epoch_loss += float(loss)

        y_hat = torch.softmax(y_hat, dim = 1).argmax(dim=1)

        correct, B, output_correct = exact_match(y_hat, yb)
        correct_val += correct
        B_val += B
        for i in range(10):
            out_correct[i] += output_correct[i]

        val_labels.extend(yb.cpu().detach().numpy())
        val_pred.extend(y_hat.cpu().detach().numpy())

val_epoch_loss = val_epoch_loss / len(val_loader)
print("Test loss: ", val_epoch_loss)
print("Test accuracy: ", accuracy_score(val_labels, val_pred)*100)
print("Test EM: ", (correct_val/ B_val)*100)
for i in range(10):
    out_correct[i] = (out_correct[i] / B_val) * 100
print("Test accuracy position-wise: ", out_correct)

100%|██████████| 125/125 [00:08<00:00, 15.06it/s]

Test loss:  0.011584546882193536
Test accuracy:  99.64874999999999
Test EM:  96.96249999999999
Test accuracy position-wise:  [99.52499999999999, 97.3125, 99.9875, 99.7375, 100.0, 100.0, 99.9625, 100.0, 100.0, 99.9625]



