# Imports

In [1]:
# !pip install numpy
# !pip install torch
# !pip install sklearn
# !pip install pickle

import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import Dataset, TensorDataset, DataLoader

import numpy as np
import math
import os.path
from collections import defaultdict
import pickle

from sklearn.metrics import classification_report

## Ensure reproducibility
Use a fixed seed such that all steps and results can be reproduced.

In [2]:
SEED = 544

np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Credit: From PyTorch's documentation
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Data Loading

In [3]:
def read_data(fname, test_dataset=False):
    sentences = []
    with open(fname, 'r') as f:
        lines = [line.strip() for line in f.readlines()]
        sentence_words = []
        sentence_tags = []
        for line in lines:
            if line:
                # test data has only index and word
                if test_dataset:
                    index, word = line.split()
                    sentence_words.append(word)
                # train/dev data has index, word, and tag
                else:
                    index, word, tag = line.split()
                    sentence_words.append(word)
                    sentence_tags.append(tag)
            else:
                # Create a sentence upon reaching an empty new line
                if test_dataset:
                    sentences.append(sentence_words)
                else:
                    sentences.append((sentence_words, sentence_tags))
                sentence_words = []
                sentence_tags = []
        # Create a sentence for the last sentence in the document
        # incase it missed a newline in the document at the end
        if len(sentence_words) > 0:
            if test_dataset:
                sentences.append(sentence_words)
            else:
                sentences.append((sentence_words, sentence_tags))
    return sentences

## Read all datasets

In [4]:
# Read all datasets given
train_data = read_data('data/train')
dev_data = read_data('data/dev')
test_data = read_data('data/test', test_dataset=True)

## Utility functions to process data

In [5]:
def prepare_sequence(seq, to_ix, use_unk=False):
    if use_unk:
        indices = [to_ix[w] if w in to_ix else to_ix['<UNK>'] for w in seq]
    else:
        indices = [to_ix[w] for w in seq]
    return indices

def get_spelling_feature(sentence):
    result = []
    for word in sentence:
        # PAD = 0
        if word == '<PAD>':
            result.append(0)
        ## ALL LOWER = 1
        elif word.islower():
            result.append(1)
        # ALL UPPER = 2
        elif word.isupper():
            result.append(2)
        # FIRST UPPER = 3
        elif word[0].isupper():
            result.append(3)
        # OTHERS = 4
        else:
            result.append(4)
    return result

## Dataset Implementation

In [6]:
class NERDataset(Dataset):
    def __init__(self, data):
        # Retrieves longest sentence, for padding
        max_sentence_len = max([len(sentence) for sentence, tags in data])
        self.X = []
        self.X_original = []
        self.y = []
        self.X_spelling = []
        
        for sentence, tags in data:
            # Pad the sentences to the same length
            padded_sentence = sentence.copy()
            padded_tags = tags.copy()
            while len(padded_sentence) < max_sentence_len:
                padded_sentence.append('<PAD>')
                padded_tags.append('<PAD>')
            # Convert to indices
            transformed_sentence = prepare_sequence(padded_sentence, word_to_ix, use_unk=True)
            transformed_tags = prepare_sequence(padded_tags, tag_to_ix)
            # Get spelling indices
            spelling_sentence = get_spelling_feature(padded_sentence)
            # Add to dataset
            self.X.append(transformed_sentence)
            self.X_original.append(padded_sentence)
            self.y.append(transformed_tags)
            self.X_spelling.append(spelling_sentence)
            
        self.X = torch.from_numpy(np.array(self.X, dtype=np.int64)).to(device)
        self.y = torch.from_numpy(np.array(self.y, dtype=np.int64)).to(device)
        self.X_spelling = torch.from_numpy(np.array(self.X_spelling, dtype=np.int64)).to(device)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, index):
        return self.X[index], self.y[index], self.X_original[index], self.X_spelling[index]

# Task 1: Simple Bidirectional LSTM model

## Hyperparameters

In [7]:
BATCH_SIZE = 1

VOCAB_THRESHOLD = 3

EMBEDDING_DIM = 100
LSTM_HIDDEN_DIM = 256
LSTM_DROPOUT = 0.33
LINEAR_DIM = 128

LEARNING_RATE = 0.2
MOMENTUM = 0.9

ELU_ALPHA = 0.5

SCHEDULER_STEP_SIZE = 5
SCHEDULER_GAMMA = 0.5

NUM_EPOCHS = 20

## Generate vocab and word/tag -> index, and index -> word/tag

In [8]:
# Generate vocab
words_freq = defaultdict(int)
for sentence, tags in train_data:
    for word in sentence:
        words_freq[word] += 1
        
vocab = {key for key, val in words_freq.items() if val >= VOCAB_THRESHOLD}

# Generate word/tag to index mappings
word_to_ix = {'<PAD>': 0, '<UNK>': 1}
tag_to_ix = {'<PAD>': 0}
for sentence, tags in train_data:
    for word in sentence:
        if word not in vocab:
            word = '<UNK>'
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
    for tag in tags:
        if tag not in tag_to_ix:
            tag_to_ix[tag] = len(tag_to_ix)
            
# Generate index to word/tag mappings
ix_to_word = {v: k for k, v in word_to_ix.items()}
ix_to_tag = {v: k for k, v in tag_to_ix.items()}

# Calculate the size of vocabulary & tags
VOCAB_SIZE = len(word_to_ix)
TAGS_SIZE = len(tag_to_ix)

## Bidirectional LSTM Model with random embeddings

In [9]:
class BLSTM1(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, linear_dim, tags_size, lstm_dropout, elu_alpha):
        super(BLSTM1, self).__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=word_to_ix['<PAD>'])
        self.dropout_pre_lstm = nn.Dropout(lstm_dropout)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.dropout_post_lstm = nn.Dropout(lstm_dropout)
        self.linear = nn.Linear(hidden_dim * 2, linear_dim)
        self.elu = nn.ELU(alpha=elu_alpha)
        self.linear2 = nn.Linear(linear_dim, tags_size)
    
    def forward(self, x):
        x = self.embeddings(x)
        x = self.dropout_pre_lstm(x)
        
        h0 = torch.zeros(2, x.size(0), self.hidden_dim).to(device)
        c0 = torch.zeros(2, x.size(0), self.hidden_dim).to(device)
        out, _ = self.lstm(x, (h0, c0))
        
        out = self.dropout_post_lstm(out)
        out = self.linear(out)
        out = self.elu(out)
        out = self.linear2(out)
    
        return out

## Utility Functions for Prediction

In [10]:
# Used to predict on a development data loader
# Writes the output to a file, i.e. to dev.out
def predict_dev1(model, data_loader, fname):
    outputs = []
    model.eval()
    with torch.no_grad():
        for X, y, X_original, X_spelling in data_loader:
            X, y = X.to(device), y.to(device)

            y_pred_scores = model(X)
            y_pred = torch.argmax(y_pred_scores, dim=2)
            y_pred_flat = torch.flatten(y_pred).tolist()

            idx = 1
            output = []
            for i in range(len(y_pred_flat)):
                word = X_original[i][0]
                pred = ix_to_tag[y_pred_flat[i]]
                if word == '<PAD>':
                    break
                output.append((idx, word, pred))
                idx += 1
            outputs.append(output)

    with open(fname, 'w') as f:
        for i in range(len(outputs)):
            for j in range(len(outputs[i])):
                idx, word, pred = outputs[i][j]
                f.write(f'{idx} {word} {pred}\n')
            if i != len(outputs)-1:
                f.write('\n')

# Used to predict on a test data, list of sentences
# Writes the output to a file, i.e. to test.out
def predict_test1(model, sentences, fname):
    outputs = []
    model.eval()
    with torch.no_grad():
        for sentence in sentences:
            spelling_sentence = [get_spelling_feature(sentence)]
            spelling_sentence = torch.from_numpy(np.array(spelling_sentence, dtype=np.int64)).to(device)
            
            transformed_sentence = [prepare_sequence(sentence, word_to_ix, use_unk=True)]
            transformed_sentence = torch.from_numpy(np.array(transformed_sentence, dtype=np.int64)).to(device)
            
            y_pred_scores = model(transformed_sentence)
            y_pred = torch.argmax(y_pred_scores, dim=2)
            y_pred_flat = torch.flatten(y_pred).tolist()

            idx = 1
            output = []
            for i in range(len(y_pred_flat)):
                word = sentence[i]
                pred = ix_to_tag[y_pred_flat[i]]
                if word == '<PAD>':
                    break
                output.append((idx, word, pred))
                idx += 1
            outputs.append(output)

    with open(fname, 'w') as f:
        for i in range(len(outputs)):
            for j in range(len(outputs[i])):
                idx, word, pred = outputs[i][j]
                f.write(f'{idx} {word} {pred}\n')
            if i != len(outputs)-1:
                f.write('\n')
                
# Used to predict on a development data loader
# Writes statistics to console
def predict1(model, data_loader, message):
    all_y = []
    all_y_pred = []
    model.eval()
    with torch.no_grad():
        for X, y, X_original, X_spelling in data_loader:
            X, y = X.to(device), y.to(device)

            y_pred_scores = model(X)
            y_pred = torch.argmax(y_pred_scores, dim=2)
            y_pred_flat = torch.flatten(y_pred).tolist()
            y_flat = torch.flatten(y).tolist()
            
            for i in range(len(y_pred_flat)):
                if y_flat[i] == tag_to_ix['<PAD>']:
                    break
                all_y.append(y_flat[i])
                all_y_pred.append(y_pred_flat[i])

    print(message, classification_report(all_y, all_y_pred))
                
# Used to predict on a development data loader
# Writes the output to a file for PERL script, i.e. to prediction.txt
def predict_perl1(model, data_loader, fname):
    outputs = []
    model.eval()
    with torch.no_grad():
        for X, y, X_original, X_spelling in data_loader:
            X, y = X.to(device), y.to(device)

            y_pred_scores = model(X)
            y_pred = torch.argmax(y_pred_scores, dim=2)
            y_pred_flat = torch.flatten(y_pred).tolist()
            y_flat = torch.flatten(y).tolist()

            idx = 1
            output = []
            for i in range(len(y_pred_flat)):
                word = X_original[i][0]
                gold = ix_to_tag[y_flat[i]]
                pred = ix_to_tag[y_pred_flat[i]]
                if word == '<PAD>':
                    break
                output.append((idx, word, gold, pred))
                idx += 1
            outputs.append(output)

    with open(fname, 'w') as f:
        for i in range(len(outputs)):
            for j in range(len(outputs[i])):
                idx, word, gold, pred = outputs[i][j]
                f.write(f'{idx} {word} {gold} {pred}\n')
            if i != len(outputs)-1:
                f.write('\n')

## Load data

In [11]:
train_dataset = NERDataset(train_data)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

dev_dataset = NERDataset(dev_data)
dev_loader = DataLoader(dev_dataset, batch_size=1, shuffle=False)

## Create model

In [12]:
model = BLSTM1(VOCAB_SIZE, EMBEDDING_DIM, LSTM_HIDDEN_DIM, LINEAR_DIM, TAGS_SIZE, LSTM_DROPOUT, ELU_ALPHA).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=SCHEDULER_STEP_SIZE, gamma=SCHEDULER_GAMMA)

In [13]:
model

BLSTM1(
  (embeddings): Embedding(8129, 100, padding_idx=0)
  (dropout_pre_lstm): Dropout(p=0.33, inplace=False)
  (lstm): LSTM(100, 256, batch_first=True, bidirectional=True)
  (dropout_post_lstm): Dropout(p=0.33, inplace=False)
  (linear): Linear(in_features=512, out_features=128, bias=True)
  (elu): ELU(alpha=0.5)
  (linear2): Linear(in_features=128, out_features=10, bias=True)
)

## Train and export model

In [14]:
%%time

if os.path.isfile('blstm1.pt'):
    print('Task 1', 'blstm1.pt exists. Loading existing model...')
    model = torch.load('blstm1.pt')
    model.to(device)
else:
    print('Task 1', 'blstm1.pt does not exist. Training a new model...')
    total_loss = []
    for epoch in range(NUM_EPOCHS):
        model.train()
        for i, (X, y, X_original, X_spelling) in enumerate(train_loader):
            X, y = X.to(device), y.to(device)

            y_pred_scores = model(X)
            y_pred = torch.flatten(y_pred_scores, start_dim=0, end_dim=1)
            y = torch.flatten(y)
            
            loss = criterion(y_pred, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss.append(loss.item())
        print(f'Epoch {epoch+1} / {NUM_EPOCHS}, training loss: {np.average(total_loss):.5f}, learning rate: {optimizer.param_groups[0]["lr"]:.5f}')
        total_loss = []
        scheduler.step()
        if epoch == 0 or (epoch+1) % 5 == 0:
            predict1(model, dev_loader, f'Epoch {epoch+1} / {NUM_EPOCHS}')
    torch.save(model, 'blstm1.pt')

# Prediction for all cases (dev, test, and dev for perl)
predict_perl1(model, dev_loader, 'prediction1.txt')
predict_dev1(model, dev_loader, 'dev1.out')
predict_test1(model, test_data, 'test1.out')

Task 1 blstm1.pt does not exist. Training a new model...
Epoch 1 / 20, training loss: 0.06235, learning rate: 0.20000


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1 / 20               precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.85      0.23      0.36      1341
           2       0.91      1.00      0.95     42975
           3       0.85      0.43      0.57       922
           4       0.88      0.41      0.56      1842
           5       0.82      0.46      0.59      1307
           6       0.90      0.52      0.66      1837
           7       0.52      0.34      0.41       751
           8       0.85      0.37      0.52       346
           9       0.70      0.53      0.60       257

    accuracy                           0.90     51578
   macro avg       0.73      0.43      0.52     51578
weighted avg       0.89      0.90      0.88     51578

Epoch 2 / 20, training loss: 0.03990, learning rate: 0.20000
Epoch 3 / 20, training loss: 0.03286, learning rate: 0.20000
Epoch 4 / 20, training loss: 0.02879, learning rate: 0.20000
Epoch 5 / 20, training loss: 0.02618, learnin

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 5 / 20               precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.86      0.63      0.73      1341
           2       0.96      0.99      0.98     42975
           3       0.91      0.69      0.79       922
           4       0.88      0.74      0.80      1842
           5       0.74      0.91      0.82      1307
           6       0.92      0.74      0.82      1837
           7       0.86      0.70      0.77       751
           8       0.85      0.59      0.69       346
           9       0.85      0.72      0.78       257

    accuracy                           0.95     51578
   macro avg       0.78      0.67      0.72     51578
weighted avg       0.95      0.95      0.94     51578

Epoch 6 / 20, training loss: 0.02153, learning rate: 0.10000
Epoch 7 / 20, training loss: 0.01961, learning rate: 0.10000
Epoch 8 / 20, training loss: 0.01846, learning rate: 0.10000
Epoch 9 / 20, training loss: 0.01747, learnin

## Exporting structs for prediction

In [15]:
with open('word_to_ix_1.pkl', 'wb') as f:
    pickle.dump(word_to_ix, f)
    
with open('tag_to_ix_1.pkl', 'wb') as f:
    pickle.dump(tag_to_ix, f)

# Task 2: Using GloVe word embeddings

## Hyperparameters

In [16]:
BATCH_SIZE = 1

EMBEDDING_DIM = 100
LSTM_HIDDEN_DIM = 256
LSTM_DROPOUT = 0.33
LINEAR_DIM = 128

LEARNING_RATE = 0.3
MOMENTUM = 0.9

ELU_ALPHA = 0.5

SCHEDULER_STEP_SIZE = 5
SCHEDULER_GAMMA = 0.5

NUM_EPOCHS = 20

SPELLING_EMBEDDING_DIM = 20

## Generate vocab and word/tag -> index, and index -> word/tag, also load GloVe

In [17]:
embeddings_dict = {}
vocab = set(['<PAD>', '<UNK>'])

with open('glove.6B.100d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

for sentence, tags in train_data:
    vocab.update(sentence)
for sentence, tags in dev_data:
    vocab.update(sentence)
for sentence in test_data:
    vocab.update(sentence)

word_to_ix = {word: i for i, word in enumerate(vocab)}
ix_to_word = {v: k for k, v in word_to_ix.items()}

embedding_matrix = np.zeros((len(vocab), EMBEDDING_DIM))
for word in vocab:
    index = word_to_ix[word]
    if word in embeddings_dict:
        vector = embeddings_dict[word]
    elif word.lower() in embeddings_dict:
        vector = embeddings_dict[word.lower()]
    else:
        vector = np.random.rand(EMBEDDING_DIM)
    embedding_matrix[index] = vector

VOCAB_SIZE = len(word_to_ix)

## Bidirectional LSTM Model with GloVe embeddings

In [18]:
class BLSTM2(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, linear_dim, tags_size, lstm_dropout, elu_alpha, embeddings, spelling_embedding_dim):
        super(BLSTM2, self).__init__()
        self.hidden_dim = hidden_dim
        
        self.embeddings_word = nn.Embedding.from_pretrained(torch.from_numpy(embeddings).float(), freeze=False, padding_idx=word_to_ix['<PAD>'])
        self.embeddings_spelling = nn.Embedding(num_embeddings=5, embedding_dim=spelling_embedding_dim, padding_idx=0)
        self.dropout_pre_lstm = nn.Dropout(lstm_dropout)
        self.lstm = nn.LSTM(embedding_dim+spelling_embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.dropout_post_lstm = nn.Dropout(lstm_dropout)
        self.linear = nn.Linear(hidden_dim * 2, linear_dim)
        self.elu = nn.ELU(alpha=elu_alpha)
        self.linear2 = nn.Linear(linear_dim, tags_size)
    
    def forward(self, x_word, x_spelling):
        x1 = self.embeddings_word(x_word)
        x2 = self.embeddings_spelling(x_spelling)
        x = torch.cat((x1, x2), dim=2).to(device)
        x = self.dropout_pre_lstm(x)
        
        h0 = torch.zeros(2, x.size(0), self.hidden_dim).to(device)
        c0 = torch.zeros(2, x.size(0), self.hidden_dim).to(device)
        out, _ = self.lstm(x, (h0, c0))
        
        out = self.dropout_post_lstm(out)
        out = self.linear(out)
        out = self.elu(out)
        out = self.linear2(out)
    
        return out

## Utility Functions for Prediction

In [19]:
# Used to predict on a development data loader
# Writes the output to a file, i.e. to dev.out
def predict_dev2(model, data_loader, fname):
    outputs = []
    model.eval()
    with torch.no_grad():
        for X, y, X_original, X_spelling in data_loader:
            X, y = X.to(device), y.to(device)

            y_pred_scores = model(X, X_spelling)
            y_pred = torch.argmax(y_pred_scores, dim=2)
            y_pred_flat = torch.flatten(y_pred).tolist()

            idx = 1
            output = []
            for i in range(len(y_pred_flat)):
                word = X_original[i][0]
                pred = ix_to_tag[y_pred_flat[i]]
                if word == '<PAD>':
                    break
                output.append((idx, word, pred))
                idx += 1
            outputs.append(output)

    with open(fname, 'w') as f:
        for i in range(len(outputs)):
            for j in range(len(outputs[i])):
                idx, word, pred = outputs[i][j]
                f.write(f'{idx} {word} {pred}\n')
            if i != len(outputs)-1:
                f.write('\n')

# Used to predict on a test data, list of sentences
# Writes the output to a file, i.e. to test.out
def predict_test2(model, sentences, fname):
    outputs = []
    model.eval()
    with torch.no_grad():
        for sentence in sentences:
            spelling_sentence = [get_spelling_feature(sentence)]
            spelling_sentence = torch.from_numpy(np.array(spelling_sentence, dtype=np.int64)).to(device)
            
            transformed_sentence = [prepare_sequence(sentence, word_to_ix, use_unk=True)]
            transformed_sentence = torch.from_numpy(np.array(transformed_sentence, dtype=np.int64)).to(device)
            
            y_pred_scores = model(transformed_sentence, spelling_sentence)
            y_pred = torch.argmax(y_pred_scores, dim=2)
            y_pred_flat = torch.flatten(y_pred).tolist()

            idx = 1
            output = []
            for i in range(len(y_pred_flat)):
                word = sentence[i]
                pred = ix_to_tag[y_pred_flat[i]]
                if word == '<PAD>':
                    break
                output.append((idx, word, pred))
                idx += 1
            outputs.append(output)

    with open(fname, 'w') as f:
        for i in range(len(outputs)):
            for j in range(len(outputs[i])):
                idx, word, pred = outputs[i][j]
                f.write(f'{idx} {word} {pred}\n')
            if i != len(outputs)-1:
                f.write('\n')
                
# Used to predict on a development data loader
# Writes statistics to console
def predict2(model, data_loader, message):
    all_y = []
    all_y_pred = []
    model.eval()
    with torch.no_grad():
        for X, y, X_original, X_spelling in data_loader:
            X, y = X.to(device), y.to(device)

            y_pred_scores = model(X, X_spelling)
            y_pred = torch.argmax(y_pred_scores, dim=2)
            y_pred_flat = torch.flatten(y_pred).tolist()
            y_flat = torch.flatten(y).tolist()
            
            for i in range(len(y_pred_flat)):
                if y_flat[i] == tag_to_ix['<PAD>']:
                    break
                all_y.append(y_flat[i])
                all_y_pred.append(y_pred_flat[i])

    print(message, classification_report(all_y, all_y_pred))
                
# Used to predict on a development data loader
# Writes the output to a file for PERL script, i.e. to prediction.txt
def predict_perl2(model, data_loader, fname):
    outputs = []
    model.eval()
    with torch.no_grad():
        for X, y, X_original, X_spelling in data_loader:
            X, y = X.to(device), y.to(device)

            y_pred_scores = model(X, X_spelling)
            y_pred = torch.argmax(y_pred_scores, dim=2)
            y_pred_flat = torch.flatten(y_pred).tolist()
            y_flat = torch.flatten(y).tolist()

            idx = 1
            output = []
            for i in range(len(y_pred_flat)):
                word = X_original[i][0]
                gold = ix_to_tag[y_flat[i]]
                pred = ix_to_tag[y_pred_flat[i]]
                if word == '<PAD>':
                    break
                output.append((idx, word, gold, pred))
                idx += 1
            outputs.append(output)

    with open(fname, 'w') as f:
        for i in range(len(outputs)):
            for j in range(len(outputs[i])):
                idx, word, gold, pred = outputs[i][j]
                f.write(f'{idx} {word} {gold} {pred}\n')
            if i != len(outputs)-1:
                f.write('\n')

## Load data

In [20]:
train_dataset = NERDataset(train_data)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

dev_dataset = NERDataset(dev_data)
dev_loader = DataLoader(dev_dataset, batch_size=1, shuffle=False)

## Create model

In [21]:
model = BLSTM2(VOCAB_SIZE, EMBEDDING_DIM, LSTM_HIDDEN_DIM, LINEAR_DIM, TAGS_SIZE, LSTM_DROPOUT, ELU_ALPHA,
               embedding_matrix, SPELLING_EMBEDDING_DIM).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=SCHEDULER_STEP_SIZE, gamma=SCHEDULER_GAMMA)

In [22]:
model

BLSTM2(
  (embeddings_word): Embedding(30292, 100, padding_idx=25957)
  (embeddings_spelling): Embedding(5, 20, padding_idx=0)
  (dropout_pre_lstm): Dropout(p=0.33, inplace=False)
  (lstm): LSTM(120, 256, batch_first=True, bidirectional=True)
  (dropout_post_lstm): Dropout(p=0.33, inplace=False)
  (linear): Linear(in_features=512, out_features=128, bias=True)
  (elu): ELU(alpha=0.5)
  (linear2): Linear(in_features=128, out_features=10, bias=True)
)

## Train and export model

In [23]:
%%time

if os.path.isfile('blstm2.pt'):
    print('Task 2', 'blstm2.pt exists. Loading existing model...')
    model = torch.load('blstm2.pt')
    model.to(device)
else:
    print('Task 2', 'blstm2.pt does not exist. Training a new model...')
    total_loss = []
    for epoch in range(NUM_EPOCHS):
        model.train()
        for i, (X, y, X_original, X_spelling) in enumerate(train_loader):
            X, y = X.to(device), y.to(device)

            y_pred_scores = model(X, X_spelling)
            y_pred = torch.flatten(y_pred_scores, start_dim=0, end_dim=1)
            y = torch.flatten(y)
            loss = criterion(y_pred, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss.append(loss.item())
        print(f'Epoch {epoch+1} / {NUM_EPOCHS}, training loss: {np.average(total_loss):.5f}, learning rate: {optimizer.param_groups[0]["lr"]:.5f}')
        total_loss = []
        scheduler.step()
        if epoch == 0 or (epoch+1) % 5 == 0:
            predict2(model, dev_loader, f'Epoch {epoch+1} / {NUM_EPOCHS}')
    torch.save(model, 'blstm2.pt')

# Prediction for all cases (dev, test, and dev for perl)
predict_perl2(model, dev_loader, 'prediction2.txt')
predict_dev2(model, dev_loader, 'dev2.out')
predict_test2(model, test_data, 'test2.out')

Task 2 blstm2.pt does not exist. Training a new model...
Epoch 1 / 20, training loss: 0.01967, learning rate: 0.30000
Epoch 1 / 20               precision    recall  f1-score   support

           1       0.87      0.86      0.87      1341
           2       0.99      1.00      0.99     42975
           3       0.94      0.78      0.85       922
           4       0.91      0.97      0.94      1842
           5       0.97      0.97      0.97      1307
           6       0.92      0.94      0.93      1837
           7       0.82      0.74      0.78       751
           8       0.93      0.51      0.66       346
           9       0.80      0.82      0.81       257

    accuracy                           0.98     51578
   macro avg       0.91      0.84      0.87     51578
weighted avg       0.98      0.98      0.98     51578

Epoch 2 / 20, training loss: 0.01013, learning rate: 0.30000
Epoch 3 / 20, training loss: 0.00772, learning rate: 0.30000
Epoch 4 / 20, training loss: 0.00624, lear

## Exporting structs for prediction

In [24]:
with open('word_to_ix_2.pkl', 'wb') as f:
    pickle.dump(word_to_ix, f)
    
with open('tag_to_ix_2.pkl', 'wb') as f:
    pickle.dump(tag_to_ix, f)