# **Read Me**

This ipynb has two main section:

1. for submission
2. below is testing logs

Please run the code in first section (for submission) sequently.

# **1. for submission**

In [1]:
# environment set up
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import pandas as pd
from collections import Counter
import time
import math
import csv
import numpy as np
import spacy
from spacy.tokens import Doc
from sklearn.metrics import f1_score, precision_score, recall_score
import gensim.downloader as api
import warnings
warnings.filterwarnings('ignore')

torch.manual_seed(1)

<torch._C.Generator at 0x7fbf90d09490>

In [0]:
# download data set
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate
drive = None
def authenticate():
    global drive
    auth.authenticate_user()
    gauth = GoogleAuth()
    gauth.credentials = GoogleCredentials.get_application_default()
    drive = GoogleDrive(gauth)

#Download files
def downloadFiles(fileIds):
    authenticate()
    for fileId in fileIds:    
        downloaded = drive.CreateFile({"id": fileId[1]})
        downloaded.GetContentFile(fileId[0])

#Download file if not existing
try:
    train_data = pd.read_csv('train_upper.csv')
except:
    downloadFiles([['train_upper.csv', '1rFyBQzZK2CSwt0Sm11J2aEsxKh0qB4T6']])
    train_data = pd.read_csv('train_upper.csv')

try:
    val_data = pd.read_csv('val_upper.csv')
except:
    downloadFiles([['val_upper.csv', '1f0OP_gI5jwqDgeFyawZZ1tTJzqCTgo6X']])
    val_data = pd.read_csv('val_upper.csv')

try:
    test_data = pd.read_csv('test_upper.csv')
except:
    downloadFiles([['test_upper.csv', '1CQ3W5q4JhAB3s4j2XC17gVYWsQcAxvb4']])
    test_data = pd.read_csv('test_upper.csv')

In [0]:
# read data into list
sentence_train = train_data['Sentence'].tolist()
ner_train = train_data['NER'].tolist()

sentence_val = val_data['Sentence'].tolist()
ner_val = val_data['NER'].tolist()

sentence_test = test_data['Sentence'].tolist()

In [0]:
# defining nlp model
word_model = api.load("glove-wiki-gigaword-300")

# https://spacy.io/usage/linguistic-features#tokenization define our own spacy tokenizar by split()
class WhitespaceTokenizer(object):
    def __init__(self, vocab):
        self.vocab = vocab

    def __call__(self, text):
        words = text.split(' ')
        # All tokens 'own' a subsequent space character in this tokenizer
        spaces = [True] * len(words)
        return Doc(self.vocab, words=words, spaces=spaces)

nlp = spacy.load('en_core_web_sm')
nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)

In [0]:
# find all possible POS tagging, dependence path tagging, tagging
word_to_ix, pos_table, dep_table, tag_table = {}, {}, {}, {}
for sentence in sentence_train:
    parse = nlp(sentence)
    for token in parse:
        if str(token) not in word_to_ix:
            word_to_ix[str(token)] = len(word_to_ix)
        if token.pos_ not in pos_table:
            pos_table[token.pos_] = len(pos_table)
        if token.dep_ not in dep_table:
            dep_table[token.dep_] = len(dep_table)
        if token.tag_ not in tag_table:
            tag_table[token.tag_] = len(tag_table)

for sentence in sentence_val:
    parse = nlp(sentence)
    for token in parse:
        if str(token) not in word_to_ix:
            word_to_ix[str(token)] = len(word_to_ix)
        if token.pos_ not in pos_table:
            pos_table[token.pos_] = len(pos_table)
        if token.dep_ not in dep_table:
            dep_table[token.dep_] = len(dep_table)
        if token.tag_ not in tag_table:
            tag_table[token.tag_] = len(tag_table)

for sentence in sentence_test:
    parse = nlp(sentence)
    for token in parse:
        if str(token) not in word_to_ix:
            word_to_ix[str(token)] = len(word_to_ix)
        if token.pos_ not in pos_table:
            pos_table[token.pos_] = len(pos_table)
        if token.dep_ not in dep_table:
            dep_table[token.dep_] = len(dep_table)
        if token.tag_ not in tag_table:
            tag_table[token.tag_] = len(tag_table)

pos_onehot = np.eye(len(pos_table))
dep_onehot = np.eye(len(dep_table))
tag_onehot_emb = np.eye(len(tag_table))

In [0]:
# calculate tf-idf by code in lab 1
DF = {}
def get_df(documents):
    for sentence in documents:
        # sentence = [sentence.split()]
        for term in np.unique(sentence.split()):
            try:
                DF[term] += 1
            except:
                DF[term] = 1

get_df(sentence_train)
get_df(sentence_val)
get_df(sentence_test)

def get_tf_idf(documents):
    tf_idf = {}
    # total number of documents
    N = 3000 + 700 + 3684
    doc_id = 0
    # get each tokenised doc
    for sentence in documents:
        # sentence = [sentence.split()]
        
        # initialise counter for the doc
        counter = Counter(sentence.split())
        # calculate total number of words in the doc
        total_num_words = len(sentence.split())    
        # get each unique word in the doc
        for term in np.unique(sentence.split()):
            #calculate Term Frequency 
            tf = counter[term] / total_num_words
            #calculate Document Frequency
            df = DF[term]
            # calculate Inverse Document Frequency
            idf = math.log(N / (df + 1)) + 1
            # calculate TF-IDF
            tf_idf[doc_id, term] = tf * idf
        doc_id += 1
    return tf_idf


tf_idf_train = get_tf_idf(sentence_train)
tf_idf_val = get_tf_idf(sentence_val)
tf_idf_test = get_tf_idf(sentence_test)

In [0]:
START_TAG = "<START>"
STOP_TAG = "<STOP>"

tag_to_ix = {'I-LOC': 0,
             'I-MISC': 1,
             'I-ORG': 2,
             'I-PER': 3,
             'O': 4,
             START_TAG: 5,
             STOP_TAG: 6}
tag_onehot = np.eye(len(tag_to_ix))
tag_lookup_table = {0: 'I-LOC',
                    1: 'I-MISC',
                    2: 'I-ORG',
                    3: 'I-PER',
                    4: 'O',
                    5: START_TAG,
                    6: STOP_TAG}

In [0]:
# prepare a smaller one-hot vector for shape feature
target_set, per_set, loc_set, org_set, misc_set = set(), set(), set(), set(), set()
for sentence, tag in zip(sentence_train, ner_train):
    parse = nlp(sentence)
    tags = [e for e in tag.split()]
    for i, token in enumerate(parse):
        if tags[i] == 'I-PER':
            per_set.add(token.shape_)
            target_set.add(token.shape_)
        elif tags[i] == 'I-LOC':
            loc_set.add(token.shape_)
            target_set.add(token.shape_)
        elif tags[i] == 'I-ORG':
            org_set.add(token.shape_)
            target_set.add(token.shape_)
        elif tags[i] == 'I-MISC':
            misc_set.add(token.shape_)
            target_set.add(token.shape_)

per_set = per_set - loc_set - org_set - misc_set
loc_set = loc_set - per_set - org_set - misc_set
org_set = org_set - per_set - loc_set - misc_set
misc_set = misc_set - per_set - loc_set - org_set

In [0]:
# define get input embedding function
def gensim_emb(word):
    try:
        emb = word_model.wv[word.lower()]
    except:
        emb = np.array([0.] * 300, dtype=float)
    return torch.FloatTensor(emb)

def get_pos(token, parse):
    return torch.FloatTensor(pos_onehot[pos_table[token.pos_]])

def get_dep(token, parse):
    return torch.FloatTensor(dep_onehot[dep_table[token.dep_]])

def get_tag(token, parse):
    return torch.FloatTensor(tag_onehot_emb[tag_table[token.tag_]])

def get_shape(token, parse):
    shape = token.shape_
    if shape in target_set:
        if shape in per_set:
            return torch.FloatTensor([1, 0, 0, 0, 0, 0])
        elif shape in loc_set:
            return torch.FloatTensor([0, 1, 0, 0, 0, 0])
        elif shape in org_set:
            return torch.FloatTensor([0, 0, 1, 0, 0, 0])
        elif shape in misc_set:
            return torch.FloatTensor([0, 0, 0, 1, 0, 0])
        return torch.FloatTensor([0, 0, 0, 0, 1, 0])
    else:
        return torch.FloatTensor([0, 0, 0, 0, 0, 1])

def get_tf_idf(token, parse, index, flag):
    if flag == 'train':
        tf_idf_emb = torch.FloatTensor([tf_idf_train[index, str(token)]])
    elif flag == 'val':
        tf_idf_emb = torch.FloatTensor([tf_idf_val[index, str(token)]])
    elif flag == 'test':
        tf_idf_emb = torch.FloatTensor([tf_idf_test[index, str(token)]])
    return tf_idf_emb

def get_embeddings(sentence, index, flag, emb_dim):
    N = len(sentence.split())
    sentence_emb = torch.empty((0, emb_dim), dtype=torch.float32)
    parse = nlp(sentence)
    for token in parse:
        word_emb = gensim_emb(str(token))
        pos_emb = get_pos(token, parse)
        dep_emb = get_dep(token, parse)
        tag_emb = get_tag(token, parse)
        shape_emb = get_shape(token, parse)
        tf_idf_emb = get_tf_idf(token, parse, index, flag)
        pad_emb = torch.FloatTensor([0., 0., 0.])
        embeddings = torch.cat((word_emb, pos_emb, dep_emb, tag_emb, shape_emb, tf_idf_emb, pad_emb), axis=-1).view(1, -1)
        sentence_emb = torch.cat((sentence_emb, embeddings), axis=0)
    return sentence_emb.view(N, 1, -1)

In [0]:
class BiLSTM_CRF(nn.Module):

    def __init__(self, tag_to_ix, embedding_dim, hidden_dim, layers=1):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        self.layers = layers
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=self.layers, bidirectional=True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000
        
        self.hidden = self.init_hidden()

    def argmax(self, vec):
        # return the argmax as a python int
        _, idx = torch.max(vec, 1)
        return idx.item()
    
    # Compute log sum exp in a numerically stable way for the forward algorithm
    def log_sum_exp(self, vec):
        max_score = vec[0, self.argmax(vec)]
        max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
        return max_score + \
            torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

    def init_hidden(self):
        return (torch.randn(2 * self.layers, 1, self.hidden_dim // 2),
                torch.randn(2 * self.layers, 1, self.hidden_dim // 2))

    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(self.log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = self.log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        lstm_out, self.hidden = self.lstm(sentence, self.hidden)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = self.argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = self.argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(feats)
        return forward_score - gold_score, tag_seq

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

In [0]:
# training functions
def calculate_acuracy(labels, model_pred):
    precision = precision_score(labels, model_pred, average='micro')
    recall = recall_score(labels, model_pred, average='micro')
    f1 = f1_score(labels, model_pred, average='micro')
    return precision, recall, f1


def train_iter(model, optimizer, sentence, tags, index, emb_dim):
    '''
    Train the model for a single iteration.
    An iteration is when a single batch of data is passed forward and backward through the neural network.
    '''
    # move input and output to gpu
    # sentence, tags = sentence.to(device), tags.to(device)
    # gradient
    model.zero_grad()
    # forward propagation
    sentence_in = get_embeddings(sentence, index, flag='train', emb_dim=emb_dim)
    targets = torch.tensor([tag_to_ix[t] for t in tags.split()], dtype=torch.long)

    loss, outputs = model.neg_log_likelihood(sentence_in, targets)
    # with torch.no_grad():
    #     _, outputs = model(sentence_in)
    # calculate precision and recall
    # precision, recall, f1 = calculate_acuracy(targets.tolist(), outputs)
    # backward propagation
    loss.backward()
    # Update the parameters. 
    optimizer.step()
    return loss, outputs, targets.tolist()


def train_epoch(model, optimizer, emb_dim):
    '''
    in one epoch, loop through the whole data, keep track and record the precision and recall of each batch
    '''
    # set model to train mode
    time_since = time.time()
    model.train()
    running_loss = []
    truth = []
    prediction = []
    for i, (sentence, tags) in enumerate(zip(sentence_train, ner_train)):
        loss, outputs, targets = train_iter(model, optimizer, sentence, tags, i, emb_dim)
        running_loss.append(loss.item())
        truth += targets
        prediction += outputs
        # display performance every 500 data records
        if i % 500 == 499:
            precision, recall, f1 = calculate_acuracy(truth, prediction)
            print('[{:4}/{:4}]'.format(i + 1, len(sentence_train)), end=': ')
            print('Training Loss: {:>7.4f}'.format(np.mean(running_loss)), end=', ')
            print('Training Precision: {:.4f}'.format(precision), end=', ')
            print('Training Recall: {:.4f}'.format(recall), end=', ')
            print('Training F_1: {:.4f}'.format(f1))
            running_loss = []
            prediction = []
            truth = []
    
    print('Epoch training time: {:.2f} s.'.format(time.time() - time_since))
    return 


def val_epoch(model, emb_dim):
    '''
    valid current model on validation set
    '''
    # set the model to validating mode, without gradient
    model.eval()
    with torch.no_grad():
        loss_list = []
        predictions = []
        true_tags = []
        for i, (sentence, tags) in enumerate(zip(sentence_val, ner_val)):
            sentence_in = get_embeddings(sentence, i, flag='val', emb_dim=emb_dim)
            targets = torch.tensor([tag_to_ix[t] for t in tags.split()], dtype=torch.long)
            for t in targets:
                true_tags.append(t.item())
            loss, outputs = model.neg_log_likelihood(sentence_in, targets)
            loss_list.append(loss.item())
            
            for p in outputs:
                predictions.append(p)
        precision, recall, f1 = calculate_acuracy(true_tags, predictions)
        print('############ Val Loss: {:>7.4f}'.format(np.mean(loss_list)), end=', ')
        print('Val Precision: {:.4f}'.format(precision), end=', ')
        print('Val Recall: {:.4f}'.format(recall), end=', ')
        print('Val F_1: {:.4f}'.format(f1))
    return


def train_model(model, optimizer, emb_dim, num_epochs=25):
    # record starting time and initialize summary writer
    since = time.time()
    # train the model num_epoch times
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch + 1, num_epochs))
        # for every epoch, run training and validating
        for phase in ['train', 'val']:

            if phase == 'train':
                train_epoch(model, optimizer, emb_dim)
            elif phase == 'val':
                val_epoch(model, emb_dim)
        torch.save(model, 'The_'+ str(epoch + 1) + '_epoch_model.pt')
        print('-' * 109)
    
    time_cost = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_cost // 60, time_cost % 60))


def predict(model):
    outputs = []
    model.eval()
    with torch.no_grad():
        for index, sentence in enumerate(sentence_test):
            sentence_in = get_embeddings(sentence, index, flag='test', emb_dim=EMBEDDING_DIM)
            _, predictions = model(sentence_in)
            for p in predictions:
                outputs.append(p)
    with open('Predicted labels.csv', 'w') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['ID','Predicted'])
        for index, label in enumerate(outputs):
            writer.writerow([str(index), str(tag_lookup_table[label])])
    return

In [14]:
HIDDEN_DIM = 512
EMBEDDING_DIM = 420

model = BiLSTM_CRF(tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, layers=2)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

train_model(model, optimizer, num_epochs=10, emb_dim=EMBEDDING_DIM)

Epoch 1/10
[ 500/3000]: Training Loss:  8.2771, Training Precision: 0.8341, Training Recall: 0.8341, Training F_1: 0.8341
[1000/3000]: Training Loss:  3.3540, Training Precision: 0.8858, Training Recall: 0.8858, Training F_1: 0.8858
[1500/3000]: Training Loss:  2.8885, Training Precision: 0.9404, Training Recall: 0.9404, Training F_1: 0.9404
[2000/3000]: Training Loss:  2.6661, Training Precision: 0.9436, Training Recall: 0.9436, Training F_1: 0.9436
[2500/3000]: Training Loss:  1.6430, Training Precision: 0.9465, Training Recall: 0.9465, Training F_1: 0.9465
[3000/3000]: Training Loss:  1.7699, Training Precision: 0.9473, Training Recall: 0.9473, Training F_1: 0.9473
Epoch training time: 271.26 s.
############ Val Loss:  1.7004, Val Precision: 0.9403, Val Recall: 0.9403, Val F_1: 0.9403
-------------------------------------------------------------------------------------------------------------
Epoch 2/10
[ 500/3000]: Training Loss:  2.1276, Training Precision: 0.9452, Training Recall

In [0]:
# load trained model from current runtime, or download from Google Drive
# please define the model first, then load
try:
    model = torch.load('The_10_epoch_model.pt')
except:
    id = '1zJx5WHfvz46X9jKbiSvYUOEzqgs4LY87'
    downloaded = drive.CreateFile({'id': id})
    downloaded.GetContentFile('The_10_epoch_model.pt')
    model = torch.load('The_10_epoch_model.pt')

# predict on test set, result files should in the file list
predict(model)

---

---

# **2. below is testing logs**

some logs may be covered.

### **word_emb+pos+dep+tag+shape+tf_idf**

In [0]:
def get_embeddings(sentence, index, flag, emb_dim):
    N = len(sentence.split())
    sentence_emb = torch.empty((0, emb_dim), dtype=torch.float32)
    parse = nlp(sentence)
    for token in parse:
        word_emb = gensim_emb(str(token))
        pos_emb = get_pos(token, parse)
        dep_emb = get_dep(token, parse)
        tag_emb = get_tag(token, parse)
        shape_emb = get_shape(token, parse)
        tf_idf_emb = get_tf_idf(token, parse, index, flag)
        embeddings = torch.cat((word_emb, pos_emb, dep_emb, tag_emb, shape_emb, tf_idf_emb), axis=-1).view(1, -1)
        sentence_emb = torch.cat((sentence_emb, embeddings), axis=0)
    return sentence_emb.view(N, 1, -1)

In [0]:
HIDDEN_DIM = 512
EMBEDDING_DIM = 417

model = BiLSTM_CRF(tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, layers=1)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

train_model(model, device, optimizer, num_epochs=10, emb_dim=EMBEDDING_DIM)

Epoch 1/10
[ 500/3000]: Training Loss:  5.9122, Training Precision: 0.8501, Training Recall: 0.8501, Training F_1: 0.8501
[1000/3000]: Training Loss:  1.9064, Training Precision: 0.9223, Training Recall: 0.9223, Training F_1: 0.9223
[1500/3000]: Training Loss:  2.1637, Training Precision: 0.9447, Training Recall: 0.9447, Training F_1: 0.9447
[2000/3000]: Training Loss:  2.1571, Training Precision: 0.9477, Training Recall: 0.9477, Training F_1: 0.9477
[2500/3000]: Training Loss:  1.2708, Training Precision: 0.9414, Training Recall: 0.9414, Training F_1: 0.9414
[3000/3000]: Training Loss:  1.4996, Training Precision: 0.9599, Training Recall: 0.9599, Training F_1: 0.9599
Epoch training time: 218.36 s.
############ Val Loss:  1.3861, Val Precision: 0.9571, Val Recall: 0.9571, Val F_1: 0.9571
-------------------------------------------------------------------------------------------------------------
Epoch 2/10
[ 500/3000]: Training Loss:  1.6917, Training Precision: 0.9392, Training Recall

In [0]:
model = torch.load('The_7_epoch_model.pt')
predict(model)

NameError: ignored

### **attention**

In [0]:
class BiLSTM_CRF(nn.Module):

    def __init__(self, tag_to_ix, embedding_dim, hidden_dim, heads, layers=1):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        # self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        self.layers = layers
        # self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=self.layers, bidirectional=True)
        self.attn = nn.MultiheadAttention(embedding_dim, num_heads=10)
        self.attn2 = nn.MultiheadAttention(hidden_dim, num_heads=8)
        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000
        
        self.hidden = self.init_hidden()

    def argmax(self, vec):
        # return the argmax as a python int
        _, idx = torch.max(vec, 1)
        return idx.item()
    
    # Compute log sum exp in a numerically stable way for the forward algorithm
    def log_sum_exp(self, vec):
        max_score = vec[0, self.argmax(vec)]
        max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
        return max_score + \
            torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

    def init_hidden(self):
        return (torch.randn(2 * self.layers, 1, self.hidden_dim // 2),
                torch.randn(2 * self.layers, 1, self.hidden_dim // 2))

    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(self.log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = self.log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        # embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        # lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        attn_output, _ = self.attn(sentence, sentence, sentence)
        lstm_out, self.hidden = self.lstm(attn_output, self.hidden)
        # print(lstm_out.shape)

        # cat = torch.cat((self.hidden[0][0,:,:], self.hidden[0][1,:,:]), 1)
        # attn_output2, _ = self.attn2(lstm_out, cat, cat)

        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = self.argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = self.argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(feats)
        return forward_score - gold_score, tag_seq

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

In [0]:
def get_embeddings(sentence, index, flag, emb_dim):
    N = len(sentence.split())
    sentence_emb = torch.empty((0, emb_dim), dtype=torch.float32)
    parse = nlp(sentence)
    for token in parse:
        word_emb = gensim_emb(str(token))
        pos_emb = get_pos(token, parse)
        dep_emb = get_dep(token, parse)
        tag_emb = get_tag(token, parse)
        shape_emb = get_shape(token, parse)
        tf_idf_emb = get_tf_idf(token, parse, index, flag)
        pad_emb = torch.FloatTensor([0., 0., 0.])
        embeddings = torch.cat((word_emb, pos_emb, dep_emb, tag_emb, shape_emb, tf_idf_emb, pad_emb), axis=-1).view(1, -1)
        sentence_emb = torch.cat((sentence_emb, embeddings), axis=0)
    return sentence_emb.view(N, 1, -1)

In [0]:
HIDDEN_DIM = 512
EMBEDDING_DIM = 420

model = BiLSTM_CRF(tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, heads=10, layers=1)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

train_model(model, device, optimizer, num_epochs=10, emb_dim=EMBEDDING_DIM)

Epoch 1/10
[ 500/3000]: Training Loss:  9.1503, Training Precision: 0.7504, Training Recall: 0.7504, Training F_1: 0.7504
[1000/3000]: Training Loss:  7.9898, Training Precision: 0.6859, Training Recall: 0.6859, Training F_1: 0.6859
[1500/3000]: Training Loss:  7.0877, Training Precision: 0.8760, Training Recall: 0.8760, Training F_1: 0.8760
[2000/3000]: Training Loss:  4.5720, Training Precision: 0.8965, Training Recall: 0.8965, Training F_1: 0.8965
[2500/3000]: Training Loss:  2.8306, Training Precision: 0.8860, Training Recall: 0.8860, Training F_1: 0.8860
[3000/3000]: Training Loss:  2.9944, Training Precision: 0.9173, Training Recall: 0.9173, Training F_1: 0.9173
Epoch training time: 213.10 s.
############ Val Loss:  2.7358, Val Precision: 0.9097, Val Recall: 0.9097, Val F_1: 0.9097
-------------------------------------------------------------------------------------------------------------
Epoch 2/10
[ 500/3000]: Training Loss:  3.3028, Training Precision: 0.9018, Training Recall

In [0]:
model = torch.load('The_4_epoch_model.pt')
predict(model)

### **attention lstm**

In [0]:
class BiLSTM_CRF(nn.Module):

    def __init__(self, tag_to_ix, embedding_dim, hidden_dim, heads, layers=1):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        # self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        self.layers = layers
        # self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=self.layers, bidirectional=True)
        self.lstm2 = nn.LSTM(hidden_dim, hidden_dim // 2,
                            num_layers=self.layers, bidirectional=True)
        self.attn = nn.MultiheadAttention(embedding_dim, num_heads=10)
        self.attn2 = nn.MultiheadAttention(hidden_dim, num_heads=8)
        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000
        
        self.hidden = self.init_hidden()

    def argmax(self, vec):
        # return the argmax as a python int
        _, idx = torch.max(vec, 1)
        return idx.item()
    
    # Compute log sum exp in a numerically stable way for the forward algorithm
    def log_sum_exp(self, vec):
        max_score = vec[0, self.argmax(vec)]
        max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
        return max_score + \
            torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

    def init_hidden(self):
        return (torch.randn(2 * self.layers, 1, self.hidden_dim // 2),
                torch.randn(2 * self.layers, 1, self.hidden_dim // 2))

    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(self.log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = self.log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        # embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        # lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        # attn_output, _ = self.attn(sentence, sentence, sentence)
        lstm_out, self.hidden = self.lstm(sentence, self.hidden)
        # print(lstm_out.shape)
        # print(self.hidden[0].shape)
        cat = torch.cat((self.hidden[0][0,:,:], self.hidden[0][1,:,:]), 1)
        # print(cat.shape)
        attn_output, _ = self.attn2(lstm_out, cat, cat)
        # cat = torch.cat((self.hidden[0][0,:,:], self.hidden[0][1,:,:]), 1)
        # attn_output2, _ = self.attn2(lstm_out, cat, cat)
        lstm_out2, _ = self.lstm2(attn_output, self.hidden)
        lstm_out = lstm_out2.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = self.argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = self.argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(feats)
        return forward_score - gold_score, tag_seq

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

In [0]:
HIDDEN_DIM = 512
EMBEDDING_DIM = 420

model = BiLSTM_CRF(tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, heads=10, layers=1)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

train_model(model, device, optimizer, num_epochs=10, emb_dim=EMBEDDING_DIM)

Epoch 1/10
[ 500/3000]: Training Loss:  9.2271, Training Precision: 0.7374, Training Recall: 0.7374, Training F_1: 0.7374
[1000/3000]: Training Loss:  7.9901, Training Precision: 0.6854, Training Recall: 0.6854, Training F_1: 0.6854
[1500/3000]: Training Loss:  7.3823, Training Precision: 0.8562, Training Recall: 0.8562, Training F_1: 0.8562
[2000/3000]: Training Loss:  7.4629, Training Precision: 0.8747, Training Recall: 0.8747, Training F_1: 0.8747
[2500/3000]: Training Loss:  6.9847, Training Precision: 0.7442, Training Recall: 0.7442, Training F_1: 0.7442
[3000/3000]: Training Loss:  7.2828, Training Precision: 0.7200, Training Recall: 0.7200, Training F_1: 0.7200
Epoch training time: 316.17 s.
############ Val Loss:  7.1270, Val Precision: 0.7675, Val Recall: 0.7675, Val F_1: 0.7675
-------------------------------------------------------------------------------------------------------------
Epoch 2/10
[ 500/3000]: Training Loss:  7.4035, Training Precision: 0.8166, Training Recall

### **attention after lstm**

In [0]:
class BiLSTM_CRF(nn.Module):

    def __init__(self, tag_to_ix, embedding_dim, hidden_dim, heads, layers=1):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        # self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        self.layers = layers
        # self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=self.layers, bidirectional=True)
        self.lstm2 = nn.LSTM(hidden_dim, hidden_dim // 2,
                            num_layers=self.layers, bidirectional=True)
        self.attn = nn.MultiheadAttention(embedding_dim, num_heads=10)
        self.attn2 = nn.MultiheadAttention(hidden_dim, num_heads=8)
        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000
        
        self.hidden = self.init_hidden()

    def argmax(self, vec):
        # return the argmax as a python int
        _, idx = torch.max(vec, 1)
        return idx.item()
    
    # Compute log sum exp in a numerically stable way for the forward algorithm
    def log_sum_exp(self, vec):
        max_score = vec[0, self.argmax(vec)]
        max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
        return max_score + \
            torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

    def init_hidden(self):
        return (torch.randn(2 * self.layers, 1, self.hidden_dim // 2),
                torch.randn(2 * self.layers, 1, self.hidden_dim // 2))

    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(self.log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = self.log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        # embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        # lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        # attn_output, _ = self.attn(sentence, sentence, sentence)
        lstm_out, self.hidden = self.lstm(sentence, self.hidden)
        # print(lstm_out.shape)
        # print(self.hidden[0].shape)
        # cat = torch.cat((self.hidden[0][0,:,:], self.hidden[0][1,:,:]), 1)
        # print(cat.shape)
        # attn_output, _ = self.attn2(lstm_out, cat, cat)
        # cat = torch.cat((self.hidden[0][0,:,:], self.hidden[0][1,:,:]), 1)
        # attn_output2, _ = self.attn2(lstm_out, cat, cat)
        # self.hidden = self.init_hidden()
        # lstm_out2, self.hidden = self.lstm2(lstm_out, self.hidden)
        # cat = torch.cat((self.hidden[0][0,:,:], self.hidden[0][1,:,:]), 1)
        attn_output, _ = self.attn2(lstm_out, lstm_out, lstm_out)
        lstm_out = attn_output.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = self.argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = self.argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(feats)
        return forward_score - gold_score, tag_seq

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

In [30]:
HIDDEN_DIM = 512
EMBEDDING_DIM = 420

model = BiLSTM_CRF(tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, heads=10, layers=1)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

train_model(model, device, optimizer, num_epochs=10, emb_dim=EMBEDDING_DIM)

Epoch 1/10
[ 500/3000]: Training Loss:  9.3941, Training Precision: 0.7757, Training Recall: 0.7757, Training F_1: 0.7757
[1000/3000]: Training Loss:  7.9603, Training Precision: 0.6873, Training Recall: 0.6873, Training F_1: 0.6873
[1500/3000]: Training Loss:  7.4042, Training Precision: 0.8791, Training Recall: 0.8791, Training F_1: 0.8791
[2000/3000]: Training Loss:  7.5490, Training Precision: 0.8719, Training Recall: 0.8719, Training F_1: 0.8719
[2500/3000]: Training Loss:  6.7966, Training Precision: 0.7555, Training Recall: 0.7555, Training F_1: 0.7555
[3000/3000]: Training Loss:  7.2752, Training Precision: 0.7148, Training Recall: 0.7148, Training F_1: 0.7148
Epoch training time: 218.04 s.
############ Val Loss:  6.9544, Val Precision: 0.7536, Val Recall: 0.7536, Val F_1: 0.7536
-------------------------------------------------------------------------------------------------------------
Epoch 2/10
[ 500/3000]: Training Loss:  7.4084, Training Precision: 0.8158, Training Recall

### **2layer with attention**

In [0]:
class BiLSTM_CRF(nn.Module):

    def __init__(self, tag_to_ix, embedding_dim, hidden_dim, heads=2, layers=1):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        # self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        self.layers = layers
        self.heads = heads
        # self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=self.layers, bidirectional=True)
        # self.lstm2 = nn.LSTM(hidden_dim, hidden_dim // 2,
        #                     num_layers=self.layers, bidirectional=True)
        self.attn = nn.MultiheadAttention(embedding_dim, num_heads=self.heads)
        # self.attn2 = nn.MultiheadAttention(hidden_dim, num_heads=8)
        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000
        
        self.hidden = self.init_hidden()

    def argmax(self, vec):
        # return the argmax as a python int
        _, idx = torch.max(vec, 1)
        return idx.item()
    
    # Compute log sum exp in a numerically stable way for the forward algorithm
    def log_sum_exp(self, vec):
        max_score = vec[0, self.argmax(vec)]
        max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
        return max_score + \
            torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

    def init_hidden(self):
        return (torch.randn(2 * self.layers, 1, self.hidden_dim // 2),
                torch.randn(2 * self.layers, 1, self.hidden_dim // 2))

    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(self.log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = self.log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        # embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        # lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        attn_output, _ = self.attn(sentence, sentence, sentence)
        lstm_out, self.hidden = self.lstm(attn_output, self.hidden)
        # print(lstm_out.shape)
        # print(self.hidden[0].shape)
        # cat = torch.cat((self.hidden[0][0,:,:], self.hidden[0][1,:,:]), 1)
        # print(cat.shape)
        # attn_output, _ = self.attn2(lstm_out, cat, cat)
        # cat = torch.cat((self.hidden[0][0,:,:], self.hidden[0][1,:,:]), 1)
        # attn_output2, _ = self.attn2(lstm_out, cat, cat)
        # self.hidden = self.init_hidden()
        # lstm_out2, self.hidden = self.lstm2(lstm_out, self.hidden)
        # cat = torch.cat((self.hidden[0][0,:,:], self.hidden[0][1,:,:]), 1)
        # attn_output, _ = self.attn2(lstm_out, lstm_out, lstm_out)
        lstm_out1 = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out1)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = self.argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = self.argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(feats)
        return forward_score - gold_score, tag_seq

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

In [40]:
HIDDEN_DIM = 512
EMBEDDING_DIM = 420

model = BiLSTM_CRF(tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, heads=2, layers=2)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

train_model(model, device, optimizer, num_epochs=10, emb_dim=EMBEDDING_DIM)

Epoch 1/10
[ 500/3000]: Training Loss:  9.4092, Training Precision: 0.7636, Training Recall: 0.7636, Training F_1: 0.7636
[1000/3000]: Training Loss:  8.1202, Training Precision: 0.6810, Training Recall: 0.6810, Training F_1: 0.6810
[1500/3000]: Training Loss:  7.4021, Training Precision: 0.8553, Training Recall: 0.8553, Training F_1: 0.8553
[2000/3000]: Training Loss:  7.4184, Training Precision: 0.8619, Training Recall: 0.8619, Training F_1: 0.8619
[2500/3000]: Training Loss:  6.7341, Training Precision: 0.7524, Training Recall: 0.7524, Training F_1: 0.7524
[3000/3000]: Training Loss:  4.4194, Training Precision: 0.8820, Training Recall: 0.8820, Training F_1: 0.8820
Epoch training time: 295.18 s.
############ Val Loss:  3.6652, Val Precision: 0.8516, Val Recall: 0.8516, Val F_1: 0.8516
-------------------------------------------------------------------------------------------------------------
Epoch 2/10
[ 500/3000]: Training Loss:  4.0966, Training Precision: 0.8672, Training Recall

### **attention after lstm2**

In [0]:
class BiLSTM_CRF(nn.Module):

    def __init__(self, tag_to_ix, embedding_dim, hidden_dim, heads=2, layers=1):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        # self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        self.layers = layers
        self.heads = heads
        # self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=self.layers, bidirectional=True)
        # self.lstm2 = nn.LSTM(hidden_dim, hidden_dim // 2,
        #                     num_layers=self.layers, bidirectional=True)
        # self.attn = nn.MultiheadAttention(embedding_dim, num_heads=self.heads)
        self.attn2 = nn.MultiheadAttention(hidden_dim, num_heads=self.heads)
        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000
        
        self.hidden = self.init_hidden()

    def argmax(self, vec):
        # return the argmax as a python int
        _, idx = torch.max(vec, 1)
        return idx.item()
    
    # Compute log sum exp in a numerically stable way for the forward algorithm
    def log_sum_exp(self, vec):
        max_score = vec[0, self.argmax(vec)]
        max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
        return max_score + \
            torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

    def init_hidden(self):
        return (torch.randn(2 * self.layers, 1, self.hidden_dim // 2),
                torch.randn(2 * self.layers, 1, self.hidden_dim // 2))

    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(self.log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = self.log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        # embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        # lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        # attn_output, _ = self.attn(sentence, sentence, sentence)
        lstm_out, self.hidden = self.lstm(sentence, self.hidden)
        # print(lstm_out.shape)
        # print(self.hidden[0].shape)
        # cat = torch.cat((self.hidden[0][0,:,:], self.hidden[0][1,:,:]), 1)
        # print(cat.shape)
        # attn_output, _ = self.attn2(lstm_out, cat, cat)
        # cat = torch.cat((self.hidden[0][0,:,:], self.hidden[0][1,:,:]), 1)
        # attn_output2, _ = self.attn2(lstm_out, cat, cat)
        # self.hidden = self.init_hidden()
        # lstm_out2, self.hidden = self.lstm2(lstm_out, self.hidden)
        # cat = torch.cat((self.hidden[0][0,:,:], self.hidden[0][1,:,:]), 1)
        attn_output, _ = self.attn2(lstm_out, lstm_out, lstm_out)
        lstm_out1 = attn_output.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out1)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = self.argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = self.argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(feats)
        return forward_score - gold_score, tag_seq

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

In [42]:
HIDDEN_DIM = 512
EMBEDDING_DIM = 420

model = BiLSTM_CRF(tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, heads=16, layers=2)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

train_model(model, device, optimizer, num_epochs=10, emb_dim=EMBEDDING_DIM)

Epoch 1/10
[ 500/3000]: Training Loss: 10.0502, Training Precision: 0.7572, Training Recall: 0.7572, Training F_1: 0.7572
[1000/3000]: Training Loss:  8.3816, Training Precision: 0.6783, Training Recall: 0.6783, Training F_1: 0.6783
[1500/3000]: Training Loss:  7.4672, Training Precision: 0.8488, Training Recall: 0.8488, Training F_1: 0.8488
[2000/3000]: Training Loss:  7.5670, Training Precision: 0.8652, Training Recall: 0.8652, Training F_1: 0.8652
[2500/3000]: Training Loss:  7.1809, Training Precision: 0.7273, Training Recall: 0.7273, Training F_1: 0.7273
[3000/3000]: Training Loss:  7.4377, Training Precision: 0.7205, Training Recall: 0.7205, Training F_1: 0.7205
Epoch training time: 307.27 s.
############ Val Loss:  7.3327, Val Precision: 0.7245, Val Recall: 0.7245, Val F_1: 0.7245
-------------------------------------------------------------------------------------------------------------
Epoch 2/10
[ 500/3000]: Training Loss:  7.6394, Training Precision: 0.7914, Training Recall

### **h=2**

In [43]:
HIDDEN_DIM = 512
EMBEDDING_DIM = 420

model = BiLSTM_CRF(tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, heads=2, layers=2)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

train_model(model, device, optimizer, num_epochs=10, emb_dim=EMBEDDING_DIM)

Epoch 1/10
[ 500/3000]: Training Loss: 10.1235, Training Precision: 0.7760, Training Recall: 0.7760, Training F_1: 0.7760
[1000/3000]: Training Loss:  8.6124, Training Precision: 0.6814, Training Recall: 0.6814, Training F_1: 0.6814
[1500/3000]: Training Loss:  7.9153, Training Precision: 0.8486, Training Recall: 0.8486, Training F_1: 0.8486
[2000/3000]: Training Loss:  7.9505, Training Precision: 0.8608, Training Recall: 0.8608, Training F_1: 0.8608
[2500/3000]: Training Loss:  7.1804, Training Precision: 0.7284, Training Recall: 0.7284, Training F_1: 0.7284
[3000/3000]: Training Loss:  7.4669, Training Precision: 0.7104, Training Recall: 0.7104, Training F_1: 0.7104
Epoch training time: 300.99 s.
############ Val Loss:  7.2026, Val Precision: 0.7479, Val Recall: 0.7479, Val F_1: 0.7479
-------------------------------------------------------------------------------------------------------------
Epoch 2/10
[ 500/3000]: Training Loss:  7.6136, Training Precision: 0.7951, Training Recall

### **without attention**

In [0]:
class BiLSTM_CRF(nn.Module):

    def __init__(self, tag_to_ix, embedding_dim, hidden_dim, layers=1):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        # self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        self.layers = layers
        # self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=self.layers, bidirectional=True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000
        
        self.hidden = self.init_hidden()

    def argmax(self, vec):
        # return the argmax as a python int
        _, idx = torch.max(vec, 1)
        return idx.item()
    
    # Compute log sum exp in a numerically stable way for the forward algorithm
    def log_sum_exp(self, vec):
        max_score = vec[0, self.argmax(vec)]
        max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
        return max_score + \
            torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

    def init_hidden(self):
        return (torch.randn(2 * self.layers, 1, self.hidden_dim // 2),
                torch.randn(2 * self.layers, 1, self.hidden_dim // 2))

    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(self.log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = self.log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        # embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        # lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out, self.hidden = self.lstm(sentence, self.hidden)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = self.argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = self.argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(feats)
        return forward_score - gold_score, tag_seq

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

### **emb+pos+dep+tag+tf_idf**

In [0]:
def get_embeddings(sentence, index, flag, emb_dim):
    N = len(sentence.split())
    sentence_emb = torch.empty((0, emb_dim), dtype=torch.float32)
    parse = nlp(sentence)
    for token in parse:
        word_emb = gensim_emb(str(token))
        pos_emb = get_pos(token, parse)
        dep_emb = get_dep(token, parse)
        tag_emb = get_tag(token, parse)
        tf_idf_emb = get_tf_idf(token, parse, index, flag)
        embeddings = torch.cat((word_emb, pos_emb, dep_emb, tag_emb, tf_idf_emb), axis=-1).view(1, -1)
        sentence_emb = torch.cat((sentence_emb, embeddings), axis=0)
    return sentence_emb.view(N, 1, -1)

In [0]:
HIDDEN_DIM = 1024
EMBEDDING_DIM = 410

model = BiLSTM_CRF(tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, layers=2)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

train_model(model, device, optimizer, num_epochs=10, emb_dim=EMBEDDING_DIM)

Epoch 1/10
[ 500/3000]: Training Loss:  7.7020, Training Precision: 0.7669, Training Recall: 0.7669, Training F_1: 0.7669
[1000/3000]: Training Loss:  3.5784, Training Precision: 0.8530, Training Recall: 0.8530, Training F_1: 0.8530
[1500/3000]: Training Loss:  3.6145, Training Precision: 0.9132, Training Recall: 0.9132, Training F_1: 0.9132
[2000/3000]: Training Loss:  3.5121, Training Precision: 0.9080, Training Recall: 0.9080, Training F_1: 0.9080
[2500/3000]: Training Loss:  2.2104, Training Precision: 0.8817, Training Recall: 0.8817, Training F_1: 0.8817
[3000/3000]: Training Loss:  2.4998, Training Precision: 0.9259, Training Recall: 0.9259, Training F_1: 0.9259
Epoch training time: 738.68 s.
############ Val Loss:  2.4294, Val Precision: 0.9209, Val Recall: 0.9209, Val F_1: 0.9209
-------------------------------------------------------------------------------------------------------------
Epoch 2/10
[ 500/3000]: Training Loss:  2.6779, Training Precision: 0.9067, Training Recall

In [0]:
model = torch.load('The_6_epoch_model.pt')
predict(model)

### **two layers**

In [0]:
class BiLSTM_CRF(nn.Module):

    def __init__(self, tag_to_ix, embedding_dim, hidden_dim, layers=1):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        # self.hidden_dim_2 = hidden_dim_2
        # self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        self.layers = layers
        # self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=self.layers, bidirectional=True)
        # self.lstm_2 = nn.LSTM(hidden_dim_1, hidden_dim_2 // 2,
        #                     num_layers=1, bidirectional=True)
        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000
        
        # self.hidden_1 = self.init_hidden_1()
        # self.hidden_2 = self.init_hidden_2()
        self.hidden = self.init_hidden()

    def argmax(self, vec):
        # return the argmax as a python int
        _, idx = torch.max(vec, 1)
        return idx.item()
    
    # Compute log sum exp in a numerically stable way for the forward algorithm
    def log_sum_exp(self, vec):
        max_score = vec[0, self.argmax(vec)]
        max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
        return max_score + \
            torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

    def init_hidden(self):
        return (torch.randn(2*self.layers, 1, self.hidden_dim // 2),
                torch.randn(2*self.layers, 1, self.hidden_dim // 2))
        
    # def init_hidden_2(self):
    #     return (torch.randn(2, 1, self.hidden_dim_2 // 2),
    #             torch.randn(2, 1, self.hidden_dim_2 // 2))

    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(self.log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = self.log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        lstm_out, self.hidden = self.lstm(sentence, self.hidden)

        # self.hidden_2 = self.init_hidden_2()
        # lstm_out_2, self.hidden_2 = self.lstm_2(lstm_out_1, self.hidden_1)

        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = self.argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = self.argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(feats)
        return forward_score - gold_score, tag_seq

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

In [33]:
# HIDDEN_DIM_1 = 256
HIDDEN_DIM = 256
EMBEDDING_DIM = 420

model = BiLSTM_CRF(tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, layers=2)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

train_model(model, device, optimizer, num_epochs=10, emb_dim=EMBEDDING_DIM)

Epoch 1/10
[ 500/3000]: Training Loss:  8.2449, Training Precision: 0.7749, Training Recall: 0.7749, Training F_1: 0.7749
[1000/3000]: Training Loss:  3.4919, Training Precision: 0.8552, Training Recall: 0.8552, Training F_1: 0.8552
[1500/3000]: Training Loss:  2.7711, Training Precision: 0.9211, Training Recall: 0.9211, Training F_1: 0.9211
[2000/3000]: Training Loss:  2.6259, Training Precision: 0.9339, Training Recall: 0.9339, Training F_1: 0.9339
[2500/3000]: Training Loss:  1.6706, Training Precision: 0.9072, Training Recall: 0.9072, Training F_1: 0.9072
[3000/3000]: Training Loss:  1.7089, Training Precision: 0.9493, Training Recall: 0.9493, Training F_1: 0.9493
Epoch training time: 177.67 s.
############ Val Loss:  1.7244, Val Precision: 0.9407, Val Recall: 0.9407, Val F_1: 0.9407
-------------------------------------------------------------------------------------------------------------
Epoch 2/10
[ 500/3000]: Training Loss:  2.0273, Training Precision: 0.9275, Training Recall

In [0]:
model = torch.load('The_2_epoch_model.pt')

predict(model)

### **layer 2**

In [0]:
class BiLSTM_CRF(nn.Module):

    def __init__(self, tag_to_ix, embedding_dim, hidden_dim, layers=1):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        # self.hidden_dim_2 = hidden_dim_2
        # self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        self.layers = layers
        # self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=self.layers, bidirectional=True)
        self.lstm2 = nn.LSTM(hidden_dim, hidden_dim // 2,
                            num_layers=self.layers, bidirectional=True)
        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000
        
        # self.hidden_1 = self.init_hidden_1()
        # self.hidden_2 = self.init_hidden_2()
        self.hidden = self.init_hidden()

    def argmax(self, vec):
        # return the argmax as a python int
        _, idx = torch.max(vec, 1)
        return idx.item()
    
    # Compute log sum exp in a numerically stable way for the forward algorithm
    def log_sum_exp(self, vec):
        max_score = vec[0, self.argmax(vec)]
        max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
        return max_score + \
            torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

    def init_hidden(self):
        return (torch.randn(2*self.layers, 1, self.hidden_dim // 2),
                torch.randn(2*self.layers, 1, self.hidden_dim // 2))
        
    # def init_hidden_2(self):
    #     return (torch.randn(2, 1, self.hidden_dim_2 // 2),
    #             torch.randn(2, 1, self.hidden_dim_2 // 2))

    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(self.log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = self.log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        lstm_out1, self.hidden = self.lstm(sentence, self.hidden)

        self.hidden = self.init_hidden()
        lstm_out2, self.hidden = self.lstm2(lstm_out1, self.hidden)

        lstm_out = lstm_out2.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = self.argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = self.argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(feats)
        return forward_score - gold_score, tag_seq

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

In [36]:
# HIDDEN_DIM_1 = 256
HIDDEN_DIM = 512
EMBEDDING_DIM = 420

model = BiLSTM_CRF(tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, layers=2)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

train_model(model, device, optimizer, num_epochs=10, emb_dim=EMBEDDING_DIM)

Epoch 1/10
[ 500/3000]: Training Loss: 10.0823, Training Precision: 0.7729, Training Recall: 0.7729, Training F_1: 0.7729
[1000/3000]: Training Loss:  8.5388, Training Precision: 0.6830, Training Recall: 0.6830, Training F_1: 0.6830
[1500/3000]: Training Loss:  7.0018, Training Precision: 0.8418, Training Recall: 0.8418, Training F_1: 0.8418
[2000/3000]: Training Loss:  6.0765, Training Precision: 0.8595, Training Recall: 0.8595, Training F_1: 0.8595
[2500/3000]: Training Loss:  4.4710, Training Precision: 0.7849, Training Recall: 0.7849, Training F_1: 0.7849
[3000/3000]: Training Loss:  3.4440, Training Precision: 0.9028, Training Recall: 0.9028, Training F_1: 0.9028
Epoch training time: 439.74 s.
############ Val Loss:  3.5560, Val Precision: 0.8702, Val Recall: 0.8702, Val F_1: 0.8702
-------------------------------------------------------------------------------------------------------------
Epoch 2/10
[ 500/3000]: Training Loss:  3.7730, Training Precision: 0.8693, Training Recall

### **layer 3**

In [38]:
# HIDDEN_DIM_1 = 256
HIDDEN_DIM = 512
EMBEDDING_DIM = 420

model = BiLSTM_CRF(tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, layers=2)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

train_model(model, device, optimizer, num_epochs=10, emb_dim=EMBEDDING_DIM)

Epoch 1/10
[ 500/3000]: Training Loss:  7.4430, Training Precision: 0.8019, Training Recall: 0.8019, Training F_1: 0.8019
[1000/3000]: Training Loss:  3.1148, Training Precision: 0.8557, Training Recall: 0.8557, Training F_1: 0.8557
[1500/3000]: Training Loss:  2.7156, Training Precision: 0.9296, Training Recall: 0.9296, Training F_1: 0.9296
[2000/3000]: Training Loss:  2.5548, Training Precision: 0.9342, Training Recall: 0.9342, Training F_1: 0.9342
[2500/3000]: Training Loss:  1.6866, Training Precision: 0.9065, Training Recall: 0.9065, Training F_1: 0.9065
[3000/3000]: Training Loss:  1.7803, Training Precision: 0.9451, Training Recall: 0.9451, Training F_1: 0.9451
Epoch training time: 274.61 s.
############ Val Loss:  1.7316, Val Precision: 0.9379, Val Recall: 0.9379, Val F_1: 0.9379
-------------------------------------------------------------------------------------------------------------
Epoch 2/10
[ 500/3000]: Training Loss:  2.0774, Training Precision: 0.9282, Training Recall

### **one LSTM layer**

In [0]:
HIDDEN_DIM = 128
EMBEDDING_DIM = 157

model = BiLSTM_CRF(tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.001, weight_decay=1e-3)

### **word_emb+pos+dep+tf_idf**

In [0]:

def calculate_acuracy(labels, model_pred):
    precision = precision_score(labels, model_pred, average='micro')
    recall = recall_score(labels, model_pred, average='micro')
    f1 = f1_score(labels, model_pred, average='micro')
    return precision, recall, f1


def train_iter(model, device, optimizer, sentence, tags, index):
    '''
    Train the model for a single iteration.
    An iteration is when a single batch of data is passed forward and backward through the neural network.
    '''
    # move input and output to gpu
    # sentence, tags = sentence.to(device), tags.to(device)
    # gradient
    model.zero_grad()
    # forward propagation
    sentence_in = emb_pos_dep_tf_idf(sentence, index, flag='train')
    targets = torch.tensor([tag_to_ix[t] for t in tags.split()], dtype=torch.long)

    loss, outputs = model.neg_log_likelihood(sentence_in, targets)
    # with torch.no_grad():
    #     _, outputs = model(sentence_in)
    # calculate precision and recall
    precision, recall, f1 = calculate_acuracy(targets.tolist(), outputs)
    # backward propagation
    loss.backward()
    # Update the parameters. 
    optimizer.step()
    return loss, precision, recall, f1


def train_epoch(model, device, optimizer, nlp, emb_dim):
    '''
    in one epoch, loop through the whole data, keep track and record the precision and recall of each batch
    '''
    # set model to train mode
    time_since = time.time()
    model.train()
    running_loss = []
    running_precision = []
    running_recall = []
    running_f1 = []
    for i, (sentence, tags) in enumerate(zip(sentence_train, ner_train)):
        loss, precision, recall, f1 = train_iter(model, device, optimizer, sentence, tags, i)
        running_loss.append(loss.item())
        running_precision.append(precision)
        running_recall.append(recall)
        running_f1.append(f1)
        if i % 500 == 499:
            print('[{:4}/{:4}]'.format(i + 1, len(sentence_train)), end=': ')
            print('Training Loss: {:>7.4f}'.format(np.mean(running_loss)), end=', ')
            print('Training Precision: {:.4f}'.format(np.mean(running_precision)), end=', ')
            print('Training Recall: {:.4f}'.format(np.mean(running_recall)), end=', ')
            print('Training F_1: {:.4f}'.format(np.mean(running_f1)))
            running_loss = []
            running_precision = []
            running_recall = []
            running_f1 = []
    # print('[{:4}/{:4}]'.format(len(sentence_train), len(sentence_train)), end=': ')
    # print('Training Loss: {:2.4f}'.format(np.mean(running_loss)), end=', ')
    # print('Training Precision: {:.4f}'.format(np.mean(running_precision)), end=', ')
    # print('Training Recall: {:.4f}'.format(np.mean(running_recall)), end=', ')
    # print('Training F_1: {:.4f}'.format(np.mean(running_f1)))
    print('Epoch training time: {:.2f} s.'.format(time.time() - time_since))
    return 


def val_epoch(model, device, nlp, emb_dim):
    '''
    valid current model on validation set
    '''
    # set the model to validating mode, without gradient
    model.eval()
    with torch.no_grad():
        loss_list = []
        predictions = []
        true_tags = []
        for i, (sentence, tags) in enumerate(zip(sentence_val, ner_val)):
            sentence_in = emb_pos_dep_tf_idf(sentence, i, flag='val')
            targets = torch.tensor([tag_to_ix[t] for t in tags.split()], dtype=torch.long)
            for t in targets:
                true_tags.append(t.item())
            loss, outputs = model.neg_log_likelihood(sentence_in, targets)
            loss_list.append(loss.item())
            # _, outputs = model(sentence_in)
            for p in outputs:
                predictions.append(p)
        precision, recall, f1 = calculate_acuracy(true_tags, predictions)
        print('############ Val Loss: {:>7.4f}'.format(np.mean(loss_list)), end=', ')
        print('Val Precision: {:.4f}'.format(precision), end=', ')
        print('Val Recall: {:.4f}'.format(recall), end=', ')
        print('Val F_1: {:.4f}'.format(f1))
    return


def train_model(model, device, optimizer, nlp, emb_dim, num_epochs=25):
    # record starting time and initialize summary writer
    since = time.time()
    # train the model num_epoch times
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch + 1, num_epochs))
        # for every epoch, run training and validating
        for phase in ['train', 'val']:

            if phase == 'train':
                train_epoch(model, device, optimizer, nlp, emb_dim)
            elif phase == 'val':
                val_epoch(model, device, nlp, emb_dim)
        torch.save(model, 'The_'+ str(epoch + 1) + '_epoch_model.pt')
        print('-' * 109)
    
    time_cost = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_cost // 60, time_cost % 60))



In [0]:
HIDDEN_DIM = 128
EMBEDDING_DIM = 158

model = BiLSTM_CRF(tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

train_model(model, device, optimizer, num_epochs=5, nlp=nlp, emb_dim=EMBEDDING_DIM)

Epoch 1/5
[ 500/3000]: Training Loss:  5.3003, Training Precision: 0.8611, Training Recall: 0.8611, Training F_1: 0.8611
[1000/3000]: Training Loss:  2.6641, Training Precision: 0.8973, Training Recall: 0.8973, Training F_1: 0.8973
[1500/3000]: Training Loss:  3.3774, Training Precision: 0.9210, Training Recall: 0.9210, Training F_1: 0.9210
[2000/3000]: Training Loss:  3.3047, Training Precision: 0.9244, Training Recall: 0.9244, Training F_1: 0.9244
[2500/3000]: Training Loss:  2.1720, Training Precision: 0.9149, Training Recall: 0.9149, Training F_1: 0.9149
[3000/3000]: Training Loss:  2.4789, Training Precision: 0.9256, Training Recall: 0.9256, Training F_1: 0.9256
Epoch training time: 134.21 s.
############ Val Loss:  2.9632, Val Precision: 0.8937, Val Recall: 0.8937, Val F_1: 0.8937
-------------------------------------------------------------------------------------------------------------
Epoch 2/5
[ 500/3000]: Training Loss:  2.6533, Training Precision: 0.9069, Training Recall: 

In [0]:
predict(model)

### **word_emb+pos+dep**

In [0]:
HIDDEN_DIM = 128
EMBEDDING_DIM = 157

model = BiLSTM_CRF(tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

train_model(model, device, optimizer, num_epochs=5, nlp=nlp, emb_dim=EMBEDDING_DIM)

Epoch 1/5
[ 500/3000]: Training Loss:  5.5581, Training Precision: 0.8568, Training Recall: 0.8568, Training F_1: 0.8568
[1000/3000]: Training Loss:  2.5450, Training Precision: 0.9013, Training Recall: 0.9013, Training F_1: 0.9013
[1500/3000]: Training Loss:  3.3812, Training Precision: 0.9219, Training Recall: 0.9219, Training F_1: 0.9219
[2000/3000]: Training Loss:  3.3821, Training Precision: 0.9211, Training Recall: 0.9211, Training F_1: 0.9211
[2500/3000]: Training Loss:  2.1253, Training Precision: 0.9230, Training Recall: 0.9230, Training F_1: 0.9230
[3000/3000]: Training Loss:  2.4866, Training Precision: 0.9355, Training Recall: 0.9355, Training F_1: 0.9355
Epoch training time: 140.41 s.
############ Val Loss:  3.0109, Val Precision: 0.8933, Val Recall: 0.8933, Val F_1: 0.8933
-------------------------------------------------------------------------------------------------------------
Epoch 2/5
[ 500/3000]: Training Loss:  2.6851, Training Precision: 0.9193, Training Recall: 

### **baseline model**

In [0]:
HIDDEN_DIM = 50
EMBEDDING_DIM = 50

model = BiLSTM_CRF(tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

train_model(model, device, optimizer, num_epochs=5, nlp=None, emb_dim=50)

Epoch 1/5
[ 500/3000]: Training Loss: 10.2738, Training Precision: 0.7489, Training Recall: 0.7489, Training F_1: 0.7489
[1000/3000]: Training Loss:  8.5752, Training Precision: 0.6726, Training Recall: 0.6726, Training F_1: 0.6726
[1500/3000]: Training Loss:  7.7830, Training Precision: 0.8310, Training Recall: 0.8310, Training F_1: 0.8310
[2000/3000]: Training Loss:  7.8989, Training Precision: 0.8516, Training Recall: 0.8516, Training F_1: 0.8516
[2500/3000]: Training Loss:  7.8406, Training Precision: 0.7180, Training Recall: 0.7180, Training F_1: 0.7180
[3000/3000]: Training Loss:  8.0194, Training Precision: 0.7141, Training Recall: 0.7141, Training F_1: 0.7141
Epoch training time: 135.87 s.
############ Val Loss:  8.7840, Val Precision: 0.7663, Val Recall: 0.7663, Val F_1: 0.7663
-------------------------------------------------------------------------------------------------------------
Epoch 2/5
[ 500/3000]: Training Loss:  8.2153, Training Precision: 0.7804, Training Recall: 

In [0]:
a = [1,2,3]
b = [2,3,4]
a.append()

TypeError: ignored

In [0]:
a = torch.tensor([1,2,3], dtype=torch.long)

In [0]:
print('Training Loss: {:>5.2f}'.format(4.3333))

Training Loss:  4.33


In [0]:
5 % 2

1

In [0]:
[e.item() for e in a]

[1, 2, 3]

In [0]:
!pip install -q --upgrade ipython
!pip install -q --upgrade ipykernel

[K     |████████████████████████████████| 788kB 11.9MB/s 
[K     |████████████████████████████████| 358kB 50.2MB/s 
[31mERROR: jupyter-console 5.2.0 has requirement prompt-toolkit<2.0.0,>=1.0.0, but you'll have prompt-toolkit 3.0.5 which is incompatible.[0m
[31mERROR: google-colab 1.0.0 has requirement ipython~=5.5.0, but you'll have ipython 7.15.0 which is incompatible.[0m
[K     |████████████████████████████████| 122kB 14.7MB/s 
[31mERROR: jupyter-console 5.2.0 has requirement prompt-toolkit<2.0.0,>=1.0.0, but you'll have prompt-toolkit 3.0.5 which is incompatible.[0m
[31mERROR: google-colab 1.0.0 has requirement ipykernel~=4.10, but you'll have ipykernel 5.3.0 which is incompatible.[0m
[31mERROR: google-colab 1.0.0 has requirement ipython~=5.5.0, but you'll have ipython 7.15.0 which is incompatible.[0m
[?25h

In [0]:
HIDDEN_DIM = 256
EMBEDDING_DIM = 157


model = BiLSTM_CRF(tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

# Check predictions before training
with torch.no_grad():
    # precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)
    precheck_sent = prepare_embeddings(sentence_train[1], nlp)
    precheck_tags = torch.tensor([tag_to_ix[t] for t in ner_train[1].split()], dtype=torch.long)
    print(model(precheck_sent))

# Make sure prepare_sequence from earlier in the LSTM section is loaded
for epoch in range(5):  # again, normally you would NOT do 300 epochs, it is toy data
    # for sentence, tags in training_data:

    for sentence, tags in zip(sentence_train, ner_train):
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        # sentence_in = prepare_sequence(sentence, word_to_ix)
        sentence_in = prepare_embeddings(sentence, nlp)
        targets = torch.tensor([tag_to_ix[t] for t in tags.split()], dtype=torch.long)
        # Step 3. Run our forward pass.
        loss = model.neg_log_likelihood(sentence_in, targets)
        print(loss.item())
        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

# Check predictions after training
with torch.no_grad():
    # precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)
    precheck_sent = prepare_embeddings(sentence_train[1], nlp)
    print(model(precheck_sent))
# We got it!

(tensor(15.6371), [0, 2, 1, 2, 1, 2, 1, 4, 3])
0.6828417778015137
17.53197479248047
2.8264882564544678
1.879255771636963
64.332763671875
41.680152893066406
36.992462158203125
18.788185119628906
16.35803985595703
31.099395751953125
14.279205322265625
20.549110412597656
0.16658544540405273
7.825750350952148
6.1936492919921875
6.334251403808594
15.946723937988281
33.514747619628906
6.8483734130859375
4.7913665771484375
4.246574401855469
0.4911832809448242
5.923057556152344
1.557264804840088
6.61895751953125
26.05602264404297
6.949073791503906
5.9677886962890625
7.78851318359375
0.4167518615722656
0.1352682113647461
3.076974868774414
1.1157255172729492
21.365493774414062
16.500274658203125
8.538810729980469
0.817535400390625
13.244148254394531
17.302505493164062
8.660789489746094
0.20551109313964844
1.8585643768310547
0.7793335914611816
1.630828857421875
35.75518798828125
0.3557615280151367
10.623384475708008
2.0036354064941406
20.11725616455078
1.3179550170898438
1.0858230590820312
1.3635

KeyboardInterrupt: ignored

In [0]:
# optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)
print('!')
# Check predictions before training
with torch.no_grad():
    # precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)
    precheck_sent = prepare_embeddings(sentence_train[1], nlp)
    precheck_sent = precheck_sent.to(device)
    precheck_tags = torch.tensor([tag_to_ix[t] for t in ner_train[1].split()], dtype=torch.long)
    print(model(precheck_sent))

# Make sure prepare_sequence from earlier in the LSTM section is loaded
for epoch in range(5):  # again, normally you would NOT do 300 epochs, it is toy data
    # for sentence, tags in training_data:

    for sentence, tags in zip(sentence_train, ner_train):
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        # sentence_in = prepare_sequence(sentence, word_to_ix)
        sentence_in = prepare_embeddings(sentence, nlp)
        targets = torch.tensor([tag_to_ix[t] for t in tags.split()], dtype=torch.long)
        sentence_in, targets = sentence_in.to(device), targets.to(device)
        # Step 3. Run our forward pass.
        loss = model.neg_log_likelihood(sentence_in, targets)
        # print(loss.item())
        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

# Check predictions after training
with torch.no_grad():
    # precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)
    precheck_sent = prepare_embeddings(sentence_train[1], nlp)
    precheck_sent = precheck_sent.to(device)
    print(model(precheck_sent))
# We got it!

In [0]:
with torch.no_grad():
    # precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)
    precheck_sent = prepare_embeddings(sentence_train[1], nlp)
    precheck_sent = precheck_sent.to(device)
    # precheck_tags = torch.tensor([tag_to_ix[t] for t in ner_train[1].split()], dtype=torch.long)
    # print(model(precheck_sent))

In [0]:
import json

with open("/var/log/colab-jupyter.log", "r") as fo:
  for line in fo:
    print(json.loads(line)['msg'])

Writing notebook server cookie secret to /root/.local/share/jupyter/runtime/notebook_cookie_secret
google.colab serverextension initialized.
Serving notebooks from local directory: /
0 active kernels
The Jupyter Notebook is running at:
http://172.28.0.2:9000/
Use Control-C to stop this server and shut down all kernels (twice to skip confirmation).
Kernel started: 1c1905de-178e-470b-ac84-c851b1a73f65
Adapting to protocol v5.1 for kernel 1c1905de-178e-470b-ac84-c851b1a73f65
KernelRestarter: restarting kernel (1/5), keep random ports


In [0]:
# find all pos and dep in this document
# nlp = spacy.load('en_core_web_sm')

pos_tag, dep_tag, tag_set = set(), set(), set()
for sentence in sentence_train:
    parse = nlp(sentence)
    for token in parse:
        pos_tag.add(token.pos_)
        dep_tag.add(token.dep_)
        tag_set.add(token.tag_)

for sentence in sentence_val:
    parse = nlp(sentence)
    for token in parse:
        pos_tag.add(token.pos_)
        dep_tag.add(token.dep_)
        tag_set.add(token.tag_)

for sentence in sentence_test:
    parse = nlp(sentence)
    for token in parse:
        pos_tag.add(token.pos_)
        dep_tag.add(token.dep_)
        tag_set.add(token.tag_)

In [0]:
tag_set

{'$',
 "''",
 ',',
 '-LRB-',
 '-RRB-',
 '.',
 ':',
 'ADD',
 'CC',
 'CD',
 'DT',
 'EX',
 'FW',
 'HYPH',
 'IN',
 'JJ',
 'JJR',
 'JJS',
 'LS',
 'MD',
 'NFP',
 'NN',
 'NNP',
 'NNPS',
 'NNS',
 'PDT',
 'POS',
 'PRP',
 'PRP$',
 'RB',
 'RBR',
 'RBS',
 'RP',
 'SYM',
 'TO',
 'UH',
 'VB',
 'VBD',
 'VBG',
 'VBN',
 'VBP',
 'VBZ',
 'WDT',
 'WP',
 'WP$',
 'WRB',
 'XX',
 '``'}

try attention

In [0]:
class BiLSTM_CRF(nn.Module):

    def __init__(self, tag_to_ix, embedding_dim, hidden_dim, num_heads, layers=1):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        # self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        self.layers = layers
        # self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=self.layers, bidirectional=True)
        self.lstm1 = nn.LSTM(embedding_dim*2, hidden_dim // 2,
                            num_layers=self.layers, bidirectional=True)
        self.lstm2 = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=self.layers, bidirectional=True)
        
        self.multiheadAtt = nn.MultiheadAttention(self.embedding_dim, num_heads)
        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000
        
        self.hidden = self.init_hidden()

    def argmax(self, vec):
        # return the argmax as a python int
        _, idx = torch.max(vec, 1)
        return idx.item()
    
    # Compute log sum exp in a numerically stable way for the forward algorithm
    def log_sum_exp(self, vec):
        max_score = vec[0, self.argmax(vec)]
        max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
        return max_score + \
            torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

    def init_hidden(self):
        return (torch.randn(2 * self.layers, 1, self.hidden_dim // 2),
                torch.randn(2 * self.layers, 1, self.hidden_dim // 2))

#################################
    def cal_summary(self, sentence):
        self.hidden = self.init_hidden()
        lstm_out, (h_n, c_n) = self.lstm(sentence, self.hidden)
        summary = torch.cat((h_n[0,:,:], h_n[1,:,:]), 1)
        return summary.view(1, 1, -1)

    def cal_attention(self, summary, input_tensor, method):

        if method == 'Dot Product':
            # bmm: https://pytorch.org/docs/master/generated/torch.bmm.html
            attn_weights = F.softmax(torch.bmm(summary, input_tensor.T.unsqueeze(0)), dim=-1)
            attn_output = torch.bmm(attn_weights, input_tensor.unsqueeze(0))
            concat_output = torch.cat((attn_output[0], summary[0]), 1)
        
        elif method == 'Scaled_Dot_Product':
            # COMPLETE THIS PART - Scale Dot Product calculation method
            # attn_weights = F.softmax(1/np.sqrt(self.hidden_dim // 2) * torch.bmm(summary, input_tensor.T.unsqueeze(0)), dim=-1)
            print(input_tensor.shape)
            N = input_tensor.shape[0]
            input_tensor = input_tensor.view(1, N, -1)
            attn_weights = F.softmax(1/np.sqrt(self.hidden_dim // 2) * torch.bmm(summary, input_tensor.T.unsqueeze(0)), dim=-1)
            attn_output = torch.bmm(attn_weights, input_tensor.unsqueeze(0))
            concat_output = torch.cat((attn_output[0], summary[0]), 1)

        return concat_output
##################################

    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(self.log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = self.log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        summary = self.cal_summary(sentence)
        # print(sentence.T.shape)
        summary_attn_output1 = self.cal_attention(summary, sentence, method='Scaled_Dot_Product')
        print(summary_attn_output1.unsqueeze(1).shape)
        self.hidden = self.init_hidden()
        lstm_out1, self.hidden = self.lstm1(summary_attn_output1.unsqueeze(1), self.hidden)

        self_attn_output, _ = self.multiheadAtt(lstm_out1, lstm_out1, lstm_out1)

        lstm_out2, self.hidden = self.lstm2(self_attn_output, self.hidden)
        print('lstm2', end=' ')
        print(lstm_out2.shape)
        # summary_attn_output2 = self.cal_attention(summary, lstm_out2, method='Scaled_Dot_Product')

        lstm_out = lstm_out2.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = self.argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = self.argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(feats)
        return forward_score - gold_score, tag_seq

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

In [0]:
HIDDEN_DIM = 410
EMBEDDING_DIM = 410

model = BiLSTM_CRF(tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, layers=1, num_heads=10)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

train_model(model, device, optimizer, num_epochs=10, emb_dim=EMBEDDING_DIM)

Epoch 1/10
torch.Size([1, 1, 410])


RuntimeError: ignored

In [0]:
pos_table = {
    'ADJ': 0,
    'ADP': 1,
    'ADV': 2,
    'AUX': 3,
    'CCONJ': 4,
    'DET': 5,
    'INTJ': 6,
    'NOUN': 7,
    'NUM': 8,
    'PART': 9,
    'PRON': 10,
    'PROPN': 11,
    'PUNCT': 12,
    'SCONJ': 13,
    'SYM': 14,
    'VERB': 15,
    'X': 16
}
pos_onehot = np.eye(len(pos_table))

In [0]:
dep_table = {
    'ROOT': 0,
    'acl': 1,
    'acomp': 2,
    'advcl': 3,
    'advmod': 4,
    'agent': 5,
    'amod': 6,
    'appos': 7,
    'attr': 8,
    'aux': 9,
    'auxpass': 10,
    'case': 11,
    'cc': 12,
    'ccomp': 13,
    'compound': 14,
    'conj': 15,
    'csubj': 16,
    'dative': 17,
    'dep': 18,
    'det': 19,
    'dobj': 20,
    'expl': 21,
    'intj': 22,
    'mark': 23,
    'meta': 24,
    'neg': 25,
    'nmod': 26,
    'npadvmod': 27,
    'nsubj': 28,
    'nsubjpass': 29,
    'nummod': 30,
    'oprd': 31,
    'parataxis': 32,
    'pcomp': 33,
    'pobj': 34,
    'poss': 35,
    'preconj': 36,
    'predet': 37,
    'prep': 38,
    'prt': 39,
    'punct': 40,
    'quantmod': 41,
    'relcl': 42,
    'xcomp': 43,
    'csubjpass': 44
}
dep_onehot = np.eye(len(dep_table))

In [0]:
tag_table = {
    '$': 0,
    "''": 1,
    ',': 2,
    '-LRB-': 3,
    '-RRB-': 4,
    '.': 5,
    ':': 6,
    'ADD': 7,
    'CC': 8,
    'CD': 9,
    'DT': 10,
    'EX': 11,
    'FW': 12,
    'HYPH': 13,
    'IN': 14,
    'JJ': 15,
    'JJR': 16,
    'JJS': 17,
    'LS': 18,
    'MD': 19,
    'NFP': 20,
    'NN': 21,
    'NNP': 22,
    'NNPS': 23,
    'NNS':24,
    'PDT':25,
    'POS':26,
    'PRP': 27,
    'PRP$':28,
    'RB': 29,
    'RBR': 30,
    'RBS': 31,
    'RP': 32,
    'SYM': 33,
    'TO': 34,
    'UH': 35,
    'VB': 36,
    'VBD': 37,
    'VBG': 38,
    'VBN': 39,
    'VBP': 40,
    'VBZ': 41,
    'WDT': 42,
    'WP': 43,
    'WP$': 44,
    'WRB': 45,
    'XX': 46,
    '``': 47
}
tag_onehot_emb = np.eye(len(tag_table))

In [11]:

word_model = api.load("glove-wiki-gigaword-300")



In [0]:
# https://spacy.io/usage/linguistic-features#tokenization define our own spacy tokenizar by split()
class WhitespaceTokenizer(object):
    def __init__(self, vocab):
        self.vocab = vocab

    def __call__(self, text):
        words = text.split(' ')
        # All tokens 'own' a subsequent space character in this tokenizer
        spaces = [True] * len(words)
        return Doc(self.vocab, words=words, spaces=spaces)

nlp = spacy.load('en_core_web_sm')
nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)

In [0]:

def prepare_sequence(seq, to_ix):
    N = len(seq.split())
    idxs = [to_ix[w] for w in seq.split()]
    emb = nn.Embedding(len(to_ix), 50)
    return emb(idxs).view(N, 1, -1)


def prepare_embeddings(seq, nlp, to_ix, emb_dim):
    N = len(seq.split())
    if not nlp:
        idxs = torch.LongTensor([to_ix[w] for w in seq.split()])
        emb = nn.Embedding(len(to_ix), emb_dim)
        return emb(idxs).view(N, 1, -1)
    sentence_emb = torch.empty((0, emb_dim), dtype=torch.float32)
    parse = nlp(seq)
    for token in parse:
        word_emb = torch.FloatTensor(token.vector)
        pos_emb = torch.FloatTensor(pos_onehot[pos_table[token.pos_]])
        dep_emb = torch.FloatTensor(dep_onehot[dep_table[token.dep_]])
        embeddings = torch.cat((word_emb, pos_emb, dep_emb), axis=-1).view(1, -1)
        # print(embeddings.shape)
        sentence_emb = torch.cat((sentence_emb, embeddings), axis=0)
    return sentence_emb.view(N, 1, -1)


def emb_pos_dep_tf_idf(seq, index, flag):
    N = len(seq.split())
    sentence_emb = torch.empty((0, 158), dtype=torch.float32)
    parse = nlp(seq)
    for token in parse:
        word_emb = torch.FloatTensor(token.vector)
        pos_emb = torch.FloatTensor(pos_onehot[pos_table[token.pos_]])
        dep_emb = torch.FloatTensor(dep_onehot[dep_table[token.dep_]])
        if flag == 'train':
            tf_idf_emb = torch.FloatTensor([tf_idf_train[index, str(token)]])
        elif flag == 'val':
            tf_idf_emb = torch.FloatTensor([tf_idf_val[index, str(token)]])
        elif flag == 'test':
            tf_idf_emb = torch.FloatTensor([tf_idf_test[index, str(token)]])
        embeddings = torch.cat((word_emb, pos_emb, dep_emb, tf_idf_emb), axis=-1).view(1, -1)
        # print(embeddings.shape)
        sentence_emb = torch.cat((sentence_emb, embeddings), axis=0)
    return sentence_emb.view(N, 1, -1)


In [0]:
# check if cuda is available, if so print the device name
# using_cuda = torch.cuda.is_available()
# device = torch.device('cuda' if using_cuda else 'cpu')
# print('We are using ' + str(device) + '.')
# if using_cuda:
#     print(torch.cuda.get_device_properties(device))

# MAX_LENGTH = 124
# EMBEDDING_DIM = 158                      # word_emb 96 + pos_onehot 17 + dep_onehot 44 + td_idf 1
# BATCH_SIZE = 256
START_TAG = "<START>"
STOP_TAG = "<STOP>"

# kwargs = {'num_workers': 1, 'pin_memory': True} if using_cuda else {}