# 1. Load Dataset

In [None]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

id_trian = '1ukOHzF-Rpi8_Ig4a40j_tlxscaGvyoHD'
id_val = '1VwEmbqMZznZcuGeXOksXgiIrQ9VUC2dg'
id_test = '17zD8SLoNL7EANiEneNzxB4L4kjqFMUg_'

downloaded = drive.CreateFile({'id':id_trian}) 
downloaded.GetContentFile('train.csv')

downloaded = drive.CreateFile({'id':id_val}) 
downloaded.GetContentFile('val.csv')

downloaded = drive.CreateFile({'id':id_test}) 
downloaded.GetContentFile('test.csv')

In [144]:
import pandas as pd

# read train
df_train = pd.read_csv("train.csv")
train_sent = [i.split() for i in list(df_train.Sentence)]
train_NER = [i.split() for i in list(df_train.NER)]

# read val and add them after train
df_val = pd.read_csv("val.csv")
train_sent += [i.split() for i in list(df_val.Sentence)]
train_NER += [i.split() for i in list(df_val.NER)]

# read test
df_test = pd.read_csv("test.csv")
test_sent = [i.split() for i in list(df_test.Sentence)]

# all sentence for processing
all_sentence = train_sent + test_sent

# 2. Test lemmatisation

In [128]:
# From Lab05
# Lemmatize
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

def lemmatize(x):
    lemmatizer = WordNetLemmatizer()
    lemma_sentence = [lemmatizer.lemmatize(w) for w in x]

    return lemma_sentence


all_sentence_lemma = [lemmatize(s) for s in all_sentence]

[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# 3. Test stemming

In [None]:
# From Lab05
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
# Transform data lists into tokens
# Stem the words
def word_stemmer(lists, isSentence):
    tokens = []

    for unstemmed_tokens in lists:
        temp = []
        for unstemmed_token in unstemmed_tokens:
            unstemmed_token = stemmer.stem(unstemmed_token)
            temp.append(unstemmed_token)
        tokens.append(temp)

    return tokens

all_sentence_stem = word_stemmer(all_sentence)

# 4. Feature Extraction

## 4.1 Word Embedding

In [145]:
## word dic
word_list = []
for sentence in all_sentence:
  word_list.extend(sentence)
unique_word = list(set(word_list))

ix_2_word = {ix: word for ix, word in enumerate(unique_word)}
word_2_ix = {word: ix for ix, word in ix_2_word.items()}

In [146]:
## From Lab 9
import gensim.downloader as api
import numpy as np

EMBEDDING_DIM = 100
## change to 100 dimension word embedding
word_emb_model = api.load("glove-wiki-gigaword-100") 

embedding_matrix = []
for ix in range(len(ix_2_word)):
    try:
        embedding_matrix.append(word_emb_model.wv[ix_2_word[ix]])
    except:
        embedding_matrix.append([0]*EMBEDDING_DIM)
embedding_matrix = np.array(embedding_matrix)
embedding_matrix.shape

  if sys.path[0] == '':


(13972, 100)

## 4.2 Other features - Part of Speech and Parse Tree

In [147]:
### From Lab 7
import spacy

#load the spacy api with the pre-trained statistical models for English. English multi-task CNN trained on OntoNotes
nlp = spacy.load("en_core_web_sm")

# add up all the sentence for Pos and Parse Tree
all_sentence_string = list(df_train.Sentence) + list(df_val.Sentence) + list(df_test.Sentence)

pos = []
pt = []
for sentence in all_sentence_string:
  # print(sentence)
  parse = nlp(sentence)
  # generate parse tree feature
  pt.append([token.dep_ for token in parse][:len(sentence.split(" "))])
  # generate part of speech feature
  pos.append([token.pos_ for token in parse][:len(sentence.split(" "))])

In [148]:
def unique(lst):
  uniq = []
  for i in lst:
    uniq.extend(i)
  uniq = list(set(uniq))
  label_2_ix = {tag: ix for ix, tag in enumerate(uniq)}
  return uniq, label_2_ix

# for pos and parse tree, because the number of them is short, i decide to use one embedding matrix for them
other_feature_unique, other_feature_2_ix = unique(pt + pos)

# 5. Model Definition - Modified Bi-LSTM with CRF

In [149]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.metrics import f1_score
import datetime

torch.manual_seed(1)

def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()

# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

In [150]:
## Model is modified from lab 9 to achieve different features input and evaluation on different layers and attention methods
class BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, other_feature_dim, hidden_dim, attetion = "Dot Product", other_features = ["pt","pos"], layers = 1):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        self.layers = layers
        self.other_feature = other_features

        self.other_feature_embeds = nn.Embedding(len(other_feature_2_ix), other_feature_dim)

        self.attention_method = attetion

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)

        """Here we use the embedding matrix as the initial weights of nn.Embedding"""
        self.word_embeds.weight.data.copy_(torch.from_numpy(embedding_matrix))

        if "pt" in other_features:
            self.embedding_dim += other_feature_dim
        if "pos" in other_features:
            self.embedding_dim += other_feature_dim
        
        self.lstm = nn.LSTM(self.embedding_dim, hidden_dim // 2,
                            num_layers=layers, bidirectional=True)

        # Maps the output of the LSTM into tag space.
        if self.attention_method == "No Attention":
            self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)
        else:
            self.hidden2tag = nn.Linear(hidden_dim * 2, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(2 * self.layers, 1, self.hidden_dim // 2).to(device),
                torch.randn(2 * self.layers, 1, self.hidden_dim // 2).to(device))

    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.).to(device)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence, other_features = None):
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        
        # print("before: ", embeds.shape)
        ## dictionary based other features
        if "pos" in self.other_feature:
            new = self.other_feature_embeds(other_features["pos"]).view(len(sentence), 1, -1)
            # print(other_features["pos"])
            # print(sentence)
            # print("new_pos: ", new.shape)
            embeds = torch.cat((embeds, new), dim = -1)
        if "pt" in self.other_feature:
            new = self.other_feature_embeds(other_features["pt"]).view(len(sentence), 1, -1)
            # print("new_pt: ", new.shape)
            embeds = torch.cat((embeds, new), dim = -1)
          
        # print(embeds.shape)

        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        
        # for self attention
        lstm_out_1 = lstm_out.view(lstm_out.size(1), lstm_out.size(0), lstm_out.size(2))
        lstm_out_2 = lstm_out.view(lstm_out.size(1), lstm_out.size(2), lstm_out.size(0))

        if self.attention_method == "Dot Product":
            attn_weights = F.softmax(torch.bmm(lstm_out_1, lstm_out_2),dim=-1)
            attn_output = torch.bmm(attn_weights, lstm_out_1)
            concat_output = torch.cat((attn_output, lstm_out_1), dim = -1)
            lstm_out = concat_output.view(len(sentence), self.hidden_dim * 2)

        elif self.attention_method == "Scale Dot Product":
            attn_weights = F.softmax(1/np.sqrt(self.hidden_dim)*torch.bmm(lstm_out_1, lstm_out_2),dim=-1)
            attn_output = torch.bmm(attn_weights, lstm_out_1)
            concat_output = torch.cat((attn_output, lstm_out_1), dim = -1)
            lstm_out = concat_output.view(len(sentence), self.hidden_dim * 2)
        else:
            lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
            
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1).to(device)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long).to(device), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.).to(device)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags, other_feature = None):
        feats = self._get_lstm_features(sentence, other_feature)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

# 6. Data Preparation For Training

In [151]:
## make word into ix for training and testing - Lab 9
def to_index(data, to_ix):
    input_index_list = []
    for sent in data:
        input_index_list.append([to_ix[w] for w in sent])
    return input_index_list

all_sentence_index = to_index(all_sentence, word_2_ix)

In [152]:
## make labels into ix
unique_NER = []
for line in train_NER:
  unique_NER.extend(line)
unique_tag= list(set(unique_NER))

START_TAG = "<START>"
STOP_TAG = "<STOP>"

tag_2_ix = {tag: ix + 2 for ix, tag in enumerate(unique_tag)}
tag_2_ix["<START>"] = 0
tag_2_ix["<STOP>"] = 1
ix_2_tag = {ix: tag for tag,ix in tag_2_ix.items()}

# transform tags into idx
train_NER_index = to_index(train_NER, tag_2_ix)

In [153]:
# make other features into ix
all_sentence_pos = to_index(pos, other_feature_2_ix)
all_sentence_pt = to_index(pt, other_feature_2_ix)

In [None]:
# Array for selected model scores
scores = np.zeros(13)

# 7. Different Model Testing

## 7.1 Baseline model

In [138]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

HIDDEN_DIM = 50
OTHER_FEATURE_DIM = 20
NUM_LAYER = 1

model = BiLSTM_CRF(len(word_2_ix), tag_2_ix, EMBEDDING_DIM, OTHER_FEATURE_DIM, HIDDEN_DIM, attetion = "No Attention", other_features = [], layers = NUM_LAYER)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

model.to(device)

cuda


In [139]:
"""Each epoch will take about 1-2 minutes"""

import datetime

for epoch in range(20):  
    time1 = datetime.datetime.now()
    train_loss = 0

    model.train()
    for i, idxs in enumerate(all_sentence_index[:3000]):
        
        tags_index = train_NER_index[i]
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)

        # Step new. Add other features as input
        other_feature = {}
        other_feature["pos"] = torch.tensor(all_sentence_pos[i], dtype=torch.long).to(device)
        other_feature["pt"] = torch.tensor(all_sentence_pt[i], dtype=torch.long).to(device)

        # Step 3. Run our forward pass.
        loss = model.neg_log_likelihood(sentence_in, targets, other_feature)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

        train_loss+=loss.item()

    time2 = datetime.datetime.now()

    # print("Epoch:%d, Training loss: %.2f, train acc: %.4f, val loss: %.2f, val acc: %.4f, time: %.2fs" %(epoch+1, train_loss,train_acc, val_loss, val_acc, (time2-time1).total_seconds()))
    print("Epoch:%d, Training loss: %.2f, time: %.2fs" %(epoch+1, train_loss, (time2-time1).total_seconds()))


Epoch:1, Training loss: 9812.25, time: 164.57s
Epoch:2, Training loss: 5165.15, time: 164.08s
Epoch:3, Training loss: 3882.39, time: 164.82s
Epoch:4, Training loss: 3090.34, time: 163.36s
Epoch:5, Training loss: 2531.92, time: 162.72s
Epoch:6, Training loss: 2098.31, time: 167.82s
Epoch:7, Training loss: 1705.86, time: 162.44s
Epoch:8, Training loss: 1372.70, time: 165.50s
Epoch:9, Training loss: 1132.07, time: 168.76s
Epoch:10, Training loss: 958.66, time: 166.57s
Epoch:11, Training loss: 761.55, time: 165.81s
Epoch:12, Training loss: 653.88, time: 167.91s
Epoch:13, Training loss: 522.50, time: 164.12s
Epoch:14, Training loss: 454.56, time: 162.45s
Epoch:15, Training loss: 376.39, time: 166.19s
Epoch:16, Training loss: 347.63, time: 167.54s
Epoch:17, Training loss: 295.13, time: 169.70s
Epoch:18, Training loss: 284.19, time: 159.39s
Epoch:19, Training loss: 254.77, time: 133.62s
Epoch:20, Training loss: 203.32, time: 156.37s


In [140]:
## save model
torch.save(model, "baseline_model.pt")

## do the prediction - test - all_features
test_index = all_sentence_index[3700:]

model.eval()
ground_truth = []
predicted = []
for i in range(len(test_index)):
  other_feature = {}
  other_feature["pos"] = torch.tensor(all_sentence_pos[i+3700], dtype=torch.long).to(device)
  other_feature["pt"] = torch.tensor(all_sentence_pt[i+3700], dtype=torch.long).to(device)
  input = torch.tensor(test_index[i],dtype=torch.long).to(device)
  features = model._get_lstm_features(input, other_feature)
  score, tag_seq = model._viterbi_decode(features)
  predicted.extend(tag_seq)
  #ground_truth.extend(output_index[i])

# output the format for submit
import csv
with open('baseline_model_result.csv', 'w+', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['Id', 'Predicted'])
    for i, y in enumerate(predicted):
        writer.writerow([i, ix_2_tag[y]])

## print f1 score for val
y_true = []
count = 0
for tags in train_NER_index[3000:]:
    y_true.extend(tags)


val_index = all_sentence_index[3000:3700]
model.eval()
predicted_val = []
for i in range(len(val_index)):
  other_feature = {}
  other_feature["pos"] = torch.tensor(all_sentence_pos[i+3000], dtype=torch.long).to(device)
  # print(other_feature["pos"].shape)
  other_feature["pt"] = torch.tensor(all_sentence_pt[i+3000], dtype=torch.long).to(device)
  input = torch.tensor(val_index[i],dtype=torch.long).to(device)
  features = model._get_lstm_features(input, other_feature)
  score, tag_seq = model._viterbi_decode(features)
  predicted_val.extend(tag_seq)

# Record all the model F1 scores
scores[0] = f1_score(y_true,predicted_val,average='micro')
print(scores[0])

from sklearn.metrics import classification_report
print(classification_report(y_true, predicted_val, digits=4))

  "type " + obj.__name__ + ". It won't be checked "


0.9658549497088407
              precision    recall  f1-score   support

           2     0.8247    0.6791    0.7449       187
           3     0.9772    0.9303    0.9532       875
           4     0.9749    0.9934    0.9841      5790
           5     0.8780    0.7579    0.8136       285
           6     0.9196    0.9284    0.9240       419

    accuracy                         0.9659      7556
   macro avg     0.9149    0.8578    0.8839      7556
weighted avg     0.9647    0.9659    0.9648      7556



## 7.2 two layers + dot product attention + word embedding + pt + pos features

In [141]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
HIDDEN_DIM = 50
OTHER_FEATURE_DIM = 20
NUM_LAYER = 2
ATTENTION = ["Dot Product", "Scale Dot Product"]

# remove string from list to remove features
ADDITION_FEATURE = ["pt","pos"]

model = BiLSTM_CRF(len(word_2_ix), tag_2_ix, EMBEDDING_DIM, OTHER_FEATURE_DIM, HIDDEN_DIM, attetion = ATTENTION[0], other_features = ["pt","pos"], layers = NUM_LAYER).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

In [142]:
"""Each epoch will take about 1-2 minutes"""

import datetime

for epoch in range(20):  
    time1 = datetime.datetime.now()
    train_loss = 0

    model.train()
    for i, idxs in enumerate(all_sentence_index[:3000]):
        
        tags_index = train_NER_index[i]
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)

        # Step new. Add other features as input
        other_feature = {}
        other_feature["pos"] = torch.tensor(all_sentence_pos[i], dtype=torch.long).to(device)
        other_feature["pt"] = torch.tensor(all_sentence_pt[i], dtype=torch.long).to(device)

        # Step 3. Run our forward pass.
        loss = model.neg_log_likelihood(sentence_in, targets, other_feature)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

        train_loss+=loss.item()

    time2 = datetime.datetime.now()

    # print("Epoch:%d, Training loss: %.2f, train acc: %.4f, val loss: %.2f, val acc: %.4f, time: %.2fs" %(epoch+1, train_loss,train_acc, val_loss, val_acc, (time2-time1).total_seconds()))
    print("Epoch:%d, Training loss: %.2f, time: %.2fs" %(epoch+1, train_loss, (time2-time1).total_seconds()))


Epoch:1, Training loss: 12324.24, time: 148.56s
Epoch:2, Training loss: 5796.74, time: 137.74s
Epoch:3, Training loss: 4271.71, time: 137.88s


KeyboardInterrupt: 

In [None]:
## save model
torch.save(model, "pt_pos_word_2_layer_dot_prodcut.pt")

## do the prediction - test - all_features
test_index = all_sentence_index[3700:]

model.eval()
ground_truth = []
predicted = []
for i in range(len(test_index)):
  other_feature = {}
  other_feature["pos"] = torch.tensor(all_sentence_pos[i+3700], dtype=torch.long).to(device)
  other_feature["pt"] = torch.tensor(all_sentence_pt[i+3700], dtype=torch.long).to(device)
  input = torch.tensor(test_index[i],dtype=torch.long).to(device)
  features = model._get_lstm_features(input, other_feature)
  score, tag_seq = model._viterbi_decode(features)
  predicted.extend(tag_seq)
  #ground_truth.extend(output_index[i])

# output the format for submit
import csv
with open('pt_pos_word_2_layer_dot_prodcut_result', 'w+', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['Id', 'Predicted'])
    for i, y in enumerate(predicted):
        writer.writerow([i, ix_2_tag[y]])

## print f1 socre for val
y_true = []
count = 0
for tags in train_NER_index[3000:]:
    y_true.extend(tags)


val_index = all_sentence_index[3000:3700]
model.eval()
predicted_val = []
for i in range(len(val_index)):
  other_feature = {}
  other_feature["pos"] = torch.tensor(all_sentence_pos[i+3000], dtype=torch.long).to(device)
  # print(other_feature["pos"].shape)
  other_feature["pt"] = torch.tensor(all_sentence_pt[i+3000], dtype=torch.long).to(device)
  input = torch.tensor(val_index[i],dtype=torch.long).to(device)
  features = model._get_lstm_features(input, other_feature)
  score, tag_seq = model._viterbi_decode(features)
  predicted_val.extend(tag_seq)

# Record all the model F1 scores
scores[1] = f1_score(y_true,predicted_val,average='micro')

from sklearn.metrics import classification_report
print(classification_report(y_true, predicted_val, digits=4))

## 7.3 one layer + dot product + only word embedding 

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
HIDDEN_DIM = 50
OTHER_FEATURE_DIM = 20
NUM_LAYER = 1
ATTENTION = ["Dot Product", "Scale Dot Product"]


model = BiLSTM_CRF(len(word_2_ix), tag_2_ix, EMBEDDING_DIM, OTHER_FEATURE_DIM, HIDDEN_DIM, 
                   attetion = ATTENTION[0], other_features = [], layers = NUM_LAYER).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

In [None]:
"""Each epoch will take about 1-2 minutes"""

import datetime

for epoch in range(20):  
    time1 = datetime.datetime.now()
    train_loss = 0

    model.train()
    for i, idxs in enumerate(all_sentence_index[:3000]):
        
        tags_index = train_NER_index[i]
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)

        # Step new. Add other features as input
        other_feature = {}
        other_feature["pos"] = torch.tensor(all_sentence_pos[i], dtype=torch.long).to(device)
        other_feature["pt"] = torch.tensor(all_sentence_pt[i], dtype=torch.long).to(device)

        # Step 3. Run our forward pass.
        loss = model.neg_log_likelihood(sentence_in, targets, other_feature)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

        train_loss+=loss.item()

    time2 = datetime.datetime.now()

    # print("Epoch:%d, Training loss: %.2f, train acc: %.4f, val loss: %.2f, val acc: %.4f, time: %.2fs" %(epoch+1, train_loss,train_acc, val_loss, val_acc, (time2-time1).total_seconds()))
    print("Epoch:%d, Training loss: %.2f, time: %.2fs" %(epoch+1, train_loss, (time2-time1).total_seconds()))


In [None]:
## save model
torch.save(model, "word_1_layer_dot_prodcut.pt")

## do the prediction - test - all_features
test_index = all_sentence_index[3700:]

model.eval()
ground_truth = []
predicted = []
for i in range(len(test_index)):
  other_feature = {}
  other_feature["pos"] = torch.tensor(all_sentence_pos[i+3700], dtype=torch.long).to(device)
  other_feature["pt"] = torch.tensor(all_sentence_pt[i+3700], dtype=torch.long).to(device)
  input = torch.tensor(test_index[i],dtype=torch.long).to(device)
  features = model._get_lstm_features(input, other_feature)
  score, tag_seq = model._viterbi_decode(features)
  predicted.extend(tag_seq)
  #ground_truth.extend(output_index[i])

# output the format for submit
import csv
with open('word_1_layer_dot_prodcut_result.csv', 'w+', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['Id', 'Predicted'])
    for i, y in enumerate(predicted):
        writer.writerow([i, ix_2_tag[y]])

## print f1 socre for val
y_true = []
count = 0
for tags in train_NER_index[3000:]:
    y_true.extend(tags)


val_index = all_sentence_index[3000:3700]
model.eval()
predicted_val = []
for i in range(len(val_index)):
  other_feature = {}
  other_feature["pos"] = torch.tensor(all_sentence_pos[i+3000], dtype=torch.long).to(device)  
  # print(other_feature["pos"].shape)
  other_feature["pt"] = torch.tensor(all_sentence_pt[i+3000], dtype=torch.long).to(device)
  input = torch.tensor(val_index[i],dtype=torch.long).to(device)
  features = model._get_lstm_features(input, other_feature)
  score, tag_seq = model._viterbi_decode(features)
  predicted_val.extend(tag_seq)

# Record all the model F1 scores
scores[2] = f1_score(y_true,predicted_val,average='micro')
    
from sklearn.metrics import classification_report
print(classification_report(y_true, predicted_val, digits=4))

## 7.4 one layer + scale dot product + only word embedding 

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
HIDDEN_DIM = 50
OTHER_FEATURE_DIM = 20
NUM_LAYER = 1
ATTENTION = ["Dot Product", "Scale Dot Product"]


model = BiLSTM_CRF(len(word_2_ix), tag_2_ix, EMBEDDING_DIM, OTHER_FEATURE_DIM, HIDDEN_DIM, 
                   attetion = ATTENTION[1], other_features = [], layers = NUM_LAYER).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

In [None]:
"""Each epoch will take about 1-2 minutes"""

import datetime

for epoch in range(20):  
    time1 = datetime.datetime.now()
    train_loss = 0

    model.train()
    for i, idxs in enumerate(all_sentence_index[:3000]):
        
        tags_index = train_NER_index[i]
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)

        # Step new. Add other features as input
        other_feature = {}
        other_feature["pos"] = torch.tensor(all_sentence_pos[i], dtype=torch.long).to(device)
        other_feature["pt"] = torch.tensor(all_sentence_pt[i], dtype=torch.long).to(device)

        # Step 3. Run our forward pass.
        loss = model.neg_log_likelihood(sentence_in, targets, other_feature)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

        train_loss+=loss.item()

    time2 = datetime.datetime.now()

    # print("Epoch:%d, Training loss: %.2f, train acc: %.4f, val loss: %.2f, val acc: %.4f, time: %.2fs" %(epoch+1, train_loss,train_acc, val_loss, val_acc, (time2-time1).total_seconds()))
    print("Epoch:%d, Training loss: %.2f, time: %.2fs" %(epoch+1, train_loss, (time2-time1).total_seconds()))


In [None]:
## save model
torch.save(model, "word_1_layer_scale_dot_prodcut.pt")

## do the prediction - test - all_features
test_index = all_sentence_index[3700:]

model.eval()
ground_truth = []
predicted = []
for i in range(len(test_index)):
  other_feature = {}
  other_feature["pos"] = torch.tensor(all_sentence_pos[i+3700], dtype=torch.long).to(device)
  other_feature["pt"] = torch.tensor(all_sentence_pt[i+3700], dtype=torch.long).to(device)
  input = torch.tensor(test_index[i],dtype=torch.long).to(device)
  features = model._get_lstm_features(input, other_feature)
  score, tag_seq = model._viterbi_decode(features)
  predicted.extend(tag_seq)
  #ground_truth.extend(output_index[i])

# output the format for submit
import csv
with open('word_1_layer_scale_dot_prodcut_result.csv', 'w+', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['Id', 'Predicted'])
    for i, y in enumerate(predicted):
        writer.writerow([i, ix_2_tag[y]])

## print f1 socre for val
y_true = []
count = 0
for tags in train_NER_index[3000:]:
    y_true.extend(tags)


val_index = all_sentence_index[3000:3700]
model.eval()
predicted_val = []
for i in range(len(val_index)):
  other_feature = {}
  other_feature["pos"] = torch.tensor(all_sentence_pos[i+3000], dtype=torch.long).to(device)
  # print(other_feature["pos"].shape)
  other_feature["pt"] = torch.tensor(all_sentence_pt[i+3000], dtype=torch.long).to(device)
  input = torch.tensor(val_index[i],dtype=torch.long).to(device)
  features = model._get_lstm_features(input, other_feature)
  score, tag_seq = model._viterbi_decode(features)
  predicted_val.extend(tag_seq)

# Record all the model F1 scores
scores[3] = f1_score(y_true,predicted_val,average='micro')

from sklearn.metrics import classification_report
print(classification_report(y_true, predicted_val, digits=4))

## 7.5 two layer + scale dot product + only word embedding 

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
HIDDEN_DIM = 50
OTHER_FEATURE_DIM = 20
NUM_LAYER = 2
ATTENTION = ["Dot Product", "Scale Dot Product"]


model = BiLSTM_CRF(len(word_2_ix), tag_2_ix, EMBEDDING_DIM, OTHER_FEATURE_DIM, HIDDEN_DIM, 
                   attetion = ATTENTION[1], other_features = [], layers = NUM_LAYER).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

In [None]:
"""Each epoch will take about 1-2 minutes"""

import datetime

for epoch in range(20):  
    time1 = datetime.datetime.now()
    train_loss = 0

    model.train()
    for i, idxs in enumerate(all_sentence_index[:3000]):
        
        tags_index = train_NER_index[i]
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)

        # Step new. Add other features as input
        other_feature = {}
        other_feature["pos"] = torch.tensor(all_sentence_pos[i], dtype=torch.long).to(device)
        other_feature["pt"] = torch.tensor(all_sentence_pt[i], dtype=torch.long).to(device)

        # Step 3. Run our forward pass.
        loss = model.neg_log_likelihood(sentence_in, targets, other_feature)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

        train_loss+=loss.item()

    time2 = datetime.datetime.now()

    # print("Epoch:%d, Training loss: %.2f, train acc: %.4f, val loss: %.2f, val acc: %.4f, time: %.2fs" %(epoch+1, train_loss,train_acc, val_loss, val_acc, (time2-time1).total_seconds()))
    print("Epoch:%d, Training loss: %.2f, time: %.2fs" %(epoch+1, train_loss, (time2-time1).total_seconds()))


In [None]:
## save model
torch.save(model, "word_2_layer_scale_dot_prodcut.pt")

## do the prediction - test - all_features
test_index = all_sentence_index[3700:]

model.eval()
ground_truth = []
predicted = []
for i in range(len(test_index)):
  other_feature = {}
  other_feature["pos"] = torch.tensor(all_sentence_pos[i+3700], dtype=torch.long).to(device)
  other_feature["pt"] = torch.tensor(all_sentence_pt[i+3700], dtype=torch.long).to(device)
  input = torch.tensor(test_index[i],dtype=torch.long).to(device)
  features = model._get_lstm_features(input, other_feature)
  score, tag_seq = model._viterbi_decode(features)
  predicted.extend(tag_seq)
  #ground_truth.extend(output_index[i])

# output the format for submit
import csv
with open('word_2_layer_scale_dot_prodcut_result.csv', 'w+', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['Id', 'Predicted'])
    for i, y in enumerate(predicted):
        writer.writerow([i, ix_2_tag[y]])

## print f1 socre for val
y_true = []
count = 0
for tags in train_NER_index[3000:]:
    y_true.extend(tags)


val_index = all_sentence_index[3000:3700]
model.eval()
predicted_val = []
for i in range(len(val_index)):
  other_feature = {}
  other_feature["pos"] = torch.tensor(all_sentence_pos[i+3000], dtype=torch.long).to(device)
  # print(other_feature["pos"].shape)
  other_feature["pt"] = torch.tensor(all_sentence_pt[i+3000], dtype=torch.long).to(device)
  input = torch.tensor(val_index[i],dtype=torch.long).to(device)
  features = model._get_lstm_features(input, other_feature)
  score, tag_seq = model._viterbi_decode(features)
  predicted_val.extend(tag_seq)

# Record all the model F1 scores
scores[4] = f1_score(y_true,predicted_val,average='micro')

from sklearn.metrics import classification_report
print(classification_report(y_true, predicted_val, digits=4))

## 7.6 two layers + scale dot product + word embedding + pos feature

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
HIDDEN_DIM = 50
OTHER_FEATURE_DIM = 20
NUM_LAYER = 2
ATTENTION = ["Dot Product", "Scale Dot Product"]


model = BiLSTM_CRF(len(word_2_ix), tag_2_ix, EMBEDDING_DIM, OTHER_FEATURE_DIM, HIDDEN_DIM, 
                   attetion = ATTENTION[1], other_features = ["pos"], layers = NUM_LAYER).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

In [None]:
"""Each epoch will take about 1-2 minutes"""

import datetime

for epoch in range(20):  
    time1 = datetime.datetime.now()
    train_loss = 0

    model.train()
    for i, idxs in enumerate(all_sentence_index[:3000]):
        
        tags_index = train_NER_index[i]
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)

        # Step new. Add other features as input
        other_feature = {}
        other_feature["pos"] = torch.tensor(all_sentence_pos[i], dtype=torch.long).to(device)
        other_feature["pt"] = torch.tensor(all_sentence_pt[i], dtype=torch.long).to(device)

        # Step 3. Run our forward pass.
        loss = model.neg_log_likelihood(sentence_in, targets, other_feature)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

        train_loss+=loss.item()

    time2 = datetime.datetime.now()

    # print("Epoch:%d, Training loss: %.2f, train acc: %.4f, val loss: %.2f, val acc: %.4f, time: %.2fs" %(epoch+1, train_loss,train_acc, val_loss, val_acc, (time2-time1).total_seconds()))
    print("Epoch:%d, Training loss: %.2f, time: %.2fs" %(epoch+1, train_loss, (time2-time1).total_seconds()))


In [None]:
## save model
torch.save(model, "pos_word_2_layer_scale_dot_prodcut.pt")

## do the prediction - test - all_features
test_index = all_sentence_index[3700:]

model.eval()
ground_truth = []
predicted = []
for i in range(len(test_index)):
  other_feature = {}
  other_feature["pos"] = torch.tensor(all_sentence_pos[i+3700], dtype=torch.long).to(device)
  other_feature["pt"] = torch.tensor(all_sentence_pt[i+3700], dtype=torch.long).to(device)
  input = torch.tensor(test_index[i],dtype=torch.long).to(device)
  features = model._get_lstm_features(input, other_feature)
  score, tag_seq = model._viterbi_decode(features)
  predicted.extend(tag_seq)
  #ground_truth.extend(output_index[i])

# output the format for submit
import csv
with open('pos_word_2_layer_scale_dot_prodcut_result.csv', 'w+', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['Id', 'Predicted'])
    for i, y in enumerate(predicted):
        writer.writerow([i, ix_2_tag[y]])

## print f1 socre for val
y_true = []
count = 0
for tags in train_NER_index[3000:]:
    y_true.extend(tags)


val_index = all_sentence_index[3000:3700]
model.eval()
predicted_val = []
for i in range(len(val_index)):
  other_feature = {}
  other_feature["pos"] = torch.tensor(all_sentence_pos[i+3000], dtype=torch.long).to(device)
  # print(other_feature["pos"].shape)
  other_feature["pt"] = torch.tensor(all_sentence_pt[i+3000], dtype=torch.long).to(device)
  input = torch.tensor(val_index[i],dtype=torch.long).to(device)
  features = model._get_lstm_features(input, other_feature)
  score, tag_seq = model._viterbi_decode(features)
  predicted_val.extend(tag_seq)

# Record all the model F1 scores
scores[5] = f1_score(y_true,predicted_val,average='micro')

from sklearn.metrics import classification_report
print(classification_report(y_true, predicted_val, digits=4))

## 7.7 two layers + scale dot product + word embedding + pt + pos features

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
HIDDEN_DIM = 50
OTHER_FEATURE_DIM = 20
NUM_LAYER = 2
ATTENTION = ["Dot Product", "Scale Dot Product"]


model = BiLSTM_CRF(len(word_2_ix), tag_2_ix, EMBEDDING_DIM, OTHER_FEATURE_DIM, HIDDEN_DIM, 
                   attetion = ATTENTION[1], other_features = ["pos","pt"], layers = NUM_LAYER).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

In [None]:
"""Each epoch will take about 1-2 minutes"""

import datetime

for epoch in range(20):  
    time1 = datetime.datetime.now()
    train_loss = 0

    model.train()
    for i, idxs in enumerate(all_sentence_index[:3000]):
        
        tags_index = train_NER_index[i]
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)

        # Step new. Add other features as input
        other_feature = {}
        other_feature["pos"] = torch.tensor(all_sentence_pos[i], dtype=torch.long).to(device)
        other_feature["pt"] = torch.tensor(all_sentence_pt[i], dtype=torch.long).to(device)

        # Step 3. Run our forward pass.
        loss = model.neg_log_likelihood(sentence_in, targets, other_feature)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

        train_loss+=loss.item()

    time2 = datetime.datetime.now()

    # print("Epoch:%d, Training loss: %.2f, train acc: %.4f, val loss: %.2f, val acc: %.4f, time: %.2fs" %(epoch+1, train_loss,train_acc, val_loss, val_acc, (time2-time1).total_seconds()))
    print("Epoch:%d, Training loss: %.2f, time: %.2fs" %(epoch+1, train_loss, (time2-time1).total_seconds()))


In [None]:
## save model
torch.save(model, "pt_pos_word_2_layer_scale_dot_prodcut.pt")

## do the prediction - test - all_features
test_index = all_sentence_index[3700:]

model.eval()
ground_truth = []
predicted = []
for i in range(len(test_index)):
  other_feature = {}
  other_feature["pos"] = torch.tensor(all_sentence_pos[i+3700], dtype=torch.long).to(device)
  other_feature["pt"] = torch.tensor(all_sentence_pt[i+3700], dtype=torch.long).to(device)
  input = torch.tensor(test_index[i],dtype=torch.long).to(device)
  features = model._get_lstm_features(input, other_feature)
  score, tag_seq = model._viterbi_decode(features)
  predicted.extend(tag_seq)
  #ground_truth.extend(output_index[i])

# output the format for submit
import csv
with open('pt_pos_word_2_layer_scale_dot_prodcut_result.csv', 'w+', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['Id', 'Predicted'])
    for i, y in enumerate(predicted):
        writer.writerow([i, ix_2_tag[y]])

## print f1 socre for val
y_true = []
count = 0
for tags in train_NER_index[3000:]:
    y_true.extend(tags)


val_index = all_sentence_index[3000:3700]
model.eval()
predicted_val = []
for i in range(len(val_index)):
  other_feature = {}
  other_feature["pos"] = torch.tensor(all_sentence_pos[i+3000], dtype=torch.long).to(device)
  # print(other_feature["pos"].shape)
  other_feature["pt"] = torch.tensor(all_sentence_pt[i+3000], dtype=torch.long).to(device)
  input = torch.tensor(val_index[i],dtype=torch.long).to(device)
  features = model._get_lstm_features(input, other_feature)
  score, tag_seq = model._viterbi_decode(features)
  predicted_val.extend(tag_seq)

# Record all the model F1 scores
scores[6] = f1_score(y_true,predicted_val,average='micro')
    
from sklearn.metrics import classification_report
print(classification_report(y_true, predicted_val, digits=4))

## 7.8 two layers + dot production attention + word embedding + pos feature

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
HIDDEN_DIM = 50
OTHER_FEATURE_DIM = 20
NUM_LAYER = 2
ATTENTION = ["Dot Product", "Scale Dot Product"]


model = BiLSTM_CRF(len(word_2_ix), tag_2_ix, EMBEDDING_DIM, OTHER_FEATURE_DIM, HIDDEN_DIM, 
                   attetion = ATTENTION[0], other_features = ["pos"], layers = NUM_LAYER).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

In [None]:
"""Each epoch will take about 1-2 minutes"""

import datetime

for epoch in range(20):  
    time1 = datetime.datetime.now()
    train_loss = 0

    model.train()
    for i, idxs in enumerate(all_sentence_index[:3000]):
        
        tags_index = train_NER_index[i]
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)

        # Step new. Add other features as input
        other_feature = {}
        other_feature["pos"] = torch.tensor(all_sentence_pos[i], dtype=torch.long).to(device)
        other_feature["pt"] = torch.tensor(all_sentence_pt[i], dtype=torch.long).to(device)

        # Step 3. Run our forward pass.
        loss = model.neg_log_likelihood(sentence_in, targets, other_feature)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

        train_loss+=loss.item()

    time2 = datetime.datetime.now()

    # print("Epoch:%d, Training loss: %.2f, train acc: %.4f, val loss: %.2f, val acc: %.4f, time: %.2fs" %(epoch+1, train_loss,train_acc, val_loss, val_acc, (time2-time1).total_seconds()))
    print("Epoch:%d, Training loss: %.2f, time: %.2fs" %(epoch+1, train_loss, (time2-time1).total_seconds()))


In [None]:
## save model
torch.save(model, "pos_word_2_layer_dot_prodcut.pt")

## do the prediction - test - all_features
test_index = all_sentence_index[3700:]

model.eval()
ground_truth = []
predicted = []
for i in range(len(test_index)):
  other_feature = {}
  other_feature["pos"] = torch.tensor(all_sentence_pos[i+3700], dtype=torch.long).to(device)
  other_feature["pt"] = torch.tensor(all_sentence_pt[i+3700], dtype=torch.long).to(device)
  input = torch.tensor(test_index[i],dtype=torch.long).to(device)
  features = model._get_lstm_features(input, other_feature)
  score, tag_seq = model._viterbi_decode(features)
  predicted.extend(tag_seq)
  #ground_truth.extend(output_index[i])

# output the format for submit
import csv
with open('pos_word_2_layer_dot_prodcut.csv', 'w+', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['Id', 'Predicted'])
    for i, y in enumerate(predicted):
        writer.writerow([i, ix_2_tag[y]])

## print f1 socre for val
y_true = []
count = 0
for tags in train_NER_index[3000:]:
    y_true.extend(tags)


val_index = all_sentence_index[3000:3700]
model.eval()
predicted_val = []
for i in range(len(val_index)):
  other_feature = {}
  other_feature["pos"] = torch.tensor(all_sentence_pos[i+3000], dtype=torch.long).to(device)
  # print(other_feature["pos"].shape)
  other_feature["pt"] = torch.tensor(all_sentence_pt[i+3000], dtype=torch.long).to(device)
  input = torch.tensor(val_index[i],dtype=torch.long).to(device)
  features = model._get_lstm_features(input, other_feature)
  score, tag_seq = model._viterbi_decode(features)
  predicted_val.extend(tag_seq)

# Record all the model F1 scores
scores[7] = f1_score(y_true,predicted_val,average='micro')

from sklearn.metrics import classification_report
print(classification_report(y_true, predicted_val, digits=4))

## 7.9 two layers + dot production attention + only word embedding

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
HIDDEN_DIM = 50
OTHER_FEATURE_DIM = 20
NUM_LAYER = 2
ATTENTION = ["Dot Product", "Scale Dot Product"]


model = BiLSTM_CRF(len(word_2_ix), tag_2_ix, EMBEDDING_DIM, OTHER_FEATURE_DIM, HIDDEN_DIM, 
                   attetion = ATTENTION[0], other_features = [], layers = NUM_LAYER).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

In [None]:
"""Each epoch will take about 1-2 minutes"""

import datetime

for epoch in range(20):  
    time1 = datetime.datetime.now()
    train_loss = 0

    model.train()
    for i, idxs in enumerate(all_sentence_index[:3000]):
        
        tags_index = train_NER_index[i]
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)

        # Step new. Add other features as input
        other_feature = {}
        other_feature["pos"] = torch.tensor(all_sentence_pos[i], dtype=torch.long).to(device)
        other_feature["pt"] = torch.tensor(all_sentence_pt[i], dtype=torch.long).to(device)

        # Step 3. Run our forward pass.
        loss = model.neg_log_likelihood(sentence_in, targets, other_feature)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

        train_loss+=loss.item()

    time2 = datetime.datetime.now()

    # print("Epoch:%d, Training loss: %.2f, train acc: %.4f, val loss: %.2f, val acc: %.4f, time: %.2fs" %(epoch+1, train_loss,train_acc, val_loss, val_acc, (time2-time1).total_seconds()))
    print("Epoch:%d, Training loss: %.2f, time: %.2fs" %(epoch+1, train_loss, (time2-time1).total_seconds()))


In [None]:
## save model
torch.save(model, "word_2_layer_dot_prodcut.pt")

## do the prediction - test - all_features
test_index = all_sentence_index[3700:]

model.eval()
ground_truth = []
predicted = []
for i in range(len(test_index)):
  other_feature = {}
  other_feature["pos"] = torch.tensor(all_sentence_pos[i+3700], dtype=torch.long).to(device)
  other_feature["pt"] = torch.tensor(all_sentence_pt[i+3700], dtype=torch.long).to(device)
  input = torch.tensor(test_index[i],dtype=torch.long).to(device)
  features = model._get_lstm_features(input, other_feature)
  score, tag_seq = model._viterbi_decode(features)
  predicted.extend(tag_seq)
  #ground_truth.extend(output_index[i])

# output the format for submit
import csv
with open('word_2_layer_dot_prodcut.csv', 'w+', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['Id', 'Predicted'])
    for i, y in enumerate(predicted):
        writer.writerow([i, ix_2_tag[y]])

## print f1 socre for val
y_true = []
count = 0
for tags in train_NER_index[3000:]:
    y_true.extend(tags)


val_index = all_sentence_index[3000:3700]
model.eval()
predicted_val = []
for i in range(len(val_index)):
  other_feature = {}
  other_feature["pos"] = torch.tensor(all_sentence_pos[i+3000], dtype=torch.long).to(device)
  # print(other_feature["pos"].shape)
  other_feature["pt"] = torch.tensor(all_sentence_pt[i+3000], dtype=torch.long).to(device)
  input = torch.tensor(val_index[i],dtype=torch.long).to(device)
  features = model._get_lstm_features(input, other_feature)
  score, tag_seq = model._viterbi_decode(features)
  predicted_val.extend(tag_seq)

# Record all the model F1 scores
scores[8] = f1_score(y_true,predicted_val,average='micro')

from sklearn.metrics import classification_report
print(classification_report(y_true, predicted_val, digits=4))

## 7.10 one layer + dot production attention + word embedding + pt + pos features

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
HIDDEN_DIM = 50
OTHER_FEATURE_DIM = 20
NUM_LAYER = 1
ATTENTION = ["Dot Product", "Scale Dot Product"]


model = BiLSTM_CRF(len(word_2_ix), tag_2_ix, EMBEDDING_DIM, OTHER_FEATURE_DIM, HIDDEN_DIM, 
                   attetion = ATTENTION[0], other_features = ["pos","pt"], layers = NUM_LAYER).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

In [None]:
"""Each epoch will take about 1-2 minutes"""

import datetime

for epoch in range(20):  
    time1 = datetime.datetime.now()
    train_loss = 0

    model.train()
    for i, idxs in enumerate(all_sentence_index[:3000]):
        
        tags_index = train_NER_index[i]
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)

        # Step new. Add other features as input
        other_feature = {}
        other_feature["pos"] = torch.tensor(all_sentence_pos[i], dtype=torch.long).to(device)
        other_feature["pt"] = torch.tensor(all_sentence_pt[i], dtype=torch.long).to(device)

        # Step 3. Run our forward pass.
        loss = model.neg_log_likelihood(sentence_in, targets, other_feature)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

        train_loss+=loss.item()

    time2 = datetime.datetime.now()

    # print("Epoch:%d, Training loss: %.2f, train acc: %.4f, val loss: %.2f, val acc: %.4f, time: %.2fs" %(epoch+1, train_loss,train_acc, val_loss, val_acc, (time2-time1).total_seconds()))
    print("Epoch:%d, Training loss: %.2f, time: %.2fs" %(epoch+1, train_loss, (time2-time1).total_seconds()))


In [None]:
## save model
torch.save(model, "pt_pos_word_1_layer_dot_prodcut.pt")

## do the prediction - test - all_features
test_index = all_sentence_index[3700:]

model.eval()
ground_truth = []
predicted = []
for i in range(len(test_index)):
  other_feature = {}
  other_feature["pos"] = torch.tensor(all_sentence_pos[i+3700], dtype=torch.long).to(device)
  other_feature["pt"] = torch.tensor(all_sentence_pt[i+3700], dtype=torch.long).to(device)
  input = torch.tensor(test_index[i],dtype=torch.long).to(device)
  features = model._get_lstm_features(input, other_feature)
  score, tag_seq = model._viterbi_decode(features)
  predicted.extend(tag_seq)
  #ground_truth.extend(output_index[i])

# output the format for submit
import csv
with open('pt_pos_word_1_layer_dot_prodcut.csv', 'w+', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['Id', 'Predicted'])
    for i, y in enumerate(predicted):
        writer.writerow([i, ix_2_tag[y]])

## print f1 socre for val
y_true = []
count = 0
for tags in train_NER_index[3000:]:
    y_true.extend(tags)


val_index = all_sentence_index[3000:3700]
model.eval()
predicted_val = []
for i in range(len(val_index)):
  other_feature = {}
  other_feature["pos"] = torch.tensor(all_sentence_pos[i+3000], dtype=torch.long).to(device)
  # print(other_feature["pos"].shape)
  other_feature["pt"] = torch.tensor(all_sentence_pt[i+3000], dtype=torch.long).to(device)
  input = torch.tensor(val_index[i],dtype=torch.long).to(device)
  features = model._get_lstm_features(input, other_feature)
  score, tag_seq = model._viterbi_decode(features)
  predicted_val.extend(tag_seq)

# Record all the model F1 scores
scores[9] = f1_score(y_true,predicted_val,average='micro')

from sklearn.metrics import classification_report
print(classification_report(y_true, predicted_val, digits=4))

## 7.11 one layer + dot production attention + word embedding + pos feature

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
HIDDEN_DIM = 50
OTHER_FEATURE_DIM = 20
NUM_LAYER = 1
ATTENTION = ["Dot Product", "Scale Dot Product"]


model = BiLSTM_CRF(len(word_2_ix), tag_2_ix, EMBEDDING_DIM, OTHER_FEATURE_DIM, HIDDEN_DIM, 
                   attetion = ATTENTION[0], other_features = ["pos"], layers = NUM_LAYER).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

In [None]:
"""Each epoch will take about 1-2 minutes"""

import datetime

for epoch in range(20):  
    time1 = datetime.datetime.now()
    train_loss = 0

    model.train()
    for i, idxs in enumerate(all_sentence_index[:3000]):
        
        tags_index = train_NER_index[i]
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)

        # Step new. Add other features as input
        other_feature = {}
        other_feature["pos"] = torch.tensor(all_sentence_pos[i], dtype=torch.long).to(device)
        other_feature["pt"] = torch.tensor(all_sentence_pt[i], dtype=torch.long).to(device)

        # Step 3. Run our forward pass.
        loss = model.neg_log_likelihood(sentence_in, targets, other_feature)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

        train_loss+=loss.item()

    time2 = datetime.datetime.now()

    # print("Epoch:%d, Training loss: %.2f, train acc: %.4f, val loss: %.2f, val acc: %.4f, time: %.2fs" %(epoch+1, train_loss,train_acc, val_loss, val_acc, (time2-time1).total_seconds()))
    print("Epoch:%d, Training loss: %.2f, time: %.2fs" %(epoch+1, train_loss, (time2-time1).total_seconds()))


In [None]:
## save model
torch.save(model, "pos_word_1_layer_dot_prodcut.pt")

## do the prediction - test - all_features
test_index = all_sentence_index[3700:]

model.eval()
ground_truth = []
predicted = []
for i in range(len(test_index)):
  other_feature = {}
  other_feature["pos"] = torch.tensor(all_sentence_pos[i+3700], dtype=torch.long).to(device)
  other_feature["pt"] = torch.tensor(all_sentence_pt[i+3700], dtype=torch.long).to(device)
  input = torch.tensor(test_index[i],dtype=torch.long).to(device)
  features = model._get_lstm_features(input, other_feature)
  score, tag_seq = model._viterbi_decode(features)
  predicted.extend(tag_seq)
  #ground_truth.extend(output_index[i])

# output the format for submit
import csv
with open('pos_word_1_layer_dot_prodcut.csv', 'w+', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['Id', 'Predicted'])
    for i, y in enumerate(predicted):
        writer.writerow([i, ix_2_tag[y]])

## print f1 socre for val
y_true = []
count = 0
for tags in train_NER_index[3000:]:
    y_true.extend(tags)


val_index = all_sentence_index[3000:3700]
model.eval()
predicted_val = []
for i in range(len(val_index)):
  other_feature = {}
  other_feature["pos"] = torch.tensor(all_sentence_pos[i+3000], dtype=torch.long).to(device)
  # print(other_feature["pos"].shape)
  other_feature["pt"] = torch.tensor(all_sentence_pt[i+3000], dtype=torch.long).to(device)
  input = torch.tensor(val_index[i],dtype=torch.long).to(device)
  features = model._get_lstm_features(input, other_feature)
  score, tag_seq = model._viterbi_decode(features)
  predicted_val.extend(tag_seq)

# Record all the model F1 scores
scores[10] = f1_score(y_true,predicted_val,average='micro')

from sklearn.metrics import classification_report
print(classification_report(y_true, predicted_val, digits=4))

## 7.12 one layer + scale dot production attention + word embedding + pt + pos features

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
HIDDEN_DIM = 50
OTHER_FEATURE_DIM = 20
NUM_LAYER = 1
ATTENTION = ["Dot Product", "Scale Dot Product"]


model = BiLSTM_CRF(len(word_2_ix), tag_2_ix, EMBEDDING_DIM, OTHER_FEATURE_DIM, HIDDEN_DIM, 
                   attetion = ATTENTION[1], other_features = ["pos","pt"], layers = NUM_LAYER).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

In [None]:
"""Each epoch will take about 1-2 minutes"""

import datetime

for epoch in range(20):  
    time1 = datetime.datetime.now()
    train_loss = 0

    model.train()
    for i, idxs in enumerate(all_sentence_index[:3000]):
        
        tags_index = train_NER_index[i]
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)

        # Step new. Add other features as input
        other_feature = {}
        other_feature["pos"] = torch.tensor(all_sentence_pos[i], dtype=torch.long).to(device)
        other_feature["pt"] = torch.tensor(all_sentence_pt[i], dtype=torch.long).to(device)

        # Step 3. Run our forward pass.
        loss = model.neg_log_likelihood(sentence_in, targets, other_feature)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

        train_loss+=loss.item()

    time2 = datetime.datetime.now()

    # print("Epoch:%d, Training loss: %.2f, train acc: %.4f, val loss: %.2f, val acc: %.4f, time: %.2fs" %(epoch+1, train_loss,train_acc, val_loss, val_acc, (time2-time1).total_seconds()))
    print("Epoch:%d, Training loss: %.2f, time: %.2fs" %(epoch+1, train_loss, (time2-time1).total_seconds()))


In [None]:
## save model
torch.save(model, "pt_pos_word_1_layer_scale_dot_prodcut.pt")

## do the prediction - test - all_features
test_index = all_sentence_index[3700:]

model.eval()
ground_truth = []
predicted = []
for i in range(len(test_index)):
  other_feature = {}
  other_feature["pos"] = torch.tensor(all_sentence_pos[i+3700], dtype=torch.long).to(device)
  other_feature["pt"] = torch.tensor(all_sentence_pt[i+3700], dtype=torch.long).to(device)
  input = torch.tensor(test_index[i],dtype=torch.long).to(device)
  features = model._get_lstm_features(input, other_feature)
  score, tag_seq = model._viterbi_decode(features)
  predicted.extend(tag_seq)
  #ground_truth.extend(output_index[i])

# output the format for submit
import csv
with open('pt_pos_word_1_layer_scale_dot_prodcut.csv', 'w+', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['Id', 'Predicted'])
    for i, y in enumerate(predicted):
        writer.writerow([i, ix_2_tag[y]])

## print f1 socre for val
y_true = []
count = 0
for tags in train_NER_index[3000:]:
    y_true.extend(tags)


val_index = all_sentence_index[3000:3700]
model.eval()
predicted_val = []
for i in range(len(val_index)):
  other_feature = {}
  other_feature["pos"] = torch.tensor(all_sentence_pos[i+3000], dtype=torch.long).to(device)
  # print(other_feature["pos"].shape)
  other_feature["pt"] = torch.tensor(all_sentence_pt[i+3000], dtype=torch.long).to(device)
  input = torch.tensor(val_index[i],dtype=torch.long).to(device)
  features = model._get_lstm_features(input, other_feature)
  score, tag_seq = model._viterbi_decode(features)
  predicted_val.extend(tag_seq)

# Record all the model F1 scores
scores[11] = f1_score(y_true,predicted_val,average='micro')

from sklearn.metrics import classification_report
print(classification_report(y_true, predicted_val, digits=4))

## 7.13 one layer + scale dot production attention + word embedding + pos feature

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
HIDDEN_DIM = 50
OTHER_FEATURE_DIM = 20
NUM_LAYER = 1
ATTENTION = ["Dot Product", "Scale Dot Product"]


model = BiLSTM_CRF(len(word_2_ix), tag_2_ix, EMBEDDING_DIM, OTHER_FEATURE_DIM, HIDDEN_DIM, 
                   attetion = ATTENTION[1], other_features = ["pos"], layers = NUM_LAYER).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

In [None]:
"""Each epoch will take about 1-2 minutes"""

import datetime

for epoch in range(20):  
    time1 = datetime.datetime.now()
    train_loss = 0

    model.train()
    for i, idxs in enumerate(all_sentence_index[:3000]):
        
        tags_index = train_NER_index[i]
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)

        # Step new. Add other features as input
        other_feature = {}
        other_feature["pos"] = torch.tensor(all_sentence_pos[i], dtype=torch.long).to(device)
        other_feature["pt"] = torch.tensor(all_sentence_pt[i], dtype=torch.long).to(device)

        # Step 3. Run our forward pass.
        loss = model.neg_log_likelihood(sentence_in, targets, other_feature)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

        train_loss+=loss.item()

    time2 = datetime.datetime.now()

    # print("Epoch:%d, Training loss: %.2f, train acc: %.4f, val loss: %.2f, val acc: %.4f, time: %.2fs" %(epoch+1, train_loss,train_acc, val_loss, val_acc, (time2-time1).total_seconds()))
    print("Epoch:%d, Training loss: %.2f, time: %.2fs" %(epoch+1, train_loss, (time2-time1).total_seconds()))


In [None]:
## save model
torch.save(model, "pos_word_1_layer_scale_dot_prodcut.pt")

## do the prediction - test - all_features
test_index = all_sentence_index[3700:]

model.eval()
ground_truth = []
predicted = []
for i in range(len(test_index)):
  other_feature = {}
  other_feature["pos"] = torch.tensor(all_sentence_pos[i+3700], dtype=torch.long).to(device)
  other_feature["pt"] = torch.tensor(all_sentence_pt[i+3700], dtype=torch.long).to(device)
  input = torch.tensor(test_index[i],dtype=torch.long).to(device)
  features = model._get_lstm_features(input, other_feature)
  score, tag_seq = model._viterbi_decode(features)
  predicted.extend(tag_seq)
  #ground_truth.extend(output_index[i])

# output the format for submit
import csv
with open('pos_word_1_layer_scale_dot_prodcut.csv', 'w+', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['Id', 'Predicted'])
    for i, y in enumerate(predicted):
        writer.writerow([i, ix_2_tag[y]])

## print f1 socre for val
y_true = []
count = 0
for tags in train_NER_index[3000:]:
    y_true.extend(tags)


val_index = all_sentence_index[3000:3700]
model.eval()
predicted_val = []
for i in range(len(val_index)):
  other_feature = {}
  other_feature["pos"] = torch.tensor(all_sentence_pos[i+3000], dtype=torch.long).to(device)
  # print(other_feature["pos"].shape)
  other_feature["pt"] = torch.tensor(all_sentence_pt[i+3000], dtype=torch.long).to(device)
  input = torch.tensor(val_index[i],dtype=torch.long).to(device)
  features = model._get_lstm_features(input, other_feature)
  score, tag_seq = model._viterbi_decode(features)
  predicted_val.extend(tag_seq)

# Record all the model F1 scores
scores[12] = f1_score(y_true,predicted_val,average='micro')

from sklearn.metrics import classification_report
print(classification_report(y_true, predicted_val, digits=4))

# 8. Evaluation

## 8.1 Summarise model scores from selected models in Section 7

In [None]:
# Best performing model from select set
baseline_score = np.round(scores[0],4)
model_scores = np.round(scores[1:],4)
best_params = np.where(model_scores==np.amax(model_scores))

# Print table summarising scores compared to other models
from prettytable import PrettyTable
t = PrettyTable(['Models', 'F1'])
t.add_row(['Baseline', baseline_score])
t.add_row(['Best other model', float(model_scores[best_params])])
print(t)

t = PrettyTable()
t.field_names = ['Selective Search - Models', 'F1']
t.align["Selective Search - Models"] = "l"
t.align["F1"] = "l"

t.add_row(['7.1 Baseline', baseline_score])
t.add_row(['7.2 two layers + dot product attention + word embedding + pt + pos features', float(model_scores[0])])
t.add_row(['7.3 one layer + dot product + only word embedding', float(model_scores[1])])
t.add_row(['7.4 one layer + scale dot product + only word embedding ', float(model_scores[2])])
t.add_row(['7.5 two layers + scale dot product + only word embedding ', float(model_scores[3])])
t.add_row(['7.6 two layers + scale dot product + word embedding + pos feature', float(model_scores[4])])
t.add_row(['7.7 two layers + scale dot product + word embedding + pt + pos features', float(model_scores[5])])
t.add_row(['7.8 two layers + dot production attention + word embedding + pos feature', float(model_scores[6])])
t.add_row(['7.9 two layers + dot production attention + only word embedding', float(model_scores[7])])
t.add_row(['7.10 one layer + dot production attention + word embedding + pt + pos features', float(model_scores[8])])
t.add_row(['7.11 one layer + dot production attention + word embedding + pos feature', float(model_scores[9])])
t.add_row(['7.12 one layer + scale dot production attention + word embedding + pt + pos features', float(model_scores[10])])
t.add_row(['7.13 one layer + scale dot production attention + word embedding + pos feature', float(model_scores[11])])
print(t)

## 8.2 Full ablation study

In [155]:
from sklearn.metrics import f1_score
import datetime
import csv

epochs = 20
HIDDEN_DIM = 50
OTHER_FEATURE_DIM = 20

# Test different embeddings
feature_flags_list = [[],["pos"],["pt"],["pos","pt"]] # 0. word embed flag 1. pos_tag flag 2. parse tree flag 3. all
# Test different number of layers
lstm_layers_list = [1, 2]
# Test different attention strategies
attention_flags = ["No Attention", "Dot Product", "Scale Dot Product"]

# Array for model scores
abl_scores = np.zeros((len(attention_flags),len(feature_flags_list),len(lstm_layers_list)))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
for ai, attention in enumerate(attention_flags):

    for fi, feature_flags in enumerate(feature_flags_list):

        for li, lstm_layers in enumerate(lstm_layers_list):

            model = BiLSTM_CRF(len(word_2_ix), tag_2_ix, EMBEDDING_DIM, OTHER_FEATURE_DIM, HIDDEN_DIM,
                               attetion=attention, other_features=feature_flags, layers=lstm_layers).to(device)
            optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

            for epoch in range(epochs):
                time1 = datetime.datetime.now()
                train_loss = 0

                model.train()
                for i, idxs in enumerate(all_sentence_index[:3000]):
                    tags_index = train_NER_index[i]
                    # Step 1. Remember that Pytorch accumulates gradients.
                    # We need to clear them out before each instance
                    model.zero_grad()

                    # Step 2. Get our inputs ready for the network, that is,
                    # turn them into Tensors of word indices.
                    sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
                    targets = torch.tensor(tags_index, dtype=torch.long).to(device)

                    # Step new. Add other features as input
                    other_feature = {}
                    other_feature["pos"] = torch.tensor(all_sentence_pos[i], dtype=torch.long).to(device)
                    other_feature["pt"] = torch.tensor(all_sentence_pt[i], dtype=torch.long).to(device)

                    # Step 3. Run our forward pass.
                    loss = model.neg_log_likelihood(sentence_in, targets, other_feature)

                    # Step 4. Compute the loss, gradients, and update the parameters by
                    # calling optimizer.step()
                    loss.backward()
                    optimizer.step()

                    train_loss += loss.item()

                time2 = datetime.datetime.now()

                #print("Epoch:%d, Training loss: %.2f, time: %.2fs" % (epoch + 1, train_loss, (time2 - time1).total_seconds()))
                
            ## save model
            torch.save(model, 'model_' + str(ai)+str(fi)+str(li) +'.pt')

            ## do the prediction - test - all_features
            test_index = all_sentence_index[3700:]

            model.eval()
            ground_truth = []
            predicted = []
            for i in range(len(test_index)):
                other_feature = {}
                other_feature["pos"] = torch.tensor(all_sentence_pos[i + 3700], dtype=torch.long).to(device)
                other_feature["pt"] = torch.tensor(all_sentence_pt[i + 3700], dtype=torch.long).to(device)
                input = torch.tensor(test_index[i], dtype=torch.long).to(device)
                features = model._get_lstm_features(input, other_feature)
                score, tag_seq = model._viterbi_decode(features)
                predicted.extend(tag_seq)
                # ground_truth.extend(output_index[i])

            with open('model_' + str(ai)+str(fi)+str(li) +'.csv', 'w+', newline='') as f:
                writer = csv.writer(f)
                writer.writerow(['Id', 'Predicted'])
                for i, y in enumerate(predicted):
                    writer.writerow([i, ix_2_tag[y]])

            ## print f1 socre for val
            y_true = []
            count = 0
            for tags in train_NER_index[3000:]:
                y_true.extend(tags)

            val_index = all_sentence_index[3000:3700]
            model.eval()
            predicted_val = []
            for i in range(len(val_index)):
                other_feature = {}
                other_feature["pos"] = torch.tensor(all_sentence_pos[i + 3000], dtype=torch.long).to(device)
                # print(other_feature["pos"].shape)
                other_feature["pt"] = torch.tensor(all_sentence_pt[i + 3000], dtype=torch.long).to(device)
                input = torch.tensor(val_index[i], dtype=torch.long).to(device)
                features = model._get_lstm_features(input, other_feature)
                score, tag_seq = model._viterbi_decode(features)
                predicted_val.extend(tag_seq)

            abl_scores[ai,fi,li] = f1_score(y_true,predicted_val,average='micro')
            print('Model ' + str(ai)+str(fi)+str(li) + ': ' + str(abl_scores[ai,fi,li]))



  "type " + obj.__name__ + ". It won't be checked "


Model 000: 0.9675754367390154
Model 001: 0.9626786659608259
Model 010: 0.9647961884595024
Model 011: 0.9661196400211752
Model 100: 0.9643991529910005
Model 101: 0.964134462678666
Model 110: 0.963208046585495
Model 111: 0.9596347273689783
Model 200: 0.9651932239280042
Model 201: 0.965060878771837
Model 210: 0.9642668078348332
Model 211: 0.9628110111169931


In [None]:
# Best performing model
best_params = np.where(abl_scores==np.amax(abl_scores))

# Print table summarising scores compared to other models
from prettytable import PrettyTable
t = PrettyTable(['Models', 'F1'])
t.add_row(['Baseline', abl_scores[0,0,0]])
t.add_row(['Best other model', float(abl_scores[best_params])])
print(t)

# Print table summarising ablation studies and scores
# Ablation Study
t = PrettyTable(['Model', 'Attention', 'Layers', 'Embedding', 'F1'])
t.add_row(['Baseline model', 'No Attention', '1 layer BiLSTM','Word Embedding', abl_scores[0,0,0]])
t.add_row(['Model 1','','', '+ PoS embedding', abl_scores[0,1,0]])
t.add_row(['Model 2','','', '+ Parse Tree embedding', abl_scores[0,2,0]])
t.add_row(['Model 3','','', '+ ALL', abl_scores[0,3,0]])
t.add_row(['Model 4','','2 layer BiLSTM','Word Embedding', abl_scores[0,0,1]])
t.add_row(['Model 5','','', '+ PoS embedding', abl_scores[0,1,1]])
t.add_row(['Model 6','','', '+ Parse Tree embedding', abl_scores[0,2,1]])
t.add_row(['Model 7','','', '+ ALL', abl_scores[0,3,1]])

t.add_row(['Model 9','Dot Product', '1 layer BiLSTM','Word Embedding', abl_scores[1,0,0]])
t.add_row(['Model 10','','', '+ PoS embedding', abl_scores[1,1,0]])
t.add_row(['Model 11','','', '+ Parse Tree embedding', abl_scores[1,2,0]])
t.add_row(['Model 12','','', '+ ALL', abl_scores[1,3,0]])
t.add_row(['Model 13','','2 layer BiLSTM','Word Embedding', abl_scores[1,0,1]])
t.add_row(['Model 14','','', '+ PoS embedding', abl_scores[1,1,1]])
t.add_row(['Model 15','','', '+ Parse Tree embedding', abl_scores[1,2,1]])
t.add_row(['Model 16','','', '+ ALL', abl_scores[1,3,1]])

t.add_row(['Model 17', 'Scale Dot Product', '1 layer BiLSTM','Word Embedding', abl_scores[2,0,0]])
t.add_row(['Model 18','','', '+ PoS embedding', abl_scores[2,1,0]])
t.add_row(['Model 19','','', '+ Parse Tree embedding', abl_scores[2,2,0]])
t.add_row(['Model 20','','', '+ ALL', abl_scores[2,3,0]])
t.add_row(['Model 21','','2 layer BiLSTM','Word Embedding', abl_scores[2,0,1]])
t.add_row(['Model 22','','', '+ PoS embedding', abl_scores[2,1,1]])
t.add_row(['Model 23','','', '+ Parse Tree embedding', abl_scores[2,2,1]])
t.add_row(['Model 24','','', '+ ALL', abl_scores[2,3,1]])
print(t)
