## Download dataset

In [1]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate
drive = None
def authenticate():
    global drive
    auth.authenticate_user()
    gauth = GoogleAuth()
    gauth.credentials = GoogleCredentials.get_application_default()
    drive = GoogleDrive(gauth)

#Download files
def downloadFiles(fileIds):
    authenticate()
    for fileId in fileIds:    
        downloaded = drive.CreateFile({"id": fileId[1]})
        downloaded.GetContentFile(fileId[0])

In [2]:
# Download file if not existing
try:
  _ = open("test_without_labels.csv", "r")
except:
  downloadFiles([["test_without_labels.csv", "1ScX29-3Z8HuNxtiAkeyT9AZgi6YMI1TJ"]])

try:
  _ = open("train.csv", "r")
except:
  downloadFiles([["train.csv", "1WuuPnijTPXyPSJ5ZJp0Kp2IhA2e6EeJy"]])

try:
  _ = open("val.csv", "r")
except:
  downloadFiles([["val.csv", "1IWcqwdFagOazxJvBR-KJhN4HFllx5Cf5"]])

In [3]:
## Read dataset
import pandas as pd
training_dataset = pd.read_csv("train.csv")
testing_dataset = pd.read_csv("test_without_labels.csv")
validating_dataset = pd.read_csv("val.csv")

In [4]:
# Convert to list
training_labels_list = list(training_dataset.labels)
validating_labels_list = list(validating_dataset.labels)
training_data_list = list(training_dataset.sents)
validating_data_list = list(validating_dataset.sents)
testing_data_list = list(testing_dataset.sents)

In [5]:
# The function to remove punctuation
def remove_punctuation(sent_data):
  final_result = []
  for idx, sentence in enumerate(sent_data):
    temp = []
    sentence = sentence.split(' ')
    for word in sentence:
      if(('.' in word or '\'' in word or '?' in word) and len(word) > 1):
        word = word.replace('?', '')
        word = word.replace('.', '')
        word = word.replace('\'','')
      temp.append(word)
    final_result.append(temp)
  return final_result

In [6]:
# Remove punctuation for training data
training_data = remove_punctuation(training_data_list)
# Remove punctuation for validating data
validating_data = remove_punctuation(validating_data_list)
# Remove punctuation for testing data
testing_data = remove_punctuation(testing_data_list)

In [7]:
## Convert labels to list
training_labels = []
for label in training_labels_list:
    training_labels.append(label.split(" "))

validating_labels = []
for label in validating_labels_list:
    validating_labels.append(label.split(" "))

print(training_labels[0])
print(validating_labels[0])

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Organisation', 'I-Organisation', 'O', 'B-Temporal', 'I-Temporal', 'I-Temporal', 'O', 'O', 'O', 'O', 'O', 'B-Organisation', 'I-Organisation', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Nationality', 'O', 'O']
['O', 'B-Quantity', 'I-Quantity', 'I-Quantity', 'I-Quantity', 'I-Quantity', 'B-Person', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Person', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [8]:
print(training_data[0])
print(validating_data[0])
print(testing_data[0])

['Operation', 'Steel', 'Curtain', '(', 'Arabic', ':', 'ا', 'ل', 'ح', 'ج', 'ا', 'ب', 'ا', 'ل', 'ف', 'و', 'ل', 'ا', 'ذ', 'ي', 'Al', 'Hejab', 'Elfulathi', ')', 'was', 'a', 'military', 'operation', 'executed', 'by', 'coalition', 'forces', 'in', 'early', 'November', '2005', 'to', 'reduce', 'the', 'flow', 'of', 'foreign', 'insurgents', 'crossing', 'the', 'border', 'and', 'joining', 'the', 'Iraqi', 'insurgency', '.']
['For', '14', '-', 'year', '-', 'old', 'Amjad', 'it', 'is', 'safer', 'there', 'than', 'being', 'above', 'ground', ',', 'and', 'over', 'time', 'his', 'enthusiasm', 'for', 'the', 'place', 'has', 'earned', 'him', 'the', 'role', 'of', '"', '"', 'deputy', 'librarian', '"', '"', '.', '"']
['Carter', 'thanked', 'Abadi', 'for', 'nearly', 'two', 'years', 'of', 'a', 'close', 'personal', 'partnership', ',', 'and', 'noted', 'the', 'continued', 'supporting', 'role', 'the', 'United', 'States', 'and', 'the', 'counter', '-', 'ISIL', 'coalition', 'can', 'play', 'in', 'Iraq', "'", 's', 'efforts', 

In [9]:
## word to index
import numpy as np
word_to_ix = {}
for sentence in training_data+validating_data+testing_data:
    for word in sentence:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
word_list = list(word_to_ix.keys())

# Add Start and Stop tags
START_TAG = "<START>"
STOP_TAG = "<STOP>"
tag_to_ix = {START_TAG:0, STOP_TAG:1}
for tags in training_labels+validating_labels:
    for tag in tags:
        if tag not in tag_to_ix:
            tag_to_ix[tag] = len(tag_to_ix)

In [10]:
print(word_to_ix)
print(tag_to_ix)
print(np.shape(word_list))

{'<START>': 0, '<STOP>': 1, 'O': 2, 'B-Organisation': 3, 'I-Organisation': 4, 'B-Temporal': 5, 'I-Temporal': 6, 'B-Nationality': 7, 'B-Location': 8, 'I-Location': 9, 'B-Person': 10, 'I-Person': 11, 'B-DocumentReference': 12, 'I-DocumentReference': 13, 'B-Money': 14, 'I-Money': 15, 'B-Quantity': 16, 'B-MilitaryPlatform': 17, 'I-MilitaryPlatform': 18, 'B-Weapon': 19, 'I-Weapon': 20, 'I-Quantity': 21, 'I-Nationality': 22}
(4280,)


# Input embedding

### POS tag

In [11]:
## POS tagging
import nltk
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import word_tokenize
def get_pos_tags(all_tokens):
  pos_tags = []
  for sentence in all_tokens:
    temp = []
    # Use the nltk to generate pos tags.
    for word, tag in nltk.pos_tag(sentence):
      temp.append(tag)
    pos_tags.append(temp)

  return pos_tags

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [12]:
# Get all tags for all data
pos_tags = get_pos_tags(training_data + validating_data + testing_data)
all_data = training_data + validating_data + testing_data

In [13]:
print(training_data[0])
print(pos_tags[0])

['Operation', 'Steel', 'Curtain', '(', 'Arabic', ':', 'ا', 'ل', 'ح', 'ج', 'ا', 'ب', 'ا', 'ل', 'ف', 'و', 'ل', 'ا', 'ذ', 'ي', 'Al', 'Hejab', 'Elfulathi', ')', 'was', 'a', 'military', 'operation', 'executed', 'by', 'coalition', 'forces', 'in', 'early', 'November', '2005', 'to', 'reduce', 'the', 'flow', 'of', 'foreign', 'insurgents', 'crossing', 'the', 'border', 'and', 'joining', 'the', 'Iraqi', 'insurgency', '.']
['NN', 'NNP', 'NNP', '(', 'NNP', ':', 'NN', 'NNP', 'NNP', 'NNP', 'NNP', 'NNP', 'NNP', 'NNP', 'NNP', 'NNP', 'NNP', 'NNP', 'NNP', 'NNP', 'NNP', 'NNP', 'NNP', ')', 'VBD', 'DT', 'JJ', 'NN', 'VBN', 'IN', 'NN', 'NNS', 'IN', 'JJ', 'NNP', 'CD', 'TO', 'VB', 'DT', 'NN', 'IN', 'JJ', 'NNS', 'VBG', 'DT', 'NN', 'CC', 'VBG', 'DT', 'NNP', 'NN', '.']


In [14]:
pos_tag_id_map = {}
# Generate id for unique pos tag
pos_tag_id = 1

for tag_list in pos_tags:
  for pos_tag in tag_list:
    if pos_tag not in pos_tag_id_map:
      # if not in the map, add the tag and set the id
      pos_tag_id_map[pos_tag] = pos_tag_id
      # id + 1
      pos_tag_id = pos_tag_id + 1

# Do one hot encoding for pos tag
one_hot_coding = np.zeros(pos_tag_id - 1)

for key in pos_tag_id_map.keys():
  one_hot_coding = np.zeros(pos_tag_id - 1)
  # Setting the position of the id as 1
  one_hot_coding[pos_tag_id_map[key] - 1] = 1
  # Save the one hot coding into the map
  pos_tag_id_map[key] = one_hot_coding

# Generate pos tag map
pos_map = {}
for index in range(0,len(all_data)):
    for index2 in range(0,len(all_data[index])):
        pos_map[all_data[index][index2]] = pos_tag_id_map[pos_tags[index][index2]]

In [15]:
print(pos_map['repeater'])

[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


### Dependency

In [16]:
import spacy
nlp = spacy.load("en_core_web_sm")

# Use the spacy to parse the sentence for dependency tree
# Ref lab 7
def get_dependency(data):
  parsed_sentences = []
  for sentence in data:
    joined_sentence = ' '.join(sentence)
    parse = nlp(joined_sentence)
    deps = []
    for item in parse:
      dep = item.dep_
      deps.append(dep)

    parsed_sentences.append(deps[:len(sentence)])
  return parsed_sentences

parsed_sentences = get_dependency(all_data)

In [17]:
print(parsed_sentences)

[['compound', 'compound', 'ROOT', 'punct', 'appos', 'punct', 'punct', 'nsubj', 'punct', 'intj', 'punct', 'nmod', 'intj', 'compound', 'compound', 'compound', 'nmod', 'intj', 'compound', 'compound', 'compound', 'compound', 'appos', 'punct', 'ROOT', 'det', 'amod', 'attr', 'acl', 'agent', 'compound', 'pobj', 'prep', 'amod', 'pobj', 'nummod', 'aux', 'advcl', 'det', 'dobj', 'prep', 'amod', 'pobj', 'acl', 'det', 'dobj', 'cc', 'conj', 'det', 'amod', 'dobj', 'punct'], ['det', 'nsubj', 'ROOT', 'dobj', 'prep', 'pobj', 'cc', 'compound', 'conj', 'punct', 'det', 'nmod', 'nmod', 'cc', 'compound', 'appos', 'punct'], ['det', 'nsubj', 'ROOT', 'acomp', 'prep', 'mark', 'nsubj', 'ccomp', 'det', 'amod', 'amod', 'compound', 'attr', 'prep', 'det', 'compound', 'compound', 'pobj', 'punct'], ['nsubj', 'ROOT', 'poss', 'amod', 'attr', 'prep', 'pobj', 'cc', 'det', 'amod', 'conj', 'aux', 'acl', 'cc', 'conj', 'amod', 'dobj', 'prep', 'det', 'pobj', 'cc', 'conj', 'prep', 'det', 'pobj', 'prep', 'pobj', 'amod', 'prep', '

In [18]:
# Generate one hot encoding for dependency tag

# id map
dependency_id_map = {}

# Set unique id for each tag
dependency_id = 1
for tag_list in parsed_sentences:
  for dependency_tag in tag_list:
    if dependency_tag not in dependency_id_map:
      # if not in the map, add the tag and id
      dependency_id_map[dependency_tag] = dependency_id
      # Id + 1
      dependency_id = dependency_id + 1

# one hot encoding
one_hot_coding = np.zeros(dependency_id - 1)

# Set the id position as 1
for key in dependency_id_map.keys():
  one_hot_coding = np.zeros(dependency_id - 1)
  one_hot_coding[dependency_id_map[key] - 1] = 1
  dependency_id_map[key] = one_hot_coding

# Generate the dependency map
dependency_map = {}
for index in range(0,len(all_data)):
    for index2 in range(0,len(all_data[index])):
        dependency_map[all_data[index][index2]] = dependency_id_map[parsed_sentences[index][index2]]

### Word Embedding

In [19]:
import gensim.downloader as api
import numpy as np
word_emb_model = api.load("glove-twitter-100") 



In [20]:
# We found that the one hot encoding of pos tagging and dependency tagging have negative impacts for the prediction, so comment out.
#EMBEDDING_DIM = 100 + (pos_tag_id - 1) + (dependency_id - 1)
EMBEDDING_DIM = 100
embedding_matrix = []
for word in word_list:
    try:
      temp = word_emb_model.wv[word]
      # temp.extend(pos_map[word])
      # temp.extend(dependency_map[word])
      embedding_matrix.append(temp)
    except:
      embedding_matrix.append([0]*EMBEDDING_DIM)
embedding_matrix = np.array(embedding_matrix)
embedding_matrix.shape

  


(4280, 100)

In [21]:
# Install flair to prepare the Bert Embedding
!pip install flair

Collecting flair
[?25l  Downloading https://files.pythonhosted.org/packages/f0/3a/1b46a0220d6176b22bcb9336619d1731301bc2c75fa926a9ef953e6e4d58/flair-0.8.0.post1-py3-none-any.whl (284kB)
[K     |█▏                              | 10kB 12.5MB/s eta 0:00:01[K     |██▎                             | 20kB 17.4MB/s eta 0:00:01[K     |███▌                            | 30kB 11.7MB/s eta 0:00:01[K     |████▋                           | 40kB 8.9MB/s eta 0:00:01[K     |█████▊                          | 51kB 4.9MB/s eta 0:00:01[K     |███████                         | 61kB 5.1MB/s eta 0:00:01[K     |████████                        | 71kB 5.8MB/s eta 0:00:01[K     |█████████▏                      | 81kB 6.5MB/s eta 0:00:01[K     |██████████▍                     | 92kB 6.6MB/s eta 0:00:01[K     |███████████▌                    | 102kB 5.1MB/s eta 0:00:01[K     |████████████▋                   | 112kB 5.1MB/s eta 0:00:01[K     |█████████████▉                  | 122kB 5.1MB/s et

In [22]:
# Use the bert-large-cased
from flair.embeddings import TransformerWordEmbeddings
bert_embedding = TransformerWordEmbeddings('bert-large-cased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=762.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435797.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1338740706.0, style=ProgressStyle(descr…




Some weights of the model checkpoint at bert-large-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# Generate Bert embedding for words in the list
from flair.data import Sentence
bert_matrix = []

for word in word_list:
  bert_embedding.embed(Sentence(word))
  for token in Sentence(word):
    bert_matrix.append(token.embedding.detach().cpu().numpy())

bert_matrix = np.array(bert_matrix)


In [25]:
print(np.shape(embedding_matrix))
print(np.shape(bert_matrix))
print(np.shape(word_list))

(4280, 100)
(4280, 1024)
(4280,)


In [26]:
# concatenate the bert embedding with the embedding matrix
# if use the pos tag and dependency, the embedding_matrix = word_embedding + pos_tag + dependency
# if not, the embedding_matrix = word_embedding
embedding_matrix = np.concatenate((bert_matrix, embedding_matrix), axis = 1)
EMBEDDING_DIM = embedding_matrix.shape[1]

In [27]:
EMBEDDING_DIM = embedding_matrix.shape[1]

In [28]:
def to_index(data, to_ix):
    input_index_list = []
    for sent in data:
        input_index_list.append([to_ix[w] for w in sent])
    return input_index_list

# Convert training data to idx
train_input_index =  to_index(training_data,word_to_ix)
# Convert training labels to idx
train_output_index = to_index(training_labels,tag_to_ix)
# Convert validating data to idx
val_input_index = to_index(validating_data,word_to_ix)
# Convert validating labels to idx
val_output_index = to_index(validating_labels,tag_to_ix)
# Convert testing labels to idx
test_input_index = to_index(testing_data,word_to_ix)

In [29]:
print(train_input_index[15])
print(train_output_index[15])
print(val_input_index[15])
print(val_output_index[15])
print(test_input_index[15])

[45, 183, 160, 33, 184, 185, 186, 33, 187, 188, 27, 33, 189, 22, 53, 190, 33, 191, 31, 192, 33, 193, 194, 77, 33, 42, 134, 40, 33, 195, 196, 134, 44]
[2, 2, 2, 3, 4, 4, 2, 2, 2, 2, 2, 2, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 4, 4, 2, 3, 4, 4, 4, 2]
[340, 154, 155, 3227, 1811, 3228, 20, 2545, 1345, 1564, 44]
[3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
[152, 119, 391, 177, 2191, 340, 31, 3786, 3765, 59, 3787, 329, 215, 3788, 26, 44, 159]


## Model

### Create Model

In [30]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()


# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))
 
class BiLSTM_CRF(nn.Module):

  def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim, layers_number, is_use_crf, attention_type = None):
      super(BiLSTM_CRF, self).__init__()
      self.embedding_dim = embedding_dim
      self.hidden_dim = hidden_dim
      self.vocab_size = vocab_size
      self.layers_number = layers_number
      # use crf or not
      self.is_use_crf = is_use_crf
      # attention type(scale, dot, general)
      self.attention_type = attention_type
      self.tag_to_ix = tag_to_ix
      self.tagset_size = len(tag_to_ix)

      self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
      self.word_embeds.weight.data.copy_(torch.from_numpy(embedding_matrix))
      # Get stacked lstm by setting the layer number
      self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, num_layers=self.layers_number, bidirectional=True)

      # Dropout layer before the last layout
      self.dropout = nn.Dropout(0.2)
      
      # The attention_weight which will be used for general attention
      self.attention_weight = nn.parameter.Parameter(torch.Tensor(1, self.hidden_dim, self.hidden_dim), requires_grad = True)

      # Maps the output of the LSTM into tag space.

      if not attention_type:
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)
      else:
        self.hidden2tag = nn.Linear(hidden_dim * 2, self.tagset_size)

      # Matrix of transition parameters.  Entry i,j is the score of
      # transitioning *to* i *from* j.
      self.transitions = nn.Parameter(
          torch.randn(self.tagset_size, self.tagset_size))

      # These two statements enforce the constraint that we never transfer
      # to the start tag and we never transfer from the stop tag
      self.transitions.data[tag_to_ix[START_TAG], :] = -10000
      self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

      self.hidden = self.init_hidden()

  def init_hidden(self):
      return (torch.randn(2 * self.layers_number, 1, self.hidden_dim // 2).to(device),
              torch.randn(2 * self.layers_number, 1, self.hidden_dim // 2).to(device))

  def _forward_alg(self, feats):
      # Do the forward algorithm to compute the partition function
      init_alphas = torch.full((1, self.tagset_size), -10000.).to(device)
      # START_TAG has all of the score.
      init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

      # Wrap in a variable so that we will get automatic backprop
      forward_var = init_alphas

      # Iterate through the sentence
      for feat in feats:
          alphas_t = []  # The forward tensors at this timestep
          for next_tag in range(self.tagset_size):
              # broadcast the emission score: it is the same regardless of
              # the previous tag
              emit_score = feat[next_tag].view(
                  1, -1).expand(1, self.tagset_size)
              # the ith entry of trans_score is the score of transitioning to
              # next_tag from i
              trans_score = self.transitions[next_tag].view(1, -1)
              # The ith entry of next_tag_var is the value for the
              # edge (i -> next_tag) before we do log-sum-exp
              next_tag_var = forward_var + trans_score + emit_score
              # The forward variable for this tag is log-sum-exp of all the
              # scores.
              alphas_t.append(log_sum_exp(next_tag_var).view(1))
          forward_var = torch.cat(alphas_t).view(1, -1)
      terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
      alpha = log_sum_exp(terminal_var)
      return alpha

  def _get_lstm_features(self, sentence):
      self.hidden = self.init_hidden()
      embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
      lstm_out, self.hidden = self.lstm(embeds, self.hidden)
      if not self.attention_type:
        # If don't use the attention, get the result directly
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
      else:
        # If use the attention, get the result after applying attention
        lstm_out = self._get_attention_output(lstm_out, sentence)
      
      # Dropout 0.2
      lstm_out = self.dropout(lstm_out)
      lstm_feats = self.hidden2tag(lstm_out)
      return lstm_feats
  
  def _get_attention_output(self, lstm_out, sentence):
      lstm_out = torch.squeeze(lstm_out, 1)
      # get the s from the lstm out
      s = lstm_out.view(1, lstm_out.size(0), lstm_out.size(1))
      # get the h by reshape s
      h = s.view(s.size(0), s.size(2), s.size(1))

      # Dot-product attention
      if self.attention_type.lower() == "dot":
        weight_att = nn.functional.softmax(torch.bmm(s, h),dim=-1)
      elif self.attention_type.lower() == "scale":
        # Scaled dot-product attention
        weight_att = nn.functional.softmax(torch.bmm(s, h) / np.sqrt(self.hidden_dim),dim=-1)
      else:
        # general attention
        temp = torch.bmm(torch.bmm(s, self.attention_weight), h)
        weight_att = nn.functional.softmax(temp, dim=-1)

      output = torch.bmm(weight_att, s)
      concat_output = torch.cat((output, s), dim = -1)
      lstm_out = concat_output.view(len(sentence), self.hidden_dim * 2)
      return lstm_out

  def _score_sentence(self, feats, tags):
      # Gives the score of a provided tag sequence
      score = torch.zeros(1).to(device)
      tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long).to(device), tags])
      for i, feat in enumerate(feats):
          score = score + \
              self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
      score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
      return score

  def _viterbi_decode(self, feats):
      backpointers = []

      # Initialize the viterbi variables in log space
      init_vvars = torch.full((1, self.tagset_size), -10000.).to(device)
      init_vvars[0][self.tag_to_ix[START_TAG]] = 0

      # forward_var at step i holds the viterbi variables for step i-1
      forward_var = init_vvars
      for feat in feats:
          bptrs_t = []  # holds the backpointers for this step
          viterbivars_t = []  # holds the viterbi variables for this step

          for next_tag in range(self.tagset_size):
              # next_tag_var[i] holds the viterbi variable for tag i at the
              # previous step, plus the score of transitioning
              # from tag i to next_tag.
              # We don't include the emission scores here because the max
              # does not depend on them (we add them in below)
              next_tag_var = forward_var + self.transitions[next_tag]
              best_tag_id = argmax(next_tag_var)
              bptrs_t.append(best_tag_id)
              viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
          # Now add in the emission scores, and assign forward_var to the set
          # of viterbi variables we just computed
          forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
          backpointers.append(bptrs_t)

      # Transition to STOP_TAG
      terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
      best_tag_id = argmax(terminal_var)
      path_score = terminal_var[0][best_tag_id]

      # Follow the back pointers to decode the best path.
      best_path = [best_tag_id]
      for bptrs_t in reversed(backpointers):
          best_tag_id = bptrs_t[best_tag_id]
          best_path.append(best_tag_id)
      # Pop off the start tag (we dont want to return that to the caller)
      start = best_path.pop()
      assert start == self.tag_to_ix[START_TAG]  # Sanity check
      best_path.reverse()
      return path_score, best_path

  def neg_log_likelihood(self, sentence, tags):
      feats = self._get_lstm_features(sentence)
      forward_score = self._forward_alg(feats)
      gold_score = self._score_sentence(feats, tags)
      return forward_score - gold_score

  def forward(self, sentence):  # dont confuse this with _forward_alg above.
      # Get the emission scores from the BiLSTM
      if self.is_use_crf:
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq
      else:
        lstm_feats = self._get_lstm_features(sentence)
        return lstm_feats, torch.argmax(lstm_feats, -1)

## Init model

In [31]:
HIDDEN_DIM = 200
is_use_crf = True
loss_func = nn.CrossEntropyLoss()

model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, layers_number = 3, is_use_crf = is_use_crf, attention_type = "scale").to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

### Train the model

In [32]:
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

def cal_acc(model, input_index, output_index, is_use_crf):
  predicted=[]
  ground_truth=[]
  for i, idxs in enumerate(input_index):
        tags_index = output_index[i]
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        pred=model(sentence_in)
        #pre_decode=[ix_to_tag[x] for x in pred[1]]
        #tags_decode=[ix_to_tag[x] for x in tags_index]
        if is_use_crf:
          predicted.extend(pred[1])
        else:
          predicted.extend(list(pred[1].cpu().numpy()))
        ground_truth.extend(tags_index)
  accuracy=accuracy_score(predicted,ground_truth)

  return predicted, ground_truth, accuracy

In [None]:
"""Each epoch will take about 1-2 minutes"""

import datetime

for epoch in range(30):  
    time1 = datetime.datetime.now()
    train_loss = 0

    model.train()
    for i, idxs in enumerate(train_input_index):
        tags_index = train_output_index[i]

        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)

        # Step 3. Run our forward pass.
        if is_use_crf:
          loss = model.neg_log_likelihood(sentence_in, targets)
        else:
          lstm_feats, tags = model(sentence_in)
          loss = loss_func(lstm_feats, targets)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

        train_loss+=loss.item()

    model.eval()
    # Call the cal_acc functions you implemented as required
    _, _, train_acc = cal_acc(model,train_input_index,train_output_index, is_use_crf = is_use_crf)
    _, _, val_acc = cal_acc(model,val_input_index,val_output_index, is_use_crf = is_use_crf)

    val_loss = 0
    for i, idxs in enumerate(val_input_index):
        tags_index = val_output_index[i]
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)
        loss = model.neg_log_likelihood(sentence_in, targets)
        val_loss+=loss.item()
    time2 = datetime.datetime.now()

    print("Epoch:%d, Training loss: %.2f, train acc: %.4f, val loss: %.2f, val acc: %.4f, time: %.2fs" %(epoch+1, train_loss,train_acc, val_loss, val_acc, (time2-time1).total_seconds()))

# Evaluation

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score
#ix_to_tag={v: k for k, v in tag_to_ix.items()}
def cal_acc(model, input_index, output_index, is_use_crf):
  predicted=[]
  ground_truth=[]
  for i, idxs in enumerate(input_index):
        tags_index = output_index[i]
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        pred=model(sentence_in)
        #pre_decode=[ix_to_tag[x] for x in pred[1]]
        #tags_decode=[ix_to_tag[x] for x in tags_index]
        if is_use_crf:
          predicted.extend(pred[1])
        else:
          predicted.extend(list(pred[1].cpu().numpy()))
        ground_truth.extend(tags_index)
  accuracy=accuracy_score(predicted,ground_truth)

  return predicted, ground_truth, accuracy

In [None]:
len(traning_data)

In [None]:
tags=[]
for labels in training_labels:
  tags.extend(labels)

import numpy as np
labels=np.unique(tags)
print(labels)

['B-DocumentReference' 'B-Location' 'B-MilitaryPlatform' 'B-Money'
 'B-Nationality' 'B-Organisation' 'B-Person' 'B-Quantity' 'B-Temporal'
 'B-Weapon' 'I-DocumentReference' 'I-Location' 'I-MilitaryPlatform'
 'I-Money' 'I-Nationality' 'I-Organisation' 'I-Person' 'I-Quantity'
 'I-Temporal' 'I-Weapon' 'O']


## Ablation Study - different input embedding model

In [None]:


def build_embedding(emtype):
  embedding_matrix = []
  if emtype ==1:
    EMBEDDING_DIM = 100
  elif emtype ==2:
    EMBEDDING_DIM = 100 + (pos_tag_id - 1)
  elif emtype ==3:
    EMBEDDING_DIM = 100 +  (pos_tag_id - 1) + (dependency_id - 1)
  for word in word_list:
      try:
        if emtype ==1:
          temp = word_emb_model.wv[word]
        elif emtype ==2:
          temp = word_emb_model.wv[word]
          temp.extend(pos_map[word])
        elif emtype ==3:
          temp = word_emb_model.wv[word]
          temp.extend(pos_map[word])
          temp.extend(dependency_map[word])
        embedding_matrix.append(temp)
      except:
        embedding_matrix.append([0]*EMBEDDING_DIM)
  embedding_matrix = np.array(embedding_matrix)
  
  embedding_matrix = np.concatenate((bert_matrix, embedding_matrix), axis = 1)
  EMBEDDING_DIM = embedding_matrix.shape[1]
  return embedding_matrix


In [None]:
embedding_matrix=build_embedding(2)
EMBEDDING_DIM=embedding_matrix.shape[1]

  app.launch_new_instance()


In [None]:
embedding_matrix=build_embedding(4)
EMBEDDING_DIM=embedding_matrix.shape[1]

In [None]:
embedding_matrix.shape


(4280, 1165)

In [None]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()


# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))
 
class BiLSTM_CRF(nn.Module):

  def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim, layers_number, is_use_crf, attention_type):
      super(BiLSTM_CRF, self).__init__()
      self.embedding_dim = embedding_dim
      self.hidden_dim = hidden_dim
      self.vocab_size = vocab_size
      self.layers_number = layers_number
      self.is_use_crf = is_use_crf
      self.attention_type = attention_type
      self.tag_to_ix = tag_to_ix
      self.tagset_size = len(tag_to_ix)

      self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
      self.word_embeds.weight.data.copy_(torch.from_numpy(embedding_matrix))
      self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                          num_layers=self.layers_number, bidirectional=True)
      self.dropout = nn.Dropout(0.2)
      
      self.attention_weight = nn.parameter.Parameter(torch.Tensor(1, self.hidden_dim, self.hidden_dim), requires_grad = True)

      # Maps the output of the LSTM into tag space.

      if not attention_type:
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)
      else:
        self.hidden2tag = nn.Linear(hidden_dim * 2, self.tagset_size)

      # Matrix of transition parameters.  Entry i,j is the score of
      # transitioning *to* i *from* j.
      self.transitions = nn.Parameter(
          torch.randn(self.tagset_size, self.tagset_size))

      # These two statements enforce the constraint that we never transfer
      # to the start tag and we never transfer from the stop tag
      self.transitions.data[tag_to_ix[START_TAG], :] = -10000
      self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

      self.hidden = self.init_hidden()

  def init_hidden(self):
      return (torch.randn(2 * self.layers_number, 1, self.hidden_dim // 2).to(device),
              torch.randn(2 * self.layers_number, 1, self.hidden_dim // 2).to(device))

  def _forward_alg(self, feats):
      # Do the forward algorithm to compute the partition function
      init_alphas = torch.full((1, self.tagset_size), -10000.).to(device)
      # START_TAG has all of the score.
      init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

      # Wrap in a variable so that we will get automatic backprop
      forward_var = init_alphas

      # Iterate through the sentence
      for feat in feats:
          alphas_t = []  # The forward tensors at this timestep
          for next_tag in range(self.tagset_size):
              # broadcast the emission score: it is the same regardless of
              # the previous tag
              emit_score = feat[next_tag].view(
                  1, -1).expand(1, self.tagset_size)
              # the ith entry of trans_score is the score of transitioning to
              # next_tag from i
              trans_score = self.transitions[next_tag].view(1, -1)
              # The ith entry of next_tag_var is the value for the
              # edge (i -> next_tag) before we do log-sum-exp
              next_tag_var = forward_var + trans_score + emit_score
              # The forward variable for this tag is log-sum-exp of all the
              # scores.
              alphas_t.append(log_sum_exp(next_tag_var).view(1))
          forward_var = torch.cat(alphas_t).view(1, -1)
      terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
      alpha = log_sum_exp(terminal_var)
      return alpha

  def _get_lstm_features(self, sentence):
      self.hidden = self.init_hidden()
      embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
      lstm_out, self.hidden = self.lstm(embeds, self.hidden)
      if not self.attention_type:
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
      else:
        lstm_out = self._get_attention_output(lstm_out, sentence)
      
      lstm_out = self.dropout(lstm_out)
      lstm_feats = self.hidden2tag(lstm_out)
      return lstm_feats
  
  def _get_attention_output(self, lstm_out, sentence):
      lstm_out = torch.squeeze(lstm_out, 1)

      s = lstm_out.view(1, lstm_out.size(0), lstm_out.size(1))
      h = s.view(s.size(0), s.size(2), s.size(1))

      if self.attention_type.lower() == "dot":
        weight_att = nn.functional.softmax(torch.bmm(s, h),dim=-1)
      elif self.attention_type.lower() == "scale":
        weight_att = nn.functional.softmax(torch.bmm(s, h) / np.sqrt(self.hidden_dim),dim=-1)
      else:
        temp = torch.bmm(torch.bmm(s, self.attention_weight), h)
        weight_att = nn.functional.softmax(temp, dim=-1)

      output = torch.bmm(weight_att, s)
      concat_output = torch.cat((output, s), dim = -1)
      lstm_out = concat_output.view(len(sentence), self.hidden_dim * 2)
      return lstm_out

  def _score_sentence(self, feats, tags):
      # Gives the score of a provided tag sequence
      score = torch.zeros(1).to(device)
      tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long).to(device), tags])
      for i, feat in enumerate(feats):
          score = score + \
              self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
      score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
      return score

  def _viterbi_decode(self, feats):
      backpointers = []

      # Initialize the viterbi variables in log space
      init_vvars = torch.full((1, self.tagset_size), -10000.).to(device)
      init_vvars[0][self.tag_to_ix[START_TAG]] = 0

      # forward_var at step i holds the viterbi variables for step i-1
      forward_var = init_vvars
      for feat in feats:
          bptrs_t = []  # holds the backpointers for this step
          viterbivars_t = []  # holds the viterbi variables for this step

          for next_tag in range(self.tagset_size):
              # next_tag_var[i] holds the viterbi variable for tag i at the
              # previous step, plus the score of transitioning
              # from tag i to next_tag.
              # We don't include the emission scores here because the max
              # does not depend on them (we add them in below)
              next_tag_var = forward_var + self.transitions[next_tag]
              best_tag_id = argmax(next_tag_var)
              bptrs_t.append(best_tag_id)
              viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
          # Now add in the emission scores, and assign forward_var to the set
          # of viterbi variables we just computed
          forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
          backpointers.append(bptrs_t)

      # Transition to STOP_TAG
      terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
      best_tag_id = argmax(terminal_var)
      path_score = terminal_var[0][best_tag_id]

      # Follow the back pointers to decode the best path.
      best_path = [best_tag_id]
      for bptrs_t in reversed(backpointers):
          best_tag_id = bptrs_t[best_tag_id]
          best_path.append(best_tag_id)
      # Pop off the start tag (we dont want to return that to the caller)
      start = best_path.pop()
      assert start == self.tag_to_ix[START_TAG]  # Sanity check
      best_path.reverse()
      return path_score, best_path

  def neg_log_likelihood(self, sentence, tags):
      feats = self._get_lstm_features(sentence)
      forward_score = self._forward_alg(feats)
      gold_score = self._score_sentence(feats, tags)
      return forward_score - gold_score

  def forward(self, sentence):  # dont confuse this with _forward_alg above.
      # Get the emission scores from the BiLSTM
      if self.is_use_crf:
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq
      else:
        lstm_feats = self._get_lstm_features(sentence)
        return lstm_feats, torch.argmax(lstm_feats, -1)

In [None]:
HIDDEN_DIM = 200
is_use_crf = True
loss_func = nn.CrossEntropyLoss()

model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, layers_number = 3, is_use_crf = is_use_crf, attention_type = "scale").to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

In [None]:
"""Each epoch will take about 1-2 minutes"""

import datetime
best_val_loss = float("inf")

best_model = None

for epoch in range(20):  
    time1 = datetime.datetime.now()
    train_loss = 0

    model.train()
    for i, idxs in enumerate(train_input_index):
        tags_index = train_output_index[i]

        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)

        # Step 3. Run our forward pass.
        if is_use_crf:
          loss = model.neg_log_likelihood(sentence_in, targets)
        else:
          lstm_feats, tags = model(sentence_in)
          loss = loss_func(lstm_feats, targets)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

        train_loss+=loss.item()

    model.eval()
    
    # Call the cal_acc functions you implemented as required
    _, _, train_acc = cal_acc(model,train_input_index,train_output_index,is_use_crf)
    _, _, val_acc = cal_acc(model,val_input_index,val_output_index,is_use_crf)

    

    val_loss = 0
    for i, idxs in enumerate(val_input_index):
        tags_index = val_output_index[i]
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)
        loss = model.neg_log_likelihood(sentence_in, targets)
        val_loss+=loss.item()
    time2 = datetime.datetime.now()
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model

    print("Epoch:%d, Training loss: %.2f, train acc: %.4f, val loss: %.2f, val acc: %.4f, time: %.2fs" %(epoch+1, train_loss,train_acc, val_loss, val_acc, (time2-time1).total_seconds()))

Epoch:1, Training loss: 10340.88, train acc: 0.7734, val loss: 3717.30, val acc: 0.7268, time: 240.85s
Epoch:2, Training loss: 7600.83, train acc: 0.8199, val loss: 3088.37, val acc: 0.7673, time: 246.47s
Epoch:3, Training loss: 5943.85, train acc: 0.8571, val loss: 2682.07, val acc: 0.8013, time: 246.53s
Epoch:4, Training loss: 4812.65, train acc: 0.8673, val loss: 2507.86, val acc: 0.8055, time: 247.28s
Epoch:5, Training loss: 4031.78, train acc: 0.8947, val loss: 2307.54, val acc: 0.8201, time: 250.08s
Epoch:6, Training loss: 3466.28, train acc: 0.8779, val loss: 2461.33, val acc: 0.8055, time: 254.08s
Epoch:7, Training loss: 2987.85, train acc: 0.9085, val loss: 2210.05, val acc: 0.8178, time: 252.32s
Epoch:8, Training loss: 2471.55, train acc: 0.9152, val loss: 2278.44, val acc: 0.8227, time: 251.46s
Epoch:9, Training loss: 2402.56, train acc: 0.9205, val loss: 2249.65, val acc: 0.8195, time: 252.16s
Epoch:10, Training loss: 2008.76, train acc: 0.9211, val loss: 2359.68, val acc: 

In [None]:
torch.save(best_model, "BiLSTM+CRF+wordEembedding+bert+post.pth")

## Attention

gege

In [None]:
HIDDEN_DIM = 200
is_use_crf = True
loss_func = nn.CrossEntropyLoss()

model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, layers_number = 3, is_use_crf = is_use_crf, attention_type = "").to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

dot

In [None]:
HIDDEN_DIM = 200
is_use_crf = True
loss_func = nn.CrossEntropyLoss()

model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, layers_number = 3, is_use_crf = is_use_crf, attention_type = "dot").to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

In [None]:
"""Each epoch will take about 1-2 minutes"""

import datetime
best_val_loss = float("inf")

best_model = None

for epoch in range(20):  
    time1 = datetime.datetime.now()
    train_loss = 0

    model.train()
    for i, idxs in enumerate(train_input_index):
        tags_index = train_output_index[i]

        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)

        # Step 3. Run our forward pass.
        if is_use_crf:
          loss = model.neg_log_likelihood(sentence_in, targets)
        else:
          lstm_feats, tags = model(sentence_in)
          loss = loss_func(lstm_feats, targets)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

        train_loss+=loss.item()

    model.eval()
    
    # Call the cal_acc functions you implemented as required
    _, _, train_acc = cal_acc(model,train_input_index,train_output_index, is_use_crf)
    _, _, val_acc = cal_acc(model,val_input_index,val_output_index, is_use_crf)

    

    if is_use_crf:
      val_loss = 0
      for i, idxs in enumerate(val_input_index):
          tags_index = val_output_index[i]
          sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
          targets = torch.tensor(tags_index, dtype=torch.long).to(device)
          loss = model.neg_log_likelihood(sentence_in, targets)
          val_loss+=loss.item()
      time2 = datetime.datetime.now()
      print("Epoch:%d, Training loss: %.2f, train acc: %.4f, val loss: %.2f, val acc: %.4f, time: %.2fs" %(epoch+1, train_loss, train_acc, val_loss, val_acc, (time2-time1).total_seconds()))
    
    else:
      time2 = datetime.datetime.now()
      print("Epoch:%d, Training loss: %.2f, train acc: %.4f, val acc: %.4f, time: %.2fs" %(epoch+1, train_loss, train_acc, val_acc, (time2-time1).total_seconds()))

Epoch:1, Training loss: 16286.89, train acc: 0.7024, val loss: 5395.16, val acc: 0.6534, time: 215.53s
Epoch:2, Training loss: 10607.97, train acc: 0.7725, val loss: 3934.90, val acc: 0.7152, time: 215.50s
Epoch:3, Training loss: 7651.03, train acc: 0.8262, val loss: 3119.85, val acc: 0.7656, time: 215.48s
Epoch:4, Training loss: 5880.11, train acc: 0.8536, val loss: 2717.64, val acc: 0.7920, time: 214.94s
Epoch:5, Training loss: 4675.50, train acc: 0.8738, val loss: 2442.97, val acc: 0.8077, time: 213.91s
Epoch:6, Training loss: 3855.90, train acc: 0.8819, val loss: 2344.83, val acc: 0.8125, time: 213.91s
Epoch:7, Training loss: 3239.00, train acc: 0.9023, val loss: 2272.07, val acc: 0.8172, time: 214.79s
Epoch:8, Training loss: 2972.50, train acc: 0.9062, val loss: 2310.29, val acc: 0.8157, time: 212.36s
Epoch:9, Training loss: 2439.61, train acc: 0.9257, val loss: 2271.11, val acc: 0.8307, time: 212.13s
Epoch:10, Training loss: 2170.67, train acc: 0.9249, val loss: 2410.79, val acc:

In [None]:
torch.save(best_model, "BiLSTM+CRF+genaral.pth")

In [None]:
"""Each epoch will take about 1-2 minutes"""

import datetime
best_val_loss = float("inf")

best_model = None

for epoch in range(20):  
    time1 = datetime.datetime.now()
    train_loss = 0

    model.train()
    for i, idxs in enumerate(train_input_index):
        tags_index = train_output_index[i]

        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)

        # Step 3. Run our forward pass.
        if is_use_crf:
          loss = model.neg_log_likelihood(sentence_in, targets)
        else:
          lstm_feats, tags = model(sentence_in)
          loss = loss_func(lstm_feats, targets)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

        train_loss+=loss.item()

    model.eval()
    
    # Call the cal_acc functions you implemented as required
    _, _, train_acc = cal_acc(model,train_input_index,train_output_index, is_use_crf)
    _, _, val_acc = cal_acc(model,val_input_index,val_output_index, is_use_crf)

    

    if is_use_crf:
      val_loss = 0
      for i, idxs in enumerate(val_input_index):
          tags_index = val_output_index[i]
          sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
          targets = torch.tensor(tags_index, dtype=torch.long).to(device)
          loss = model.neg_log_likelihood(sentence_in, targets)
          val_loss+=loss.item()
      time2 = datetime.datetime.now()
      print("Epoch:%d, Training loss: %.2f, train acc: %.4f, val loss: %.2f, val acc: %.4f, time: %.2fs" %(epoch+1, train_loss, train_acc, val_loss, val_acc, (time2-time1).total_seconds()))
    
    else:
      time2 = datetime.datetime.now()
      print("Epoch:%d, Training loss: %.2f, train acc: %.4f, val acc: %.4f, time: %.2fs" %(epoch+1, train_loss, train_acc, val_acc, (time2-time1).total_seconds()))

Epoch:1, Training loss: 16797.88, train acc: 0.7151, val loss: 5258.45, val acc: 0.6576, time: 215.76s
Epoch:2, Training loss: 10244.09, train acc: 0.7708, val loss: 3869.60, val acc: 0.7148, time: 213.87s
Epoch:3, Training loss: 7791.70, train acc: 0.8188, val loss: 3194.46, val acc: 0.7626, time: 213.95s
Epoch:4, Training loss: 6075.83, train acc: 0.8520, val loss: 2770.62, val acc: 0.7914, time: 214.25s
Epoch:5, Training loss: 4728.46, train acc: 0.8699, val loss: 2592.57, val acc: 0.8064, time: 214.25s
Epoch:6, Training loss: 3916.73, train acc: 0.8861, val loss: 2472.08, val acc: 0.8159, time: 213.29s
Epoch:7, Training loss: 3228.12, train acc: 0.9001, val loss: 2444.03, val acc: 0.8170, time: 212.90s
Epoch:8, Training loss: 2732.94, train acc: 0.9119, val loss: 2471.33, val acc: 0.8140, time: 213.60s
Epoch:9, Training loss: 2545.33, train acc: 0.9070, val loss: 2649.56, val acc: 0.8074, time: 212.22s
Epoch:10, Training loss: 2430.77, train acc: 0.9083, val loss: 2796.28, val acc:

In [None]:
torch.save(best_model, "BiLSTM+CRF+dot.pth")

## Not use crf

In [None]:
HIDDEN_DIM = 200
is_use_crf = False
loss_func = nn.CrossEntropyLoss()

model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, layers_number = 3, is_use_crf = is_use_crf, attention_type = "scale").to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

In [None]:
"""Each epoch will take about 1-2 minutes"""

import datetime
best_val_loss = float("inf")

best_model = None

for epoch in range(20):  
    time1 = datetime.datetime.now()
    train_loss = 0

    model.train()
    for i, idxs in enumerate(train_input_index):
        tags_index = train_output_index[i]

        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)

        # Step 3. Run our forward pass.
        if is_use_crf:
          loss = model.neg_log_likelihood(sentence_in, targets)
        else:
          lstm_feats, tags = model(sentence_in)
          loss = loss_func(lstm_feats, targets)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

        train_loss+=loss.item()

    model.eval()
    
    # Call the cal_acc functions you implemented as required
    _, _, train_acc = cal_acc(model,train_input_index,train_output_index, is_use_crf)
    _, _, val_acc = cal_acc(model,val_input_index,val_output_index, is_use_crf)

    

    if is_use_crf:
      val_loss = 0
      for i, idxs in enumerate(val_input_index):
          tags_index = val_output_index[i]
          sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
          targets = torch.tensor(tags_index, dtype=torch.long).to(device)
          loss = model.neg_log_likelihood(sentence_in, targets)
          val_loss+=loss.item()
      time2 = datetime.datetime.now()
      print("Epoch:%d, Training loss: %.2f, train acc: %.4f, val loss: %.2f, val acc: %.4f, time: %.2fs" %(epoch+1, train_loss, train_acc, val_loss, val_acc, (time2-time1).total_seconds()))
    
    else:
      time2 = datetime.datetime.now()
      print("Epoch:%d, Training loss: %.2f, train acc: %.4f, val acc: %.4f, time: %.2fs" %(epoch+1, train_loss, train_acc, val_acc, (time2-time1).total_seconds()))

In [None]:
torch.save(model, 'not_use_crf.pth')

### Different stacked layers

In [33]:
HIDDEN_DIM = 200
is_use_crf = True
loss_func = nn.CrossEntropyLoss()

model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, layers_number = 1, is_use_crf = is_use_crf, attention_type = "scale").to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

In [34]:
"""Each epoch will take about 1-2 minutes"""

import datetime
best_val_loss = float("inf")

best_model = None

for epoch in range(20):  
    time1 = datetime.datetime.now()
    train_loss = 0

    model.train()
    for i, idxs in enumerate(train_input_index):
        tags_index = train_output_index[i]

        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)

        # Step 3. Run our forward pass.
        if is_use_crf:
          loss = model.neg_log_likelihood(sentence_in, targets)
        else:
          lstm_feats, tags = model(sentence_in)
          loss = loss_func(lstm_feats, targets)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

        train_loss+=loss.item()

    model.eval()
    
    # Call the cal_acc functions you implemented as required
    _, _, train_acc = cal_acc(model,train_input_index,train_output_index, is_use_crf)
    _, _, val_acc = cal_acc(model,val_input_index,val_output_index, is_use_crf)

    

    if is_use_crf:
      val_loss = 0
      for i, idxs in enumerate(val_input_index):
          tags_index = val_output_index[i]
          sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
          targets = torch.tensor(tags_index, dtype=torch.long).to(device)
          loss = model.neg_log_likelihood(sentence_in, targets)
          val_loss+=loss.item()
      time2 = datetime.datetime.now()
      print("Epoch:%d, Training loss: %.2f, train acc: %.4f, val loss: %.2f, val acc: %.4f, time: %.2fs" %(epoch+1, train_loss, train_acc, val_loss, val_acc, (time2-time1).total_seconds()))
    
    else:
      time2 = datetime.datetime.now()
      print("Epoch:%d, Training loss: %.2f, train acc: %.4f, val acc: %.4f, time: %.2fs" %(epoch+1, train_loss, train_acc, val_acc, (time2-time1).total_seconds()))

Epoch:1, Training loss: 13746.34, train acc: 0.8231, val loss: 3601.84, val acc: 0.7677, time: 201.23s
Epoch:2, Training loss: 6654.32, train acc: 0.8636, val loss: 2720.14, val acc: 0.7956, time: 204.25s
Epoch:3, Training loss: 4807.17, train acc: 0.8863, val loss: 2399.18, val acc: 0.8174, time: 202.87s
Epoch:4, Training loss: 3817.70, train acc: 0.8986, val loss: 2241.91, val acc: 0.8242, time: 205.38s
Epoch:5, Training loss: 3243.32, train acc: 0.9119, val loss: 2195.98, val acc: 0.8203, time: 207.68s
Epoch:6, Training loss: 2744.11, train acc: 0.9128, val loss: 2283.44, val acc: 0.8227, time: 207.20s
Epoch:7, Training loss: 2467.45, train acc: 0.9209, val loss: 2264.81, val acc: 0.8221, time: 245.16s
Epoch:8, Training loss: 1974.80, train acc: 0.9350, val loss: 2227.30, val acc: 0.8237, time: 199.03s
Epoch:9, Training loss: 1811.53, train acc: 0.9416, val loss: 2309.67, val acc: 0.8119, time: 190.73s
Epoch:10, Training loss: 1639.85, train acc: 0.9488, val loss: 2381.54, val acc: 

In [35]:
torch.save(model, 'one_layer.pth')

In [36]:
HIDDEN_DIM = 200
is_use_crf = True
loss_func = nn.CrossEntropyLoss()

model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, layers_number = 2, is_use_crf = is_use_crf, attention_type = "scale").to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

In [37]:
"""Each epoch will take about 1-2 minutes"""

import datetime
best_val_loss = float("inf")

best_model = None

for epoch in range(20):  
    time1 = datetime.datetime.now()
    train_loss = 0

    model.train()
    for i, idxs in enumerate(train_input_index):
        tags_index = train_output_index[i]

        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)

        # Step 3. Run our forward pass.
        if is_use_crf:
          loss = model.neg_log_likelihood(sentence_in, targets)
        else:
          lstm_feats, tags = model(sentence_in)
          loss = loss_func(lstm_feats, targets)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

        train_loss+=loss.item()

    model.eval()
    
    # Call the cal_acc functions you implemented as required
    _, _, train_acc = cal_acc(model,train_input_index,train_output_index, is_use_crf)
    _, _, val_acc = cal_acc(model,val_input_index,val_output_index, is_use_crf)

    

    if is_use_crf:
      val_loss = 0
      for i, idxs in enumerate(val_input_index):
          tags_index = val_output_index[i]
          sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
          targets = torch.tensor(tags_index, dtype=torch.long).to(device)
          loss = model.neg_log_likelihood(sentence_in, targets)
          val_loss+=loss.item()
      time2 = datetime.datetime.now()
      print("Epoch:%d, Training loss: %.2f, train acc: %.4f, val loss: %.2f, val acc: %.4f, time: %.2fs" %(epoch+1, train_loss, train_acc, val_loss, val_acc, (time2-time1).total_seconds()))
    
    else:
      time2 = datetime.datetime.now()
      print("Epoch:%d, Training loss: %.2f, train acc: %.4f, val acc: %.4f, time: %.2fs" %(epoch+1, train_loss, train_acc, val_acc, (time2-time1).total_seconds()))

Epoch:1, Training loss: 14731.08, train acc: 0.7789, val loss: 4256.91, val acc: 0.7272, time: 211.32s
Epoch:2, Training loss: 8079.21, train acc: 0.8398, val loss: 3121.25, val acc: 0.7838, time: 224.75s
Epoch:3, Training loss: 5609.54, train acc: 0.8665, val loss: 2642.58, val acc: 0.8003, time: 225.12s
Epoch:4, Training loss: 4337.71, train acc: 0.8881, val loss: 2446.86, val acc: 0.8144, time: 225.44s
Epoch:5, Training loss: 3613.59, train acc: 0.8928, val loss: 2354.02, val acc: 0.8163, time: 225.24s
Epoch:6, Training loss: 2869.54, train acc: 0.8903, val loss: 2525.26, val acc: 0.8140, time: 222.38s
Epoch:7, Training loss: 2405.86, train acc: 0.9121, val loss: 2468.06, val acc: 0.8161, time: 222.97s
Epoch:8, Training loss: 2063.20, train acc: 0.9330, val loss: 2260.57, val acc: 0.8286, time: 223.49s
Epoch:9, Training loss: 1782.69, train acc: 0.9364, val loss: 2412.17, val acc: 0.8176, time: 221.80s
Epoch:10, Training loss: 1565.86, train acc: 0.9432, val loss: 2359.27, val acc: 

In [38]:
torch.save(model, 'two_layer.pth')

In [39]:
HIDDEN_DIM = 200
is_use_crf = True
loss_func = nn.CrossEntropyLoss()

model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, layers_number = 3, is_use_crf = is_use_crf, attention_type = "scale").to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

In [40]:
"""Each epoch will take about 1-2 minutes"""

import datetime
best_val_loss = float("inf")

best_model = None

for epoch in range(20):  
    time1 = datetime.datetime.now()
    train_loss = 0

    model.train()
    for i, idxs in enumerate(train_input_index):
        tags_index = train_output_index[i]

        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)

        # Step 3. Run our forward pass.
        if is_use_crf:
          loss = model.neg_log_likelihood(sentence_in, targets)
        else:
          lstm_feats, tags = model(sentence_in)
          loss = loss_func(lstm_feats, targets)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

        train_loss+=loss.item()

    model.eval()
    
    # Call the cal_acc functions you implemented as required
    _, _, train_acc = cal_acc(model,train_input_index,train_output_index, is_use_crf)
    _, _, val_acc = cal_acc(model,val_input_index,val_output_index, is_use_crf)

    

    if is_use_crf:
      val_loss = 0
      for i, idxs in enumerate(val_input_index):
          tags_index = val_output_index[i]
          sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
          targets = torch.tensor(tags_index, dtype=torch.long).to(device)
          loss = model.neg_log_likelihood(sentence_in, targets)
          val_loss+=loss.item()
      time2 = datetime.datetime.now()
      print("Epoch:%d, Training loss: %.2f, train acc: %.4f, val loss: %.2f, val acc: %.4f, time: %.2fs" %(epoch+1, train_loss, train_acc, val_loss, val_acc, (time2-time1).total_seconds()))
    
    else:
      time2 = datetime.datetime.now()
      print("Epoch:%d, Training loss: %.2f, train acc: %.4f, val acc: %.4f, time: %.2fs" %(epoch+1, train_loss, train_acc, val_acc, (time2-time1).total_seconds()))

Epoch:1, Training loss: 16439.10, train acc: 0.7284, val loss: 4933.59, val acc: 0.6797, time: 221.49s
Epoch:2, Training loss: 9843.40, train acc: 0.7748, val loss: 3838.67, val acc: 0.7215, time: 221.06s
Epoch:3, Training loss: 7504.44, train acc: 0.8224, val loss: 3183.86, val acc: 0.7636, time: 223.28s
Epoch:4, Training loss: 5798.15, train acc: 0.8503, val loss: 2812.92, val acc: 0.7907, time: 222.18s
Epoch:5, Training loss: 4612.48, train acc: 0.8768, val loss: 2452.93, val acc: 0.8087, time: 223.84s
Epoch:6, Training loss: 3764.78, train acc: 0.8829, val loss: 2468.14, val acc: 0.8185, time: 223.06s
Epoch:7, Training loss: 3218.77, train acc: 0.9025, val loss: 2252.24, val acc: 0.8231, time: 222.67s
Epoch:8, Training loss: 2618.98, train acc: 0.9093, val loss: 2266.91, val acc: 0.8235, time: 222.11s
Epoch:9, Training loss: 2315.80, train acc: 0.9097, val loss: 2431.76, val acc: 0.8231, time: 225.38s
Epoch:10, Training loss: 2015.45, train acc: 0.9294, val loss: 2317.64, val acc: 

In [41]:
torch.save(model, 'three_layer.pth')