# pytorch setup, imports and constants initialization

In [2]:
from os.path import exists
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())
cuda_output = !ldconfig -p|grep cudart.so|sed -e 's/.*\.\([0-9]*\)\.\([0-9]*\)$/cu\1\2/'
accelerator = cuda_output[0] if exists('/dev/nvidia0') else 'cpu'
!pip3 install https://download.pytorch.org/whl/cu100/torch-1.0.1-cp36-cp36m-linux_x86_64.whl
!pip3 install torch torchvision
  
import torch
device =  torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

Collecting torch==1.0.1
[?25l  Downloading https://download.pytorch.org/whl/cu100/torch-1.0.1-cp36-cp36m-linux_x86_64.whl (614.8MB)
[K     |████████████████████████████████| 614.8MB 28kB/s 
[31mERROR: torchvision 0.6.0+cu101 has requirement torch==1.5.0, but you'll have torch 1.0.1 which is incompatible.[0m
[?25hInstalling collected packages: torch
  Found existing installation: torch 1.5.0+cu101
    Uninstalling torch-1.5.0+cu101:
      Successfully uninstalled torch-1.5.0+cu101
Successfully installed torch-1.0.1
cuda:0


In [3]:
!pip3 install pymagnitude
from pymagnitude import *

import time
import argparse
import numpy as np
import time
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence
from sklearn.metrics import f1_score
from pdb import set_trace as debug

# Flag, set to True if you want to use DE data as well for pretraining
use_de = True

PAD = 0
UNK = 1
BOS = 2
EOS = 3

PAD_WORD = '<blank>'
UNK_WORD = '<unk>'
BOS_WORD = '<s>'
EOS_WORD = '</s>'

CAT = ['PER', 'ORG', 'LOC', 'MISC']
POSITION = ['I', 'B']
LABEL_INDEX = [PAD_WORD] + ['O'] + ["{}-{}".format(position, cat) for cat in CAT for position in POSITION]

train_paths = ['Data/eng.train', 'Data/eng.testa', 'Data/eng.testb']
if use_de: train_paths.extend(['Data/ned.train', 'Data/ned.testa', 'Data/ned.testb'])
val_paths = ['Data/esp.testa']
test_paths = ['Data/esp.train', 'Data/esp.testb']

Collecting pymagnitude
[?25l  Downloading https://files.pythonhosted.org/packages/0a/a3/b9a34d22ed8c0ed59b00ff55092129641cdfa09d82f9abdc5088051a5b0c/pymagnitude-0.1.120.tar.gz (5.4MB)
[K     |████████████████████████████████| 5.4MB 2.8MB/s 
[?25hBuilding wheels for collected packages: pymagnitude
  Building wheel for pymagnitude (setup.py) ... [?25l[?25hdone
  Created wheel for pymagnitude: filename=pymagnitude-0.1.120-cp36-cp36m-linux_x86_64.whl size=135918206 sha256=5a475fc9e35999bd120b478d83e1d194cb129f8327181fccdd4d32f9e9fa3b43
  Stored in directory: /root/.cache/pip/wheels/a2/c7/98/cb48b9db35f8d1a7827b764dc36c5515179dc116448a47c8a1
Successfully built pymagnitude
Installing collected packages: pymagnitude
Successfully installed pymagnitude-0.1.120


# Connect to Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

%cd "/content/gdrive/Shared drives/CIS 530 Project/"

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive
/content/gdrive/Shared drives/CIS 530 Project


# Utility Functions

In [0]:
# Returns the dictionary from the file with translations
def language_to_spanish_dict(path):
  l_to_spanish = dict()

  with open(path, 'r') as f:
    for line in f:
        line = line.rstrip('\n').split('\t')
        en_word = line[0]
        es_word = line[1]
        if en_word.isupper(): l_to_spanish[en_word] = es_word.upper()
        elif len(en_word) > 0 and en_word[0].isupper(): l_to_spanish[en_word] = es_word.capitalize()
        else: l_to_spanish[en_word] = es_word

  return l_to_spanish

# Returns a matrix of category distributions of the translation word, seen in
# the training set, or for all test words the closes translation word
def previous_ner_distributions(embedding_matrix, word_to_index, en_to_spanish, de_to_spanish=None):
  embedding_size = embedding_matrix.shape[1]
  translation_words = []
  translation_words_matrix = []
  category_distribution_matrix = np.zeros((len(word_to_index), 5))

  # Columns = O, PER, ORG, LOC, MISC
  en_files = [f for f in train_paths if 'eng' in f]
  for file in en_files:
    with open(file, 'r') as f:
      for line in f:
        line = line.rstrip('\n')
        if line == "": continue

        word = line.split()[0]
        trans = en_to_spanish[word]
        tag = line.split()[3]
        if trans not in translation_words:
          translation_words.append(trans)
          translation_words_matrix.append(embedding_matrix[word_to_index[trans], :])
        col = 0
        if 'PER' in tag: col = 1
        elif 'ORG' in tag: col = 2
        elif 'LOC' in tag: col = 3
        elif 'MISC' in tag: col = 4
        category_distribution_matrix[word_to_index[trans], col] += 1

  if de_to_spanish:
    de_files = [f for f in train_paths if 'ned' in f]
    for file in de_files:
      with open(file, 'r') as f:
        for line in f:
          line = line.rstrip('\n')
          if line == "": continue

          word = line.split()[0]
          trans = de_to_spanish[word]
          tag = line.split()[2]
          if trans not in translation_words:
            translation_words.append(trans)
            translation_words_matrix.append(embedding_matrix[word_to_index[trans], :])
          col = 0
          if 'PER' in tag: col = 1
          elif 'ORG' in tag: col = 2
          elif 'LOC' in tag: col = 3
          elif 'MISC' in tag: col = 4
          category_distribution_matrix[word_to_index[trans], col] += 1

  translation_words_matrix = np.array(translation_words_matrix)
  norms = np.linalg.norm(translation_words_matrix, axis=1)
  # ratios on each row
  sums = np.maximum(category_distribution_matrix.sum(axis=1), 1)
  category_distribution_matrix = category_distribution_matrix / sums[:, None]

  es_files = val_paths + test_paths

  for file in es_files:
    with open(file, 'r') as f:
      for line in f:
        line = line.rstrip('\n')
        if line == "": continue

        word = line.split()[0]
        if category_distribution_matrix[word_to_index[word], :].sum() == 0:
          embedding = embedding_matrix[word_to_index[word], :]
          embedding = np.reshape(embedding, (embedding_size, 1))
          closeness = np.matmul(translation_words_matrix, embedding)/norms[:, None]
          closest_distribution = category_distribution_matrix[word_to_index[translation_words[np.argmax(closeness)]], :]
          category_distribution_matrix[word_to_index[word], :] = closest_distribution

  return category_distribution_matrix

def get_category_average_word_embeddings(matrix, en_to_spanish, word_to_index, de_to_spanish=None):
  CAT_TO_IDX = {'PER':0, 'ORG':1, 'LOC':2, 'MISC':3, 'O':4}
  CAT = ['PER', 'ORG', 'LOC', 'MISC', 'O']
  dim = matrix.shape[1]

  CAT_EMB = np.array([[0 for i in range(dim)] for j in range(len(CAT))], dtype=np.float64)
  
  cnt = [0]*5
  en_files = [f for f in train_paths if 'eng' in f]
  for file in en_files:
    with open(file, 'r') as f:
      for line in f:
        line = line.rstrip('\n')
        if line == "": continue

        word = line.split()[0]
        cat = line.split()[3].replace('I-','').replace('B-','')
        trans = en_to_spanish[word]

        emb = np.squeeze(matrix[word_to_index[trans], :])

        CAT_EMB[CAT_TO_IDX[cat]] += emb
        cnt[CAT_TO_IDX[cat]]+=1

  if de_to_spanish:
    de_files = [f for f in train_paths if 'ned' in f]
    for file in de_files:
      with open(file, 'r') as f:
        for line in f:
          line = line.rstrip('\n')
          if line == "": continue

          word = line.split()[0]
          cat = line.split()[2].replace('I-','').replace('B-','')
          trans = de_to_spanish[word]

          emb = np.squeeze(matrix[word_to_index[trans], :])

          CAT_EMB[CAT_TO_IDX[cat]] += emb
          cnt[CAT_TO_IDX[cat]]+=1

  cnt = np.array(cnt)
  cnt = cnt[:,np.newaxis]
  CAT_EMB = np.divide(CAT_EMB,cnt)
  return CAT_EMB

def find_distance(dim, cat_emb, emb):
  dis = []
  for i in range(len(cat_emb)):
    dis.append(np.inner(cat_emb[i], emb) / (np.linalg.norm(cat_emb[i]) * np.linalg.norm(emb)))

  return dis

def all_distances(matrix, word_to_index, en_to_spanish, de_to_spanish=None):
  dim = matrix.shape[1]
  all_category_distances = []

  cat_emb = get_category_average_word_embeddings(matrix, en_to_spanish, word_to_index, de_to_spanish)
  for i in range(matrix.shape[0]):
    all_category_distances.append(find_distance(dim, cat_emb, matrix[i, :]))
  return np.array(all_category_distances)

def get_distributional_info(embedding_matrix, word_to_index, en_to_spanish, distance=True, de_to_spanish=None):
  if distance: return all_distances(embedding_matrix, word_to_index, en_to_spanish, de_to_spanish)
  else: return previous_ner_distributions(embedding_matrix, word_to_index, en_to_spanish, de_to_spanish)

# Returns a tuple (matrix, word_to_index) where the matrix contains all the 
# X-dimensional word embeddings, and word_to_index is a dictionary from the
# word into the index in the matrix. For Out-of-vocabulary words it creates
# a random embedding
def get_indexed_word_embeddings(en_to_spanish, embedding_path, de_to_spanish=None):
  vectors = Magnitude(embedding_path)

  dim = vectors.dim
  
  matrix = [] # words by embedding-dimension
  word_to_index = dict()

  index = 0

  en_files = [f for f in train_paths if 'eng' in f]
  
  for file in en_files:
    with open(file, 'r') as f:
      for line in f:
        line = line.rstrip('\n')
        if line == "": continue

        word = line.split()[0]
        trans = en_to_spanish[word]
        if trans not in word_to_index:
          word_to_index[trans] = index
          if trans in vectors: matrix.append(vectors.query(trans))
          else: matrix.append(np.random.uniform(-(3/dim)**0.5, (3/dim)**0.5, dim))

          index += 1

  if de_to_spanish:
    de_files = [f for f in train_paths if 'ned' in f]
    for file in de_files:
      with open(file, 'r') as f:
        for line in f:
          line = line.rstrip('\n')
          if line == "": continue

          word = line.split()[0]
          trans = de_to_spanish[word]
          if trans not in word_to_index:
            word_to_index[trans] = index
            if trans in vectors: matrix.append(vectors.query(trans))
            else: matrix.append(np.random.uniform(-(3/dim)**0.5, (3/dim)**0.5, dim))

            index += 1

  es_files = val_paths + test_paths
  
  for file in es_files:
    with open(file, 'r') as f:
      for line in f:
        line = line.rstrip('\n')
        if line == "": continue

        word = line.split()[0]
        if word not in word_to_index:
          word_to_index[word] = index
          if word in vectors: matrix.append(vectors.query(word))
          else: matrix.append(np.random.uniform(-(3/dim)**0.5, (3/dim)**0.5, dim))
            
          index += 1

  return np.array(matrix), word_to_index

#returns char dictionary created from all path in paths
def create_char_index(paths, en_to_spanish, pad=False, de_to_spanish=None):
    char_dict = {}
    if pad:
        char_dict[PAD_WORD] = PAD
        char_dict[UNK_WORD] = UNK
    else:
        char_dict[UNK_WORD] = 0

    for path in paths:
        for line in open(path):
            l = line.strip().split()
            if len(l) > 0:# and l[0] != '':
              #l[0] is word l[1] is POS, l[2] is gold standard NER label
              word = l[0]
              es_word = word
              if 'eng' in path and word in en_to_spanish:
                es_word = en_to_spanish.get(word)
              if de_to_spanish:
                if 'ned' in path and word in de_to_spanish:
                  es_word = de_to_spanish.get(word)
              for i in range(len(es_word)):
                  if es_word[i] not in char_dict:
                      char_dict[es_word[i]] = len(char_dict)

    return char_dict

#   Returns 
#1. all the spanish 'sentences' in 2D array
#2. 2D array indicating if word in given sentence is OOV word (True if the word is used as-is, translation not found) or not
#3. 2D array returning labels.
#   in the file(s) at paths in an array.
def data_to_words_sentences(paths, en_to_spanish, test=False, de_to_spanish=None):
  sentences=[]
  curr_sentence=[]
  OOV = []
  labels=[]
  curr_OOV_sentence = []
  curr_label_sentence = []
  word_idx = 0
  for path in paths:
      for line in open(path):
        line = line.strip().split()
        
        end_of_line=False
        if len(line) == 0:
          end_of_line=True
        
        if not end_of_line:
          word = line[0]
          es_word = word

          if not test:
            if 'eng' in path: curr_label_sentence.append(line[3]) #english has 4th column label
            else: curr_label_sentence.append(line[2]) #dutch has 3rd column label
          else: #spanish has 3rd column label
            curr_label_sentence.append(line[2])

          curr_OOV_sentence.append(True)
          if test:
            curr_OOV_sentence[word_idx] = False
          else: #not test
            if 'eng' in path and word in en_to_spanish:
              es_word = en_to_spanish.get(word)
              curr_OOV_sentence[word_idx] = False
            elif de_to_spanish:
              if word in de_to_spanish:
                es_word = de_to_spanish.get(word)
                curr_OOV_sentence[word_idx] = False
          
          word_idx = word_idx+1
          curr_sentence.append(es_word)
        if end_of_line:
          sentences.append(curr_sentence)
          OOV.append(curr_OOV_sentence)
          labels.append(curr_label_sentence)
          curr_sentence=[]
          curr_OOV_sentence=[]
          curr_label_sentence=[]
          word_idx=0
  return sentences, OOV, labels

#ADD all methods to get initial input to neural network here. Returns char, labels and word input
#OOV[i] indicates if word[i] is OOV (translation not found)
def get_input(word_vocab_dict, sentences, OOV, char_vocab_dict, token_labels, label_to_index):
  max_word_len = max(len(word) for sentence in sentences for word in sentence)
  #max_word_len = max(len(word) for word in words)
  max_sentence_length = max(len(sentence) for sentence in sentences)

  word_input = np.zeros((len(sentences), max_sentence_length), dtype='int64')
  word_input_length = [len(sentence) for sentence in sentences]

  char_input = np.zeros((len(sentences), max_sentence_length,max_word_len), dtype='int64') 
  char_input_length = np.zeros((len(sentences), max_sentence_length), dtype='int64') #2D array of length of word in each sentence in sentences
  
  label_input = np.zeros((len(sentences), max_sentence_length), dtype='int64') #2D array of label of word in each sentence in sentences
  for i in range(len(sentences)):
    for j in range(len(sentences[i])):
      word_input[i][j] = word_vocab_dict[sentences[i][j]]
      char_input_length[i][j] = len(sentences[i][j]) 
      label_input[i][j] = label_to_index.index(token_labels[i][j])
      for k in range(len(sentences[i][j])):
        c = sentences[i][j][k]
        if c in char_vocab_dict:
          input_zero = c.isdigit() or OOV[i][j]
          char_input[i][j][k] = char_vocab_dict['0' if input_zero else c]
        else:
          char_input[i][j][k] = UNK

  word_input_var = Variable(torch.from_numpy(word_input), requires_grad=False)
  word_input_length_var = Variable(torch.LongTensor(word_input_length), requires_grad=False)
  label_var = Variable(torch.from_numpy(label_input), requires_grad=False)
  char_input_var = Variable(torch.from_numpy(char_input), requires_grad=False)
  char_input_length_var = Variable(torch.from_numpy(char_input_length), requires_grad=False)
  return word_input_var.cuda(), word_input_length_var.cuda(), char_input_var.cuda(), char_input_length_var.cuda(), label_var.cuda()

def batch_from_data(X, X1, y, batch_size, random=True):
    batch_num = int(np.ceil(len(y) / float(batch_size)))
    rand_indices = np.arange(len(y))
    if random: rand_indices = np.random.permutation(len(y))

    for batch in range(0, batch_num):
        bs = batch_size if batch < batch_num - 1 else len(y) - batch_size * batch
        #from pdb import set_trace as debug
        #debug()
        yield [X[i] for i in rand_indices[batch * batch_size : batch * batch_size + bs]], [X1[i] for i in rand_indices[batch * batch_size : batch * batch_size + bs]], [y[i] for i in rand_indices[batch * batch_size : batch * batch_size + bs]]

# Model

##char embedding

In [0]:
class char_model(nn.Module):
    def __init__(self, char_vocab_size, char_embed_size, char_lstm_hidden_size=50):

        #START: char embedding section
        super(char_model, self).__init__()
        self.char_embed = nn.Embedding(char_vocab_size, char_embed_size, padding_idx=PAD)
        self.char_lstm = nn.LSTM(char_embed_size, char_lstm_hidden_size, bidirectional=True, batch_first=True)
        self.char_lstm_hidden_size = char_lstm_hidden_size
        #END: char embedding section 

    def forward(self, char_inp, char_input_length):

        #START: char embedding section
        #from pdb import set_trace as debug
        #debug()
        char_input = char_inp.view(-1, char_inp.size(2))
        char_input_length_sorted, char_original_idx = char_input_length.view(-1).sort(0, descending=True)
        char_embedded = self.char_embed(char_input)
        char_embedded_sorted = char_embedded[char_original_idx] #get embeddings in descending order of length of word in words

        char_input_length_sorted_size = char_input_length_sorted.size(0)
        last_index = char_input_length_sorted_size
        if char_input_length_sorted.data.eq(0).sum() != 0: #atleast 1 element of char_input_length_sorted is/are zero
          last_index = char_input_length_sorted.data.eq(0).nonzero()[0][0]
        char_embedded_sorted = char_embedded_sorted[:last_index]
        char_input_length_sorted = char_input_length_sorted[:last_index]

        char_input_packed_padded = pack_padded_sequence(char_embedded_sorted, char_input_length_sorted.cpu().data.numpy(), batch_first=True)
        char_output_packed_padded, (h_n, c_n) = self.char_lstm(char_input_packed_padded)
        char_hidden_state = torch.cat([h_n[0], h_n[1]], 1)

        if last_index != char_input_length_sorted_size:
          zero_padding_diff = char_input_length_sorted_size - last_index
          zero_padding = Variable(torch.zeros((zero_padding_diff, 2*self.char_lstm_hidden_size)), requires_grad=False).cuda()
          char_hidden_state = torch.cat([char_hidden_state, zero_padding], 0)

        char_hidden_state = char_hidden_state[torch.argsort(char_original_idx)] #char_hidden_state[torch.from_numpy(np.argsort(char_original_idx.cpu().data.numpy())).cuda()]
        char_hidden_state = char_hidden_state.view(char_inp.size(0), -1, char_hidden_state.size(1))
        return char_hidden_state
        #END: char embedding section 

    def reset_parameters(self):

      #START: char embedding section
      for param in self.char_embed.parameters():
          nn.init.normal(param, mean=0, std=0.01)

      for name, param in self.char_lstm.named_parameters():
          if 'bias' in name:
              nn.init.constant_(param, 0.)
          elif 'weight' in name:
              nn.init.normal(param, mean=0, std=0.1)
      #END: char embedding section 

##Self Attention

In [0]:
class selfAttention(nn.Module):

  def __init__(self, char_lstm_hidden_size, word_vocab_size, word_embedding_size, word_lstm_hiddden_size, word_vector):
    super(selfAttention, self).__init__()
    self.word_embed = nn.Embedding(word_vocab_size, word_embedding_size, padding_idx=PAD)
    self.word_lstm = nn.LSTM(2*char_lstm_hidden_size + word_embedding_size, word_lstm_hiddden_size, batch_first=True, bidirectional=True)

    self.word_linear = nn.Linear(word_lstm_hiddden_size * 2, word_lstm_hiddden_size * 2)

    self.tanh = nn.Tanh()
    self.softmax = nn.Softmax(dim=2)
    self.embedding_dropout = nn.Dropout(0.5)
    self.word_dropout = nn.Dropout(0.5)
    self.att_sm_dropout = nn.Dropout(0.5)
    #self.att_dropout= nn.Dropout(0.2)
    self.word_embed.weight.data.copy_(torch.from_numpy(np.asarray(word_vector)))




  def forward(self, words, char_hidden_state, word_length):
    word_embedding = self.word_embed(words)
    word_lstm_input = torch.cat([word_embedding, char_hidden_state], 2)

    word_lstm_input = self.embedding_dropout(word_lstm_input)
    
    word_length, word_idx = word_length.sort(0, descending=True)
    word_lstm_input = word_lstm_input[word_idx]

    word_packed_input = pack_padded_sequence(word_lstm_input, word_length.cpu().data.numpy(), batch_first=True)
    word_packed_output, _ = self.word_lstm(word_packed_input)
    word_output, _ = pad_packed_sequence(word_packed_output, batch_first=True)
    word_output = word_output[torch.from_numpy(np.argsort(word_idx.cpu().data.numpy())).cuda()]
    
    word_output = self.word_dropout(word_output)
    attn_input = self.tanh(self.word_linear(word_output))

    att_padding_mask = Variable(words.data.ne(PAD)).cuda()
    context = attn_input * att_padding_mask.float().unsqueeze(2)
    attn_out = context.bmm(context.transpose(1, 2))

    attention_self_mask = Variable(1 - torch.eye(words.size(1), words.size(1))).cuda()
    attn_out = attn_out * attention_self_mask.unsqueeze(0)

    out = self.softmax(attn_out)
    out = out * att_padding_mask.float().unsqueeze(2)
    out = out * att_padding_mask.float().unsqueeze(1)
    out = self.att_sm_dropout(out)
    context_v = out.bmm(word_output)
    #context_v = self.att_dropout(context_v)
    word_output = torch.cat([word_output, context_v], 2)

    return word_output


  def reset_parameters(self):

    for name, param in self.word_lstm.named_parameters():
        if 'bias' in name:
            nn.init.constant(param, 0.)
        elif 'weight' in name:
            nn.init.normal(param, mean=0, std=0.1)

## CRF

In [0]:
def logsumexp(x, dim=None): #AS IS
    if dim is None:
        xmax = x.max()
        xmax_ = x.max()
        return xmax_ + torch.log(torch.exp(x - xmax).sum())
    else:
        xmax, _ = x.max(dim, keepdim=True)
        xmax_, _ = x.max(dim)
        return xmax_ + torch.log(torch.exp(x - xmax).sum(dim))


In [0]:
class CRF_Module(nn.Module):
    def __init__(self, input_size, num_labels, bigram=True):


        super(CRF_Module, self).__init__()
        self.pad_label_id = num_labels
        self.bigram = bigram
        self.input_size = input_size
        self.num_labels = num_labels + 1
        self.state_layer = nn.Linear(input_size, self.num_labels)

        if bigram: # 
            self.transition_layer = nn.Linear(input_size, self.num_labels * self.num_labels) # transition weights are learned (costs of moving from one tag to next)
            self.register_parameter('transition_matrix', None)
        else:
            self.transition_layer = None
            self.transition_matrix = Parameter(torch.Tensor(self.num_labels, self.num_labels)) # initialize a transition matrix instead 

        self.reset_parameters()

    def forward(self, input, mask=None):
      batch, length, _ = input.size()
      out_state = self.state_layer(input).unsqueeze(2)

      if self.bigram:
          out_transition = self.transition_layer(input).view(batch, length, self.num_labels, self.num_labels)
          net_output = out_transition + out_state
      else:
          net_output = self.transition_matrix + out_state

      if mask is not None:
          net_output = net_output * mask.unsqueeze(2).unsqueeze(3)
      return net_output


    def reset_parameters(self):
      nn.init.constant(self.state_layer.bias, 0.)
      if self.bigram:
          nn.init.xavier_uniform(self.transition_layer.weight)
          nn.init.constant(self.transition_layer.bias, 0.)
      else:
          nn.init.normal(self.transition_matrix)


    def _viterbi_decode(self, input, mask, leading_symbolic=0):
      energy = self.forward(input, mask=mask).data
      energyTrans = energy.transpose(0, 1) #energy_transpose
      energyTrans = energyTrans[:, :, leading_symbolic:-1, leading_symbolic:-1]

      w_len, batch_size, num_label, _ = energyTrans.size()
      batch_index = torch.arange(0, batch_size).long().cuda()
      
      curr_mat = torch.zeros([w_len, batch_size, num_label, 1]).cuda()
      pointer = torch.cuda.LongTensor(w_len, batch_size, num_label).zero_()
      back_pointer = torch.cuda.LongTensor(w_len, batch_size).zero_()

      curr_mat[0] = energy[:, 0, -1, leading_symbolic:-1].unsqueeze(2)
      pointer[0] = -1
      for t in range(1, w_len):
          prev_mat = curr_mat[t - 1]
          temp_mat, pointer[t] = torch.max(energyTrans[t] + prev_mat, dim=1)
          curr_mat[t] = temp_mat.unsqueeze(2)

      _, back_pointer[-1] = torch.max(curr_mat[-1].squeeze(2), dim=1)
      for t in reversed(range(w_len - 1)):
          pointer_last = pointer[t + 1]
          back_pointer[t] = pointer_last[batch_index, back_pointer[t + 1]]

      return back_pointer.transpose(0, 1) + leading_symbolic

    
    def loss(self, input, target, mask=None):
      #debug()
      batch, length, _ = input.size()
      energy = self.forward(input, mask=mask)
      energy_transpose = energy.transpose(0, 1)
      target_transpose = target.transpose(0, 1)
      mask_transpose = None
      if mask is not None:
          mask_transpose = mask.unsqueeze(2).transpose(0, 1)

      partition = None

      batch_index = torch.arange(0, batch).long().cuda()
      prev_label = torch.cuda.LongTensor(batch).fill_(self.num_labels - 1)
      tgt_energy = Variable(torch.zeros(batch)).cuda()

      for t in range(length):
          curr_energy = energy_transpose[t]
          if t == 0:
              partition = curr_energy[:, -1, :]
          else:
              partition_new = logsumexp(curr_energy + partition.unsqueeze(2), dim=1)
              if mask_transpose is None:
                  partition = partition_new
              else:
                  mask_t = mask_transpose[t]
                  partition = partition + (partition_new - partition) * mask_t
          tgt_energy += curr_energy[batch_index, prev_label, target_transpose[t].data]
          prev_label = target_transpose[t].data

      return logsumexp(partition, dim=1) - tgt_energy



## wrapper code

In [0]:
class Attention_LSTM_CRF(nn.Module):
    def __init__(self, char_vocab_size, char_embed_size, word_vocab_size, word_embedding_size, word_lstm_hiddden_size, word_vector, num_labels, cat_vector=None, cat=0, bigram=True, char_lstm_hidden_size=50):
        super(Attention_LSTM_CRF, self).__init__()
        self.char_vocab_size = char_vocab_size
        self.char_embed_size = char_embed_size
        self.word_vocab_size = word_vocab_size
        self.word_embedding_size = word_embedding_size
        self.word_lstm_hiddden_size = word_lstm_hiddden_size
        self.char_lstm_hidden_size = char_lstm_hidden_size
        self.word_vector = word_vector


        self.cat = 0
        if cat_vector is not None:
          self.cat_vector = cat_vector
          self.cat_embed = nn.Embedding(word_vocab_size, cat, padding_idx=PAD)
          self.cat_embed.weight.data.copy_(torch.from_numpy(np.asarray(cat_vector)))
          self.cat = cat


        # crf vars
        self.num_labels = num_labels
        self.bigram = bigram

        self.charModel = char_model(self.char_vocab_size, self.char_embed_size, self.char_lstm_hidden_size)
        self.Attention = selfAttention(self.char_lstm_hidden_size, self.word_vocab_size, self.word_embedding_size, self.word_lstm_hiddden_size, self.word_vector)
        self.CRF = CRF_Module(self.word_lstm_hiddden_size*4+cat, num_labels)
    
    def forward(self, words, input, word_length, char_input_length, target, hidden=None):
        charOut = self.charModel(input, char_input_length) #basically char_hidden_state
        AttentionOut = self.Attention(words, charOut, word_length)
        #Concat with category embedding
        if self.cat!=0:
          cat_embedding = self.cat_embed(words)
          AttentionOut = torch.cat([AttentionOut, cat_embedding], 2)
        #debug()
        # CRFout = self.CRF(AttentionOut, words.ne(PAD).float()).data # energy

        CRFLossOut = self.CRF.loss(AttentionOut, target, words.ne(PAD).float()).mean()
        CRFPredict = self.CRF._viterbi_decode(AttentionOut, words.ne(PAD).float(), 1)

        return CRFLossOut, CRFPredict

##Training and Validation

In [0]:
def training(en_to_spanish , char_vocab_dict, model, optimizer, lr, epochs, de_to_spanish=None):
  best_f1_score = 0.0
  
  for epoch in range(epochs):
      labels_global = []
      pred_global = []

      model.train()

      epoch_loss = 0
      total = 0
      correct = 0
      batch =0
      
      sentences, OOV, labels = data_to_words_sentences(train_paths, en_to_spanish, False, de_to_spanish) #whole data
      
      for data in batch_from_data(sentences, OOV, labels, 16):

          bsentences, bOOV, y = data
          #label_input
          
          #[bsentences, bOOV] = X
          word_input, word_length_input, char_input, char_length_input, label_input = get_input(word_to_index, bsentences, bOOV, char_vocab_dict, y, LABEL_INDEX)
          optimizer.zero_grad()
          true_labels = label_input.contiguous().view(-1)

          loss, predict = model(word_input, char_input, word_length_input, char_length_input, label_input)
          predict = predict.contiguous().view(-1)

          total +=  true_labels.data.ne(PAD).float().sum()
          pred_correct = predict.eq(true_labels.data).masked_select(true_labels.ne(PAD).data).float().sum()

          loss.backward()
          torch.nn.utils.clip_grad_norm(model.parameters(), 5)
          optimizer.step()

          epoch_loss += loss.item()
          correct += pred_correct
          batch+=1

          labels_global.extend(true_labels.masked_select(true_labels.ne(PAD).data).cpu().data.numpy())
          pred_global.extend(predict.masked_select(true_labels.ne(PAD).data).cpu().data.numpy())
          
          # true_labels_list = list(true_labels.masked_select(true_labels.ne(PAD).data).cpu().data.numpy())
          # predict = list(predict.masked_select(true_labels.ne(PAD).data).cpu().data.numpy())
          # bsentences = [j for i in bsentences for j in i]
          # for i in range(len(bsentences)):
          #   print(bsentences[i] + "\t" + LABEL_INDEX[true_labels_list[i]] + "\t" + LABEL_INDEX[predict[i]])

          if batch%200==0:
            print("Batch {} loss: {:.2f}".format(batch, epoch_loss/batch))
    
      f1 = f1_score(labels_global, pred_global, average='macro')
      
      print("Epoch {} training loss: {:.4f}, training accuracy: {:.4f}, f1 score {:.2f}".format(epoch, epoch_loss/batch, correct * 100.0/total, f1))
      lr = lr / (1.0 + epoch * 0.05) #decay=0.05
      for param_group in optimizer.param_groups:
          param_group['lr'] = lr
      val_f1 = evaluate(en_to_spanish, char_vocab_dict, model, de_to_spanish)

      # store at best f1
      if val_f1 > best_f1_score :
        best_f1_score = val_f1
        print("Saving model on best val F1 so far " + str(val_f1))
        torch.save(model.state_dict(), "reimplemented_baseline_bestf1.pt")

def evaluate(en_to_spanish, char_vocab_dict, model, de_to_spanish = None):

    model.eval()

    correct = 0
    total = 0
    test_loss = 0
    batch = 0

    labels_global = []
    pred_global = []
    
    sentences, OOV, labels = data_to_words_sentences(val_paths, en_to_spanish, True, de_to_spanish) #whole data

    #val_X = [words, OOV]
    for data in batch_from_data(sentences, OOV, labels, 16):
        
        bsentences, bOOV, y = data
        #label_input
        
        #[bsentences, bOOV] = X
        
        word_input, word_length_input, char_input, char_length_input, label_input = get_input(word_to_index, bsentences, bOOV, char_vocab_dict, y, LABEL_INDEX)
        loss, predict = model(word_input, char_input, word_length_input, char_length_input, label_input)
        predict = predict.contiguous().view(-1)
        test_loss += loss.item()

        true_labels = label_input.contiguous().view(-1)
        total += true_labels.data.ne(PAD).float().sum()
        pred_correct = predict.eq(true_labels.data).masked_select(true_labels.ne(PAD).data).float().sum()
        correct += pred_correct
        batch+=1

        labels_global.extend(true_labels.masked_select(true_labels.ne(PAD).data).cpu().data.numpy())
        pred_global.extend(predict.masked_select(true_labels.ne(PAD).data).cpu().data.numpy())

        # true_labels_list = list(true_labels.masked_select(true_labels.ne(PAD).data).cpu().data.numpy())
        # predict = list(predict.masked_select(true_labels.ne(PAD).data).cpu().data.numpy())
        # bsentences = [j for i in bsentences for j in i]
        # for i in range(len(bsentences)):
        #   print(bsentences[i] + "\t" + LABEL_INDEX[true_labels_list[i]] + "\t" + LABEL_INDEX[predict[i]])

    test_acc = correct * 100.0 / total

    f1 = f1_score(labels_global, pred_global, average='macro')

    print("loss: {:.4f} eval acc: {:.4f} | f1 {:.4f}".format(test_loss/batch, test_acc, f1))
    return f1

def write_to_results(en_to_spanish, char_vocab_dict, model, de_to_spanish=None):

    model.eval()

    labels_global = []
    pred_global = []
   
    sentences, OOV, labels = data_to_words_sentences(test_paths, en_to_spanish, True, de_to_spanish) #whole data

    for data in batch_from_data(sentences, OOV, labels, 16, random=False):
        # debug()
        bsentences, bOOV, y = data
        
        word_input, word_length_input, char_input, char_length_input, label_input = get_input(word_to_index, bsentences, bOOV, char_vocab_dict, y, LABEL_INDEX)
        loss, predict = model(word_input, char_input, word_length_input, char_length_input, label_input)
        predict = predict.contiguous().view(-1)

        true_labels = label_input.contiguous().view(-1)

        labels_global.extend(list(true_labels.masked_select(true_labels.ne(PAD).data).cpu().data.numpy()))
        pred_global.extend(list(predict.masked_select(true_labels.ne(PAD).data).cpu().data.numpy()))
        # true_labels_list = list(true_labels.masked_select(true_labels.ne(PAD).data).cpu().data.numpy())
        # predict = list(predict.masked_select(true_labels.ne(PAD).data).cpu().data.numpy())
        # bsentences = [j for i in bsentences for j in i]
        # for i in range(len(bsentences)):
        #   print(bsentences[i] + "\t" + LABEL_INDEX[true_labels_list[i]] + "\t" + LABEL_INDEX[predict[i]])

    sentence_tokens = [j for i in sentences for j in i]

    with open("results_new_edit.txt", "w") as f:
        for i in range(len(sentence_tokens)):
            f.write(sentence_tokens[i] + "\t" + LABEL_INDEX[labels_global[i]] + "\t" + LABEL_INDEX[pred_global[i]] + "\n")


#Training and Evaluating

##Loading data

In [0]:
# spanish.glove.gigaword_wiki.100d.magnitude
embedding_path = "spanish.glove.gigaword_wiki.100d.magnitude"

path = "translations_bi.txt"
de_es_translation_dict = None
if use_de: 
  embedding_path = "umwe-esp-smallish.magnitude"
  de_es_translation_dict = language_to_spanish_dict("translations_umwe_ned.txt")
  path = "translations_umwe_eng.txt"
en_es_translation_dict = language_to_spanish_dict(path)
all_paths=[]
all_paths.extend(train_paths)
all_paths.extend(val_paths)
all_paths.extend(test_paths)  
char_vocab_dict = create_char_index(all_paths, en_es_translation_dict, False, de_es_translation_dict)
matrix, word_to_index = get_indexed_word_embeddings(en_es_translation_dict, embedding_path, de_es_translation_dict) #word_vector, word_

use_distances = True

extraInfo = None
numCat = 0
if use_distances:
  # change to True to use distances to category means, False to use ratio of previous distributions
  extraInfo = get_distributional_info(matrix, word_to_index, en_es_translation_dict, True, de_es_translation_dict)
  numCat = len(CAT) + 1

matrix = np.concatenate((matrix, extraInfo), axis = 1)

## Training

In [12]:
model = Attention_LSTM_CRF(len(char_vocab_dict), 25, len(matrix), 305, 200, matrix, len(LABEL_INDEX), None, 0).cuda()

optimizer = torch.optim.SGD(model.parameters(), lr=0.015, momentum=0.9)

training(en_es_translation_dict, char_vocab_dict, model, optimizer, 0.015, 30, de_es_translation_dict)




Batch 200 loss: 5.64
Batch 400 loss: 4.07
Batch 600 loss: 3.45
Batch 800 loss: 3.07
Batch 1000 loss: 2.81
Batch 1200 loss: 2.64
Batch 1400 loss: 2.50
Batch 1600 loss: 2.37
Batch 1800 loss: 2.28
Batch 2000 loss: 2.19
Batch 2200 loss: 2.12
Batch 2400 loss: 2.05
Batch 2600 loss: 1.99
Epoch 0 training loss: 1.9394, training accuracy: 94.0562, f1 score 0.60
loss: 5.5841 eval acc: 89.7814 | f1 0.4742
Saving model on best val F1 so far 0.4741690841688502
Batch 200 loss: 0.96
Batch 400 loss: 0.98
Batch 600 loss: 0.96
Batch 800 loss: 0.95
Batch 1000 loss: 0.94
Batch 1200 loss: 0.94
Batch 1400 loss: 0.93
Batch 1600 loss: 0.91
Batch 1800 loss: 0.89
Batch 2000 loss: 0.89
Batch 2200 loss: 0.88
Batch 2400 loss: 0.87
Batch 2600 loss: 0.86
Epoch 1 training loss: 0.8606, training accuracy: 97.2047, f1 score 0.80
loss: 4.7005 eval acc: 92.6062 | f1 0.5618
Saving model on best val F1 so far 0.5618189065907966
Batch 200 loss: 0.55
Batch 400 loss: 0.58
Batch 600 loss: 0.57
Batch 800 loss: 0.57
Batch 1000 l

##saving model

In [0]:
## saving code integrated above in training loop
# torch.save(model.state_dict(), "reimplemented_baseline_batch16.pt")

## Open model and write to results

In [0]:
# model = Attention_LSTM_CRF(len(char_vocab_dict), 25, len(matrix), 100, 200, matrix, len(LABEL_INDEX), extraInfo, numCat).cuda()
# model.load_state_dict(torch.load("reimplemented_baseline_bestf1.pt"))

write_to_results(en_es_translation_dict, char_vocab_dict, model, de_es_translation_dict)