In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import pickle
import os
import sys
import time
from pprint import pprint
from collections import defaultdict
import math
import configparser
from itertools import chain
from subprocess import Popen, PIPE
from sklearn import metrics

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F
import random
import numpy as np
# import torch.autograd as autograd
import torch.optim as optim
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence

from utils_data import remap_labels
from utils_tensor import sort_variables_lengths, sort_variables_back

### Object for input (words) and output (labels) sequence 
We'll need a unique index per word to use as the inputs and targets of the networks later. To keep track of all this we will use a helper class called Sequence which has word → index (word2index) and index → word (index2word) dictionaries, as well as a count of each word (word2count).

In [2]:
class Sequence:
    def __init__(self, name, is_output):
        self.name = name
        self.is_output = is_output
        if is_output:            
            self.word2idx = {}
            self.idx2word = {}
            self.words = []
            self.n_words = 0

        else:
            self.word2idx = {'<UNK>':1}
            self.idx2word = {1:'<UNK>'}
            self.words = ['<UNK>']
            self.n_words = 1
            
            # creating alphabet dict for character embedding
            self.char2idx = {}
            self.idx2char = {}
            self.chars = []
            self.n_chars = 0
            self.max_word_len = 0
            self.max_word = ''
        
        self.word_count = {}
        self.unk_list = []
        self.char_count = {}
        
    def add_sentence(self, sentence):
        for word in sentence.split(' '):
                self.add_word(word)

    def add_word(self, word):
        if word not in self.words:
            self.words.append(word)
            self.word_count[word] = 1
            self.n_words += 1

        else:
            self.word_count[word] += 1
        
        if not self.is_output:
            word_len = 0
            for char in word:
                self.add_char(char)
                word_len+=1
                if word_len > self.max_word_len:
                    self.max_word_len = word_len
                    self.max_word = word
                    
    def add_char(self, char):
        if char not in self.chars:
            self.chars.append(char)
            self.char_count[char] = 1
            self.n_chars += 1
        else:
            self.char_count[char] += 1
            
    def make_char_dict(self):
        for idx, char in enumerate(self.chars):
            self.char2idx[char] = idx + 1
            self.idx2char[idx+1] = char
            
    def make_word_dict(self, is_output = False, shuffle = True):
        
        if shuffle:
            if is_output:
                preserved_tags = self.words[:1]
                actual_words = self.words[1:]
                random.shuffle(actual_words)
                self.words = preserved_tags + actual_words

            else:
                preserved_tags = self.words[:3]
                actual_words = self.words[3:]
                random.shuffle(actual_words)
                self.words = preserved_tags + actual_words
                
        for idx, word in enumerate(self.words):
            self.word2idx[word] = idx + 1
            self.idx2word[idx+1] = word

### Simple tokenization functions

In [3]:
# turn Unicode characters to ASCII
def unicode2ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

def normalize_string(s):
    s = unicode2ascii(s.strip())
    # substituting digits with 0, but keep the format
    s = re.sub(r"\d", r"0", s)
    # removing non-letter chars:
    # s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [4]:
def read_files(dataset_filepath, data_obj_file, resume_training = False):
    """
    reading data (train, valid and test) file, spliting to words and labels pairs
    return:
    all_pairs: dictionary ['train']['valid']['test'] containing lists of sentence and label sequence pairs.
               e.g.: [u'SOCCER - JAPAN GET LUCKY WIN , CHINA IN SURPRISE DEFEAT .',
                      u'O O O O O O O B-PER O O O O']        
    """
    # if train from scratch, read sequences from dataset files
    if resume_training == False:
        print('Reading files...\nInitiating Sequence objects...')
        input_seq = Sequence('sentences', is_output=False)
        output_seq = Sequence('labels', is_output=True)

    #resume training from a checkpoint, read sequences from saved object
    else:
        print('Resume training...\nLoading Sequence objects... ')
        (input_seq, output_seq, _) = load_data_from_file(data_obj_file)

    all_pairs = {}
    for dataset_type in ['train', 'valid', 'test']:
        data_filepath = os.path.join(dataset_filepath, dataset_type + '.txt')
        print('reading lines in {:s} file (path: {:s})'.format(dataset_type, data_filepath))
        pairs = []
        for line in open(data_filepath):
            pairs.append([normalize_string(s) for s in line.strip().split('\t')])
        all_pairs[dataset_type] = pairs
        print('Read {:d} pairs in {:s} set'.format(len(all_pairs[dataset_type]), dataset_type))

    return input_seq, output_seq, all_pairs


The full process for preparing the data is:

- Read text file and split into lines  
- Split lines into pairs and normalize  
- Make word (or label, char) lists from sentences in pairs 


In [5]:
#Prepare Data
def prepare_data(dataset_filepath, data_obj_file, resume_training = False):
    '''
    Prepare data for train, valid and test
    :param dataset_filepath: str 
    :param dataset_type: str, train/valid/test set
    :param data_obj_file: str, save path for the prepared data object
    :param resume_training: bool, if resume training from a check point
    :return:
    '''
    senteces, labels, all_pairs = read_files(dataset_filepath, data_obj_file, resume_training)
    # train_pairs, val_pair, test_pairs = pairs
    all_lengths = {}
    all_max_len = {} # for the rnn.pack_padded_sequence
    
    for dataset_type, pairs in all_pairs.iteritems():
        all_lengths[dataset_type] = [len(pairs[x][0].strip().split(' ')) for x in range(len(pairs))]
        all_max_len[dataset_type] = max(all_lengths[dataset_type])
        print('Counting sentence length in {:s} set...'.format(dataset_type))
        print("For {:s} set, maximum length of sentence sentence : {:d}".format(dataset_type, all_max_len[dataset_type]))

    for dataset_type in all_pairs.keys():
        if not resume_training:
            print('Processing {:s} set, adding words to the dictionary...'.format(dataset_type))
            for pair in all_pairs[dataset_type]:
                senteces.add_sentence(pair[0])
                labels.add_sentence(pair[1])
            # break here to see the senteces.chars char2idx idx2char
            print('shuffle word dictionary...')
            senteces.make_word_dict(is_output=False)
            senteces.make_char_dict()
            labels.make_word_dict(is_output=True)
            print('shuffle char dictionary...')
            senteces.make_char_dict()
            print('Done!')
            print('Saving Sequence objects to file {:s}'.format(data_obj_file))
            save_data_to_file(senteces, labels, all_pairs, data_obj_file)
        else:
            print('Resume training, not adding any words to Lang objects.')
            
    print("Counted vocab size:")
    print(senteces.name, senteces.n_words)
    print(labels.name, labels.n_words)
    print(labels.idx2word)
    print('idx2word for senteces:')
    print([senteces.idx2word[i] for i in range(1, 31)])
    print('idx2char for senteces:')
    print([senteces.idx2char[i] for i in range(1,31)])
    return senteces, labels, all_pairs, all_max_len, all_lengths


In [6]:
def save_data_to_file(input_seq, output_seq, pairs, fname):
    with open(fname, 'wb') as f:
        pickle.dump((input_seq,output_seq, pairs), f)
    print('saved Sequence object to {:s}'.format(fname))

def load_data_from_file(fname):
    with open(fname, 'rb') as f:
        data= pickle.load(f)
    return data

### Turning training data into Tensors
To train we need to turn the sentences into something the neural network can understand, which of course means numbers. Each sentence will be split into words and turned into a LongTensor which represents the index (from the Sequence indexes made earlier) of each word.

In [7]:
def get_batch(input_seq, output_seq, pairs, batch_size, max_len, lengths):
    data_size = len(pairs)
    num_batches = int((data_size - 1)/batch_size) + 1
    for batch_num in range(num_batches):
        start_index = batch_num * batch_size
        end_index = min((batch_num + 1) * batch_size, data_size)
        yield variables_from_pairs(input_seq, output_seq, pairs[start_index:end_index], max_len), lengths[start_index:end_index]

def variables_from_pairs(input_seq, output_seq, pairs, max_len):
    x_variables = []
    y_variables = []
    x_variables_char = []
    for pair in pairs:
        # get varaible representing sentence sequence, label sequence and character sequence
        # from an input/output pair
        x_variable, y_variable, x_variable_char = variables_from_pair(input_seq, output_seq,
                                                                      pair, max_len)
        x_variables.append(x_variable)
        y_variables.append(y_variable)
        x_variables_char.append(x_variable_char)
    return x_variables, y_variables , x_variables_char

def variables_from_pair(input_seq, output_seq, pair, max_len):
    '''
    Given an input/output pair, turn them into pytorch Variable
    :param input_seq: Sequence object
    :param output_seq: Sequence object
    :param pair: list of str
    :param max_len: int
    :return: varaible representing sentence sequence, label sequence and character sequence
    '''    
    input_variable = variable_from_sentence(input_seq, pair[0], max_len)
    target_variable = variable_from_sentence(output_seq, pair[1], max_len)
    input_variable_char = []
    cnt = 0
    #getting char variable:
    for word in pair[0].split(' '):
        input_variable_char.append(variable_from_word(input_seq, word, input_seq.max_word_len))
        cnt += 1
    new_var = [0] * input_seq.max_word_len
    for i in range(max_len - cnt): #make input_variable_char the length of max_len
        input_variable_char.append(new_var)
        # input_variable_char should be list of list
    return (input_variable, target_variable, input_variable_char)
    
def variable_from_sentence(sequence, sentence, max_len):
    '''
    :param sequence: Sequence object
    :param sentence: str, a sentence, each word seperated by ' '
    :param max_len: int, max len of the sentence (i.e. max len of the returned list),
                    sentences which has less words than max len are padded with 0 in the end.
    :return result: list of int, length: max_len
    '''
    indexes = indexes_from_sentence(sequence, sentence) # a sentence as indices.
    if len(indexes) < max_len:
        indexes = indexes + [0] * (max_len - len(indexes))
    return indexes

def indexes_from_sentence(sequence, sentence):
    '''
    :param sequence: Sequence object
    :param sentence: str, a sentence, each word seperated by ' '
    :return idxes: int list, words in sentence mapped to indexes
    '''
    idxes = []
    for word in sentence.split(' '):
        try:
            idx = sequence.word2idx[word]
        except KeyError:
            try:
                idx = sequence.word2idx['<UNK>']
                sequence.unk_list.append(word)
            except KeyError:
                print('keyerror: {:s}'.format(word))
        idxes.append(idx)
    return idxes

def variable_from_word(sequence, word, max_word_len):
    '''
    :param sequence: Sequence object
    :param word: str, a word
    :param max_word_len: int, max len of a word
    :return indexes: list of int, length: max_len
    '''
    indexes = indexes_from_word(sequence, word)
    if len(indexes) < max_word_len:
        indexes = indexes + [0] * (max_word_len - len(indexes))
    return indexes

def indexes_from_word(sequence, word):
    '''
    :param sequence: Sequence object
    :param word: str, a word
    :return idxes: list of int, each char in the word mapped to indexes
    '''
    idxes = []
    for char in word:
        idx = sequence.char2idx[char]
        idxes.append(idx)
    return idxes


### Use pretrained embedding

In [8]:
def load_pretrained_word_embeddings(parameters, input_seq):
    emb_file = parameters['embedding_filepath']
    f = open(emb_file, 'r')
    line = f.readline()
    emb_size = len(line.strip().split(' ')) - 1
    f.close()
    print('loading word embeddings from file {:s}...\n Embedding size: {:d}'.format(
        emb_file, emb_size
    ))
    
    embbedding_weights = np.zeros((input_seq.n_words + 1, emb_size))
    pretrained_embeddings = {}
    with open(emb_file, 'r') as f:
        for line in f:
            splited = line.strip().split(' ')
            if len(splited) == 0:
                continue
            else:
                pretrained_embeddings[splited[0]] = splited[1:]
    direct_map = 0
    lowercase_map = 0
    random_init = 0
    map_to_unk = 0
    low_frequency_word = []
    others = []
    words_without_pretrained_embeddings = []
    for word in input_seq.words:
        if word in ['<SOS>', '<EOS>', '<UNK>']:
            continue
        elif word in pretrained_embeddings:
            vector = np.array(pretrained_embeddings[word], dtype=float)
            embbedding_weights[input_seq.word2idx[word]] = vector
            direct_map += 1
        elif word.lower() in pretrained_embeddings:
            vector = np.array(pretrained_embeddings[word.lower()], dtype=float)
            embbedding_weights[input_seq.word2idx[word]] = vector
            lowercase_map += 1
        elif input_seq.word_count[word] > 1:
            # not low frequency word, but in
            #random init
            vector = np.random.uniform(-0.25, 0.25, emb_size)
            embbedding_weights[input_seq.word2idx[word]] = vector
            random_init += 1
        elif input_seq.word_count[word] <= 1:
            low_frequency_word.append(word)
        else:
            others.append(word)
    
    print('Map {:d} tokens with pretrained embeddings.'.format(direct_map+lowercase_map))
    print('direct map: {:d}\nlower-case map: {:d}\n'.format(direct_map, lowercase_map))
    print('Randomly initialized {:d} token embeddings.'.format(random_init))
    print('{:d} low_frequency_word: '.format(len(low_frequency_word)))
    
    return embbedding_weights, emb_size

### Define some related parameters / hyperparameters

In [9]:
parameters = {
    'n_epochs': 200,
    'patience': 20,
    'save_best_epoch': True,
    'resume': False,
    'batch_size': 128,
    'clip': 5,
    'use_pretrained_word_embedding': True,
    'use_char_embedding': True,
    'char_emb_size': 25,
    'hidden_size': 128,
    'n_layers': 1,
    'dropout_p': 0.5,
    'learning_rate': 0.05,
    'lang_in': 'sent',
    'lang_out': 'ner',
    'output_model_dir': '/home/liah/ner/seq2seq_for_ner/src/model/tutorial', 
    'pred_output_dir': '/home/liah/ner/seq2seq_for_ner/src/result/tutorial',
    'dataset_filepath' : '/home/liah/ner/seq2seq_for_ner/src/data/conll03-ner-org-bioes-5k/train1',
    'baseline_folder': 'none',
    'data_obj_file_name': 'obj_sent_ner_step1',
    'embedding_filepath': '/home/liah/word_vectors/eng/glove.6B.100d.txt',
    'model_path': ''
    
}

In [10]:
#load data
data_obj_file = os.path.join(parameters['output_model_dir'], parameters['data_obj_file_name'])

input_seq, output_seq, \
all_pairs, all_max_len, all_lengths = prepare_data(dataset_filepath =parameters['dataset_filepath'],
                                               data_obj_file=data_obj_file)
print('Example pairs:')
for data_type in all_max_len.keys():
    print('\tExample {:s} sentence sequence: '.format(data_type), all_pairs[data_type][0][0])
    print('\tExample {:s} label sequence: '.format(data_type), all_pairs[data_type][0][1])

Reading files...
Initiating Sequence objects...
reading lines in train file (path: /home/liah/ner/seq2seq_for_ner/src/data/conll03-ner-org-bioes-5k/train1/train.txt)
Read 5000 pairs in train set
reading lines in valid file (path: /home/liah/ner/seq2seq_for_ner/src/data/conll03-ner-org-bioes-5k/train1/valid.txt)
Read 3250 pairs in valid set
reading lines in test file (path: /home/liah/ner/seq2seq_for_ner/src/data/conll03-ner-org-bioes-5k/train1/test.txt)
Read 3453 pairs in test set
Counting sentence length in test set...
For test set, maximum length of sentence sentence : 124
Counting sentence length in train set...
For train set, maximum length of sentence sentence : 78
Counting sentence length in valid set...
For valid set, maximum length of sentence sentence : 109
Processing test set, adding words to the dictionary...
shuffle word dictionary...
shuffle char dictionary...
Done!
Saving Sequence objects to file /home/liah/ner/seq2seq_for_ner/src/model/tutorial/obj_sent_ner_step1
saved S

### Bidirectioanl LSTM model for Named Entity Recognition

In [11]:
class BiLSTM(nn.Module):
    def __init__(self, input_size, emb_size, hidden_size,
                 n_layers=1, dropout=0.1, pretrained_weights=None, output_size = 10,
                 use_char_embedding = False, char_alphabet_size = 0, char_emb_size = 25):
        super(BiLSTM, self).__init__()

        self.input_size = input_size
        self.emb_size = emb_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.dropout_p = dropout
        self.use_char_embedding = use_char_embedding

        self.embedding = nn.Embedding(input_size, emb_size)
        if pretrained_weights is not None:
            weights = torch.from_numpy(pretrained_weights).type(torch.FloatTensor)
            assert weights.size(0) == input_size and weights.size(1) == emb_size
            weights = weights.cuda() if use_cuda else weights
            self.embedding.weight = nn.Parameter(weights)
        
        # use character embedding
        if self.use_char_embedding and char_alphabet_size != 0:
            self.char_alphabet_size = char_alphabet_size
            self.char_emb_size = char_emb_size
            self.char_embedding = nn.Embedding(self.char_alphabet_size, self.char_emb_size)
            
            self.char_lstm = nn.LSTM(char_emb_size, char_emb_size,
                                     n_layers, dropout = 0.1,
                                     bidirectional=True)
            
        self.dropout = nn.Dropout(p=self.dropout_p)
        if not self.use_char_embedding:
            self.lstm = nn.LSTM(emb_size, hidden_size,
                            n_layers, dropout=0.1,
                            bidirectional=True)
        else:
            self.lstm = nn.LSTM(emb_size + char_emb_size * 2, hidden_size,
                                n_layers, dropout=0.1,
                                bidirectional=True)

        self.output_size = output_size
        # self.fc = nn.Linear(2 * self.hidden_size, self.hidden_size)
        self.out1 = nn.Linear(2 * self.hidden_size, self.hidden_size)
        self.out2 = nn.Linear(self.hidden_size, self.output_size)
        # self.hidden = self.init_hidden(100)
        self.softmax = nn.LogSoftmax()

    def forward(self, input_seqs, input_lengths, hidden=None, input_seqs_char = None):
        # Note: we run this all at once (over multiple batches of multiple sequences)
        embedded = self.embedding(input_seqs)
        embedded = self.dropout(embedded)
        
        if self.use_char_embedding and input_seqs_char is not None:
            char_batch_size = input_seqs_char.size(1)
            char_seq_len = input_seqs_char.size(0)
            input_seqs_char = input_seqs_char.view(char_seq_len * char_batch_size, -1)
            char_embedded = self.char_embedding(input_seqs_char).transpose(1, 0)
            char_lstm_output, (char_lstm_hidden, char_lstm_cell) = self.char_lstm(char_embedded)
            char_lstm_output_sum = torch.cat((char_lstm_hidden[0], char_lstm_hidden[1]), -1)
            char_embedded_seq = char_lstm_output_sum.view(char_seq_len, char_batch_size, -1)
            embedded = torch.cat((embedded, char_embedded_seq), -1)
                
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
        outputs, hidden = self.lstm(packed, hidden)
        outputs, output_lengths = torch.nn.utils.rnn.pad_packed_sequence(outputs)  # unpack (back to padded)

        seq_len = outputs.size(0)
        batch_size = outputs.size(1)
        raw_outputs1 = self.out1(outputs.view(seq_len * batch_size, -1))
        raw_outputs2 = self.out2(raw_outputs1).view(seq_len, batch_size, -1)
        scores = self.softmax(raw_outputs2.transpose(2, 0))
        scores = scores.transpose(2, 0)

        return scores, hidden, output_lengths

    def init_hidden(self, batch_size):
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        c_state = Variable(torch.randn(self.n_layers * 2, batch_size, self.hidden_size))
        h_state = Variable(torch.randn(self.n_layers * 2, batch_size, self.hidden_size))
        if use_cuda:
            c_state = c_state.cuda()
            h_state = h_state.cuda()
        return (c_state, h_state)

### A training step (for a batch of examples)

In [12]:
def train_step(parameters, input_seq, input_variable, target_variable, lengths_sorted,
               model, optimizer, criterion, eval_during_train =True,
               input_variable_char=None):
    optimizer.zero_grad()
    loss = 0
    input_batch_size, input_seq_len = input_variable.size()[1], input_variable.size()[0]
    target_batch_size, target_seq_len = target_variable.size()[1], target_variable.size()[0]
    assert input_batch_size == target_batch_size
    assert input_seq_len == target_seq_len

    model.train()
    model_hidden = model.init_hidden(input_batch_size)

    model_outputs, model_hidden, model_output_lengths = model(input_variable,
                                                            input_lengths=lengths_sorted,
                                                            hidden = model_hidden,
                                                            input_seqs_char = input_variable_char)
    
    if eval_during_train :
        output_word_idxes = np.zeros((input_seq_len, input_batch_size), dtype=int)
        
    for i in range(model_outputs.size(0)): #loop over seq_len
        loss += criterion(model_outputs[i, :, :], target_variable[i, :])
        if eval_during_train:
            _, topi = model_outputs.data.topk(1, dim=-1)
            decoded_word = topi[i].cpu().numpy()
            output_word_idxes[i,:] = decoded_word.squeeze(-1)

    loss.backward()
    ec = torch.nn.utils.clip_grad_norm(model.parameters(), parameters['clip'])

    optimizer.step()
    output_word_idxes = np.transpose(output_word_idxes, (1, 0))
    return output_word_idxes, loss.data[0], ec

### Evaluating the network

Evaluation is mostly the same as training, but there are no gold label sequences fed into the network.

In [13]:
# eval
def evaluate(dataset_type,parameters, input_seq, output_seq, pairs, max_len, lengths, model, epoch):
    parameters['dataset_type'] = dataset_type
    if parameters['pred_output_dir'] is None:
        print('has to provide a output folder for prediction results')
        sys.exit(0)
    
    model = torch.load(parameters['model_path']) if model is None else model

    start = time.time()
    plot_losses = []
    print_loss_total = 0 # Reset every print_every
    plot_loss_total = 0 # Reset every plot_every
    
    all_preds = np.zeros((len(pairs), max_len), dtype=int)
    # all_golds = np.zeros((len(pairs), max_len))
    all_preds_list = []
    all_golds_list = []
    all_preds_words = np.full((len(pairs), max_len), 's', dtype = object)
    
    for i_batch, ((input_variables, target_variables, input_variables_char), batch_lengths) in \
     enumerate(get_batch(input_seq, output_seq, pairs, \
     batch_size = parameters['batch_size'], max_len =max_len, lengths = lengths)):
    
        input_variables_sorted, input_variables_char_sorted, target_variables_sorted, \
        lengths_sorted, lengths_argsort \
            = sort_variables_lengths(input_variables, target_variables,
                                                  batch_lengths, needs_argsort=True,
                                                  input_variables_char=input_variables_char)

        preds, attns = evaluate_step(model, input_seq, output_seq,
                              input_variables_sorted, lengths_sorted,
                              input_variable_char=input_variables_char_sorted)
        # here the preds in each batch are sorted accroding to sequence length
        new_preds = sort_variables_back(preds, lengths_argsort)
        all_preds[i_batch * parameters['batch_size'] : \
                    min((i_batch + 1) * parameters['batch_size'],len(pairs)), :] = new_preds
        all_preds_list.extend([item for sublist in new_preds.tolist() for item in sublist])
        all_golds_list.extend([item for sublist in target_variables for item in sublist])
    # calculate token F1 by sklearn
    # all preds (N x seq_len) contains the predicted indices (as int)
    
    ### calc sklearn F1
    new_y_pred, new_y_true, \
    new_label_indices, new_label_names, _, _ = remap_labels(all_preds_list,
                                                                       all_golds_list,
                                                                       output_seq,
                                                                       'token')
    ix = new_label_names.index('<SOS>')
    new_label_names.remove('<SOS>')
    new_label_indices.remove(new_label_indices[ix])
    current_f1_report = metrics.classification_report(y_pred=new_y_pred, y_true=new_y_true,
                                                    digits=4,
                                                    labels=new_label_indices,
                                                    target_names=new_label_names)
    
    current_f1_sklearn = get_sklearn_eval(current_f1_report,dataset_type)

    return current_f1_sklearn


### An evaluation step (for a batch of examples)

In [14]:
# eval step
def evaluate_step(model, input_seq, output_seq, input_variable, lengths_sorted,
                  input_variable_char=None):
    # this one is for testing BiLSTM
    model.eval()
    input_batch_size, input_seq_len = input_variable.size()[1], input_variable.size()[0]
    decoded_words = np.zeros((input_seq_len, input_batch_size), dtype=int)
    model_outputs, model_hidden, model_output_lengths = model(input_variable,
                                                                  input_lengths = lengths_sorted,
                                                                  input_seqs_char=input_variable_char)
    #loop over lengths_sorted
    for i_batch in range(input_batch_size):
        seq_len = lengths_sorted[i_batch]
        for i_seq in range(seq_len):
            topv, topi = model_outputs.data.topk(1, dim=-1)
            decoded_word = topi[i_seq, i_batch].cpu().numpy()
            decoded_words[i_seq, i_batch] = decoded_word

    decoded_words = np.transpose(decoded_words, (1, 0))
    return decoded_words, None

In [15]:
def write_pred(preds, predfile, input_seq, output_seq, pairs):
    '''
    inputs and targets are torch tensors, need to be converted back to words
    preds are already words
    '''
    # input_sentences = sentences_from_variables(input_lang, inputs)
    # output_sentences = sentences_from_variables(output_lang, targets)
    # print(len(pairs))
    try:
        assert len(pairs) == preds.shape[0]
    except AssertionError:
        print('evaluation pairs shape and preds shape are not the same!!! got:')
        print('len(pairs)', len(pairs), 'preds.shape[0]',preds.shape[0])
    with open(predfile, 'w') as f:
        preds = preds.tolist()
        
        for i in range(len(pairs)):
            input_seq = pairs[i][0].split(' ')
            target_seq = pairs[i][1].split(' ')
            pred_seq = preds[i][:len(target_seq)]
            assert len(input_seq) == len(target_seq) and len(target_seq) == len(pred_seq)
            for i in range(len(input_seq)):
                line = input_seq[i] + ' ' + target_seq[i] + ' ' + pred_seq[i] + '\n'
                f.writelines(line)
            f.writelines(u'\n')

In [16]:
def get_conll_eval(pred_output_filepath):
    # subprocess.call('perl conlleval <{}'.format(pred_output_filepath), shell = True)
    with open(pred_output_filepath, 'r') as pred_output_file:
        p = Popen(['perl', 'conlleval'], stdin=pred_output_file, stdout=PIPE, stderr=PIPE)
    output, err = p.communicate()
    # rc = p.returncode
    print(output)
    output_lines = output.split('\n')
    pattern = r'(\w+).*FB1:\s*(\d+.\d+)'
    f1_scores = {}
    for line in output_lines:
        match = re.search(pattern=pattern, string=line)
        if match:
            if match.group(1) == 'accuracy':
                f1_scores['overall'] = float(match.group(2))
            else:
                f1_scores[match.group(1)] = float(match.group(2))
    pprint(f1_scores)
    return f1_scores

def get_sklearn_eval(current_f1_report, dataset_type):
    print('F1 (sklearn) on {:s} set:'.format(dataset_type))
    print(current_f1_report)
    f1_scores = {}
    pattern = r'(\w+)\s+\d\.\d+\s+\d\.\d+\s+(\d\.\d+)'
    for line in current_f1_report.strip().split('\n'):
        match = re.search(pattern, line)
        if match:
            if match.group(1) == 'total':
                f1_scores['overall'] = float(match.group(2)) * 100.
            else:
                f1_scores[match.group(1)] = float(match.group(2)) * 100.
    return f1_scores

### Putting them all together

In [17]:
use_cuda = torch.cuda.is_available()
def train(parameters, input_seq, output_seq, all_pairs, all_max_len, all_lengths):
    if parameters['use_pretrained_word_embedding']:
        pretraind_embed, emb_size = load_pretrained_word_embeddings(parameters, input_seq)
    else:
        pretraind_embed = None
        emb_size = parameters['hidden_size']
    
    # +1 for padding
    blstm = BiLSTM(input_seq.n_words + 1, emb_size,
                 parameters['hidden_size'], parameters['n_layers'],
                 dropout=parameters['dropout_p'],
                 pretrained_weights=pretraind_embed,
                 output_size=len(output_seq.words) + 1,
                 use_char_embedding=parameters['use_char_embedding'],
                 char_alphabet_size=input_seq.n_chars + 1,
                 char_emb_size=parameters['char_emb_size'])
    if use_cuda:
        blstm.cuda()
    print(blstm)

    optimizer = optim.SGD(blstm.parameters(), lr=parameters['learning_rate'], momentum=0.8)
#     criterion = nn.NLLLoss(ignore_index=0)
    criterion = nn.NLLLoss()
    
    # Keep track of time elapsed and running averages
    start = time.time()
    plot_losses = []
    ecs, dcs = [], []
    eca, dca = 0, 0
    print_loss_total = 0
    all_output_word_idxes = np.zeros((len(all_pairs['train']), all_max_len['train']))
    
    f1s = defaultdict(dict)
    for dataset_type in ['train', 'valid', 'test']:
        f1s[dataset_type]['overall'] = []
        
    best_valid_f1 = 0.0
    patience_cnt = 0
    for epoch in range(1, parameters['n_epochs'] + 1):
        print('\nStarting epoch {:d}...'.format(epoch))
        epoch_start_time = time.time()
        for i_batch, ((input_variables, target_variables, input_variables_char), batch_lengths) in \
                enumerate(get_batch(input_seq, output_seq, all_pairs['train'], \
                                    batch_size=parameters['batch_size'],
                                    max_len=all_max_len['train'], lengths=all_lengths['train'])):

            input_variables_sorted, input_variables_char_sorted, target_variables_sorted, \
            lengths_sorted, lengths_argsort = sort_variables_lengths(input_variables, target_variables,
                                                      batch_lengths, needs_argsort = True,
                                                      input_variables_char = input_variables_char)

            output_word_idxes, loss, ec = train_step(parameters, output_seq, input_variables_sorted,
                                                          target_variables_sorted, lengths_sorted,
                                                          blstm, optimizer, criterion,
                                                          input_variable_char = input_variables_char_sorted)

            output_word_idxes_sorted = sort_variables_back(output_word_idxes, lengths_argsort)
            all_output_word_idxes[i_batch * parameters['batch_size']: \
                min((i_batch + 1) * parameters['batch_size'], len(all_pairs['train'])), :] = output_word_idxes_sorted
            print_loss_total += loss

            eca += ec

        if epoch == 0: continue
        else:
#             print_loss_avg = print_loss_total / parameters['print_every']
            print_summary = '%s (%d %d%%) %.4f' % (time_since(start,
                                                              epoch / float(parameters['n_epochs'])),
                                                   epoch, epoch / parameters['n_epochs'] * 100,
                                                   print_loss_total)
            print_loss_total = 0
            print(print_summary)

            for dataset_type in ['train', 'valid', 'test']:
                output_word_idxes = all_output_word_idxes if dataset_type == 'train' else None
                print('evaluating on {:s} set ...'.format(dataset_type))
                
                current_f1 = evaluate_during_train(dataset_type, parameters,
                                                   input_seq, output_seq,
                                                   all_pairs[dataset_type],
                                                   all_max_len[dataset_type],
                                                   all_lengths[dataset_type],
                                                   model=blstm,
                                                   epoch=epoch,
                                                   output_word_idxes = output_word_idxes)
                f1s[dataset_type]['overall'].append(current_f1['overall'])

            current_valid_f1 = f1s['valid']['overall'][epoch-1]

            if current_valid_f1 > best_valid_f1:
                best_valid_f1 = current_valid_f1
                model_path = save_best(blstm, parameters['output_model_dir'])
                print("Saved model in {:s}".format(parameters['output_model_dir']))
                parameters['model_path'] = model_path
                patience_cnt = 0
            else:
                print('The valid F1 does not improve in the last {:d} epochs.'.format(patience_cnt+1))
                patience_cnt += 1

        epoch_elapsed_training_time = time.time() - epoch_start_time

        print('Training completed in {0:.2f} seconds'.format(epoch_elapsed_training_time))
        if patience_cnt >= parameters['patience']:
            print('Early stopping!')
            break

    # finishing up the experiments
    f1s_for_plot = {}
    for dataset_type in ['train', 'valid', 'test']:
        f1s_for_plot[dataset_type] = f1s[dataset_type]['overall']
    graph_path = os.path.join(parameters['output_model_dir'], 'F1-plot.svg')
    plot_f1(f1s_for_plot, graph_path, 'step'+str(parameters['step']))
    print('F1 (sklearn):')
    pprint(f1s)
#     return best_valid_f1, f1s


def evaluate_during_train(dataset_type, parameters, input_seq, output_seq,
                          pairs, max_len, lengths, model=None, epoch=0, output_word_idxes = None):

    current_f1 = evaluate(dataset_type, parameters, input_seq, output_seq,
                 pairs, max_len, lengths, model, epoch)

    return current_f1

In [18]:
def save_best(model, save_path):
    model_path = os.path.join(save_path, 'best_model.pt')
    save_model(model, model_path)
    return model_path

def save_model(model, filename):
    if not os.path.isfile(filename):
        open(filename, 'w').close()
    torch.save(model, filename)
    print('Saved %s as %s' % (model.__class__.__name__, filename))


In [None]:
def as_minutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def time_since(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (as_minutes(s), as_minutes(rs))

In [None]:
train(parameters, input_seq, output_seq, all_pairs, all_max_len, all_lengths)
#an epoch (train+eval) takes about 3 minutes

loading word embeddings from file /home/liah/word_vectors/eng/glove.6B.100d.txt...
 Embedding size: 100
Map 17616 tokens with pretrained embeddings.
direct map: 8840
lower-case map: 8776

Randomly initialized 323 token embeddings.
1151 low_frequency_word: 
BiLSTM (
  (embedding): Embedding(19092, 100)
  (char_embedding): Embedding(77, 25)
  (char_lstm): LSTM(25, 25, dropout=0.1, bidirectional=True)
  (dropout): Dropout (p = 0.5)
  (lstm): LSTM(150, 128, dropout=0.1, bidirectional=True)
  (out1): Linear (256 -> 128)
  (out2): Linear (128 -> 14)
  (softmax): LogSoftmax ()
)

Starting epoch 1...
0m 42s (- 139m 39s) (1 0%) 806.6310
evaluating on train set ...


  'precision', 'predicted', average, warn_for)


F1 (sklearn) on train set:
             precision    recall  f1-score   support

        LOC     0.9444    0.0345    0.0666      2954
       MISC     0.0000    0.0000    0.0000      1633
        PER     0.7833    0.6968    0.7375      3928

avg / total     0.6890    0.3334    0.3633      8515

evaluating on valid set ...
F1 (sklearn) on valid set:
             precision    recall  f1-score   support

        LOC     0.9029    0.0444    0.0847      2094
       MISC     0.0000    0.0000    0.0000      1268
        PER     0.8196    0.7272    0.7707      3149

avg / total     0.6868    0.3660    0.3999      6511

evaluating on test set ...
F1 (sklearn) on test set:
             precision    recall  f1-score   support

        LOC     0.9254    0.0322    0.0622      1925
       MISC     0.0000    0.0000    0.0000       918
        PER     0.8447    0.6946    0.7623      2773

avg / total     0.7343    0.3540    0.3977      5616

Saved BiLSTM as /home/liah/ner/seq2seq_for_ner/src/model/tuto

  "type " + obj.__name__ + ". It won't be checked "


4m 42s (- 466m 2s) (2 1%) 204.7604
evaluating on train set ...
