# Delivery 02

# Sctructure:
+ Read data
+ Spell checker
+ HMM
+ Id fetures
+ Structured Perceptron (Notebook: Structured_Perceptron_Validation)
+ BERT

In [131]:
import pandas as pd
import numpy as np
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm, trange

import editdistance
import itertools
import re


In [2]:
from IPython.core.display import display, HTML
# display(HTML("<style>.container { width:100% !important; }</style>"))

pd.set_option('display.max_rows', 25)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 50)

In [3]:
# Using ffill we optimize the code. It will fill all missing values with the previous non-nan value
data = pd.read_csv("data/kaggle_ner/ner_dataset.csv",
                   encoding="latin1").fillna(method="ffill")

In [4]:
data.head()


Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [5]:
print("Available TAGS:")
print(set(data["Tag"]))

Available TAGS:
{'B-org', 'B-per', 'O', 'I-art', 'I-eve', 'I-geo', 'B-tim', 'I-org', 'B-gpe', 'I-gpe', 'I-tim', 'B-nat', 'B-geo', 'B-eve', 'I-per', 'B-art', 'I-nat'}


In [6]:
print("{:10s} | {:20s}".format("TAG", "Examples"))
print("-"*60)
for tag in set(data["Tag"]):
    print("{:10s} | {:20s} {:20s} {:20s}".format(tag, *data[data["Tag"] == tag]["Word"][0:3].values))

TAG        | Examples            
------------------------------------------------------------
B-org      | Labor                International        IAEA                
B-per      | Bush                 President            Thomas              
O          | Thousands            of                   demonstrators       
I-art      | Non-Proliferation    V-6                  Simple              
I-eve      | Summer               Olympics             Olympic             
I-geo      | Park                 State                State               
B-tim      | Wednesday            Wednesday            Tuesday             
I-org      | Party                Atomic               Energy              
B-gpe      | British              English              Britain             
I-gpe      | States               Korea                Binh                
I-tim      | 8                    1                    2                   
B-nat      | H5N1                 H5N1                 Jing          

In [7]:
n_sentences = len(data['Sentence #'].unique())

In [8]:
n_sentences

47959

In [9]:
# Using groupby and apply rather than a for loop the computation is reduced from +2h to 4s
%time X_txt = list(data.groupby("Sentence #")['Word'].apply(list))
%time Y_txt = list(data.groupby("Sentence #")['Tag'].apply(list))

Wall time: 5.37 s
Wall time: 5.24 s


In [10]:
i = 0
xy = ["{}/{}".format(x,y) for x,y in zip(X_txt[i],Y_txt[i])]
" ".join(xy)

'Thousands/O of/O demonstrators/O have/O marched/O through/O London/B-geo to/O protest/O the/O war/O in/O Iraq/B-geo and/O demand/O the/O withdrawal/O of/O British/B-gpe troops/O from/O that/O country/O ./O'

## Word to pos and tag to pos

In [11]:
def build_word_to_pos(X):
    word_to_pos = {k: i for i, k in enumerate(X['Word'].unique())}               
    pos_to_word = {v: k for k, v in word_to_pos.items()}
    return word_to_pos, pos_to_word
            
def build_tag_to_pos(Y):
    tag_to_pos = {k: i for i, k in enumerate(Y['Tag'].unique())} 
    pos_to_tag = {v: k for k, v in tag_to_pos.items()}

    return tag_to_pos, pos_to_tag

In [12]:
word_to_pos, pos_to_word = build_word_to_pos(data)
tag_to_pos, pos_to_tag  = build_tag_to_pos(data)

len(word_to_pos), len(tag_to_pos)

(35178, 17)

In [13]:
print(list(word_to_pos.items())[:10])
print(tag_to_pos)

[('Thousands', 0), ('of', 1), ('demonstrators', 2), ('have', 3), ('marched', 4), ('through', 5), ('London', 6), ('to', 7), ('protest', 8), ('the', 9)]
{'O': 0, 'B-geo': 1, 'B-gpe': 2, 'B-per': 3, 'I-geo': 4, 'B-org': 5, 'I-org': 6, 'B-tim': 7, 'B-art': 8, 'I-art': 9, 'I-per': 10, 'I-gpe': 11, 'I-tim': 12, 'B-nat': 13, 'B-eve': 14, 'I-eve': 15, 'I-nat': 16}


In [14]:
X = [[word_to_pos[w] for w in s] for s in X_txt]
Y = [[tag_to_pos[t] for t in s] for s in Y_txt]

# HMM 

In [25]:
import scipy
import numpy as np

# From .py
from HiddenMarkovModel import *

import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

import skseq

In [26]:
import skseq
import skseq.sequences
import skseq.readers

from skseq.sequences import sequence
from skseq.sequences import sequence_list
from skseq.sequences import label_dictionary

In [27]:
n_split = 35971
X_train_txt, X_test_txt, Y_train_txt, Y_test_txt = X_txt[:n_split], X_txt[n_split:], Y_txt[:n_split], Y_txt[n_split:]

## Log trick

In [28]:
def logzero():
    return -np.inf


def safe_log(x):
    print(x)
    if x == 0:
        return logzero()
    return np.log(x)


def logsum_pair(logx, logy):
    """
    Return log(x+y), avoiding arithmetic underflow/overflow.

    logx: log(x)
    logy: log(y)

    Rationale:

    x + y    = e^logx + e^logy
             = e^logx (1 + e^(logy-logx))
    log(x+y) = logx + log(1 + e^(logy-logx)) (1)

    Likewise,
    log(x+y) = logy + log(1 + e^(logx-logy)) (2)

    The computation of the exponential overflows earlier and is less precise
    for big values than for small values. Due to the presence of logy-logx
    (resp. logx-logy), (1) is preferred when logx > logy and (2) is preferred
    otherwise.
    """
    if logx == logzero():
        return logy
    elif logx > logy:
        return logx + np.log1p(np.exp(logy-logx))
    else:
        return logy + np.log1p(np.exp(logx-logy))


def logsum(logv):
    """
    Return log(v[0]+v[1]+...), avoiding arithmetic underflow/overflow.
    """
    res = logzero()
    for val in logv:
        res = logsum_pair(res, val)
    return res

In [41]:
class HMM(object):
    
    def __init__(self, word_to_pos={}, state_to_pos={}):
        self.fitted = False
        self.counts = {"emission": None, "transition":None, "final":None, "initial":None}
        self.probs  = {"emission": None, "transition":None, "final":None, "initial":None}
        self.scores = {"emission": None, "transition":None, "final":None, "initial":None}
        self.decode = set(["posterior", "viterbi"])
        self.word_to_pos  = word_to_pos
        self.state_to_pos = state_to_pos
        self.pos_to_word  = {v: k for k, v in word_to_pos.items()}
        self.pos_to_state = {v: k for k, v in state_to_pos.items()}
    
        self.n_states     = len(state_to_pos)
        self.n_words      = len(word_to_pos)
        self.fitted = False

    def fit(self, observation_lables: list, state_labels: list):
        """
        Computes and saves: counts, probs, scores.
        """
        if self.state_to_pos is None or self.word_to_pos is None:
            print("Error state_to_pos or word_to_pos needed to be defined")
            return
            
        self.counts = self.sufficient_statistics_hmm(observation_lables, state_labels)       
        self.probs  = self.compute_probs(self.counts)  
        self.scores = self.compute_scores(self.probs)  
        self.fitted = True
        
    def sufficient_statistics_hmm(self, observation_lables, state_labels):

        state_to_pos, word_to_pos = self.state_to_pos, self.word_to_pos
        
        def update_initial_counts(initial_counts, seq_x, state_to_pos):
            initial_counts[state_to_pos[seq_x[0]]] +=  1
            
        def update_transition_counts(transition_counts, seq_y, state_to_pos):
            for (t_prev, t) in zip(seq_y[:-1], seq_y[1:]):
                transition_counts[state_to_pos[t], state_to_pos[t_prev]] += 1 

        def update_emission_counts(emission_counts, seq_x, seq_y, state_to_pos, word_to_pos):
            for (t,x) in zip(seq_y, seq_x):
                emission_counts[state_to_pos[t], word_to_pos[x]] += 1 
                
        def update_final_counts(final_counts, seq_y, state_to_pos):
            final_counts[state_to_pos[seq_y[-1]]] +=1

        n_states = len(state_to_pos)
        n_words  = len(word_to_pos)
        initial_counts      = np.zeros((n_states))
        transition_counts   = np.zeros((n_states, n_states))
        final_counts        = np.zeros((n_states))
        emission_counts     = np.zeros((n_states, n_words))

        for seq_x, seq_y in zip(observation_lables, state_labels):
            update_initial_counts(initial_counts, seq_y, state_to_pos)
            update_transition_counts(transition_counts, seq_y,  state_to_pos)
            update_emission_counts(emission_counts, seq_x, seq_y, state_to_pos, word_to_pos) 
            update_final_counts(final_counts, seq_y,  state_to_pos) 

        return {"emission":   emission_counts, 
                "transition": transition_counts,
                "final":      final_counts, 
                "initial":    initial_counts}
    
    def compute_probs(self, counts):
        
        initial_counts    = counts['initial']
        transition_counts = counts['transition']
        emission_counts   = counts['emission']
        final_counts      = counts['final']

        initial_probs    = (initial_counts / np.sum(initial_counts))
        transition_probs = transition_counts/(np.sum(transition_counts,0) + final_counts)
        final_probs      = final_counts/(np.sum(transition_counts, 0) + final_counts )
        emission_probs   = (emission_counts.T / np.sum(emission_counts, 1)).T
    
        return {"emission":   emission_probs, 
                "transition": transition_probs,
                "final":      final_probs, 
                "initial":    initial_probs}
    
    def compute_scores(self, probs):
         return {"emission":   np.log(probs["emission"]), 
                 "transition": np.log(probs["transition"]),
                 "final":      np.log(probs["final"]), 
                 "initial":    np.log(probs["initial"])}
        
    def forward_computations(self, x: list):
        forward_x = None
        return forward_x
    
    def backward_computations(self, x:list):
        backward_x = None
        return backward_x
    
    def log_forward_computations(self, x: list):
        """
        Compute the log_forward computations

        Assume there are S possible states and a sequence of length N.
        This method will compute iteritavely the log_forward quantities.

        * log_f is a S x N Array.
        * log_f_x[:,i] will contain the forward quantities at position i.
        * log_f_x[:,i] is a vector of size S.
        
        Returns
        - log_f_x: Array of size K x N
        """ 
        n_x = len(x)
        
        # log_f_x initialized to -Inf because log(0) = -Inf
        log_f_x = np.zeros((self.n_states, n_x)) - np.Inf
        x_emission_scores = np.array([hmm.scores['emission'][:, hmm.word_to_pos[w]] for w in x]).T
        
        log_f_x[:,0] = x_emission_scores[:, 0] + self.scores['initial']
        for n in range(1, n_x):
            for s in range(self.n_states):
                log_f_x[s,n] = logsum(log_f_x[:,n-1] + self.scores['transition'][s,:]) + x_emission_scores[s,n]

        log_likelihood = logsum(log_f_x[:,n_x-1] + self.scores['final']) 
        return log_f_x, log_likelihood # log(P(X=x))
    
    
    def log_backward_computations(self, x: list):
        n_x = len(x)
        
        # log_f_x initialized to -Inf because log(0) = -Inf
        log_b_x = np.zeros((self.n_states, n_x)) - np.Inf
        x_emission_scores = np.array([hmm.scores['emission'][:, hmm.word_to_pos[w]] for w in x]).T
        log_b_x[:,-1] = self.scores['final']

        for n in range(n_x-2, -1, -1):
            for s in range(self.n_states):
                log_b_x[s,n] = logsum(log_b_x[:,n+1] + self.scores['transition'][:,s] + x_emission_scores[:,n+1])

        log_likelihood = logsum(log_b_x[:,0] + self.scores['initial'] + x_emission_scores[:,0]) 
        return log_b_x, log_likelihood  # log(P(X=x))
        
    def predict_labels(self, x: list, decode="posterior"):
        """
        Retuns a sequence of states for each word in **x**.
        The output depends on the **decode** method chosen.
        """
        assert decode in self.decode, "decode `{}` is not valid".format(decode)
        
        if decode is 'posterior':
            return self.posterior_decode(x)
        
        if decode is 'viterbi':
            return self.viterbi_decode(x)

    def compute_state_posteriors(self, x:list):
        log_f_x, log_likelihood = self.log_forward_computations(x)
        log_b_x, log_likelihood = self.log_backward_computations(x)
        state_posteriors = np.zeros((self.n_states, len(x)))
        
        for pos in range(len(x)):
            state_posteriors[:, pos] = log_f_x[:, pos] + log_b_x[:, pos] - log_likelihood
        return state_posteriors

    def posterior_decode(self, x: list, decode_states=True):
        
        state_posteriors = self.compute_state_posteriors(x)
        y_hat = state_posteriors.argmax(axis=0)
        
        if decode_states:
            y_hat = [hmm.pos_to_state[y] for y in y_hat]
            
        return y_hat

In [42]:
hmm = HMM(word_to_pos, tag_to_pos)

In [43]:
hmm.fit(X_train_txt, Y_train_txt)



In [44]:
print(X_test_txt[0])

['A', 'recent', 'performance', 'by', 'Seun', 'Kuti', ',', 'son', 'of', 'Afro-beat', 'legend', 'Fela', 'Kuti', ',', 'kicked', 'off', 'this', 'year', "'s", 'concert', 'series', ',', 'bringing', 'the', 'sounds', 'of', 'Lagos', ',', 'Nigeria', 'to', 'downtown', 'Los', 'Angeles', '.']


In [45]:
def evaluate_hmm():
    tot = 0
    err = 0

    mstks = []
    correct = []

    tbar = tqdm(X_test_txt)
    for i, xtest in enumerate(tbar):
        pred = hmm.predict_labels(xtest)
        yral = Y_test_txt[i]

        s = sum(v1!=v2 for v1,v2 in list(zip(pred, yral)))
        err+= s
        tot+=len(yral)

        toappend = mstks if s!=0 else correct
        toappend.append(pd.DataFrame([xtest, pred, yral]))
        tbar.set_description("Accuracy: {:6.4f}".format(1-err/tot))
    return correct, mstks

In [46]:
correct, mstks = evaluate_hmm()

HBox(children=(FloatProgress(value=0.0, max=11988.0), HTML(value='')))






In [47]:
correct[0]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23
0,Police,in,Zimbabwe,have,arrested,a,nephew,of,President,Robert,Mugabe,on,suspicion,of,smuggling,30,tons,of,scarce,flour,to,neighboring,Mozambique,.
1,O,O,B-geo,O,O,O,O,O,B-per,I-per,I-per,O,O,O,O,O,O,O,O,O,O,O,B-geo,O
2,O,O,B-geo,O,O,O,O,O,B-per,I-per,I-per,O,O,O,O,O,O,O,O,O,O,O,B-geo,O


In [48]:
mstks[0]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33
0,A,recent,performance,by,Seun,Kuti,",",son,of,Afro-beat,legend,Fela,Kuti,",",kicked,off,this,year,'s,concert,series,",",bringing,the,sounds,of,Lagos,",",Nigeria,to,downtown,Los,Angeles,.
1,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O
2,O,O,O,O,B-per,I-per,O,O,O,O,O,B-per,I-per,O,O,O,O,O,O,O,O,O,O,O,O,O,B-geo,O,B-geo,O,O,B-geo,I-geo,O


# Structured perceptron

In [49]:
from skseq.sequences.sequence_list import SequenceList
from skseq.sequences.sequence import Sequence
from skseq.sequences.label_dictionary import LabelDictionary

x_dict = LabelDictionary(label_names=data['Word'].unique())
y_dict = LabelDictionary(label_names=data['Tag'].unique())

# train_sequences = [Sequence(x,y) for x,y in zip(X_train_txt, Y_train_txt)]
# test_sequences = [Sequence(x,y) for x,y in zip(X_test_txt, Y_test_txt)]

train_seq_list = SequenceList(x_dict, y_dict)
for x,y in zip(X_train_txt, Y_train_txt):
    train_seq_list.add_sequence(x, y, x_dict, y_dict)

test_seq_list = SequenceList(x_dict, y_dict)
for x,y in zip(X_test_txt, Y_test_txt):
    test_seq_list.add_sequence(x, y, x_dict, y_dict)

In [50]:
class BK_Tree():    
    def __init__(self, documents):
        self.root = Node(documents[0][0])
        for seq in documents:
            for word in seq:
                self.root.append(word)
        
    def append(self, word):
        self.root.append(word)
        
    def is_in_corpus(self, new_word):
        return self.root.is_in_corpus(new_word)
        
        
class Node():    
    def __init__(self, word):
        self.word = word
        self.dict_dist = {}
    
    def is_in_corpus(self, new_word):
        if self.word == new_word:
            return True
        dist = editdistance.eval(self.word, new_word)
        if dist not in self.dict_dist:
            return False
        return self.dict_dist[dist].is_in_corpus(new_word)
    
    def append(self, new_word):
        dist = editdistance.eval(self.word, new_word)
        if dist not in self.dict_dist:
            if self.word!=new_word:
                self.dict_dist[dist] = Node(new_word)
        else:
            self.dict_dist[dist].append(new_word)    
            
            
def edit_ditance_word(mistake, X_train_txt):
    # mistake = "Barchelona" 
    corpus = [w for seq in X_train_txt for w in seq]
    distances = [editdistance.eval(mistake, word) for word in corpus]
    return corpus[np.argmin(distances)], min(distances)

In [51]:
# You can improve the results adding more features

# from skseq.sequences import extended_feature
# feature_mapper = skseq.sequences.extended_feature.ExtendedFeatures(train_seq_list)

In [52]:
feature_mapper = skseq.sequences.id_feature.IDFeatures(train_seq_list)
feature_mapper.build_features()

print('Feature dict length:', len(feature_mapper.feature_dict))
print('Feature list length:', len(feature_mapper.feature_list))

import skseq.sequences.structured_perceptron_validation as spc
# sp = spc.StructuredPerceptron(corpus.word_dict, corpus.tag_dict, feature_mapper)
sp = spc.StructuredPerceptronValidation(x_dict, y_dict, feature_mapper, averaged=False)


def train_perceptron(load_no_fit = True, num_epochs = 50, epochs_before_stopping = 5, dir_to_params = './'):
    if not load_no_fit:
        print('Training for %i epochs with early stopping after %i epochs of no improvement' % (num_epochs, epochs_before_stopping))
#         %time sp.fit(dummy_seq_list, val_seq_list, num_epochs, epochs_before_stopping, dir_to_params)
        %time sp.fit(train_seq_list, test_seq_list, num_epochs, epochs_before_stopping, dir_to_params)
    else:
        sp.load_model(dir_to_params)
        
        
import pandas as pd
from IPython.core import display as ICD


Feature dict length: 38298
Feature list length: 35971


In [136]:
train_seq_list

[0/0 1/0 2/0 3/0 4/0 5/0 6/1 7/0 8/0 9/0 10/0 11/0 12/1 13/0 14/0 9/0 15/0 1/0 16/2 17/0 18/0 19/0 20/0 21/0 , 124/2 125/0 126/0 127/0 128/0 7/0 129/0 130/0 7/0 131/0 132/0 118/0 1/0 9/0 123/0 107/7 93/0 133/0 134/0 135/5 136/0 137/0 138/0 139/0 21/0 , 942/0 943/0 363/7 944/0 344/0 945/0 11/0 9/0 946/1 248/0 200/0 93/0 251/0 317/0 947/5 193/0 180/0 948/0 7/0 3/0 332/0 7/0 949/0 134/0 376/0 305/0 311/0 11/0 950/0 543/1 544/4 21/0 , 42/0 846/0 133/0 45/0 4645/0 4646/0 4647/0 31/0 1878/0 352/0 21/0 , 154/1 836/0 1188/0 7563/3 9398/10 172/0 350/7 93/0 149/1 93/0 840/2 13/0 705/2 305/0 828/0 180/0 16289/0 243/0 1540/0 13/0 2475/0 7/0 2333/0 3778/0 1/0 813/0 4068/1 337/0 19/0 3541/0 12201/0 517/0 273/0 2602/0 21/0 , 816/3 9398/10 172/0 9/0 1077/0 7441/0 1209/0 3738/0 254/0 324/0 180/0 11/0 758/0 1/0 1540/0 3787/0 250/0 31/0 9/0 758/0 16290/0 11/0 862/7 93/0 1544/3 1545/2 93/0 9/0 1543/1 13/0 2568/1 21/0 , 595/0 172/0 322/0 116/0 68/0 1523/0 13/0 9/0 3605/0 8062/0 3897/0 19/0 2433/0 58/0 151/

In [132]:
def evaluate_perceptron():
    # Make predictions for the various sequences using the trained model.
    pred_train = sp.viterbi_decode_corpus(train_seq_list)
    pred_test  = sp.viterbi_decode_corpus(test_seq_list)

    # Evaluate and print accuracies
    eval_train = evaluate_corpus(train_seq_list.seq_list, pred_train)
    eval_test = evaluate_corpus(test_seq_list.seq_list, pred_test)

    print("SP -  Accuracy Train: %.3f Test: %.3f"%(eval_train, eval_test))
    
def predict_text_tags(seq, nice_format=False, change_word=False, sensibility=2):
    assert isinstance(seq, str) or isinstance(seq, list), "The input must be a sentence (string format or a list of words)"
    corpus = list(itertools.chain(*X_train_txt)) ###
    
    if isinstance(seq, str):
        seq = seq.split()      
    
    num2lab = {v:k for k,v in sp.state_labels.items()}
    if nice_format:
        grp = pd.DataFrame([seq, [num2lab[w] for w in sp.predict_tags_given_words(seq)]], 
                           index=["Words", "Tags"], columns=["W_{:02d}".format(i) for i in range(len(seq))])   
        grp = grp.style.applymap(lambda x: 'color: blue' if x != 'O' and x in sp.state_labels else 'color: black')
        ICD.display(grp)
        
    else:
        res = ""
        for o, w in zip(seq, sp.predict_tags_given_words(seq)):
            if o not in corpus:                                                    #######
                correct_o, dist = edit_ditance_word(o, X_train_txt)
                if dist <= sensibility:
                    seq[seq.index(o)] = correct_o
                    w = sp.predict_tags_given_words(seq)[seq.index(correct_o)]
                    if change_word:
                        o = correct_o
                        
            ft = " {}/{}" if num2lab[w]=='O' else " {}/\x1b[34m{}\x1b[0m"
            res += ft.format(o, num2lab[w]) 
        print(res)
    
    
def predict_batch_text_tags(batch, nice_format=False, change_word=False, sensibility=2):
    all_s = sum(isinstance(seq, str) for seq in batch)
    all_l = sum(isinstance(seq, list) for seq in batch)
    assert all_s==0 or all_l==0, "The inputs must be sentences (string format or lists of words)"

    for b in batch:
        predict_text_tags(b, nice_format, change_word, sensibility)
        
        
def word_in_corpus(phrase):
    corpus = list(itertools.chain(*X_train_txt)) # Use train words
    for i in range(len(phrase)):
        if phrase[i] not in corpus:
            print(phrase[i])
            palabra, dist = edit_ditance_word(phrase[i])
            if dist==1:
                phrase[i] = palabra
    return phrase


def edit_ditance_word(mistake, X_train_txt):
    # mistake = "Barchelona" 
    corpus = [w for seq in X_train_txt for w in seq]
    distances = [editdistance.eval(mistake, word) for word in corpus]
    return corpus[np.argmin(distances)], min(distances)

In [54]:
num_epochs = 100
epochs_before_stopping = 5
train_perceptron(load_no_fit=True, num_epochs = num_epochs, epochs_before_stopping = epochs_before_stopping)

# evaluate_perceptron()

In [55]:
val_acc = sp.evaluate_corpus(test_seq_list.seq_list, sp.viterbi_decode_corpus(test_seq_list))
print('Checking same Validation Accuracy: %f' % (val_acc))
# Checking same Validation Accuracy: 0.560440 with ExtendedFeatures
# Checking same Validation Accuracy: 0.955210

HBox(children=(FloatProgress(value=0.0, description='Decoding', max=11988.0, style=ProgressStyle(description_w…

Checking same Validation Accuracy: 0.955210


In [135]:
predict_batch_text_tags(["The programmers from Barcelona might write a sentence without a spell checker .",
"The programmers from Barchelona cannot write a sentence without a spell checker .",
"Jack London went to Parrris .",
"Jack London went to Paris .",
"We never though Microsoft would become such a big company .",
"We never though Microsof would become such a big company .",
"The president of U.S.A though they could win the war",
"The president of the United States of America though they could win the war",
"The king of Saudi Arabia wanted total control .",
"Robin does not want to go to Saudi Arabia .",
"Apple is a great company .",
"I really love apples and oranges ."], nice_format=False, change_word=False, sensibility=2)

 The/O programmers/O from/O Barcelona/[34mB-geo[0m might/O write/O a/O sentence/O without/O a/O spell/O checker/O ./O
 The/O programmers/O from/O Barchelona/[34mB-geo[0m cannot/O write/O a/O sentence/O without/O a/O spell/O checker/O ./O
 Jack/[34mB-per[0m London/[34mI-per[0m went/O to/O Parrris/[34mB-geo[0m ./O
 Jack/[34mB-per[0m London/[34mI-per[0m went/O to/O Paris/[34mB-geo[0m ./O
 We/O never/O though/O Microsoft/[34mB-org[0m would/O become/O such/O a/O big/O company/O ./O
 We/O never/O though/O Microsof/[34mB-org[0m would/O become/O such/O a/O big/O company/O ./O
 The/O president/O of/O U.S.A/[34mB-org[0m though/O they/O could/O win/O the/O war/O
 The/O president/O of/O the/O United/[34mB-geo[0m States/[34mI-geo[0m of/O America/[34mB-geo[0m though/O they/O could/O win/O the/O war/O
 The/O king/O of/O Saudi/[34mB-org[0m Arabia/[34mI-org[0m wanted/O total/O control/O ./O
 Robin/O does/O not/O want/O to/O go/O to/O Saudi/[34mB-org[0m Arabia/[34mI-or

In [130]:
a = ["The programmers from Barcelona might write a sentence without a spell checker .",
"The programmers from Barchelona cannot write a sentence without a spell checker ."]
type(a[1])

str

In [127]:
s = "The programmers from U.S.A. might spell a sentence without a spell checker ."
last_bit = re.sub('(?<! )(?=[.,!?()])|(?<=[.,!?()])(?! )', r' ', s.split()[-1]).split()

print(s.split()[:-1] + last_bit)


['The', 'programmers', 'from', 'U.S.A.', 'might', 'spell', 'a', 'sentence', 'without', 'a', 'spell', 'checker', '.']


# BERT
+ Bidirectional Encoder Representations from Transformers

In [None]:
class SentenceGetter(object):

    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [None]:
getter = SentenceGetter(data)

In [None]:
sentences = [[word[0] for word in sentence] for sentence in getter.sentences]
print(sentences[0])

In [None]:
labels = [[s[2] for s in sentence] for sentence in getter.sentences]
print(labels[0])

In [None]:
tag_values = list(set(data["Tag"].values))
tag_values.append("PAD")
tag2idx = {t: i for i, t in enumerate(tag_values)}

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [None]:
MAX_LEN = 75
bs = 32

In [None]:
# GPU 
gpu_id = 3

device = torch.device("cuda:{}".format(gpu_id) if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(gpu_id)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

In [None]:
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels


In [None]:
tokenized_texts_and_labels = [tokenize_and_preserve_labels(sent, labs) for sent, labs in tqdm(zip(sentences, labels), total=len(sentences))]

In [None]:
tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]

In [None]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")


In [None]:
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")

In [None]:
attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]

In [None]:
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags,
                                                            random_state=2018, test_size=0.1)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)

In [None]:
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

In [None]:
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)


In [None]:
import transformers
from transformers import BertForTokenClassification, AdamW

In [None]:
model = BertForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(tag2idx),
    output_attentions = False,
    output_hidden_states = False
)
model.to(device=device)

In [None]:
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=3e-5,
    eps=1e-8
)

In [None]:
from transformers import get_linear_schedule_with_warmup

epochs = 3
max_grad_norm = 1.0

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [None]:
# !pip3 install seqeval

In [None]:
from seqeval.metrics import f1_score, accuracy_score

In [None]:
## Store the average loss after each epoch so we can plot them.
loss_values, validation_loss_values = [], []

for _ in tqdm(range(epochs), desc="Epoch"):
    # ========================================
    #               Training
    # ========================================
    # Perform one full pass over the training set.

    # Put the model into training mode.
    model.train()
    # Reset the total loss for this epoch.
    total_loss = 0

    # Training loop
    for step, batch in tqdm(enumerate(train_dataloader), leave=False, total=len(train_dataloader), desc="Training"):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # Always clear any previously calculated gradients before performing a backward pass.
        model.zero_grad()
        # forward pass
        # This will return the loss (rather than the model output)
        # because we have provided the `labels`.
        outputs = model(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask, labels=b_labels)
        # get the loss
        loss = outputs[0]
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # track train loss
        total_loss += loss.item()
        # Clip the norm of the gradient
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))

    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)


    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    # Put the model into evaluation mode
    model.eval()
    # Reset the validation loss for this epoch.
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    
    for batch in tqdm(valid_dataloader, desc="Validation", leave=False):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        # Telling the model not to compute or store gradients,
        # saving memory and speeding up validation
        with torch.no_grad():
            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have not provided labels.
            outputs = model(b_input_ids, token_type_ids=None,
                            attention_mask=b_input_mask, labels=b_labels)
        # Move logits and labels to CPU
        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences.
        eval_loss += outputs[0].mean().item()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.extend(label_ids)

    eval_loss = eval_loss / len(valid_dataloader)
    validation_loss_values.append(eval_loss)
    print("Validation loss: {}".format(eval_loss))
    pred_tags = [tag_values[p_i] for p, l in zip(predictions, true_labels)
                                 for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD"]
    valid_tags = [tag_values[l_i] for l in true_labels
                                  for l_i in l if tag_values[l_i] != "PAD"]
    print("Validation Accuracy: {}".format(accuracy_score(pred_tags, valid_tags)))
    print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags)))
    print()

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

# Use plot styling from seaborn.
sns.set(style='darkgrid')

# Increase the plot size and font size.
sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12,6)

# Plot the learning curve.
plt.plot(loss_values, 'b-o', label="training loss")
plt.plot(validation_loss_values, 'r-o', label="validation loss")

# Label the plot.
plt.title("Learning curve")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()

plt.show()