In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm, trange

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

pd.set_option('display.max_rows', 25)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 50)

In [2]:
# Using ffill we optimize the code. It will fill all missing values with the previous non-nan value
data = pd.read_csv("data/kaggle_ner/ner_dataset.csv",
                   encoding="latin1").fillna(method="ffill")

data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [3]:
print("Available TAGS:")
print(set(data["Tag"]))

Available TAGS:
{'I-per', 'B-gpe', 'I-tim', 'I-org', 'I-nat', 'I-art', 'B-nat', 'I-eve', 'B-art', 'B-org', 'I-gpe', 'B-per', 'I-geo', 'B-eve', 'O', 'B-geo', 'B-tim'}


In [50]:
def sortbystrng(val):
    if len(val)<2:
        return 0
    v = 2*ord(val[2])*ord(val[3])
    return v if val[0]=='B' else v+1

print("{:10s} | {:20s}".format("TAG", "Examples"))
print("-"*60)
for tag in sorted(set(data["Tag"]), key=sortbystrng):
    print("{:10s} | {:20s} {:20s} {:20s}".format(tag, *data[data["Tag"] == tag]["Word"][0:3].values))

TAG        | Examples            
------------------------------------------------------------
O          | Thousands            of                   demonstrators       
B-geo      | London               Iraq                 Hyde                
I-geo      | Park                 State                State               
B-nat      | H5N1                 H5N1                 Jing                
I-nat      | Jing                 Jing                 Acute               
B-art      | Nuclear              Saltillo             Pentastar           
I-art      | Non-Proliferation    V-6                  Simple              
B-per      | Bush                 President            Thomas              
I-per      | Mahmoud              Ahmadinejad          Horbach             
B-gpe      | British              English              Britain             
I-gpe      | States               Korea                Binh                
B-eve      | 2012                 Games                Games         

In [5]:
n_sentences = len(data['Sentence #'].unique())
n_sentences

47959

In [6]:
# Using groupby and apply rather than a for loop the computation is reduced from +2h to 4s
%time X_txt = list(data.groupby("Sentence #")['Word'].apply(list))
%time Y_txt = list(data.groupby("Sentence #")['Tag'].apply(list))

from skseq.sequences.label_dictionary import LabelDictionary
x_dict = LabelDictionary(label_names=data['Word'].unique())
y_dict = LabelDictionary(label_names=data['Tag'].unique())

Wall time: 3.09 s
Wall time: 3.04 s


In [7]:
import scipy
import numpy as np

import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

import skseq
import skseq.sequences
import skseq.readers
from skseq.sequences import sequence
from skseq.sequences import sequence_list
from skseq.sequences import label_dictionary

In [8]:
n_split = 35971
X_train_txt, X_test_txt, Y_train_txt, Y_test_txt = X_txt[:n_split], X_txt[n_split:], Y_txt[:n_split], Y_txt[n_split:]

# val_split = n_split - 100
# X_train_txt, X_val_txt, Y_train_txt, Y_val_txt = X_train_txt[:val_split], X_train_txt[val_split:], Y_train_txt[:val_split], Y_train_txt[val_split:]

# dummy_split = 100
# X_dummy_txt, Y_dummy_txt = X_train_txt[:dummy_split], Y_train_txt[:dummy_split]

# len(X_train_txt), len(Y_train_txt), len(X_val_txt), len(Y_val_txt), len(X_test_txt), len(Y_test_txt), len(X_dummy_txt), len(Y_dummy_txt)

In [9]:
from skseq.sequences.sequence_list import SequenceList
from skseq.sequences.sequence import Sequence

# train_sequences = [Sequence(x,y) for x,y in zip(X_train_txt, Y_train_txt)]
# test_sequences = [Sequence(x,y) for x,y in zip(X_test_txt, Y_test_txt)]

train_seq_list = SequenceList(x_dict, y_dict)
for x,y in zip(X_train_txt, Y_train_txt):
    train_seq_list.add_sequence(x, y, x_dict, y_dict)
    
# val_seq_list = SequenceList(x_dict, y_dict)
# for x,y in zip(X_val_txt, Y_val_txt):
#     val_seq_list.add_sequence(x, y, x_dict, y_dict)

test_seq_list = SequenceList(x_dict, y_dict)
for x,y in zip(X_test_txt, Y_test_txt):
    test_seq_list.add_sequence(x, y, x_dict, y_dict)

# dummy_seq_list = SequenceList(x_dict, y_dict)
# for x,y in zip(X_dummy_txt, Y_dummy_txt):
#     dummy_seq_list.add_sequence(x, y, x_dict, y_dict)

In [10]:
feature_mapper = skseq.sequences.id_feature.IDFeatures(train_seq_list)
feature_mapper.build_features()

print('Feature dict length:', len(feature_mapper.feature_dict))
print('Feature list length:', len(feature_mapper.feature_list))

import skseq.sequences.structured_perceptron_validation as spc
# sp = spc.StructuredPerceptron(corpus.word_dict, corpus.tag_dict, feature_mapper)
sp = spc.StructuredPerceptronValidation(x_dict, y_dict, feature_mapper, averaged=False)

Feature dict length: 38298
Feature list length: 35971


In [16]:
def train_perceptron(load_no_fit = True, num_epochs = 50, epochs_before_stopping = 5, dir_to_params = './'):
    if not load_no_fit:
        print('Training for %i epochs with early stopping after %i epochs of no improvement' % (num_epochs, epochs_before_stopping))
#         %time sp.fit(dummy_seq_list, val_seq_list, num_epochs, epochs_before_stopping, dir_to_params)
        %time sp.fit(train_seq_list, test_seq_list, num_epochs, epochs_before_stopping, dir_to_params)
    else:
        sp.load_model(dir_to_params)

In [17]:
import pandas as pd
from IPython.core import display as ICD

In [25]:
def evaluate_perceptron():
    # Make predictions for the various sequences using the trained model.
    pred_train = sp.viterbi_decode_corpus(train_seq_list)
    pred_test  = sp.viterbi_decode_corpus(test_seq_list)

    # Evaluate and print accuracies
    eval_train = evaluate_corpus(train_seq_list.seq_list, pred_train)
    eval_test = evaluate_corpus(test_seq_list.seq_list, pred_test)

    print("SP -  Accuracy Train: %.3f Test: %.3f"%(eval_train, eval_test))
    
def predict_text_tags(seq, nice_format=False):
    assert isinstance(seq, str) or isinstance(seq, list), "The input must be a sentence (string format or a list of words)"
    
    if isinstance(seq, str):
        seq = seq.split()
    
    num2lab = {v:k for k,v in sp.state_labels.items()}
    if nice_format:
        grp = pd.DataFrame([seq, [num2lab[w] for w in sp.predict_tags_given_words(seq)]], 
                           index=["Words", "Tags"], columns=["W_{:02d}".format(i) for i in range(len(seq))])   
        grp = grp.style.applymap(lambda x: 'color: blue' if x != 'O' and x in sp.state_labels else 'color: black')
        ICD.display(grp)
        
    else:
        res = ""
        for o, w in zip(seq, sp.predict_tags_given_words(seq)):
            ft = " {}/{}" if num2lab[w]=='O' else " {}/\x1b[34m{}\x1b[0m"
            res += ft.format(o, num2lab[w]) 
        print(res)
    
    
def predict_batch_text_tags(batch, nice_format=False):
    all_s = sum(isinstance(seq, str) for seq in batch)
    all_l = sum(isinstance(seq, list) for seq in batch)
    assert all_s==0 or all_l==0, "The inputs must be sentences (string format or lists of words)"

    for b in batch:
        predict_text_tags(b, nice_format)

In [20]:
num_epochs = 100
epochs_before_stopping = 5
train_perceptron(load_no_fit=False, num_epochs = num_epochs, epochs_before_stopping = epochs_before_stopping)

# evaluate_perceptron()

Training for 100 epochs with early stopping after 5 epochs of no improvement


HBox(children=(FloatProgress(value=0.0, description='Fitting', style=ProgressStyle(description_width='initial'…

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=35971.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='Decoding', max=11988.0, style=ProgressStyle(description_w…

Epoch: 0 Train Accuracy: 0.907593 Validation Accuracy: 0.922106 (new best)
Model saved


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=35971.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='Decoding', max=11988.0, style=ProgressStyle(description_w…

Epoch: 1 Train Accuracy: 0.933843 Validation Accuracy: 0.953355 (new best)
Model saved


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=35971.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='Decoding', max=11988.0, style=ProgressStyle(description_w…

Epoch: 2 Train Accuracy: 0.942461 Validation Accuracy: 0.953055
No increase in validation for 1 consecutive epochs


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=35971.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='Decoding', max=11988.0, style=ProgressStyle(description_w…

Epoch: 3 Train Accuracy: 0.947719 Validation Accuracy: 0.946627
No increase in validation for 2 consecutive epochs


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=35971.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='Decoding', max=11988.0, style=ProgressStyle(description_w…

Epoch: 4 Train Accuracy: 0.950978 Validation Accuracy: 0.944200
No increase in validation for 3 consecutive epochs


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=35971.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='Decoding', max=11988.0, style=ProgressStyle(description_w…

Epoch: 5 Train Accuracy: 0.953296 Validation Accuracy: 0.954661 (new best)
Model saved


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=35971.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='Decoding', max=11988.0, style=ProgressStyle(description_w…

Epoch: 6 Train Accuracy: 0.955265 Validation Accuracy: 0.946108
No increase in validation for 1 consecutive epochs


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=35971.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='Decoding', max=11988.0, style=ProgressStyle(description_w…

Epoch: 7 Train Accuracy: 0.956558 Validation Accuracy: 0.945643
No increase in validation for 2 consecutive epochs


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=35971.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='Decoding', max=11988.0, style=ProgressStyle(description_w…

Epoch: 8 Train Accuracy: 0.958187 Validation Accuracy: 0.914571
No increase in validation for 3 consecutive epochs


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=35971.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='Decoding', max=11988.0, style=ProgressStyle(description_w…

Epoch: 9 Train Accuracy: 0.959286 Validation Accuracy: 0.955129 (new best)
Model saved


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=35971.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='Decoding', max=11988.0, style=ProgressStyle(description_w…

Epoch: 10 Train Accuracy: 0.959754 Validation Accuracy: 0.953843
No increase in validation for 1 consecutive epochs


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=35971.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='Decoding', max=11988.0, style=ProgressStyle(description_w…

Epoch: 11 Train Accuracy: 0.960082 Validation Accuracy: 0.951485
No increase in validation for 2 consecutive epochs


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=35971.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='Decoding', max=11988.0, style=ProgressStyle(description_w…

Epoch: 12 Train Accuracy: 0.961041 Validation Accuracy: 0.955210 (new best)
Model saved


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=35971.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='Decoding', max=11988.0, style=ProgressStyle(description_w…

Epoch: 13 Train Accuracy: 0.962554 Validation Accuracy: 0.949257
No increase in validation for 1 consecutive epochs


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=35971.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='Decoding', max=11988.0, style=ProgressStyle(description_w…

Epoch: 14 Train Accuracy: 0.962509 Validation Accuracy: 0.947349
No increase in validation for 2 consecutive epochs


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=35971.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='Decoding', max=11988.0, style=ProgressStyle(description_w…

Epoch: 15 Train Accuracy: 0.963056 Validation Accuracy: 0.954787
No increase in validation for 3 consecutive epochs


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=35971.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='Decoding', max=11988.0, style=ProgressStyle(description_w…

Epoch: 16 Train Accuracy: 0.963280 Validation Accuracy: 0.953432
No increase in validation for 4 consecutive epochs


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=35971.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='Decoding', max=11988.0, style=ProgressStyle(description_w…

Epoch: 17 Train Accuracy: 0.963600 Validation Accuracy: 0.951462
No increase in validation for 5 consecutive epochs

Early stopping performed at epoch 17, saved model at epoch 12
Train Accuracy for saved model: 0.961041
Validation Accuracy for saved model: 0.955210

Best model successfully loaded
Wall time: 1h 46min 45s


In [23]:
# val_acc = sp.evaluate_corpus(test_seq_list.seq_list, sp.viterbi_decode_corpus(test_seq_list))
# print('Checking same Validation Accuracy: %f' % (val_acc))

In [26]:
predict_text_tags("Jack London went to Paris".split())
predict_text_tags("Jack London went to Paris".split(), nice_format=True)

 Jack/[34mB-per[0m London/[34mI-per[0m went/O to/O Paris/[34mB-geo[0m


Unnamed: 0,W_00,W_01,W_02,W_03,W_04
Words,Jack,London,went,to,Paris
Tags,B-per,I-per,O,O,B-geo


In [52]:
predict_batch_text_tags(["The programmers from Barcelona might write a sentence without a spell checker.",
"The programmers from Barchelona cannot write a sentence without a spell checker.",
"Jack London went to Parris.",
"Jack London went to Paris.",
"We never though Microsoft would become such a big company.",
"We never though Microsof would become such a big company.",
"The president of U.S.A though they could win the war",
"The president of the United States of America though they could win the war",
"The king of Saudi Arabia wanted total control.",
"Robin does not want to go to Saudi Arabia.",
"Apple is a great company.",
"I really love apples and oranges."])

 The/O programmers/O from/O Barcelona/[34mB-geo[0m might/O write/O a/O sentence/O without/O a/O spell/[34mB-art[0m checker./[34mI-art[0m
 The/O programmers/O from/O Barchelona/O cannot/O write/O a/O sentence/O without/O a/O spell/[34mB-art[0m checker./[34mI-art[0m
 Jack/[34mB-per[0m London/[34mI-per[0m went/O to/O Parris./O
 Jack/[34mB-per[0m London/[34mI-per[0m went/O to/O Paris./O
 We/O never/O though/O Microsoft/[34mB-org[0m would/O become/O such/O a/O big/O company./O
 We/O never/O though/O Microsof/O would/O become/O such/O a/O big/O company./O
 The/O president/O of/O U.S.A/[34mB-org[0m though/O they/O could/O win/O the/O war/O
 The/O president/O of/O the/O United/[34mB-geo[0m States/[34mI-geo[0m of/O America/[34mB-geo[0m though/O they/O could/O win/O the/O war/O
 The/O king/O of/O Saudi/[34mB-org[0m Arabia/[34mI-org[0m wanted/O total/O control./O
 Robin/O does/O not/O want/O to/O go/O to/O Saudi/O Arabia./O
 Apple/[34mB-org[0m is/O a/O great/O co

In [28]:
predict_batch_text_tags(["The programmers from Barcelona might write a sentence without a spell checker.",
"The programmers from Barchelona cannot write a sentence without a spell checker.",
"Jack London went to Parris.",
"Jack London went to Paris.",
"We never though Microsoft would become such a big company.",
"We never though Microsof would become such a big company.",
"The president of U.S.A though they could win the war",
"The president of the United States of America though they could win the war",
"The king of Saudi Arabia wanted total control.",
"Robin does not want to go to Saudi Arabia.",
"Apple is a great company.",
"I really love apples and oranges."], nice_format=True)

Unnamed: 0,W_00,W_01,W_02,W_03,W_04,W_05,W_06,W_07,W_08,W_09,W_10,W_11
Words,The,programmers,from,Barcelona,might,write,a,sentence,without,a,spell,checker.
Tags,O,O,O,B-geo,O,O,O,O,O,O,B-art,I-art


Unnamed: 0,W_00,W_01,W_02,W_03,W_04,W_05,W_06,W_07,W_08,W_09,W_10,W_11
Words,The,programmers,from,Barchelona,cannot,write,a,sentence,without,a,spell,checker.
Tags,O,O,O,O,O,O,O,O,O,O,B-art,I-art


Unnamed: 0,W_00,W_01,W_02,W_03,W_04
Words,Jack,London,went,to,Parris.
Tags,B-per,I-per,O,O,O


Unnamed: 0,W_00,W_01,W_02,W_03,W_04
Words,Jack,London,went,to,Paris.
Tags,B-per,I-per,O,O,O


Unnamed: 0,W_00,W_01,W_02,W_03,W_04,W_05,W_06,W_07,W_08,W_09
Words,We,never,though,Microsoft,would,become,such,a,big,company.
Tags,O,O,O,B-org,O,O,O,O,O,O


Unnamed: 0,W_00,W_01,W_02,W_03,W_04,W_05,W_06,W_07,W_08,W_09
Words,We,never,though,Microsof,would,become,such,a,big,company.
Tags,O,O,O,O,O,O,O,O,O,O


Unnamed: 0,W_00,W_01,W_02,W_03,W_04,W_05,W_06,W_07,W_08,W_09
Words,The,president,of,U.S.A,though,they,could,win,the,war
Tags,O,O,O,B-org,O,O,O,O,O,O


Unnamed: 0,W_00,W_01,W_02,W_03,W_04,W_05,W_06,W_07,W_08,W_09,W_10,W_11,W_12,W_13
Words,The,president,of,the,United,States,of,America,though,they,could,win,the,war
Tags,O,O,O,O,B-geo,I-geo,O,B-geo,O,O,O,O,O,O


Unnamed: 0,W_00,W_01,W_02,W_03,W_04,W_05,W_06,W_07
Words,The,king,of,Saudi,Arabia,wanted,total,control.
Tags,O,O,O,B-org,I-org,O,O,O


Unnamed: 0,W_00,W_01,W_02,W_03,W_04,W_05,W_06,W_07,W_08
Words,Robin,does,not,want,to,go,to,Saudi,Arabia.
Tags,O,O,O,O,O,O,O,O,O


Unnamed: 0,W_00,W_01,W_02,W_03,W_04
Words,Apple,is,a,great,company.
Tags,B-org,O,O,O,O


Unnamed: 0,W_00,W_01,W_02,W_03,W_04,W_05
Words,I,really,love,apples,and,oranges.
Tags,O,O,O,O,O,O
