In [2]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm, trange

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

pd.set_option('display.max_rows', 25)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 50)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
# Using ffill we optimize the code. It will fill all missing values with the previous non-nan value
data = pd.read_csv("data/kaggle_ner/ner_dataset.csv",
                   encoding="latin1").fillna(method="ffill")

data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [4]:
print("Available TAGS:")
print(set(data["Tag"]))

Available TAGS:
{'B-nat', 'I-per', 'I-tim', 'I-nat', 'B-art', 'B-geo', 'B-org', 'I-org', 'I-art', 'I-geo', 'B-per', 'I-eve', 'B-eve', 'I-gpe', 'O', 'B-gpe', 'B-tim'}


In [5]:
print("{:10s} | {:20s}".format("TAG", "Examples"))
print("-"*60)
for tag in set(data["Tag"]):
    print("{:10s} | {:20s} {:20s} {:20s}".format(tag, *data[data["Tag"] == tag]["Word"][0:3].values))

TAG        | Examples            
------------------------------------------------------------
B-nat      | H5N1                 H5N1                 Jing                
I-per      | Mahmoud              Ahmadinejad          Horbach             
I-tim      | 8                    1                    2                   
I-nat      | Jing                 Jing                 Acute               
B-art      | Nuclear              Saltillo             Pentastar           
B-geo      | London               Iraq                 Hyde                
B-org      | Labor                International        IAEA                
I-org      | Party                Atomic               Energy              
I-art      | Non-Proliferation    V-6                  Simple              
I-geo      | Park                 State                State               
B-per      | Bush                 President            Thomas              
I-eve      | Summer               Olympics             Olympic       

In [6]:
n_sentences = len(data['Sentence #'].unique())
n_sentences

47959

In [7]:
# Using groupby and apply rather than a for loop the computation is reduced from +2h to 4s
%time X_txt = list(data.groupby("Sentence #")['Word'].apply(list))
%time Y_txt = list(data.groupby("Sentence #")['Tag'].apply(list))

from skseq.sequences.label_dictionary import LabelDictionary
x_dict = LabelDictionary(label_names=data['Word'].unique())
y_dict = LabelDictionary(label_names=data['Tag'].unique())

Wall time: 4.8 s
Wall time: 4.63 s


In [8]:
import scipy
import numpy as np

import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

import skseq
import skseq.sequences
import skseq.readers
from skseq.sequences import sequence
from skseq.sequences import sequence_list
from skseq.sequences import label_dictionary

In [9]:
n_split = 35971
X_train_txt, X_test_txt, Y_train_txt, Y_test_txt = X_txt[:n_split], X_txt[n_split:], Y_txt[:n_split], Y_txt[n_split:]

val_split = n_split - 100
X_train_txt, X_val_txt, Y_train_txt, Y_val_txt = X_train_txt[:val_split], X_train_txt[val_split:], Y_train_txt[:val_split], Y_train_txt[val_split:]

dummy_split = 100
X_dummy_txt, Y_dummy_txt = X_train_txt[:dummy_split], Y_train_txt[:dummy_split]

len(X_train_txt), len(Y_train_txt), len(X_val_txt), len(Y_val_txt), len(X_test_txt), len(Y_test_txt), len(X_dummy_txt), len(Y_dummy_txt)

(35871, 35871, 100, 100, 11988, 11988, 100, 100)

In [10]:
from skseq.sequences.sequence_list import SequenceList
from skseq.sequences.sequence import Sequence

# train_sequences = [Sequence(x,y) for x,y in zip(X_train_txt, Y_train_txt)]
# test_sequences = [Sequence(x,y) for x,y in zip(X_test_txt, Y_test_txt)]

train_seq_list = SequenceList(x_dict, y_dict)
for x,y in zip(X_train_txt, Y_train_txt):
    train_seq_list.add_sequence(x, y, x_dict, y_dict)
    
val_seq_list = SequenceList(x_dict, y_dict)
for x,y in zip(X_val_txt, Y_val_txt):
    val_seq_list.add_sequence(x, y, x_dict, y_dict)

test_seq_list = SequenceList(x_dict, y_dict)
for x,y in zip(X_test_txt, Y_test_txt):
    test_seq_list.add_sequence(x, y, x_dict, y_dict)

dummy_seq_list = SequenceList(x_dict, y_dict)
for x,y in zip(X_dummy_txt, Y_dummy_txt):
    dummy_seq_list.add_sequence(x, y, x_dict, y_dict)

In [49]:
feature_mapper = skseq.sequences.id_feature.IDFeatures(train_seq_list)
feature_mapper.build_features()

print('Feature dict length:', len(feature_mapper.feature_dict))
print('Feature list length:', len(feature_mapper.feature_list))

import skseq.sequences.structured_perceptron_validation as spc
# sp = spc.StructuredPerceptron(corpus.word_dict, corpus.tag_dict, feature_mapper)
sp = spc.StructuredPerceptronValidation(x_dict, y_dict, feature_mapper, averaged=False)

Feature dict length: 38250
Feature list length: 35871


In [50]:
def train_perceptron(load_no_fit = True, num_epochs = 50, epochs_before_stopping = 5, dir_to_params = './'):
    if not load_no_fit:
        print('Training for %i epochs with early stopping after %i epochs of no improvement' % (num_epochs, epochs_before_stopping))
        %time sp.fit(dummy_seq_list, val_seq_list, num_epochs, epochs_before_stopping, dir_to_params)

#         def evaluate_corpus(sequences, sequences_predictions):
#             """Evaluate classification accuracy at corpus level, comparing with
#             gold standard."""
#             total = 0.0
#             correct = 0.0
#             for i, sequence in enumerate(tqdm(sequences, leave=False)):
#                 pred = sequences_predictions[i]
#                 for j, y_hat in enumerate(pred.y):
#                     if sequence.y[j] == y_hat:
#                         correct += 1
#                     total += 1
#             return correct / total
#         sp.save_model(path_to_model)
    else:
        sp.load_model(dir_to_params)

In [51]:
import pandas as pd
from IPython.core import display as ICD

In [52]:
def evaluate_perceptron():
    # Make predictions for the various sequences using the trained model.
    pred_train = sp.viterbi_decode_corpus(train_seq_list)
    pred_test  = sp.viterbi_decode_corpus(test_seq_list)

    # Evaluate and print accuracies
    eval_train = evaluate_corpus(train_seq_list.seq_list, pred_train)
    eval_test = evaluate_corpus(test_seq_list.seq_list, pred_test)

    print("SP -  Accuracy Train: %.3f Test: %.3f"%(eval_train, eval_test))
    
def predict_text_tags(seq, nice_format=False):
    assert isinstance(seq, str) or isinstance(seq, list), "The input must be a sentence (string format or a list of words)"
    
    if isinstance(seq, str):
        seq = seq.split()
    
    num2lab = {v:k for k,v in sp.state_labels.items()}
    if nice_format:
        grp = pd.DataFrame([seq, [num2lab[w] for w in sp.predict_tags_given_words(seq)]], 
                           index=["Words", "Tags"], columns=["W_{:02d}".format(i) for i in range(len(seq))])                            
        ICD.display(grp.style)
    else:
        print(" ".join(["{}/{}".format(o, num2lab[w]) for o, w in zip(seq, sp.predict_tags_given_words(seq))]))
    
    
def predict_batch_text_tags(batch, nice_format=False):
    all_s = sum(isinstance(seq, str) for seq in batch)
    all_l = sum(isinstance(seq, list) for seq in batch)
    assert all_s==0 or all_l==0, "The inputs must be sentences (string format or lists of words)"

    for b in batch:
        predict_text_tags(b, nice_format)

In [53]:
num_epochs = 50
epochs_before_stopping = 5
train_perceptron(load_no_fit=False, num_epochs = num_epochs, epochs_before_stopping = epochs_before_stopping)

# evaluate_perceptron()

Training for 50 epochs with early stopping after 5 epochs of no improvement


HBox(children=(FloatProgress(value=0.0, description='Fitting', max=50.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='Epoch', style=ProgressStyle(description_width='initial'))…

HBox(children=(FloatProgress(value=0.0, description='Decoding', style=ProgressStyle(description_width='initial…

Epoch: 0 Train Accuracy: 0.768110 Validation Accuracy: 0.836431 (new best)
Model saved


HBox(children=(FloatProgress(value=0.0, description='Epoch', style=ProgressStyle(description_width='initial'))…

HBox(children=(FloatProgress(value=0.0, description='Decoding', style=ProgressStyle(description_width='initial…

Epoch: 1 Train Accuracy: 0.799912 Validation Accuracy: 0.860130 (new best)
Model saved


HBox(children=(FloatProgress(value=0.0, description='Epoch', style=ProgressStyle(description_width='initial'))…

HBox(children=(FloatProgress(value=0.0, description='Decoding', style=ProgressStyle(description_width='initial…

Epoch: 2 Train Accuracy: 0.803004 Validation Accuracy: 0.860130
No increase in validation for 1 consecutive epochs


HBox(children=(FloatProgress(value=0.0, description='Epoch', style=ProgressStyle(description_width='initial'))…

HBox(children=(FloatProgress(value=0.0, description='Decoding', style=ProgressStyle(description_width='initial…

Epoch: 3 Train Accuracy: 0.813604 Validation Accuracy: 0.866636 (new best)
Model saved


HBox(children=(FloatProgress(value=0.0, description='Epoch', style=ProgressStyle(description_width='initial'))…

HBox(children=(FloatProgress(value=0.0, description='Decoding', style=ProgressStyle(description_width='initial…

Epoch: 4 Train Accuracy: 0.863516 Validation Accuracy: 0.867100 (new best)
Model saved


HBox(children=(FloatProgress(value=0.0, description='Epoch', style=ProgressStyle(description_width='initial'))…

HBox(children=(FloatProgress(value=0.0, description='Decoding', style=ProgressStyle(description_width='initial…

Epoch: 5 Train Accuracy: 0.853357 Validation Accuracy: 0.868959 (new best)
Model saved


HBox(children=(FloatProgress(value=0.0, description='Epoch', style=ProgressStyle(description_width='initial'))…

HBox(children=(FloatProgress(value=0.0, description='Decoding', style=ProgressStyle(description_width='initial…

Epoch: 6 Train Accuracy: 0.905477 Validation Accuracy: 0.860130
No increase in validation for 1 consecutive epochs


HBox(children=(FloatProgress(value=0.0, description='Epoch', style=ProgressStyle(description_width='initial'))…

HBox(children=(FloatProgress(value=0.0, description='Decoding', style=ProgressStyle(description_width='initial…

Epoch: 7 Train Accuracy: 0.887367 Validation Accuracy: 0.872677 (new best)
Model saved


HBox(children=(FloatProgress(value=0.0, description='Epoch', style=ProgressStyle(description_width='initial'))…

HBox(children=(FloatProgress(value=0.0, description='Decoding', style=ProgressStyle(description_width='initial…

Epoch: 8 Train Accuracy: 0.927120 Validation Accuracy: 0.867565
No increase in validation for 1 consecutive epochs


HBox(children=(FloatProgress(value=0.0, description='Epoch', style=ProgressStyle(description_width='initial'))…

HBox(children=(FloatProgress(value=0.0, description='Decoding', style=ProgressStyle(description_width='initial…

Epoch: 9 Train Accuracy: 0.938604 Validation Accuracy: 0.868030
No increase in validation for 2 consecutive epochs


HBox(children=(FloatProgress(value=0.0, description='Epoch', style=ProgressStyle(description_width='initial'))…

HBox(children=(FloatProgress(value=0.0, description='Decoding', style=ProgressStyle(description_width='initial…

Epoch: 10 Train Accuracy: 0.920495 Validation Accuracy: 0.872212
No increase in validation for 3 consecutive epochs


HBox(children=(FloatProgress(value=0.0, description='Epoch', style=ProgressStyle(description_width='initial'))…

HBox(children=(FloatProgress(value=0.0, description='Decoding', style=ProgressStyle(description_width='initial…

Epoch: 11 Train Accuracy: 0.966431 Validation Accuracy: 0.875929 (new best)
Model saved


HBox(children=(FloatProgress(value=0.0, description='Epoch', style=ProgressStyle(description_width='initial'))…

HBox(children=(FloatProgress(value=0.0, description='Decoding', style=ProgressStyle(description_width='initial…

Epoch: 12 Train Accuracy: 0.969965 Validation Accuracy: 0.881970 (new best)
Model saved


HBox(children=(FloatProgress(value=0.0, description='Epoch', style=ProgressStyle(description_width='initial'))…

HBox(children=(FloatProgress(value=0.0, description='Decoding', style=ProgressStyle(description_width='initial…

Epoch: 13 Train Accuracy: 0.987191 Validation Accuracy: 0.747677
No increase in validation for 1 consecutive epochs


HBox(children=(FloatProgress(value=0.0, description='Epoch', style=ProgressStyle(description_width='initial'))…

HBox(children=(FloatProgress(value=0.0, description='Decoding', style=ProgressStyle(description_width='initial…

Epoch: 14 Train Accuracy: 0.939488 Validation Accuracy: 0.885223 (new best)
Model saved


HBox(children=(FloatProgress(value=0.0, description='Epoch', style=ProgressStyle(description_width='initial'))…

HBox(children=(FloatProgress(value=0.0, description='Decoding', style=ProgressStyle(description_width='initial…

Epoch: 15 Train Accuracy: 0.984982 Validation Accuracy: 0.882900
No increase in validation for 1 consecutive epochs


HBox(children=(FloatProgress(value=0.0, description='Epoch', style=ProgressStyle(description_width='initial'))…

HBox(children=(FloatProgress(value=0.0, description='Decoding', style=ProgressStyle(description_width='initial…

Epoch: 16 Train Accuracy: 0.978799 Validation Accuracy: 0.886152 (new best)
Model saved


HBox(children=(FloatProgress(value=0.0, description='Epoch', style=ProgressStyle(description_width='initial'))…

HBox(children=(FloatProgress(value=0.0, description='Decoding', style=ProgressStyle(description_width='initial…

Epoch: 17 Train Accuracy: 0.968640 Validation Accuracy: 0.888011 (new best)
Model saved


HBox(children=(FloatProgress(value=0.0, description='Epoch', style=ProgressStyle(description_width='initial'))…

HBox(children=(FloatProgress(value=0.0, description='Decoding', style=ProgressStyle(description_width='initial…

Epoch: 18 Train Accuracy: 0.973498 Validation Accuracy: 0.884758
No increase in validation for 1 consecutive epochs


HBox(children=(FloatProgress(value=0.0, description='Epoch', style=ProgressStyle(description_width='initial'))…

HBox(children=(FloatProgress(value=0.0, description='Decoding', style=ProgressStyle(description_width='initial…

Epoch: 19 Train Accuracy: 0.989841 Validation Accuracy: 0.883364
No increase in validation for 2 consecutive epochs


HBox(children=(FloatProgress(value=0.0, description='Epoch', style=ProgressStyle(description_width='initial'))…

HBox(children=(FloatProgress(value=0.0, description='Decoding', style=ProgressStyle(description_width='initial…

Epoch: 20 Train Accuracy: 0.986749 Validation Accuracy: 0.875465
No increase in validation for 3 consecutive epochs


HBox(children=(FloatProgress(value=0.0, description='Epoch', style=ProgressStyle(description_width='initial'))…

HBox(children=(FloatProgress(value=0.0, description='Decoding', style=ProgressStyle(description_width='initial…

Epoch: 21 Train Accuracy: 0.992049 Validation Accuracy: 0.887546
No increase in validation for 4 consecutive epochs


HBox(children=(FloatProgress(value=0.0, description='Epoch', style=ProgressStyle(description_width='initial'))…

HBox(children=(FloatProgress(value=0.0, description='Decoding', style=ProgressStyle(description_width='initial…

Epoch: 22 Train Accuracy: 0.985866 Validation Accuracy: 0.881970
No increase in validation for 5 consecutive epochs

Early stopping performed at epoch 22, saved model at epoch 17
Train Accuracy for saved model: 0.968640
Validation Accuracy for saved model: 0.888011

Best model successfully loaded
Wall time: 1min 6s


In [54]:
val_acc = sp.evaluate_corpus(val_seq_list.seq_list, sp.viterbi_decode_corpus(val_seq_list))
print('Checking same Validation Accuracy: %f' % (val_acc))

HBox(children=(FloatProgress(value=0.0, description='Decoding', style=ProgressStyle(description_width='initial…

0.888011


In [108]:
predict_text_tags("Jack London went to Paris".split())
predict_text_tags("Jack London went to Paris".split(), nice_format=True)

Jack/B-per London/I-per went/O to/O Paris/B-geo


Unnamed: 0,W_00,W_01,W_02,W_03,W_04
Words,Jack,London,went,to,Paris
Tags,B-per,I-per,O,O,B-geo


In [49]:
predict_batch_text_tags(["The programmers from Barcelona might write a sentence without a spell checker.",
"The programmers from Barchelona cannot write a sentence without a spell checker.",
"Jack London went to Parris.",
"Jack London went to Paris.",
"We never though Microsoft would become such a big company.",
"We never though Microsof would become such a big company.",
"The president of U.S.A though they could win the war",
"The president of the United States of America though they could win the war",
"The king of Saudi Arabia wanted total control.",
"Robin does not want to go to Saudi Arabia.",
"Apple is a great company.",
"I really love apples and oranges."])

The/O programmers/O from/O Barcelona/B-geo might/O write/O a/O sentence/O without/O a/O spell/B-per checker./I-per
The/O programmers/O from/O Barchelona/B-org cannot/I-org write/I-org a/O sentence/O without/O a/O spell/B-per checker./I-per
Jack/B-per London/I-per went/O to/O Parris./O
Jack/B-per London/I-per went/O to/O Paris./O
We/O never/O though/O Microsoft/B-org would/O become/O such/O a/O big/O company./O
We/O never/O though/O Microsof/O would/O become/O such/O a/O big/O company./O
The/O president/O of/O U.S.A/B-org though/O they/O could/O win/O the/O war/O
The/O president/O of/O the/O United/B-geo States/I-geo of/I-geo America/I-geo though/O they/O could/O win/O the/O war/O
The/O king/O of/O Saudi/B-geo Arabia/I-geo wanted/O total/O control./O
Robin/O does/O not/O want/O to/O go/O to/O Saudi/B-per Arabia./I-per
Apple/B-org is/O a/O great/O company./O
I/O really/O love/O apples/O and/O oranges./O


In [109]:
predict_batch_text_tags(["The programmers from Barcelona might write a sentence without a spell checker.",
"The programmers from Barchelona cannot write a sentence without a spell checker.",
"Jack London went to Parris.",
"Jack London went to Paris.",
"We never though Microsoft would become such a big company.",
"We never though Microsof would become such a big company.",
"The president of U.S.A though they could win the war",
"The president of the United States of America though they could win the war",
"The king of Saudi Arabia wanted total control.",
"Robin does not want to go to Saudi Arabia.",
"Apple is a great company.",
"I really love apples and oranges."], nice_format=True)

Unnamed: 0,W_00,W_01,W_02,W_03,W_04,W_05,W_06,W_07,W_08,W_09,W_10,W_11
Words,The,programmers,from,Barcelona,might,write,a,sentence,without,a,spell,checker.
Tags,O,O,O,B-geo,O,O,O,O,O,O,B-per,I-per


Unnamed: 0,W_00,W_01,W_02,W_03,W_04,W_05,W_06,W_07,W_08,W_09,W_10,W_11
Words,The,programmers,from,Barchelona,cannot,write,a,sentence,without,a,spell,checker.
Tags,O,O,O,B-org,I-org,I-org,O,O,O,O,B-per,I-per


Unnamed: 0,W_00,W_01,W_02,W_03,W_04
Words,Jack,London,went,to,Parris.
Tags,B-per,I-per,O,O,O


Unnamed: 0,W_00,W_01,W_02,W_03,W_04
Words,Jack,London,went,to,Paris.
Tags,B-per,I-per,O,O,O


Unnamed: 0,W_00,W_01,W_02,W_03,W_04,W_05,W_06,W_07,W_08,W_09
Words,We,never,though,Microsoft,would,become,such,a,big,company.
Tags,O,O,O,B-org,O,O,O,O,O,O


Unnamed: 0,W_00,W_01,W_02,W_03,W_04,W_05,W_06,W_07,W_08,W_09
Words,We,never,though,Microsof,would,become,such,a,big,company.
Tags,O,O,O,O,O,O,O,O,O,O


Unnamed: 0,W_00,W_01,W_02,W_03,W_04,W_05,W_06,W_07,W_08,W_09
Words,The,president,of,U.S.A,though,they,could,win,the,war
Tags,O,O,O,B-org,O,O,O,O,O,O


Unnamed: 0,W_00,W_01,W_02,W_03,W_04,W_05,W_06,W_07,W_08,W_09,W_10,W_11,W_12,W_13
Words,The,president,of,the,United,States,of,America,though,they,could,win,the,war
Tags,O,O,O,O,B-geo,I-geo,I-geo,I-geo,O,O,O,O,O,O


Unnamed: 0,W_00,W_01,W_02,W_03,W_04,W_05,W_06,W_07
Words,The,king,of,Saudi,Arabia,wanted,total,control.
Tags,O,O,O,B-geo,I-geo,O,O,O


Unnamed: 0,W_00,W_01,W_02,W_03,W_04,W_05,W_06,W_07,W_08
Words,Robin,does,not,want,to,go,to,Saudi,Arabia.
Tags,O,O,O,O,O,O,O,B-per,I-per


Unnamed: 0,W_00,W_01,W_02,W_03,W_04
Words,Apple,is,a,great,company.
Tags,B-org,O,O,O,O


Unnamed: 0,W_00,W_01,W_02,W_03,W_04,W_05
Words,I,really,love,apples,and,oranges.
Tags,O,O,O,O,O,O
