In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm, trange

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

pd.set_option('display.max_rows', 25)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 50)

In [2]:
# Using ffill we optimize the code. It will fill all missing values with the previous non-nan value
data = pd.read_csv("data/kaggle_ner/ner_dataset.csv",
                   encoding="latin1").fillna(method="ffill")

data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [3]:
print("Available TAGS:")
print(set(data["Tag"]))

Available TAGS:
{'I-geo', 'I-art', 'I-org', 'I-eve', 'I-tim', 'B-geo', 'B-org', 'O', 'B-art', 'B-tim', 'I-gpe', 'B-gpe', 'B-per', 'I-per', 'B-nat', 'B-eve', 'I-nat'}


In [4]:
print("{:10s} | {:20s}".format("TAG", "Examples"))
print("-"*60)
for tag in set(data["Tag"]):
    print("{:10s} | {:20s} {:20s} {:20s}".format(tag, *data[data["Tag"] == tag]["Word"][0:3].values))

TAG        | Examples            
------------------------------------------------------------
I-geo      | Park                 State                State               
I-art      | Non-Proliferation    V-6                  Simple              
I-org      | Party                Atomic               Energy              
I-eve      | Summer               Olympics             Olympic             
I-tim      | 8                    1                    2                   
B-geo      | London               Iraq                 Hyde                
B-org      | Labor                International        IAEA                
O          | Thousands            of                   demonstrators       
B-art      | Nuclear              Saltillo             Pentastar           
B-tim      | Wednesday            Wednesday            Tuesday             
I-gpe      | States               Korea                Binh                
B-gpe      | British              English              Britain       

In [5]:
n_sentences = len(data['Sentence #'].unique())
n_sentences

47959

In [6]:
# Using groupby and apply rather than a for loop the computation is reduced from +2h to 4s
%time X_txt = list(data.groupby("Sentence #")['Word'].apply(list))
%time Y_txt = list(data.groupby("Sentence #")['Tag'].apply(list))

from skseq.sequences.label_dictionary import LabelDictionary
x_dict = LabelDictionary(label_names=data['Word'].unique())
y_dict = LabelDictionary(label_names=data['Tag'].unique())

Wall time: 3.39 s
Wall time: 3.28 s


In [7]:
import scipy
import numpy as np

import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

import skseq
import skseq.sequences
import skseq.readers
from skseq.sequences import sequence
from skseq.sequences import sequence_list
from skseq.sequences import label_dictionary

In [8]:
n_split = 35971
X_train_txt, X_test_txt, Y_train_txt, Y_test_txt = X_txt[:n_split], X_txt[n_split:], Y_txt[:n_split], Y_txt[n_split:]

In [9]:
from skseq.sequences.sequence_list import SequenceList
from skseq.sequences.sequence import Sequence

# train_sequences = [Sequence(x,y) for x,y in zip(X_train_txt, Y_train_txt)]
# test_sequences = [Sequence(x,y) for x,y in zip(X_test_txt, Y_test_txt)]

train_seq_list = SequenceList(x_dict, y_dict)
for x,y in zip(X_train_txt, Y_train_txt):
    train_seq_list.add_sequence(x, y, x_dict, y_dict)

test_seq_list = SequenceList(x_dict, y_dict)
for x,y in zip(X_test_txt, Y_test_txt):
    test_seq_list.add_sequence(x, y, x_dict, y_dict)

In [10]:
feature_mapper = skseq.sequences.id_feature.IDFeatures(train_seq_list)
feature_mapper.build_features()

print('Feature dict length:', len(feature_mapper.feature_dict))
print('Feature list length:', len(feature_mapper.feature_list))

import skseq.sequences.structured_perceptron as spc
# sp = spc.StructuredPerceptron(corpus.word_dict, corpus.tag_dict, feature_mapper)
sp = spc.StructuredPerceptron(x_dict, y_dict, feature_mapper)

Feature dict length: 38298
Feature list length: 35971


In [11]:
def evaluate_corpus(sequences, sequences_predictions):
    """Evaluate classification accuracy at corpus level, comparing with
    gold standard."""
    total = 0.0
    correct = 0.0
    for i, sequence in enumerate(tqdm(sequences, leave=False)):
        pred = sequences_predictions[i]
        for j, y_hat in enumerate(pred.y):
            if sequence.y[j] == y_hat:
                correct += 1
            total += 1
    return correct / total

def train_perceptron(load_no_fit = True):
    if not load_no_fit:
        num_epochs = 50
        %time sp.fit(train_seq_list, num_epochs)
        sp.save_model("")
    else:
        sp.load_model("")

In [12]:
import pandas as pd
from IPython.core import display as ICD

In [13]:
def evaluate_perceptron():
    # Make predictions for the various sequences using the trained model.
    pred_train = sp.viterbi_decode_corpus(train_seq_list)
    pred_test  = sp.viterbi_decode_corpus(test_seq_list)

    # Evaluate and print accuracies
    eval_train = evaluate_corpus(train_seq_list.seq_list, pred_train)
    eval_test = evaluate_corpus(test_seq_list.seq_list, pred_test)

    print("SP -  Accuracy Train: %.3f Test: %.3f"%(eval_train, eval_test))
    
def predict_text_tags(seq, nice_format=False):
    assert isinstance(seq, str) or isinstance(seq, list), "The input must be a sentence (string format or a list of words)"
    
    if isinstance(seq, str):
        seq = seq.split()
    
    num2lab = {v:k for k,v in sp.state_labels.items()}
    if nice_format:
        grp = pd.DataFrame([seq, [num2lab[w] for w in sp.predict_tags_given_words(seq)]], 
                           index=["Words", "Tags"], columns=["W_{:02d}".format(i) for i in range(len(seq))])   
        grp = grp.style.applymap(lambda x: 'color: blue' if x != 'O' and x in sp.state_labels else 'color: black')
        ICD.display(grp)
        
    else:
        res = ""
        for o, w in zip(seq, sp.predict_tags_given_words(seq)):
            ft = " {}/{}" if num2lab[w]=='O' else " {}/\x1b[34m{}\x1b[0m"
            res += ft.format(o, num2lab[w]) 
        print(res)
            
#         print(" ".join(["{}/{}".format(o, num2lab[w]) for o, w in zip(seq, sp.predict_tags_given_words(seq))]))
    
    
def predict_batch_text_tags(batch, nice_format=False):
    all_s = sum(isinstance(seq, str) for seq in batch)
    all_l = sum(isinstance(seq, list) for seq in batch)
    assert all_s==0 or all_l==0, "The inputs must be sentences (string format or lists of words)"

    for b in batch:
        predict_text_tags(b, nice_format)

In [14]:
train_perceptron(load_no_fit=True)
# evaluate_perceptron()

In [26]:
predict_text_tags("Jack London went to Paris".split())
predict_text_tags("Jack London went to Paris.".split())

# predict_text_tags("Jack London went to Paris".split(), nice_format=True)

 Jack/[34mB-per[0m London/[34mI-per[0m went/O to/O Paris/[34mB-geo[0m
 Jack/[34mB-per[0m London/[34mI-per[0m went/O to/O Paris./O


In [25]:
sp.predict_tags_given_words(['Jack', 'London', 'went', 'to', 'Paris'])

array([ 3, 10,  0,  0,  1])

In [19]:
predict_batch_text_tags(["The programmers from Barcelona might write a sentence without a spell checker.",
"The programmers from Barchelona cannot write a sentence without a spell checker.",
"Jack London went to Parris.",
"Jack London went to Paris.",
"We never though Microsoft would become such a big company.",
"We never though Microsof would become such a big company.",
"The president of U.S.A though they could win the war",
"The president of the United States of America though they could win the war",
"The king of Saudi Arabia wanted total control.",
"Robin does not want to go to Saudi Arabia.",
"Apple is a great company.",
"I really love apples and oranges."])

 The/O programmers/O from/O Barcelona/[34mB-geo[0m might/O write/O a/O sentence/O without/O a/O spell/[34mB-per[0m checker./[34mI-per[0m
 The/O programmers/O from/O Barchelona/[34mB-org[0m cannot/[34mI-org[0m write/[34mI-org[0m a/O sentence/O without/O a/O spell/[34mB-per[0m checker./[34mI-per[0m
 Jack/[34mB-per[0m London/[34mI-per[0m went/O to/O Parris./O
 Jack/[34mB-per[0m London/[34mI-per[0m went/O to/O Paris./O
 We/O never/O though/O Microsoft/[34mB-org[0m would/O become/O such/O a/O big/O company./O
 We/O never/O though/O Microsof/O would/O become/O such/O a/O big/O company./O
 The/O president/O of/O U.S.A/[34mB-org[0m though/O they/O could/O win/O the/O war/O
 The/O president/O of/O the/O United/[34mB-geo[0m States/[34mI-geo[0m of/[34mI-geo[0m America/[34mI-geo[0m though/O they/O could/O win/O the/O war/O
 The/O king/O of/O Saudi/[34mB-geo[0m Arabia/[34mI-geo[0m wanted/O total/O control./O
 Robin/O does/O not/O want/O to/O go/O to/O Saudi/

In [20]:
predict_batch_text_tags(["The programmers from Barcelona might write a sentence without a spell checker.",
"The programmers from Barchelona cannot write a sentence without a spell checker.",
"Jack London went to Parris.",
"Jack London went to Paris.",
"We never though Microsoft would become such a big company.",
"We never though Microsof would become such a big company.",
"The president of U.S.A though they could win the war",
"The president of the United States of America though they could win the war",
"The king of Saudi Arabia wanted total control.",
"Robin does not want to go to Saudi Arabia.",
"Apple is a great company.",
"I really love apples and oranges."], nice_format=True)

Unnamed: 0,W_00,W_01,W_02,W_03,W_04,W_05,W_06,W_07,W_08,W_09,W_10,W_11
Words,The,programmers,from,Barcelona,might,write,a,sentence,without,a,spell,checker.
Tags,O,O,O,B-geo,O,O,O,O,O,O,B-per,I-per


Unnamed: 0,W_00,W_01,W_02,W_03,W_04,W_05,W_06,W_07,W_08,W_09,W_10,W_11
Words,The,programmers,from,Barchelona,cannot,write,a,sentence,without,a,spell,checker.
Tags,O,O,O,B-org,I-org,I-org,O,O,O,O,B-per,I-per


Unnamed: 0,W_00,W_01,W_02,W_03,W_04
Words,Jack,London,went,to,Parris.
Tags,B-per,I-per,O,O,O


Unnamed: 0,W_00,W_01,W_02,W_03,W_04
Words,Jack,London,went,to,Paris.
Tags,B-per,I-per,O,O,O


Unnamed: 0,W_00,W_01,W_02,W_03,W_04,W_05,W_06,W_07,W_08,W_09
Words,We,never,though,Microsoft,would,become,such,a,big,company.
Tags,O,O,O,B-org,O,O,O,O,O,O


Unnamed: 0,W_00,W_01,W_02,W_03,W_04,W_05,W_06,W_07,W_08,W_09
Words,We,never,though,Microsof,would,become,such,a,big,company.
Tags,O,O,O,O,O,O,O,O,O,O


Unnamed: 0,W_00,W_01,W_02,W_03,W_04,W_05,W_06,W_07,W_08,W_09
Words,The,president,of,U.S.A,though,they,could,win,the,war
Tags,O,O,O,B-org,O,O,O,O,O,O


Unnamed: 0,W_00,W_01,W_02,W_03,W_04,W_05,W_06,W_07,W_08,W_09,W_10,W_11,W_12,W_13
Words,The,president,of,the,United,States,of,America,though,they,could,win,the,war
Tags,O,O,O,O,B-geo,I-geo,I-geo,I-geo,O,O,O,O,O,O


Unnamed: 0,W_00,W_01,W_02,W_03,W_04,W_05,W_06,W_07
Words,The,king,of,Saudi,Arabia,wanted,total,control.
Tags,O,O,O,B-geo,I-geo,O,O,O


Unnamed: 0,W_00,W_01,W_02,W_03,W_04,W_05,W_06,W_07,W_08
Words,Robin,does,not,want,to,go,to,Saudi,Arabia.
Tags,O,O,O,O,O,O,O,B-per,I-per


Unnamed: 0,W_00,W_01,W_02,W_03,W_04
Words,Apple,is,a,great,company.
Tags,B-org,O,O,O,O


Unnamed: 0,W_00,W_01,W_02,W_03,W_04,W_05
Words,I,really,love,apples,and,oranges.
Tags,O,O,O,O,O,O
