In [1]:
import pandas as pd
import numpy as np
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm, trange



from IPython.core.display import display, HTML
# display(HTML("<style>.container { width:100% !important; }</style>"))

pd.set_option('display.max_rows', 25)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 50)

In [None]:
# Using ffill we optimize the code. It will fill all missing values with the previous non-nan value
data = pd.read_csv("data/kaggle_ner/ner_dataset.csv",
                   encoding="latin1").fillna(method="ffill")

data.head()

In [3]:
print("Available TAGS:")
print(set(data["Tag"]))

Available TAGS:
{'I-eve', 'I-art', 'I-gpe', 'I-tim', 'B-eve', 'I-per', 'O', 'B-art', 'B-nat', 'B-geo', 'B-per', 'B-gpe', 'I-geo', 'B-org', 'I-org', 'I-nat', 'B-tim'}


In [4]:
print("{:10s} | {:20s}".format("TAG", "Examples"))
print("-"*60)
for tag in set(data["Tag"]):
    print("{:10s} | {:20s} {:20s} {:20s}".format(tag, *data[data["Tag"] == tag]["Word"][0:3].values))

TAG        | Examples            
------------------------------------------------------------
I-eve      | Summer               Olympics             Olympic             
I-art      | Non-Proliferation    V-6                  Simple              
I-gpe      | States               Korea                Binh                
I-tim      | 8                    1                    2                   
B-eve      | 2012                 Games                Games               
I-per      | Mahmoud              Ahmadinejad          Horbach             
O          | Thousands            of                   demonstrators       
B-art      | Nuclear              Saltillo             Pentastar           
B-nat      | H5N1                 H5N1                 Jing                
B-geo      | London               Iraq                 Hyde                
B-per      | Bush                 President            Thomas              
B-gpe      | British              English              Britain       

In [5]:
n_sentences = len(data['Sentence #'].unique())
n_sentences

47959

In [7]:
# Using groupby and apply rather than a for loop the computation is reduced from +2h to 4s
%time X_txt = list(data.groupby("Sentence #")['Word'].apply(list))
%time Y_txt = list(data.groupby("Sentence #")['Tag'].apply(list))

from skseq.sequences.label_dictionary import LabelDictionary
x_dict = LabelDictionary(label_names=data['Word'].unique())
y_dict = LabelDictionary(label_names=data['Tag'].unique())

Wall time: 4.56 s
Wall time: 4.53 s


In [8]:
import scipy
import numpy as np

import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

import skseq
import skseq.sequences
import skseq.readers
from skseq.sequences import sequence
from skseq.sequences import sequence_list
from skseq.sequences import label_dictionary

In [9]:
n_split = 35971
X_train_txt, X_test_txt, Y_train_txt, Y_test_txt = X_txt[:n_split], X_txt[n_split:], Y_txt[:n_split], Y_txt[n_split:]

In [14]:
from skseq.sequences.sequence_list import SequenceList
from skseq.sequences.sequence import Sequence

# train_sequences = [Sequence(x,y) for x,y in zip(X_train_txt, Y_train_txt)]
# test_sequences = [Sequence(x,y) for x,y in zip(X_test_txt, Y_test_txt)]

train_seq_list = SequenceList(x_dict, y_dict)
for x,y in zip(X_train_txt, Y_train_txt):
    train_seq_list.add_sequence(x, y, x_dict, y_dict)

test_seq_list = SequenceList(x_dict, y_dict)
for x,y in zip(X_test_txt, Y_test_txt):
    test_seq_list.add_sequence(x, y, x_dict, y_dict)

In [19]:
feature_mapper = skseq.sequences.id_feature.IDFeatures(train_seq_list)
feature_mapper.build_features()

print('Feature dict length:', len(feature_mapper.feature_dict))
print('Feature list length:', len(feature_mapper.feature_list))

import skseq.sequences.structured_perceptron as spc
# sp = spc.StructuredPerceptron(corpus.word_dict, corpus.tag_dict, feature_mapper)
sp = spc.StructuredPerceptron(x_dict, y_dict, feature_mapper)

num_epochs = 5
%time sp.fit(train_seq_list, num_epochs)

def evaluate_corpus(sequences, sequences_predictions):
    """Evaluate classification accuracy at corpus level, comparing with
    gold standard."""
    total = 0.0
    correct = 0.0
    for i, sequence in enumerate(sequences):
        pred = sequences_predictions[i]
        for j, y_hat in enumerate(pred.y):
            if sequence.y[j] == y_hat:
                correct += 1
            total += 1
    return correct / total

# Make predictions for the various sequences using the trained model.
pred_train = sp.viterbi_decode_corpus(train_seq_list)
pred_test  = sp.viterbi_decode_corpus(test_seq_list)

# Evaluate and print accuracies
eval_train = evaluate_corpus(train_seq_list.seq_list, pred_train)
eval_test = evaluate_corpus(test_seq_list.seq_list, pred_test)

print("SP -  Accuracy Train: %.3f Test: %.3f"%(eval_train, eval_test))

Feature dict length: 38298
Feature list length: 35971


KeyboardInterrupt: 

In [23]:
seq = train_seq_list[0]
seq

0/0 1/0 2/0 3/0 4/0 5/0 6/1 7/0 8/0 9/0 10/0 11/0 12/1 13/0 14/0 9/0 15/0 1/0 16/2 17/0 18/0 19/0 20/0 21/0 

In [24]:
sp.viterbi_decode(seq)

(0/0 1/0 2/0 3/0 4/0 5/0 6/0 7/0 8/0 9/0 10/0 11/0 12/0 13/0 14/0 9/0 15/0 1/0 16/0 17/0 18/0 19/0 20/0 21/0 ,
 0.0)

In [28]:
%%time 
pred_test  = sp.viterbi_decode_corpus(test_seq_list)
eval_test = evaluate_corpus(test_seq_list.seq_list, pred_test)
eval_test

Wall time: 2min 28s


0.8462011121693447