#Preparations

In [1]:
! pip install transformers seqeval

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d5/43/cfe4ee779bbd6a678ac6a97c5a5cdeb03c35f9eaebbb9720b036680f9a2d/transformers-4.6.1-py3-none-any.whl (2.2MB)
[K     |████████████████████████████████| 2.3MB 8.0MB/s 
[?25hCollecting seqeval
[?25l  Downloading https://files.pythonhosted.org/packages/9d/2d/233c79d5b4e5ab1dbf111242299153f3caddddbb691219f363ad55ce783d/seqeval-1.2.2.tar.gz (43kB)
[K     |████████████████████████████████| 51kB 8.1MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 51.2MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.4

In [2]:
import pandas as pd
import numpy as np

In [3]:
#for later comparison
results = {}

In [4]:
def load():
  # dataset source: https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus
  data_dir = "/content/drive/MyDrive/Data/ner/kaggle-ner/"
  return pd.read_csv(data_dir + "ner_dataset.csv", encoding="latin1").fillna(method = "ffill")

In [5]:
class SentencesLoader:

  def __init__(self, data):
    self.data = data
    self.grouped = self.get_sentences()
    self.sentences = [[s[0] for s in sentence]  for sentence in self.grouped]
    self.labels = [[s[2] for s in sentence]  for sentence in self.grouped]
    self.pos_tags = [[s[1] for s in sentence] for sentence in self.grouped]
    self.tags = data["Tag"].unique().tolist()
    self.tags.append("PAD")  #add general padding token
    self.tag2idx = {tag: idx for idx, tag in enumerate(self.tags)} #encode tags with numeric values

  def get_sentences(self):
    aggr_fun = lambda sentence: [(word, pos, tag) for word, pos, tag 
                                 in zip(sentence["Word"].values.tolist(),
                                        sentence["POS"].values.tolist(),
                                        sentence["Tag"].values.tolist())]
    return self.data.groupby("Sentence #").apply(aggr_fun)


In [6]:
data = load()
sentences_loader = SentencesLoader(data)
sentences_loader.sentences[0]

['Thousands',
 'of',
 'demonstrators',
 'have',
 'marched',
 'through',
 'London',
 'to',
 'protest',
 'the',
 'war',
 'in',
 'Iraq',
 'and',
 'demand',
 'the',
 'withdrawal',
 'of',
 'British',
 'troops',
 'from',
 'that',
 'country',
 '.']

In [7]:
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [8]:
results = {}

#Bert


##Preprocessing


In [9]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

from tqdm import tqdm, trange

torch.__version__

'1.8.1+cu101'

In [10]:
#torch config
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

#set up BERT config as suggested by https://www.depends-on-the-definition.com/named-entity-recognition-with-bert/
MAX_LEN = 75
bs = 32

In [11]:
class BertPreprocesser():
  def __init__(self, sentences, labels, tag2idx):
    self.sentences = sentences
    self.labels = labels
    self.tag2idx = tag2idx
    self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

    tokenized_texts_and_labels = [self.tokenize_and_preserve_labels(s, l) 
                                  for s, l in zip(self.sentences, self.labels)]

    tokenized_texts = [pair[0] for pair in tokenized_texts_and_labels] 
    labels = [pair[1] for pair in tokenized_texts_and_labels]

    pad_i, pad_t = self.add_padding(tokenized_texts, labels)

    self.valid_dataloader = None
    self.train_dataloader = None
    self.to_data_loaders(pad_i, pad_t)
  
  def tokenize_and_preserve_labels(self, sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):
        tokenized_word = self.tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        tokenized_sentence.extend(tokenized_word)
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

  def add_padding(self, tokenized_texts, tokenized_labels):
    input_ids = [self.tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts]
    input_padded = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

    tag_idxs = [[self.tag2idx.get(l) for l in lab] for lab in tokenized_labels]
    tag_padded = pad_sequences(tag_idxs, maxlen=MAX_LEN, value=self.tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")
    return input_padded, tag_padded

  def to_data_loaders(self, input_padded, tags_padded):
    attention_masks = [[float(i != 0.0) for i in ii] for ii in input_padded]
    tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_padded, tags_padded, random_state=118, test_size=0.1)
    tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_padded, random_state=118, test_size=0.1)

    tr_inputs = torch.tensor(tr_inputs)
    val_inputs = torch.tensor(val_inputs)
    tr_tags = torch.tensor(tr_tags)
    val_tags = torch.tensor(val_tags)
    tr_masks = torch.tensor(tr_masks)
    val_masks = torch.tensor(val_masks)

    train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
    train_sampler = RandomSampler(train_data)
    self.train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

    valid_data = TensorDataset(val_inputs, val_masks, val_tags)
    valid_sampler = SequentialSampler(valid_data)
    self.valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)


In [12]:
bert_preprocesser = BertPreprocesser(sentences_loader.sentences, sentences_loader.labels, sentences_loader.tag2idx)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435797.0, style=ProgressStyle(descripti…




In [13]:
valid_dataloader = bert_preprocesser.valid_dataloader
train_dataloader = bert_preprocesser.train_dataloader

## Training and evaluation

In [14]:
import transformers
from transformers import BertForTokenClassification, AdamW
from transformers import get_linear_schedule_with_warmup
from seqeval.metrics import f1_score, accuracy_score

In [15]:
tag2idx = sentences_loader.tag2idx
tag_list = sentences_loader.tags

In [16]:
model = BertForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(tag2idx),
    output_attentions = False,
    output_hidden_states = False
)
# move to GPU -> uncomment only for GPU runtimes
model.cuda()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435779157.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [17]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],'weight_decay_rate': 0.0}
]


optimizer = AdamW (optimizer_grouped_parameters, lr=3e-5, eps=1e-8)

epochs = 3
max_grad_norm = 1.0

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [18]:
loss_values, validation_loss_values = [], []

for _ in trange(epochs, desc="Epoch"):
    #------------- Training -------------
    # Put the model into training mode.
    model.train()
    # Reset the total loss for this epoch (good practice!).
    total_loss = 0

    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        batch_input_ids, batch_input_mask, batch_labels = batch
        
        model.zero_grad()

        # forward pass
        outputs = model(batch_input_ids, token_type_ids=None,
                        attention_mask=batch_input_mask, labels=batch_labels)
        loss = outputs[0]

        # backward pass
        loss.backward()

        # track train loss
        total_loss += loss.item()
        # clip gradient to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        
        # update parameters
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))

    loss_values.append(avg_train_loss)

    # --------------- Validation ----------------
    # change mode
    model.eval()

    # reset
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []

    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        batch_input_ids, batch_input_mask, batch_labels = batch

        with torch.no_grad():
            # forward pass, get logits
            outputs = model(batch_input_ids, token_type_ids=None, attention_mask=batch_input_mask, labels=batch_labels)

        # move to cpu
        logits = outputs[1].detach().cpu().numpy()
        label_ids = batch_labels.to('cpu').numpy()

        eval_loss += outputs[0].mean().item()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.extend(label_ids)

    eval_loss = eval_loss / len(valid_dataloader)
    validation_loss_values.append(eval_loss)
    print(f"Validation loss: {eval_loss}")

    pred_tags = [[tag_list[p_i] for p_i, l_i in zip(p, l) if tag_list[l_i] != "PAD" ]
                                 for p, l in zip(predictions, true_labels)]
    valid_tags = [[tag_list[l_i] for p_i, l_i in zip(p, l) if tag_list[l_i] != "PAD" ]
                                 for p, l in zip(predictions, true_labels)]
    f1=0
    acc = 0
    try:
      acc = accuracy_score(pred_tags, valid_tags)
      f1 = f1_score(pred_tags, valid_tags)
    except:
      print(pred_tags)
      print(valid_tags)
    print(f"Validation Accuracy: {acc}")
    print(f"Validation F1-Score: {f1}")
    print()

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Average train loss: 0.1905821890931292
Validation loss: 0.1319598850607872


Epoch:  33%|███▎      | 1/3 [10:00<20:01, 600.61s/it]

Validation Accuracy: 0.9590428169945445
Validation F1-Score: 0.8247585016503649

Average train loss: 0.10961607094989431
Validation loss: 0.12100724913179875


Epoch:  67%|██████▋   | 2/3 [20:02<10:01, 601.01s/it]

Validation Accuracy: 0.9630600099189949
Validation F1-Score: 0.8406417757403944

Average train loss: 0.08159509383755052
Validation loss: 0.12224106021225452


Epoch: 100%|██████████| 3/3 [30:05<00:00, 601.70s/it]

Validation Accuracy: 0.9641015043808894
Validation F1-Score: 0.8433509433962264






In [19]:
results["BERT"] = {}
results["BERT"]["f1"] = f1
results["BERT"]["acc"] = acc

# Spacy


##Preprocessing

In [20]:
! pip install -U spacy
! python -m spacy download en_core_web_sm

Collecting spacy
[?25l  Downloading https://files.pythonhosted.org/packages/1b/d8/0361bbaf7a1ff56b44dca04dace54c82d63dad7475b7d25ea1baefafafb2/spacy-3.0.6-cp37-cp37m-manylinux2014_x86_64.whl (12.8MB)
[K     |████████████████████████████████| 12.8MB 240kB/s 
Collecting spacy-legacy<3.1.0,>=3.0.4
  Downloading https://files.pythonhosted.org/packages/8d/67/d4002a18e26bf29b17ab563ddb55232b445ab6a02f97bf17d1345ff34d3f/spacy_legacy-3.0.5-py2.py3-none-any.whl
Collecting thinc<8.1.0,>=8.0.3
[?25l  Downloading https://files.pythonhosted.org/packages/61/87/decceba68a0c6ca356ddcb6aea8b2500e71d9bc187f148aae19b747b7d3c/thinc-8.0.3-cp37-cp37m-manylinux2014_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 40.6MB/s 
Collecting pathy>=0.3.5
[?25l  Downloading https://files.pythonhosted.org/packages/13/87/5991d87be8ed60beb172b4062dbafef18b32fa559635a8e2b633c2974f85/pathy-0.5.2-py3-none-any.whl (42kB)
[K     |████████████████████████████████| 51kB 9.2MB/s 
Collecting typer<0.4.0,>

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
sentences = sentences_loader.sentences
labels = sentences_loader.labels

In [23]:
train_s, test_s, train_l, test_l = train_test_split(sentences, labels, random_state=118, test_size=0.25)

print(len(train_s), len(train_l), len(test_s), len(test_l))

35969 35969 11990 11990


In [24]:
test_data = [[(ent, l) for ent, l in zip(sentence, labels) if l!='O'] for sentence, labels in zip(test_s, test_l)]

In [25]:
def to_spacy_format(sentences, labels):
  train_data = []

  for sentence, label in zip(sentences, labels):
    entity_lst = []
    idx = 0
    for s, l in zip(sentence, label):
      if l != 'O':
        entity_lst.append( (idx, idx + len(s), l))
      idx += len(s) +1

    text = " ".join(sentence[:-1])
    train_data.append((text, {"entities": entity_lst}))

  return train_data

In [26]:
json_train = to_spacy_format(train_s, train_l)
json_test = to_spacy_format(test_s, test_l)


##Training and evaluation

In [27]:
import spacy
from spacy.util import minibatch, compounding
from tqdm import tqdm, trange
from spacy.training import Example

In [28]:
# nlp = spacy.load("en_core_web_sm")
nlp = spacy.blank('en')

nlp.add_pipe('ner')
ner = nlp.get_pipe('ner')

In [29]:
for l in sentences_loader.tags:
  ner.add_label(l)

optimizer = nlp.begin_training()


In [30]:
from spacy.util import minibatch, compounding
from spacy.training import Example
import random

pipes = [p for p in nlp.pipe_names if p != 'ner']
n_iter = 10

with nlp.disable_pipes(*pipes):
  examples = []
  for text, annotation in json_train:
    doc = nlp.make_doc(text)
    examples.append(Example.from_dict(doc, annotation))

  nlp.initialize(lambda: examples)
  losses = []
  for n in tqdm(range(n_iter)):
    random.shuffle(examples)
    for batch in minibatch(examples, size=8):
      loss = {}
      nlp.update(batch,sgd = optimizer, losses = loss)
      losses.append(loss['ner'])
    print(losses)

 10%|█         | 1/10 [04:27<40:05, 267.31s/it]

[113.48650431632996, 155.4552390575409, 118.62874114513397, 194.1432626247406, 187.50994050502777, 149.65019273757935, 118.73559504747391, 153.62681740522385, 94.49580186605453, 81.44404798746109, 72.90638267993927, 39.41099325940013, 37.42086456657853, 59.386255743243964, 52.43513315689779, 57.86003029304993, 48.28738355595851, 39.44803114410024, 45.47480588534381, 41.46658247313462, 56.01517000235617, 38.99827999807894, 29.572634560987353, 28.27255625464022, 31.686758555471897, 47.05686429515481, 21.518243001773953, 28.454468481068034, 42.01672312592564, 23.317661208428035, 47.443064914741626, 35.39549869966868, 32.49572573801561, 43.31732623843709, 18.241781943303067, 33.7059194277972, 25.63108233900857, 38.781996541656554, 39.01675563008757, 27.044119471887825, 30.120811759014032, 39.63596860274629, 10.441630307643209, 34.92154841715819, 17.984093076905992, 48.932391791036935, 31.178023993834813, 42.472074948382215, 26.48100954871552, 21.020110897661652, 28.495763001467367, 20.2458

 20%|██        | 2/10 [08:50<35:29, 266.17s/it]

[113.48650431632996, 155.4552390575409, 118.62874114513397, 194.1432626247406, 187.50994050502777, 149.65019273757935, 118.73559504747391, 153.62681740522385, 94.49580186605453, 81.44404798746109, 72.90638267993927, 39.41099325940013, 37.42086456657853, 59.386255743243964, 52.43513315689779, 57.86003029304993, 48.28738355595851, 39.44803114410024, 45.47480588534381, 41.46658247313462, 56.01517000235617, 38.99827999807894, 29.572634560987353, 28.27255625464022, 31.686758555471897, 47.05686429515481, 21.518243001773953, 28.454468481068034, 42.01672312592564, 23.317661208428035, 47.443064914741626, 35.39549869966868, 32.49572573801561, 43.31732623843709, 18.241781943303067, 33.7059194277972, 25.63108233900857, 38.781996541656554, 39.01675563008757, 27.044119471887825, 30.120811759014032, 39.63596860274629, 10.441630307643209, 34.92154841715819, 17.984093076905992, 48.932391791036935, 31.178023993834813, 42.472074948382215, 26.48100954871552, 21.020110897661652, 28.495763001467367, 20.2458

 30%|███       | 3/10 [13:05<30:38, 262.71s/it]

[113.48650431632996, 155.4552390575409, 118.62874114513397, 194.1432626247406, 187.50994050502777, 149.65019273757935, 118.73559504747391, 153.62681740522385, 94.49580186605453, 81.44404798746109, 72.90638267993927, 39.41099325940013, 37.42086456657853, 59.386255743243964, 52.43513315689779, 57.86003029304993, 48.28738355595851, 39.44803114410024, 45.47480588534381, 41.46658247313462, 56.01517000235617, 38.99827999807894, 29.572634560987353, 28.27255625464022, 31.686758555471897, 47.05686429515481, 21.518243001773953, 28.454468481068034, 42.01672312592564, 23.317661208428035, 47.443064914741626, 35.39549869966868, 32.49572573801561, 43.31732623843709, 18.241781943303067, 33.7059194277972, 25.63108233900857, 38.781996541656554, 39.01675563008757, 27.044119471887825, 30.120811759014032, 39.63596860274629, 10.441630307643209, 34.92154841715819, 17.984093076905992, 48.932391791036935, 31.178023993834813, 42.472074948382215, 26.48100954871552, 21.020110897661652, 28.495763001467367, 20.2458

 40%|████      | 4/10 [17:27<26:14, 262.48s/it]

[113.48650431632996, 155.4552390575409, 118.62874114513397, 194.1432626247406, 187.50994050502777, 149.65019273757935, 118.73559504747391, 153.62681740522385, 94.49580186605453, 81.44404798746109, 72.90638267993927, 39.41099325940013, 37.42086456657853, 59.386255743243964, 52.43513315689779, 57.86003029304993, 48.28738355595851, 39.44803114410024, 45.47480588534381, 41.46658247313462, 56.01517000235617, 38.99827999807894, 29.572634560987353, 28.27255625464022, 31.686758555471897, 47.05686429515481, 21.518243001773953, 28.454468481068034, 42.01672312592564, 23.317661208428035, 47.443064914741626, 35.39549869966868, 32.49572573801561, 43.31732623843709, 18.241781943303067, 33.7059194277972, 25.63108233900857, 38.781996541656554, 39.01675563008757, 27.044119471887825, 30.120811759014032, 39.63596860274629, 10.441630307643209, 34.92154841715819, 17.984093076905992, 48.932391791036935, 31.178023993834813, 42.472074948382215, 26.48100954871552, 21.020110897661652, 28.495763001467367, 20.2458

 50%|█████     | 5/10 [21:33<21:28, 257.62s/it]

[113.48650431632996, 155.4552390575409, 118.62874114513397, 194.1432626247406, 187.50994050502777, 149.65019273757935, 118.73559504747391, 153.62681740522385, 94.49580186605453, 81.44404798746109, 72.90638267993927, 39.41099325940013, 37.42086456657853, 59.386255743243964, 52.43513315689779, 57.86003029304993, 48.28738355595851, 39.44803114410024, 45.47480588534381, 41.46658247313462, 56.01517000235617, 38.99827999807894, 29.572634560987353, 28.27255625464022, 31.686758555471897, 47.05686429515481, 21.518243001773953, 28.454468481068034, 42.01672312592564, 23.317661208428035, 47.443064914741626, 35.39549869966868, 32.49572573801561, 43.31732623843709, 18.241781943303067, 33.7059194277972, 25.63108233900857, 38.781996541656554, 39.01675563008757, 27.044119471887825, 30.120811759014032, 39.63596860274629, 10.441630307643209, 34.92154841715819, 17.984093076905992, 48.932391791036935, 31.178023993834813, 42.472074948382215, 26.48100954871552, 21.020110897661652, 28.495763001467367, 20.2458

 60%|██████    | 6/10 [25:49<17:08, 257.02s/it]

[113.48650431632996, 155.4552390575409, 118.62874114513397, 194.1432626247406, 187.50994050502777, 149.65019273757935, 118.73559504747391, 153.62681740522385, 94.49580186605453, 81.44404798746109, 72.90638267993927, 39.41099325940013, 37.42086456657853, 59.386255743243964, 52.43513315689779, 57.86003029304993, 48.28738355595851, 39.44803114410024, 45.47480588534381, 41.46658247313462, 56.01517000235617, 38.99827999807894, 29.572634560987353, 28.27255625464022, 31.686758555471897, 47.05686429515481, 21.518243001773953, 28.454468481068034, 42.01672312592564, 23.317661208428035, 47.443064914741626, 35.39549869966868, 32.49572573801561, 43.31732623843709, 18.241781943303067, 33.7059194277972, 25.63108233900857, 38.781996541656554, 39.01675563008757, 27.044119471887825, 30.120811759014032, 39.63596860274629, 10.441630307643209, 34.92154841715819, 17.984093076905992, 48.932391791036935, 31.178023993834813, 42.472074948382215, 26.48100954871552, 21.020110897661652, 28.495763001467367, 20.2458

 70%|███████   | 7/10 [29:52<12:38, 252.87s/it]

[113.48650431632996, 155.4552390575409, 118.62874114513397, 194.1432626247406, 187.50994050502777, 149.65019273757935, 118.73559504747391, 153.62681740522385, 94.49580186605453, 81.44404798746109, 72.90638267993927, 39.41099325940013, 37.42086456657853, 59.386255743243964, 52.43513315689779, 57.86003029304993, 48.28738355595851, 39.44803114410024, 45.47480588534381, 41.46658247313462, 56.01517000235617, 38.99827999807894, 29.572634560987353, 28.27255625464022, 31.686758555471897, 47.05686429515481, 21.518243001773953, 28.454468481068034, 42.01672312592564, 23.317661208428035, 47.443064914741626, 35.39549869966868, 32.49572573801561, 43.31732623843709, 18.241781943303067, 33.7059194277972, 25.63108233900857, 38.781996541656554, 39.01675563008757, 27.044119471887825, 30.120811759014032, 39.63596860274629, 10.441630307643209, 34.92154841715819, 17.984093076905992, 48.932391791036935, 31.178023993834813, 42.472074948382215, 26.48100954871552, 21.020110897661652, 28.495763001467367, 20.2458

 80%|████████  | 8/10 [34:22<08:36, 258.16s/it]

[113.48650431632996, 155.4552390575409, 118.62874114513397, 194.1432626247406, 187.50994050502777, 149.65019273757935, 118.73559504747391, 153.62681740522385, 94.49580186605453, 81.44404798746109, 72.90638267993927, 39.41099325940013, 37.42086456657853, 59.386255743243964, 52.43513315689779, 57.86003029304993, 48.28738355595851, 39.44803114410024, 45.47480588534381, 41.46658247313462, 56.01517000235617, 38.99827999807894, 29.572634560987353, 28.27255625464022, 31.686758555471897, 47.05686429515481, 21.518243001773953, 28.454468481068034, 42.01672312592564, 23.317661208428035, 47.443064914741626, 35.39549869966868, 32.49572573801561, 43.31732623843709, 18.241781943303067, 33.7059194277972, 25.63108233900857, 38.781996541656554, 39.01675563008757, 27.044119471887825, 30.120811759014032, 39.63596860274629, 10.441630307643209, 34.92154841715819, 17.984093076905992, 48.932391791036935, 31.178023993834813, 42.472074948382215, 26.48100954871552, 21.020110897661652, 28.495763001467367, 20.2458

 90%|█████████ | 9/10 [38:58<04:23, 263.30s/it]

[113.48650431632996, 155.4552390575409, 118.62874114513397, 194.1432626247406, 187.50994050502777, 149.65019273757935, 118.73559504747391, 153.62681740522385, 94.49580186605453, 81.44404798746109, 72.90638267993927, 39.41099325940013, 37.42086456657853, 59.386255743243964, 52.43513315689779, 57.86003029304993, 48.28738355595851, 39.44803114410024, 45.47480588534381, 41.46658247313462, 56.01517000235617, 38.99827999807894, 29.572634560987353, 28.27255625464022, 31.686758555471897, 47.05686429515481, 21.518243001773953, 28.454468481068034, 42.01672312592564, 23.317661208428035, 47.443064914741626, 35.39549869966868, 32.49572573801561, 43.31732623843709, 18.241781943303067, 33.7059194277972, 25.63108233900857, 38.781996541656554, 39.01675563008757, 27.044119471887825, 30.120811759014032, 39.63596860274629, 10.441630307643209, 34.92154841715819, 17.984093076905992, 48.932391791036935, 31.178023993834813, 42.472074948382215, 26.48100954871552, 21.020110897661652, 28.495763001467367, 20.2458

100%|██████████| 10/10 [43:22<00:00, 260.25s/it]

[113.48650431632996, 155.4552390575409, 118.62874114513397, 194.1432626247406, 187.50994050502777, 149.65019273757935, 118.73559504747391, 153.62681740522385, 94.49580186605453, 81.44404798746109, 72.90638267993927, 39.41099325940013, 37.42086456657853, 59.386255743243964, 52.43513315689779, 57.86003029304993, 48.28738355595851, 39.44803114410024, 45.47480588534381, 41.46658247313462, 56.01517000235617, 38.99827999807894, 29.572634560987353, 28.27255625464022, 31.686758555471897, 47.05686429515481, 21.518243001773953, 28.454468481068034, 42.01672312592564, 23.317661208428035, 47.443064914741626, 35.39549869966868, 32.49572573801561, 43.31732623843709, 18.241781943303067, 33.7059194277972, 25.63108233900857, 38.781996541656554, 39.01675563008757, 27.044119471887825, 30.120811759014032, 39.63596860274629, 10.441630307643209, 34.92154841715819, 17.984093076905992, 48.932391791036935, 31.178023993834813, 42.472074948382215, 26.48100954871552, 21.020110897661652, 28.495763001467367, 20.2458




In [31]:
test_examples = []
for text, annotation in json_test:
    doc = nlp.make_doc(text)
    test_examples.append(Example.from_dict(doc, annotation))

scores = nlp.evaluate(test_examples)
print(scores)

{'token_acc': 1.0, 'token_p': 1.0, 'token_r': 1.0, 'token_f': 1.0, 'ents_p': 0.8427967045201514, 'ents_r': 0.8435689168441385, 'ents_f': 0.8431826338782936, 'ents_per_type': {'B-geo': {'p': 0.8370215595023269, 'r': 0.9249580184718724, 'f': 0.8787954330159047}, 'I-geo': {'p': 0.7826993225638353, 'r': 0.7985114300903775, 'f': 0.7905263157894737}, 'B-org': {'p': 0.7869318181818182, 'r': 0.7212096935709994, 'f': 0.7526387292298046}, 'I-org': {'p': 0.8421349901157865, 'r': 0.713738630923887, 'f': 0.7726389428682472}, 'B-art': {'p': 0.2857142857142857, 'r': 0.037383177570093455, 'f': 0.06611570247933883}, 'B-gpe': {'p': 0.965259907359753, 'r': 0.9342465753424658, 'f': 0.9495000632831287}, 'B-tim': {'p': 0.918973124749298, 'r': 0.8991365777080063, 'f': 0.9089466375719104}, 'B-per': {'p': 0.8212157330154947, 'r': 0.811925524393118, 'f': 0.8165442047878643}, 'I-per': {'p': 0.8108771212430995, 'r': 0.9017735334242838, 'f': 0.853913230702982}, 'I-tim': {'p': 0.7927553444180523, 'r': 0.81352833638

In [32]:
results["Spacy"] ={}
results["Spacy"]["precision"] = scores["ents_p"]
results["Spacy"]["recall"] = scores["ents_r"]
results["Spacy"]["fscore"] = scores["ents_f"]

#NLTK

##Preparation


In [33]:
from nltk.chunk import tree2conlltags
from pprint import pprint
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [34]:
from sklearn.model_selection import train_test_split

sentences = sentences_loader.sentences
labels = sentences_loader.labels
pos_tags = sentences_loader.pos_tags

train_s, test_s, train_l, test_l = train_test_split(sentences, labels, random_state=118, test_size=0.25)
train_p, test_p, _ , _ = train_test_split(pos_tags, labels, random_state=118, test_size=0.25)

print(len(train_s), len(train_l), len(test_s), len(test_l), len(train_p), len(test_p))

35969 35969 11990 11990 35969 11990


In [35]:
def train_reader_generator ():
  for sent, pos, lab in zip(train_s, train_p, train_l):
    yield [((w, t), iob) for w, t, iob in zip(sent, pos, lab)]

def test_reader_generator ():
  for sent, pos, lab in zip(test_s, test_p, test_l):
    yield [((w, t), iob) for w, t, iob in zip(sent, pos, lab)]

In [36]:
train_reader = train_reader_generator()
test_reader = test_reader_generator()

In [37]:
from nltk.stem.snowball import SnowballStemmer
import string 

def features(tokens, index, history):
  stemmer = SnowballStemmer('english')
  tokens = [('[S2]', '[S2]'), ('[S1]', '[S1]')] + list(tokens) + [('[E1]', '[E1]'), ('[E2]', '[E2]')]
  history = ['S2', 'S1'] + list(history)

  #accomodate the padding
  index +=2

  word, pos = tokens[index]
  prev_w, prev_p = tokens[index-1]
  prev_prev_w, prev_prev_p = tokens[index-2]
  prev_iob = history[index - 1]
  next_w, next_p = tokens[index+1]
  next_next_w, next_next_p = tokens[index+2]

  contains_dash = '-' in word
  contains_dot  = '.' in word
  allascii = all([True for c in word if c in string.ascii_lowercase])
  allcaps = word == word.capitalize()
  capitalized = word[0] in string.ascii_uppercase

  prev_allcaps = prev_w == prev_w.capitalize()
  prev_capitalized = prev_w in string.ascii_uppercase

  next_allcaps = next_w == next_w.capitalize()
  next_capitalized = next_w in string.ascii_uppercase

  return {
        'word': word,
        'lemma': stemmer.stem(word),
        'pos': pos,
        'all-ascii': allascii,
 
        'next-word': next_w,
        'next-lemma': stemmer.stem(next_w),
        'next-pos': next_p,
 
        'next-next-word': next_next_w,
        'nextnextpos': next_next_p,
 
        'prev-word': prev_w,
        'prev-lemma': stemmer.stem(prev_w),
        'prev-pos': prev_p,
 
        'prev-prev-word': prev_prev_w,
        'prev-prev-pos': prev_prev_p,
 
        'prev-iob': prev_iob,
 
        'contains-dash': contains_dash,
        'contains-dot': contains_dot,
 
        'all-caps': allcaps,
        'capitalized': capitalized,
 
        'prev-all-caps': prev_allcaps,
        'prev-capitalized': prev_capitalized,
 
        'next-all-caps': next_allcaps,
        'next-capitalized': next_capitalized,
    }

##Train and evaluate

In [38]:
import collections
from nltk.tag import ClassifierBasedTagger
from nltk.chunk import ChunkParserI
from nltk.chunk.util import conlltags2tree

class NERChunker(ChunkParserI):
  def __init__(self, train, **kwargs):
    assert isinstance(train, collections.Iterable)

    self.feature_detector = features
    self.tagger = ClassifierBasedTagger(feature_detector = self.feature_detector, train=train, **kwargs)

  def parse(self, tagged_sent):
    chunks = self.tagger.tag(tagged_sent)

    triplet_chunks = [(w, t, l) for ((w, t), l) in chunks]

    return conlltags2tree(triplet_chunks)

In [39]:
#### training 
train_data = list(train_reader_generator())
test_data = list(test_reader_generator())

chunker = NERChunker(train_data)

score = chunker.evaluate([conlltags2tree([(w, t, iob) for (w, t), iob in iobs]) for iobs in test_data])
print(score)

  


ChunkParse score:
    IOB Accuracy:  93.9%%
    Precision:     65.7%%
    Recall:        76.8%%
    F-Measure:     70.8%%


In [40]:
results["NLTK"] = {}
results["NLTK"]["acc"] = score.accuracy()
results["NLTK"]["precision"] = score.precision()
results["NLTK"]["F-measure"] = score.f_measure()
results["NLTK"]["recall"] = score.recall()

#Results


In [41]:
from pprint import pprint
pprint(results)

{'BERT': {'acc': 0.9641015043808894, 'f1': 0.8433509433962264},
 'NLTK': {'F-measure': 0.7079149438865918,
          'acc': 0.9392790534136424,
          'precision': 0.6566923381327814,
          'recall': 0.7678043919279638},
 'Spacy': {'fscore': 0.8431826338782936,
           'precision': 0.8427967045201514,
           'recall': 0.8435689168441385}}
