<a href="https://colab.research.google.com/github/jsokolowska/text-classification/blob/practical-excercise/practicalNER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Preparations

In [1]:
! pip install transformers seqeval

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d5/43/cfe4ee779bbd6a678ac6a97c5a5cdeb03c35f9eaebbb9720b036680f9a2d/transformers-4.6.1-py3-none-any.whl (2.2MB)
[K     |████████████████████████████████| 2.3MB 7.5MB/s 
[?25hCollecting seqeval
[?25l  Downloading https://files.pythonhosted.org/packages/9d/2d/233c79d5b4e5ab1dbf111242299153f3caddddbb691219f363ad55ce783d/seqeval-1.2.2.tar.gz (43kB)
[K     |████████████████████████████████| 51kB 7.8MB/s 
Collecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b25e86/huggingface_hub-0.0.8-py3-none-any.whl
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 34.8MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
[?25l  Downloading htt

In [2]:
import pandas as pd
import numpy as np

In [3]:
#for later comparison
results = {}

In [4]:
def load():
  # dataset source: https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus
  data_dir = "/content/drive/MyDrive/Data/ner/kaggle-ner/"
  return pd.read_csv(data_dir + "ner_dataset.csv", encoding="latin1").fillna(method = "ffill")

In [5]:
class SentencesLoader:

  def __init__(self, data):
    self.data = data
    self.grouped = self.get_sentences()
    self.sentences = [[s[0] for s in sentence]  for sentence in self.grouped]
    self.labels = [[s[2] for s in sentence]  for sentence in self.grouped]
    self.pos_tags = [[s[1] for s in sentence] for sentence in self.grouped]
    self.tags = data["Tag"].unique().tolist()
    self.tags.append("PAD")  #add general padding token
    self.tag2idx = {tag: idx for idx, tag in enumerate(self.tags)} #encode tags with numeric values

  def get_sentences(self):
    aggr_fun = lambda sentence: [(word, pos, tag) for word, pos, tag 
                                 in zip(sentence["Word"].values.tolist(),
                                        sentence["POS"].values.tolist(),
                                        sentence["Tag"].values.tolist())]
    return self.data.groupby("Sentence #").apply(aggr_fun)


In [6]:
data = load()
sentences_loader = SentencesLoader(data)
sentences_loader.sentences[0]

['Thousands',
 'of',
 'demonstrators',
 'have',
 'marched',
 'through',
 'London',
 'to',
 'protest',
 'the',
 'war',
 'in',
 'Iraq',
 'and',
 'demand',
 'the',
 'withdrawal',
 'of',
 'British',
 'troops',
 'from',
 'that',
 'country',
 '.']

In [7]:
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [8]:
results = {}

#Bert


##Preprocessing


In [9]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

from tqdm import tqdm, trange

torch.__version__

'1.8.1+cu101'

In [10]:
#torch config
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

#set up BERT config as suggested by https://www.depends-on-the-definition.com/named-entity-recognition-with-bert/
MAX_LEN = 75
bs = 32

In [11]:
class BertPreprocesser():
  def __init__(self, sentences, labels, tag2idx):
    self.sentences = sentences
    self.labels = labels
    self.tag2idx = tag2idx
    self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

    tokenized_texts_and_labels = [self.tokenize_and_preserve_labels(s, l) 
                                  for s, l in zip(self.sentences, self.labels)]

    tokenized_texts = [pair[0] for pair in tokenized_texts_and_labels] 
    labels = [pair[1] for pair in tokenized_texts_and_labels]

    pad_i, pad_t = self.add_padding(tokenized_texts, labels)

    self.valid_dataloader = None
    self.train_dataloader = None
    self.to_data_loaders(pad_i, pad_t)
  
  def tokenize_and_preserve_labels(self, sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):
        tokenized_word = self.tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        tokenized_sentence.extend(tokenized_word)
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

  def add_padding(self, tokenized_texts, tokenized_labels):
    input_ids = [self.tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts]
    input_padded = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

    tag_idxs = [[self.tag2idx.get(l) for l in lab] for lab in tokenized_labels]
    tag_padded = pad_sequences(tag_idxs, maxlen=MAX_LEN, value=self.tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")
    return input_padded, tag_padded

  def to_data_loaders(self, input_padded, tags_padded):
    attention_masks = [[float(i != 0.0) for i in ii] for ii in input_padded]
    tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_padded, tags_padded, random_state=118, test_size=0.1)
    tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_padded, random_state=118, test_size=0.1)

    tr_inputs = torch.tensor(tr_inputs)
    val_inputs = torch.tensor(val_inputs)
    tr_tags = torch.tensor(tr_tags)
    val_tags = torch.tensor(val_tags)
    tr_masks = torch.tensor(tr_masks)
    val_masks = torch.tensor(val_masks)

    train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
    train_sampler = RandomSampler(train_data)
    self.train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

    valid_data = TensorDataset(val_inputs, val_masks, val_tags)
    valid_sampler = SequentialSampler(valid_data)
    self.valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)


In [12]:
bert_preprocesser = BertPreprocesser(sentences_loader.sentences, sentences_loader.labels, sentences_loader.tag2idx)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435797.0, style=ProgressStyle(descripti…




In [13]:
valid_dataloader = bert_preprocesser.valid_dataloader
train_dataloader = bert_preprocesser.train_dataloader

## Training and evaluation

In [14]:
import transformers
from transformers import BertForTokenClassification, AdamW
from transformers import get_linear_schedule_with_warmup
from seqeval.metrics import f1_score, accuracy_score

In [15]:
tag2idx = sentences_loader.tag2idx
tag_list = sentences_loader.tags

In [16]:
model = BertForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(tag2idx),
    output_attentions = False,
    output_hidden_states = False
)
# move to GPU -> uncomment only for GPU runtimes
model.cuda()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435779157.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [17]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],'weight_decay_rate': 0.0}
]


optimizer = AdamW (optimizer_grouped_parameters, lr=3e-5, eps=1e-8)

epochs = 3
max_grad_norm = 1.0

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [18]:
# measure performance without fine tuning
model.eval()

eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
predictions , true_labels = [], []


for batch in valid_dataloader:
  batch = tuple(t.to(device) for t in batch)
  batch_input_ids, batch_input_mask, batch_labels = batch

  with torch.no_grad():
    # forward pass, get logits
    outputs = model(batch_input_ids, token_type_ids=None, attention_mask=batch_input_mask, labels=batch_labels)

    # move to cpu
    logits = outputs[1].detach().cpu().numpy()
    label_ids = batch_labels.to('cpu').numpy()

    eval_loss += outputs[0].mean().item()
    predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
    true_labels.extend(label_ids)

  eval_loss = eval_loss / len(valid_dataloader)
  print(f"Validation loss: {eval_loss}")

  pred_tags = [[tag_list[p_i] for p_i, l_i in zip(p, l) if tag_list[l_i] != "PAD" ]
                                 for p, l in zip(predictions, true_labels)]
  valid_tags = [[tag_list[l_i] for p_i, l_i in zip(p, l) if tag_list[l_i] != "PAD" ]
                                 for p, l in zip(predictions, true_labels)]
  f1=0
  acc = 0
  try:
    acc = accuracy_score(pred_tags, valid_tags)
    f1 = f1_score(pred_tags, valid_tags)
  except:
    print(pred_tags)
    print(valid_tags)

Validation loss: 0.021602907180786134
Validation loss: 0.02193119551340739




Validation loss: 0.021876729490068227
Validation loss: 0.0219084130191916
Validation loss: 0.022038850060081557
Validation loss: 0.02189691485875829
Validation loss: 0.021772308854388793
Validation loss: 0.021795296163745242
Validation loss: 0.021994273023415854
Validation loss: 0.022103798655381528
Validation loss: 0.02193837173957103
Validation loss: 0.022170283318107557
Validation loss: 0.021933768444064892
Validation loss: 0.02211155556475324
Validation loss: 0.022075054782154505
Validation loss: 0.02198379912467481
Validation loss: 0.021938811519310986
Validation loss: 0.021904139590121417
Validation loss: 0.02196193157879417
Validation loss: 0.021953363446344795
Validation loss: 0.022018480428245002
Validation loss: 0.021950648944432116
Validation loss: 0.021909617892376946
Validation loss: 0.021822431243204224
Validation loss: 0.02176850622598822
Validation loss: 0.021988200716555415
Validation loss: 0.021860885935176613
Validation loss: 0.02215204270890785
Validation loss: 0.02

In [19]:
print(acc, f1)

0.0082079682592164 0.018258505532371926


In [20]:
loss_values, validation_loss_values = [], []

for _ in trange(epochs, desc="Epoch"):
    #------------- Training -------------
    # Put the model into training mode.
    model.train()
    # Reset the total loss for this epoch (good practice!).
    total_loss = 0

    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        batch_input_ids, batch_input_mask, batch_labels = batch
        
        model.zero_grad()

        # forward pass
        outputs = model(batch_input_ids, token_type_ids=None,
                        attention_mask=batch_input_mask, labels=batch_labels)
        loss = outputs[0]

        # backward pass
        loss.backward()

        # track train loss
        total_loss += loss.item()
        # clip gradient to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        
        # update parameters
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))

    loss_values.append(avg_train_loss)

    # --------------- Validation ----------------
    # change mode
    model.eval()

    # reset
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []

    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        batch_input_ids, batch_input_mask, batch_labels = batch

        with torch.no_grad():
            # forward pass, get logits
            outputs = model(batch_input_ids, token_type_ids=None, attention_mask=batch_input_mask, labels=batch_labels)

        # move to cpu
        logits = outputs[1].detach().cpu().numpy()
        label_ids = batch_labels.to('cpu').numpy()

        eval_loss += outputs[0].mean().item()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.extend(label_ids)

    eval_loss = eval_loss / len(valid_dataloader)
    validation_loss_values.append(eval_loss)
    print(f"Validation loss: {eval_loss}")

    pred_tags = [[tag_list[p_i] for p_i, l_i in zip(p, l) if tag_list[l_i] != "PAD" ]
                                 for p, l in zip(predictions, true_labels)]
    valid_tags = [[tag_list[l_i] for p_i, l_i in zip(p, l) if tag_list[l_i] != "PAD" ]
                                 for p, l in zip(predictions, true_labels)]
    f1=0
    acc = 0
    try:
      acc = accuracy_score(pred_tags, valid_tags)
      f1 = f1_score(pred_tags, valid_tags)
    except:
      print(pred_tags)
      print(valid_tags)
    print(f"Validation Accuracy: {acc}")
    print(f"Validation F1-Score: {f1}")
    print()

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Average train loss: 0.19080437020052796
Validation loss: 0.12643766870101292


Epoch:  33%|███▎      | 1/3 [09:47<19:35, 587.88s/it]

Validation Accuracy: 0.9593734501570508
Validation F1-Score: 0.8207197912241306

Average train loss: 0.10973901993935421
Validation loss: 0.12175005121777455


Epoch:  67%|██████▋   | 2/3 [19:40<09:49, 589.42s/it]

Validation Accuracy: 0.9625723260042982
Validation F1-Score: 0.8358379762084025

Average train loss: 0.08197993792184774
Validation loss: 0.12381150174885988


Epoch: 100%|██████████| 3/3 [29:34<00:00, 591.45s/it]

Validation Accuracy: 0.9635559596627542
Validation F1-Score: 0.8425234770045751






In [21]:
results["BERT"] = {}
results["BERT"]["f1"] = f1
results["BERT"]["acc"] = acc

# Spacy


##Preprocessing

In [22]:
! pip install -U spacy
! python -m spacy download en_core_web_sm

Collecting spacy
[?25l  Downloading https://files.pythonhosted.org/packages/1b/d8/0361bbaf7a1ff56b44dca04dace54c82d63dad7475b7d25ea1baefafafb2/spacy-3.0.6-cp37-cp37m-manylinux2014_x86_64.whl (12.8MB)
[K     |████████████████████████████████| 12.8MB 240kB/s 
[?25hCollecting pydantic<1.8.0,>=1.7.1
[?25l  Downloading https://files.pythonhosted.org/packages/ca/fa/d43f31874e1f2a9633e4c025be310f2ce7a8350017579e9e837a62630a7e/pydantic-1.7.4-cp37-cp37m-manylinux2014_x86_64.whl (9.1MB)
[K     |████████████████████████████████| 9.1MB 48.5MB/s 
[?25hCollecting srsly<3.0.0,>=2.4.1
[?25l  Downloading https://files.pythonhosted.org/packages/c3/84/dfdfc9f6f04f6b88207d96d9520b911e5fec0c67ff47a0dea31ab5429a1e/srsly-2.4.1-cp37-cp37m-manylinux2014_x86_64.whl (456kB)
[K     |████████████████████████████████| 460kB 48.5MB/s 
Collecting catalogue<2.1.0,>=2.0.3
  Downloading https://files.pythonhosted.org/packages/9c/10/dbc1203a4b1367c7b02fddf08cb2981d9aa3e688d398f587cea0ab9e3bec/catalogue-2.0.4-py3-

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
sentences = sentences_loader.sentences
labels = sentences_loader.labels

In [25]:
train_s, test_s, train_l, test_l = train_test_split(sentences, labels, random_state=118, test_size=0.25)

print(len(train_s), len(train_l), len(test_s), len(test_l))

35969 35969 11990 11990


In [26]:
test_data = [[(ent, l) for ent, l in zip(sentence, labels) if l!='O'] for sentence, labels in zip(test_s, test_l)]

In [27]:
def to_spacy_format(sentences, labels):
  train_data = []

  for sentence, label in zip(sentences, labels):
    entity_lst = []
    idx = 0
    for s, l in zip(sentence, label):
      if l != 'O':
        entity_lst.append( (idx, idx + len(s), l))
      idx += len(s) +1

    text = " ".join(sentence[:-1])
    train_data.append((text, {"entities": entity_lst}))

  return train_data

In [28]:
json_train = to_spacy_format(train_s, train_l)
json_test = to_spacy_format(test_s, test_l)


##Training and evaluation

In [29]:
import spacy
from spacy.util import minibatch, compounding
from tqdm import tqdm, trange
from spacy.training import Example

In [30]:
# nlp = spacy.load("en_core_web_sm")
nlp = spacy.blank('en')

nlp.add_pipe('ner')
ner = nlp.get_pipe('ner')

In [31]:
from spacy.util import minibatch, compounding
from spacy.training import Example
import random

for l in sentences_loader.tags:
  ner.add_label(l)

optimizer = nlp.begin_training()

pipes = [p for p in nlp.pipe_names if p != 'ner']
n_iter = 1
with nlp.disable_pipes(*pipes):
  examples = []
  for text, annotation in json_train[:10]:
    doc = nlp.make_doc(text)
    examples.append(Example.from_dict(doc, annotation))

  nlp.initialize(lambda: examples)
  losses = []
  for n in tqdm(range(n_iter)):
    random.shuffle(examples)
    for batch in minibatch(examples, size=8):
      loss = {}
      nlp.update(batch,sgd = optimizer, losses = loss)
      losses.append(loss['ner'])
    print(losses)

100%|██████████| 1/1 [00:00<00:00, 13.03it/s]

[97.86487260460854, 34.434512972831726]





In [32]:
# with minimal initial training
for l in sentences_loader.tags:
  ner.add_label(l)
  test_examples = []

for text, annotation in json_test:
    doc = nlp.make_doc(text)
    test_examples.append(Example.from_dict(doc, annotation))

scores = nlp.evaluate(test_examples)
print(scores)

{'token_acc': 1.0, 'token_p': 1.0, 'token_r': 1.0, 'token_f': 1.0, 'ents_p': 1.0, 'ents_r': 2.4763508493883413e-05, 'ents_f': 4.9525790555431744e-05, 'ents_per_type': {'B-geo': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'B-org': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'B-art': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'I-org': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'B-gpe': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'B-tim': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'I-geo': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'B-per': {'p': 1.0, 'r': 0.00023568230025925054, 'f': 0.00047125353440150805}, 'I-per': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'I-tim': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'B-nat': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'I-nat': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'I-eve': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'I-art': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'B-eve': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'I-gpe': {'p': 0.0, 'r': 0.0, 'f': 0.0}}, 'speed': 28990.125598799004}


In [33]:
from spacy.util import minibatch, compounding
from spacy.training import Example
import random

pipes = [p for p in nlp.pipe_names if p != 'ner']
n_iter = 10

with nlp.disable_pipes(*pipes):
  examples = []
  for text, annotation in json_train:
    doc = nlp.make_doc(text)
    examples.append(Example.from_dict(doc, annotation))

  nlp.initialize(lambda: examples)
  losses = []
  for n in tqdm(range(n_iter)):
    random.shuffle(examples)
    for batch in minibatch(examples, size=8):
      loss = {}
      nlp.update(batch,sgd = optimizer, losses = loss)
      losses.append(loss['ner'])
    print(losses)

 10%|█         | 1/10 [03:59<35:59, 239.99s/it]

[83.17569065093994, 79.6760364472866, 126.9400363266468, 123.4854262471199, 150.73915219306946, 152.931607067585, 137.05412435531616, 170.10490530729294, 130.37322688102722, 145.60925686359406, 109.33590173721313, 99.14357316493988, 97.95660361647606, 82.63438504934311, 21.46052821725607, 29.574679605662823, 46.83929770498071, 33.541933764267014, 32.01611076399422, 41.6576060038351, 37.983536292260396, 56.47323562880047, 22.02849796426017, 32.82008169288747, 36.1174738695845, 50.202385464683175, 38.39657839015126, 35.14475874276832, 23.538004816509783, 44.514926143572666, 37.13873267243616, 33.17395962806768, 41.41177752491785, 44.22356061451865, 26.16226004416785, 34.698701813093976, 23.3179221919454, 20.08650177941672, 19.730159480837756, 29.289054874301655, 17.54033882827207, 43.64226967394643, 24.062498959245204, 37.59782286299742, 25.41727211408579, 31.277114343072753, 26.573629980892292, 24.916366884717718, 25.32195799177134, 34.58566181453352, 32.56944449027651, 31.2326732860583

 20%|██        | 2/10 [07:51<31:39, 237.46s/it]

[83.17569065093994, 79.6760364472866, 126.9400363266468, 123.4854262471199, 150.73915219306946, 152.931607067585, 137.05412435531616, 170.10490530729294, 130.37322688102722, 145.60925686359406, 109.33590173721313, 99.14357316493988, 97.95660361647606, 82.63438504934311, 21.46052821725607, 29.574679605662823, 46.83929770498071, 33.541933764267014, 32.01611076399422, 41.6576060038351, 37.983536292260396, 56.47323562880047, 22.02849796426017, 32.82008169288747, 36.1174738695845, 50.202385464683175, 38.39657839015126, 35.14475874276832, 23.538004816509783, 44.514926143572666, 37.13873267243616, 33.17395962806768, 41.41177752491785, 44.22356061451865, 26.16226004416785, 34.698701813093976, 23.3179221919454, 20.08650177941672, 19.730159480837756, 29.289054874301655, 17.54033882827207, 43.64226967394643, 24.062498959245204, 37.59782286299742, 25.41727211408579, 31.277114343072753, 26.573629980892292, 24.916366884717718, 25.32195799177134, 34.58566181453352, 32.56944449027651, 31.2326732860583

 30%|███       | 3/10 [11:21<26:44, 229.21s/it]

[83.17569065093994, 79.6760364472866, 126.9400363266468, 123.4854262471199, 150.73915219306946, 152.931607067585, 137.05412435531616, 170.10490530729294, 130.37322688102722, 145.60925686359406, 109.33590173721313, 99.14357316493988, 97.95660361647606, 82.63438504934311, 21.46052821725607, 29.574679605662823, 46.83929770498071, 33.541933764267014, 32.01611076399422, 41.6576060038351, 37.983536292260396, 56.47323562880047, 22.02849796426017, 32.82008169288747, 36.1174738695845, 50.202385464683175, 38.39657839015126, 35.14475874276832, 23.538004816509783, 44.514926143572666, 37.13873267243616, 33.17395962806768, 41.41177752491785, 44.22356061451865, 26.16226004416785, 34.698701813093976, 23.3179221919454, 20.08650177941672, 19.730159480837756, 29.289054874301655, 17.54033882827207, 43.64226967394643, 24.062498959245204, 37.59782286299742, 25.41727211408579, 31.277114343072753, 26.573629980892292, 24.916366884717718, 25.32195799177134, 34.58566181453352, 32.56944449027651, 31.2326732860583

 40%|████      | 4/10 [15:08<22:51, 228.61s/it]

[83.17569065093994, 79.6760364472866, 126.9400363266468, 123.4854262471199, 150.73915219306946, 152.931607067585, 137.05412435531616, 170.10490530729294, 130.37322688102722, 145.60925686359406, 109.33590173721313, 99.14357316493988, 97.95660361647606, 82.63438504934311, 21.46052821725607, 29.574679605662823, 46.83929770498071, 33.541933764267014, 32.01611076399422, 41.6576060038351, 37.983536292260396, 56.47323562880047, 22.02849796426017, 32.82008169288747, 36.1174738695845, 50.202385464683175, 38.39657839015126, 35.14475874276832, 23.538004816509783, 44.514926143572666, 37.13873267243616, 33.17395962806768, 41.41177752491785, 44.22356061451865, 26.16226004416785, 34.698701813093976, 23.3179221919454, 20.08650177941672, 19.730159480837756, 29.289054874301655, 17.54033882827207, 43.64226967394643, 24.062498959245204, 37.59782286299742, 25.41727211408579, 31.277114343072753, 26.573629980892292, 24.916366884717718, 25.32195799177134, 34.58566181453352, 32.56944449027651, 31.2326732860583

 50%|█████     | 5/10 [18:49<18:51, 226.24s/it]

[83.17569065093994, 79.6760364472866, 126.9400363266468, 123.4854262471199, 150.73915219306946, 152.931607067585, 137.05412435531616, 170.10490530729294, 130.37322688102722, 145.60925686359406, 109.33590173721313, 99.14357316493988, 97.95660361647606, 82.63438504934311, 21.46052821725607, 29.574679605662823, 46.83929770498071, 33.541933764267014, 32.01611076399422, 41.6576060038351, 37.983536292260396, 56.47323562880047, 22.02849796426017, 32.82008169288747, 36.1174738695845, 50.202385464683175, 38.39657839015126, 35.14475874276832, 23.538004816509783, 44.514926143572666, 37.13873267243616, 33.17395962806768, 41.41177752491785, 44.22356061451865, 26.16226004416785, 34.698701813093976, 23.3179221919454, 20.08650177941672, 19.730159480837756, 29.289054874301655, 17.54033882827207, 43.64226967394643, 24.062498959245204, 37.59782286299742, 25.41727211408579, 31.277114343072753, 26.573629980892292, 24.916366884717718, 25.32195799177134, 34.58566181453352, 32.56944449027651, 31.2326732860583

 60%|██████    | 6/10 [22:34<15:03, 225.80s/it]

[83.17569065093994, 79.6760364472866, 126.9400363266468, 123.4854262471199, 150.73915219306946, 152.931607067585, 137.05412435531616, 170.10490530729294, 130.37322688102722, 145.60925686359406, 109.33590173721313, 99.14357316493988, 97.95660361647606, 82.63438504934311, 21.46052821725607, 29.574679605662823, 46.83929770498071, 33.541933764267014, 32.01611076399422, 41.6576060038351, 37.983536292260396, 56.47323562880047, 22.02849796426017, 32.82008169288747, 36.1174738695845, 50.202385464683175, 38.39657839015126, 35.14475874276832, 23.538004816509783, 44.514926143572666, 37.13873267243616, 33.17395962806768, 41.41177752491785, 44.22356061451865, 26.16226004416785, 34.698701813093976, 23.3179221919454, 20.08650177941672, 19.730159480837756, 29.289054874301655, 17.54033882827207, 43.64226967394643, 24.062498959245204, 37.59782286299742, 25.41727211408579, 31.277114343072753, 26.573629980892292, 24.916366884717718, 25.32195799177134, 34.58566181453352, 32.56944449027651, 31.2326732860583

 70%|███████   | 7/10 [26:22<11:19, 226.57s/it]

[83.17569065093994, 79.6760364472866, 126.9400363266468, 123.4854262471199, 150.73915219306946, 152.931607067585, 137.05412435531616, 170.10490530729294, 130.37322688102722, 145.60925686359406, 109.33590173721313, 99.14357316493988, 97.95660361647606, 82.63438504934311, 21.46052821725607, 29.574679605662823, 46.83929770498071, 33.541933764267014, 32.01611076399422, 41.6576060038351, 37.983536292260396, 56.47323562880047, 22.02849796426017, 32.82008169288747, 36.1174738695845, 50.202385464683175, 38.39657839015126, 35.14475874276832, 23.538004816509783, 44.514926143572666, 37.13873267243616, 33.17395962806768, 41.41177752491785, 44.22356061451865, 26.16226004416785, 34.698701813093976, 23.3179221919454, 20.08650177941672, 19.730159480837756, 29.289054874301655, 17.54033882827207, 43.64226967394643, 24.062498959245204, 37.59782286299742, 25.41727211408579, 31.277114343072753, 26.573629980892292, 24.916366884717718, 25.32195799177134, 34.58566181453352, 32.56944449027651, 31.2326732860583

 80%|████████  | 8/10 [30:25<07:43, 231.62s/it]

[83.17569065093994, 79.6760364472866, 126.9400363266468, 123.4854262471199, 150.73915219306946, 152.931607067585, 137.05412435531616, 170.10490530729294, 130.37322688102722, 145.60925686359406, 109.33590173721313, 99.14357316493988, 97.95660361647606, 82.63438504934311, 21.46052821725607, 29.574679605662823, 46.83929770498071, 33.541933764267014, 32.01611076399422, 41.6576060038351, 37.983536292260396, 56.47323562880047, 22.02849796426017, 32.82008169288747, 36.1174738695845, 50.202385464683175, 38.39657839015126, 35.14475874276832, 23.538004816509783, 44.514926143572666, 37.13873267243616, 33.17395962806768, 41.41177752491785, 44.22356061451865, 26.16226004416785, 34.698701813093976, 23.3179221919454, 20.08650177941672, 19.730159480837756, 29.289054874301655, 17.54033882827207, 43.64226967394643, 24.062498959245204, 37.59782286299742, 25.41727211408579, 31.277114343072753, 26.573629980892292, 24.916366884717718, 25.32195799177134, 34.58566181453352, 32.56944449027651, 31.2326732860583

 90%|█████████ | 9/10 [33:56<03:45, 225.23s/it]

[83.17569065093994, 79.6760364472866, 126.9400363266468, 123.4854262471199, 150.73915219306946, 152.931607067585, 137.05412435531616, 170.10490530729294, 130.37322688102722, 145.60925686359406, 109.33590173721313, 99.14357316493988, 97.95660361647606, 82.63438504934311, 21.46052821725607, 29.574679605662823, 46.83929770498071, 33.541933764267014, 32.01611076399422, 41.6576060038351, 37.983536292260396, 56.47323562880047, 22.02849796426017, 32.82008169288747, 36.1174738695845, 50.202385464683175, 38.39657839015126, 35.14475874276832, 23.538004816509783, 44.514926143572666, 37.13873267243616, 33.17395962806768, 41.41177752491785, 44.22356061451865, 26.16226004416785, 34.698701813093976, 23.3179221919454, 20.08650177941672, 19.730159480837756, 29.289054874301655, 17.54033882827207, 43.64226967394643, 24.062498959245204, 37.59782286299742, 25.41727211408579, 31.277114343072753, 26.573629980892292, 24.916366884717718, 25.32195799177134, 34.58566181453352, 32.56944449027651, 31.2326732860583

100%|██████████| 10/10 [37:18<00:00, 223.88s/it]

[83.17569065093994, 79.6760364472866, 126.9400363266468, 123.4854262471199, 150.73915219306946, 152.931607067585, 137.05412435531616, 170.10490530729294, 130.37322688102722, 145.60925686359406, 109.33590173721313, 99.14357316493988, 97.95660361647606, 82.63438504934311, 21.46052821725607, 29.574679605662823, 46.83929770498071, 33.541933764267014, 32.01611076399422, 41.6576060038351, 37.983536292260396, 56.47323562880047, 22.02849796426017, 32.82008169288747, 36.1174738695845, 50.202385464683175, 38.39657839015126, 35.14475874276832, 23.538004816509783, 44.514926143572666, 37.13873267243616, 33.17395962806768, 41.41177752491785, 44.22356061451865, 26.16226004416785, 34.698701813093976, 23.3179221919454, 20.08650177941672, 19.730159480837756, 29.289054874301655, 17.54033882827207, 43.64226967394643, 24.062498959245204, 37.59782286299742, 25.41727211408579, 31.277114343072753, 26.573629980892292, 24.916366884717718, 25.32195799177134, 34.58566181453352, 32.56944449027651, 31.2326732860583




In [34]:
test_examples = []
for text, annotation in json_test:
    doc = nlp.make_doc(text)
    test_examples.append(Example.from_dict(doc, annotation))

scores = nlp.evaluate(test_examples)
print(scores)

{'token_acc': 1.0, 'token_p': 1.0, 'token_r': 1.0, 'token_f': 1.0, 'ents_p': 0.8477186501597445, 'ents_r': 0.8410430389777623, 'ents_f': 0.8443676503493026, 'ents_per_type': {'B-geo': {'p': 0.8758692267773741, 'r': 0.885705289672544, 'f': 0.8807597975264833}, 'I-geo': {'p': 0.851037851037851, 'r': 0.7410951621477937, 'f': 0.7922705314009661}, 'B-org': {'p': 0.7725941422594143, 'r': 0.7396354896855598, 'f': 0.7557556533306048}, 'B-art': {'p': 0.25, 'r': 0.028037383177570093, 'f': 0.05042016806722689}, 'I-org': {'p': 0.7552742616033755, 'r': 0.8140258496888464, 'f': 0.7835502822255501}, 'B-gpe': {'p': 0.9600512163892445, 'r': 0.9337484433374844, 'f': 0.9467171717171717}, 'B-tim': {'p': 0.9316168898043254, 'r': 0.8875588697017268, 'f': 0.9090543663953371}, 'B-per': {'p': 0.7784932388924662, 'r': 0.8548197030403016, 'f': 0.8148730622332061}, 'I-per': {'p': 0.8446729380879457, 'r': 0.8778990450204639, 'f': 0.8609655479986621}, 'I-tim': {'p': 0.8798534798534798, 'r': 0.7318708104814138, 'f':

In [35]:
results["Spacy"] ={}
results["Spacy"]["precision"] = scores["ents_p"]
results["Spacy"]["recall"] = scores["ents_r"]
results["Spacy"]["fscore"] = scores["ents_f"]

#NLTK

##Preparation


In [36]:
from nltk.chunk import tree2conlltags
from pprint import pprint
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [37]:
from sklearn.model_selection import train_test_split

sentences = sentences_loader.sentences
labels = sentences_loader.labels
pos_tags = sentences_loader.pos_tags

train_s, test_s, train_l, test_l = train_test_split(sentences, labels, random_state=118, test_size=0.25)
train_p, test_p, _ , _ = train_test_split(pos_tags, labels, random_state=118, test_size=0.25)

print(len(train_s), len(train_l), len(test_s), len(test_l), len(train_p), len(test_p))

35969 35969 11990 11990 35969 11990


In [38]:
def train_reader_generator ():
  for sent, pos, lab in zip(train_s, train_p, train_l):
    yield [((w, t), iob) for w, t, iob in zip(sent, pos, lab)]

def test_reader_generator ():
  for sent, pos, lab in zip(test_s, test_p, test_l):
    yield [((w, t), iob) for w, t, iob in zip(sent, pos, lab)]

In [39]:
train_reader = train_reader_generator()
test_reader = test_reader_generator()

In [40]:
from nltk.stem.snowball import SnowballStemmer
import string 

def features(tokens, index, history):
  stemmer = SnowballStemmer('english')
  tokens = [('[S2]', '[S2]'), ('[S1]', '[S1]')] + list(tokens) + [('[E1]', '[E1]'), ('[E2]', '[E2]')]
  history = ['S2', 'S1'] + list(history)

  #accomodate the padding
  index +=2

  word, pos = tokens[index]
  prev_w, prev_p = tokens[index-1]
  prev_prev_w, prev_prev_p = tokens[index-2]
  prev_iob = history[index - 1]
  next_w, next_p = tokens[index+1]
  next_next_w, next_next_p = tokens[index+2]

  contains_dash = '-' in word
  contains_dot  = '.' in word
  allascii = all([True for c in word if c in string.ascii_lowercase])
  allcaps = word == word.capitalize()
  capitalized = word[0] in string.ascii_uppercase

  prev_allcaps = prev_w == prev_w.capitalize()
  prev_capitalized = prev_w in string.ascii_uppercase

  next_allcaps = next_w == next_w.capitalize()
  next_capitalized = next_w in string.ascii_uppercase

  return {
        'word': word,
        'lemma': stemmer.stem(word),
        'pos': pos,
        'all-ascii': allascii,
 
        'next-word': next_w,
        'next-lemma': stemmer.stem(next_w),
        'next-pos': next_p,
 
        'next-next-word': next_next_w,
        'nextnextpos': next_next_p,
 
        'prev-word': prev_w,
        'prev-lemma': stemmer.stem(prev_w),
        'prev-pos': prev_p,
 
        'prev-prev-word': prev_prev_w,
        'prev-prev-pos': prev_prev_p,
 
        'prev-iob': prev_iob,
 
        'contains-dash': contains_dash,
        'contains-dot': contains_dot,
 
        'all-caps': allcaps,
        'capitalized': capitalized,
 
        'prev-all-caps': prev_allcaps,
        'prev-capitalized': prev_capitalized,
 
        'next-all-caps': next_allcaps,
        'next-capitalized': next_capitalized,
    }

##Train and evaluate

In [41]:
import collections
from nltk.tag import ClassifierBasedTagger
from nltk.chunk import ChunkParserI
from nltk.chunk.util import conlltags2tree

class NERChunker(ChunkParserI):
  def __init__(self, train, **kwargs):
    assert isinstance(train, collections.Iterable)

    self.feature_detector = features
    self.tagger = ClassifierBasedTagger(feature_detector = self.feature_detector, train=train, **kwargs)

  def parse(self, tagged_sent):
    chunks = self.tagger.tag(tagged_sent)

    triplet_chunks = [(w, t, l) for ((w, t), l) in chunks]

    return conlltags2tree(triplet_chunks)

In [42]:
# without aditional training
train_data = list(train_reader_generator())
test_data = list(test_reader_generator())
chunker = NERChunker(train_data[:10])
score = chunker.evaluate([conlltags2tree([(w, t, iob) for (w, t), iob in iobs]) for iobs in test_data])
print(score)

  


ChunkParse score:
    IOB Accuracy:  53.7%%
    Precision:      6.7%%
    Recall:        21.5%%
    F-Measure:     10.2%%


In [43]:
#### training 
train_data = list(train_reader_generator())
test_data = list(test_reader_generator())

chunker = NERChunker(train_data)

score = chunker.evaluate([conlltags2tree([(w, t, iob) for (w, t), iob in iobs]) for iobs in test_data])
print(score)

ChunkParse score:
    IOB Accuracy:  93.9%%
    Precision:     65.7%%
    Recall:        76.8%%
    F-Measure:     70.8%%


In [44]:
results["NLTK"] = {}
results["NLTK"]["acc"] = score.accuracy()
results["NLTK"]["precision"] = score.precision()
results["NLTK"]["F-measure"] = score.f_measure()
results["NLTK"]["recall"] = score.recall()

#Results


In [45]:
from pprint import pprint
pprint(results)

{'BERT': {'acc': 0.9635559596627542, 'f1': 0.8425234770045751},
 'NLTK': {'F-measure': 0.7079149438865918,
          'acc': 0.9392790534136424,
          'precision': 0.6566923381327814,
          'recall': 0.7678043919279638},
 'Spacy': {'fscore': 0.8443676503493026,
           'precision': 0.8477186501597445,
           'recall': 0.8410430389777623}}
