In [2]:
import json
import spacy
from transformers import pipeline

with open('ner-train.json', 'r') as fin:
    data = json.load(fin)
data[:3]

[{'text': '#BREAKINGNEWS MALAYSIA AIRLINES FLIGHT #MH17 CONFIRMED SHOT DOWN OVER #DONETSK OBLAST, SHORTLY BEFORE REACHING RUSSIAN AIR SPACE',
  'labels': [{'start': 14, 'end': 31, 'label': 'ORG'},
   {'start': 70, 'end': 85, 'label': 'LOC'},
   {'start': 111, 'end': 118, 'label': 'LOC'}]},
 {'text': "Here's a route map of @MAS #MH17, which @flightaware lost track of right at the Ukraine/Poland border http://t.co/IGvQJnkU7B",
  'labels': [{'start': 22, 'end': 26, 'label': 'ORG'},
   {'start': 40, 'end': 52, 'label': 'ORG'},
   {'start': 80, 'end': 87, 'label': 'LOC'},
   {'start': 88, 'end': 101, 'label': 'LOC'}]},
 {'text': "Can somebody confirm this. RT @KennySkyNews: Reports Malaysia Airlines MH17 has crashed near Ukraine's border with Russia. #MH17 #Malaysia",
  'labels': [{'start': 53, 'end': 70, 'label': 'ORG'},
   {'start': 93, 'end': 100, 'label': 'LOC'},
   {'start': 115, 'end': 121, 'label': 'LOC'},
   {'start': 129, 'end': 138, 'label': 'ORG'}]}]

In [7]:
# json -> spacy docs
nlp = spacy.load('en_core_web_sm')
docs = []
for tweet in data:
    doc = nlp(tweet['text'])
    spans = []
    for ent in tweet['labels']:
        spans.append(doc.char_span(ent['start'], ent['end'], label=ent['label']))
    doc.set_ents(spans)
    docs.append(doc)

In [11]:
# spacy docs to biluo tags
from spacy.training import offsets_to_biluo_tags

tags = []
for doc in docs:
    tag = offsets_to_biluo_tags(doc, [(e.start_char, e.end_char, e.label_) for e in doc.ents])
    tags.append(tag)


In [12]:
# bilou tags to iob tags
def bilou2iob(tags):
    # Replace L
    tags = [t.replace('L-', 'I-') for t in tags]
    # Replace U
    tags = [t.replace('U-', 'B-') for t in tags]
    return tags

tags = [bilou2iob(tag) for tag in tags]

In [31]:
from itertools import chain
set(chain(*tags))

{'B-LOC', 'B-ORG', 'B-PER', 'I-LOC', 'I-ORG', 'I-PER', 'O'}

In [13]:
# Huggingface training

In [14]:
# Evaluate on the data first

In [15]:
from datasets import load_metric

In [16]:
metric = load_metric('seqeval')

In [35]:
clf = pipeline('ner', aggregation_strategy='first')

In [36]:
pred = clf([doc.text for doc in docs])
pred[0]

[{'entity_group': 'LOC',
  'score': 0.4011522,
  'word': 'MALAYSIA',
  'start': 14,
  'end': 22},
 {'entity_group': 'ORG',
  'score': 0.5026202,
  'word': 'AIRLINES',
  'start': 23,
  'end': 31},
 {'entity_group': 'ORG',
  'score': 0.4480394,
  'word': 'DONETSK',
  'start': 71,
  'end': 78},
 {'entity_group': 'MISC',
  'score': 0.99558073,
  'word': 'RUSSIAN',
  'start': 111,
  'end': 118}]

In [41]:
# Predictions to IOB tags

def pred2biluo(text, entities):
    ents = [(ent['start'], ent['end'], ent['entity_group']) for ent in entities]
    tags = offsets_to_biluo_tags(nlp(text), ents)
    return bilou2iob(tags)
    
pred_iob = [pred2biluo(doc.text, p) for doc, p in zip(docs, pred)]

  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_

In [50]:
# Compute the metrics
import pandas as pd

metrics = metric.compute(references=tags, predictions=pred_iob)
mdf = pd.DataFrame({k: v for k, v in metrics.items() if k in ['LOC', 'MISC', 'ORG', 'PER']})
mdf

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,LOC,MISC,ORG,PER
precision,0.546099,0.0,0.544444,0.609756
recall,0.531034,0.0,0.471154,0.555556
f1,0.538462,0.0,0.505155,0.581395
number,145.0,0.0,104.0,45.0


In [55]:
# Train!!!

In [59]:
# Original data to dataset
# Each sample needs: {'text': '...', 'ner_tags': [0, ,1, 2, 3.. each mapping to an IOB tag]}

In [60]:
data[:2]

[{'text': '#BREAKINGNEWS MALAYSIA AIRLINES FLIGHT #MH17 CONFIRMED SHOT DOWN OVER #DONETSK OBLAST, SHORTLY BEFORE REACHING RUSSIAN AIR SPACE',
  'labels': [{'start': 14, 'end': 31, 'label': 'ORG'},
   {'start': 70, 'end': 85, 'label': 'LOC'},
   {'start': 111, 'end': 118, 'label': 'LOC'}]},
 {'text': "Here's a route map of @MAS #MH17, which @flightaware lost track of right at the Ukraine/Poland border http://t.co/IGvQJnkU7B",
  'labels': [{'start': 22, 'end': 26, 'label': 'ORG'},
   {'start': 40, 'end': 52, 'label': 'ORG'},
   {'start': 80, 'end': 87, 'label': 'LOC'},
   {'start': 88, 'end': 101, 'label': 'LOC'}]}]

In [70]:
def iob2nertags(iob):
    return [clf.model.config.label2id.get(k, 0) for k in iob]

def span2nertags(text, labels):
    doc = nlp(text)
    spans = [doc.char_span(ent['start'], ent['end'], label=ent['label']) for ent in labels]
    doc.set_ents(spans)
    tags = offsets_to_biluo_tags(doc, [(e.start_char, e.end_char, e.label_) for e in doc.ents])
    tags = bilou2iob(tags)
    return iob2nertags(tags)
    

In [101]:
train_data = {'text': [[c.text for c in nlp(d['text'])] for d in data], 'ner_tags': [span2nertags(**d) for d in data]}

In [102]:
train_data['text'][:2]

[['#',
  'BREAKINGNEWS',
  'MALAYSIA',
  'AIRLINES',
  'FLIGHT',
  '#',
  'MH17',
  'CONFIRMED',
  'SHOT',
  'DOWN',
  'OVER',
  '#',
  'DONETSK',
  'OBLAST',
  ',',
  'SHORTLY',
  'BEFORE',
  'REACHING',
  'RUSSIAN',
  'AIR',
  'SPACE'],
 ['Here',
  "'s",
  'a',
  'route',
  'map',
  'of',
  '@MAS',
  '#',
  'MH17',
  ',',
  'which',
  '@flightaware',
  'lost',
  'track',
  'of',
  'right',
  'at',
  'the',
  'Ukraine',
  '/',
  'Poland',
  'border',
  'http://t.co/IGvQJnkU7B']]

In [103]:
train_data['ner_tags'][:2]

[[0, 0, 5, 6, 0, 0, 0, 0, 0, 0, 0, 7, 8, 8, 0, 0, 0, 0, 7, 0, 0],
 [0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 7, 0, 7, 8, 0]]

In [73]:
from datasets import Dataset

In [104]:
ds = Dataset.from_dict(train_data)

In [122]:
def tokenize_and_align_labels(examples, tokenizer):
    tokenized_inputs = tokenizer(examples["text"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized = ds.map(lambda x: tokenize_and_align_labels(x, clf.tokenizer), batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [107]:
from transformers import TrainingArguments, Trainer

In [125]:
from transformers import DataCollatorForTokenClassification
collator = DataCollatorForTokenClassification(tokenizer=clf.tokenizer)

args = TrainingArguments(output_dir="./session", evaluation_strategy="epoch")
trainer = Trainer(model=clf.model, args=args, train_dataset=tokenized, eval_dataset=tokenized, tokenizer=clf.tokenizer, data_collator=collator)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: ner_tags, text.
***** Running training *****
  Num examples = 200
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 75


Epoch,Training Loss,Validation Loss
1,No log,0.062503
2,No log,0.029004
3,No log,0.017964


The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: ner_tags, text.
***** Running Evaluation *****
  Num examples = 200
  Batch size = 8
The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: ner_tags, text.
***** Running Evaluation *****
  Num examples = 200
  Batch size = 8
The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: ner_tags, text.
***** Running Evaluation *****
  Num examples = 200
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=75, training_loss=0.07440160751342774, metrics={'train_runtime': 21.594, 'train_samples_per_second': 27.785, 'train_steps_per_second': 3.473, 'total_flos': 71445315223872.0, 'train_loss': 0.07440160751342774, 'epoch': 3.0})

In [126]:
# Check the metrics again

In [130]:
clf.model.to('cpu')

pred = clf([doc.text for doc in docs])
pred_iob = [pred2biluo(doc.text, p) for doc, p in zip(docs, pred)]

metrics = metric.compute(references=tags, predictions=pred_iob)
mdf = pd.DataFrame({k: v for k, v in metrics.items() if k in ['LOC', 'MISC', 'ORG', 'PER']})
mdf

  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_

Unnamed: 0,LOC,ORG,PER
precision,0.964539,0.957895,0.948718
recall,0.937931,0.875,0.822222
f1,0.951049,0.914573,0.880952
number,145.0,104.0,45.0


In [129]:
clf.model.device

device(type='cuda', index=0)