In [1]:
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
import pandas as pd

In [2]:
clf = pipeline('ner', aggregation_strategy='simple')

In [3]:
clf('Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO,therefore very close to the Manhattan Bridge which is visible from the window.')

[{'entity': 'I-ORG',
  'score': 0.9995786,
  'index': 1,
  'word': 'Hu',
  'start': 0,
  'end': 2},
 {'entity': 'I-ORG',
  'score': 0.9909764,
  'index': 2,
  'word': '##gging',
  'start': 2,
  'end': 7},
 {'entity': 'I-ORG',
  'score': 0.9982225,
  'index': 3,
  'word': 'Face',
  'start': 8,
  'end': 12},
 {'entity': 'I-ORG',
  'score': 0.99948806,
  'index': 4,
  'word': 'Inc',
  'start': 13,
  'end': 16},
 {'entity': 'I-LOC',
  'score': 0.9994345,
  'index': 11,
  'word': 'New',
  'start': 40,
  'end': 43},
 {'entity': 'I-LOC',
  'score': 0.9993196,
  'index': 12,
  'word': 'York',
  'start': 44,
  'end': 48},
 {'entity': 'I-LOC',
  'score': 0.9993794,
  'index': 13,
  'word': 'City',
  'start': 49,
  'end': 53},
 {'entity': 'I-LOC',
  'score': 0.98625815,
  'index': 19,
  'word': 'D',
  'start': 79,
  'end': 80},
 {'entity': 'I-LOC',
  'score': 0.95142686,
  'index': 20,
  'word': '##UM',
  'start': 80,
  'end': 82},
 {'entity': 'I-LOC',
  'score': 0.9336589,
  'index': 21,
  'word

In [4]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [5]:
# parse the azure dataset

In [6]:
import json
ROOT = '/tmp/LoanAgreements/'

with open('/tmp/LoanAgreements/loanAgreementsLabels.json', 'r') as fin:
    anno = json.load(fin)

In [10]:
clf.model.config.label2id

{'B-LOC': 7,
 'B-MISC': 1,
 'B-ORG': 5,
 'B-PER': 3,
 'I-LOC': 8,
 'I-MISC': 2,
 'I-ORG': 6,
 'I-PER': 4,
 'O': 0}

In [11]:
labelmap = {
    'BorrowerName': 'PER',
    'BorrowerAddress': 'LOC',
    'BorrowerCity': 'LOC',
    'BorrowerState': 'LOC',
    'LenderName': 'PER',
    'LenderAddress': 'LOC',
    'LenderCity': 'LOC',
    'LenderState': 'LOC',
}

In [19]:
pred = []
for doc in anno['documents']:
    with open(ROOT + doc['location'], 'r') as fin:
        text = fin.read()
    pred.append({'text': text, 'id': doc['location'], 'predictions': clf(text)})

In [21]:
from spacy import load
nlp = load('en_core_web_sm')

In [22]:
doc = nlp(pred[0]['text'])

In [40]:
ent_true = []
for label in anno['documents'][0]['extractors'][0]['labels']:
    start = label['offset'] - 1
    end = start + label['length']
    print(doc.char_span(start, end + 1), labelmap.get(label['extractorName'], 'MISC'))
    ent_true.append((start, end, labelmap.get(label['extractorName'], 'MISC')))

Fusce Rd, City LOC
None LOC
None LOC
None LOC
None LOC
None LOC
None MISC
None MISC
None MISC
None MISC
None PER
Williams with a PER


In [57]:
for label in anno['documents'][0]['extractors'][0]['labels']:
    start = label['offset'] - 1
    end = start + label['length'] + 1
    print(doc.char_span(start, end), label['extractorName'])

Fusce Rd, City BorrowerAddress
None BorrowerCity
None BorrowerState
None LenderAddress
None LenderCity
None LenderState
None LoanAmountWords
None LoanAmountNumbers
None Interest
None Date
None BorrowerName
Williams with a LenderName


In [43]:
doc.text[199:214]

'Fusce Rd, City '

In [36]:
doc.text[272:272 + 14]

'Williams with '

In [46]:
label

{'extractorName': 'LenderName', 'offset': 273, 'length': 14}

In [47]:
anno['documents'][0]['extractors'][0]['labels']

[{'extractorName': 'BorrowerAddress', 'offset': 200, 'length': 13},
 {'extractorName': 'BorrowerCity', 'offset': 223, 'length': 9},
 {'extractorName': 'BorrowerState', 'offset': 243, 'length': 8},
 {'extractorName': 'LenderAddress', 'offset': 314, 'length': 15},
 {'extractorName': 'LenderCity', 'offset': 339, 'length': 10},
 {'extractorName': 'LenderState', 'offset': 360, 'length': 8},
 {'extractorName': 'LoanAmountWords', 'offset': 446, 'length': 66},
 {'extractorName': 'LoanAmountNumbers', 'offset': 514, 'length': 11},
 {'extractorName': 'Interest', 'offset': 601, 'length': 2},
 {'extractorName': 'Date', 'offset': 5, 'length': 9},
 {'extractorName': 'BorrowerName', 'offset': 160, 'length': 13},
 {'extractorName': 'LenderName', 'offset': 273, 'length': 14}]

In [50]:
doc.text[199:(199 + 14)]

'Fusce Rd, City'

In [55]:
doc.char_span(199, 213)

Fusce Rd, City

In [58]:
import pandas as pd

In [59]:
df = pd.read_csv('/tmp/ner.csv', usecols=['Sentence', 'Tag'])

In [60]:
df.head()

Unnamed: 0,Sentence,Tag
0,Thousands of demonstrators have marched throug...,"['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', '..."
1,Families of soldiers killed in the conflict jo...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
2,They marched from the Houses of Parliament to ...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
3,"Police put the number of marchers at 10,000 wh...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
4,The protest comes on the eve of the annual con...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."


In [61]:
print(df.iloc[0]['Sentence'])

Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .


In [62]:
df['Tag'][0]

"['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']"

In [64]:
import ast

In [66]:
from tqdm import tqdm

In [68]:
df = df.sample(1000)
df.head()

Unnamed: 0,Sentence,Tag
34213,The International Criminal Tribunal for the fo...,"['B-org', 'I-org', 'I-org', 'I-org', 'O', 'O',..."
6007,"In the Great Plains states of the country , wh...","['O', 'O', 'B-geo', 'I-geo', 'O', 'O', 'O', 'O..."
30616,Pakistan and India often conduct such tests to...,"['B-geo', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O..."
1737,Iraq 's foreign minister says Syria is refusin...,"['B-geo', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O..."
42998,VOA 's Chris Simkins reports .,"['B-org', 'O', 'B-per', 'I-per', 'O', 'O']"


In [71]:
pred = []
for text in tqdm(df['Sentence'].values):
    pred.append(clf(text))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [01:16<00:00, 13.06it/s]


In [72]:
pred[0]

[{'entity_group': 'ORG',
  'score': 0.8809569,
  'word': 'International Criminal Tribunal for',
  'start': 4,
  'end': 39},
 {'entity_group': 'LOC',
  'score': 0.999504,
  'word': 'Yugoslavia',
  'start': 51,
  'end': 61},
 {'entity_group': 'PER',
  'score': 0.96618253,
  'word': 'Perisic',
  'start': 167,
  'end': 174},
 {'entity_group': 'ORG',
  'score': 0.61537665,
  'word': 'Staff',
  'start': 212,
  'end': 217},
 {'entity_group': 'MISC',
  'score': 0.73656905,
  'word': 'Yugoslav',
  'start': 225,
  'end': 233},
 {'entity_group': 'ORG',
  'score': 0.9765235,
  'word': 'Army',
  'start': 234,
  'end': 238}]

In [75]:
doc = nlp(df['Sentence'].iloc[0])

In [108]:
from spacy.training import offsets_to_biluo_tags

ent_pred_tags = []
for doc, ents in zip(df['Sentence'].values, pred):
    entities = [(k['start'], k['end'], k['entity_group']) for k in ents]
    tags = offsets_to_biluo_tags(nlp(doc), entities)
    ent_pred_tags.append(tags)

In [140]:
ent_true_tags = [ast.literal_eval(k.upper()) for k in df['Tag'].values]

In [99]:
from seqeval.metrics import classification_report, f1_score

In [98]:
print(classification_report(ent_true_tags[:2], ent_pred_tags[:2]))

              precision    recall  f1-score   support

         GEO       0.00      0.00      0.00         2
         GPE       0.00      0.00      0.00         1
         LOC       0.00      0.00      0.00         0
        MISC       0.00      0.00      0.00         0
         ORG       0.33      0.25      0.29         4
         PER       0.00      0.00      0.00         0
         TIM       0.00      0.00      0.00         1

   micro avg       0.14      0.12      0.13         8
   macro avg       0.05      0.04      0.04         8
weighted avg       0.17      0.12      0.14         8



In [100]:
f1_score(ent_true_tags[:2], ent_pred_tags[:2])



0.13333333333333333

In [141]:
def replace_tags(tags):
    tags = [tag.replace('L-', 'I-') for tag in tags]
    tags = [tag.replace('U-', 'B-') for tag in tags]
    return tags

ent_pred_tags = [replace_tags(tag) for tag in ent_pred_tags]

In [110]:
f1_score(ent_true_tags[:2], ent_pred_tags[:2])

0.13333333333333333

In [114]:
len(ent_true_tags)

1000

In [115]:
len(ent_pred_tags)

1000

In [130]:
count = 0
to_remove = []
for num, (i, j) in enumerate(zip(ent_true_tags, ent_pred_tags)):
    if len(i) != len(j):
        to_remove.append(num)

In [132]:
df.drop(df.index[to_remove], axis=0, inplace=True)

In [None]:
from spacy.training import offsets_to_biluo_tags

ent_pred_tags = []
for doc, ents in zip(df['Sentence'].values, pred):
    entities = [(k['start'], k['end'], k['entity_group']) for k in ents]
    tags = offsets_to_biluo_tags(nlp(doc), entities)
    ent_pred_tags.append(tags)

  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_

In [136]:
from datasets import load_metric

In [138]:
metric = load_metric('seqeval')

In [146]:
metric.compute(predictions=ent_pred_tags, references=ent_true_tags, zero_division="warn", average="micro")

TypeError: _compute() got an unexpected keyword argument 'average'

In [148]:
from itertools import chain

In [149]:
set(chain(*ent_pred_tags))

{'-',
 'B-LOC',
 'B-MISC',
 'B-ORG',
 'B-PER',
 'I-LOC',
 'I-MISC',
 'I-ORG',
 'I-PER',
 'O'}

In [150]:
set(chain(*ent_true_tags))

{'B-ART',
 'B-EVE',
 'B-GEO',
 'B-GPE',
 'B-NAT',
 'B-ORG',
 'B-PER',
 'B-TIM',
 'I-ART',
 'I-EVE',
 'I-GEO',
 'I-GPE',
 'I-ORG',
 'I-PER',
 'I-TIM',
 'O'}

In [151]:
with open('/tmp/e.json', 'r') as fin:
    tweets = [json.loads(k) for k in fin.readlines()]

In [157]:
from spacy.training import biluo_tags_to_offsets

def parse(text, entities, annotation_offsets, **kwargs):
    doc = nlp(text)
    return biluo_tags_to_offsets(doc, entities)

for tweet in tweets:
    ents = parse(**tweet)
    doc = nlp(tweet['text'])
    entities = tweet['entities']
    break
    

In [158]:
biluo_tags_to_offsets(doc, entities)

[]

In [159]:
entities

['O',
 'B-ORG',
 'I-ORG',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-LOC',
 'I-LOC',
 'O',
 'O',
 'O',
 'O',
 'B-LOC',
 'O',
 'O']

In [160]:
doc

#BREAKINGNEWS MALAYSIA AIRLINES FLIGHT #MH17 CONFIRMED SHOT DOWN OVER #DONETSK OBLAST, SHORTLY BEFORE REACHING RUSSIAN AIR SPACE

In [161]:
len(doc)

21

In [162]:
len(entities)

18

In [170]:
ent_clean = []
for label, (start, end) in zip(tweet['entities'], tweet['annotation_offsets']):
    if label == 'O':
        continue
    span = doc.char_span(start, end)
    print(span, label)
    ent_clean.append(span)

MALAYSIA B-ORG
AIRLINES I-ORG
#DONETSK B-LOC
OBLAST I-LOC
RUSSIAN B-LOC


In [166]:
from spacy.util import filter_spans

In [169]:
filter_spans(ent_clean)

[MALAYSIA, AIRLINES, #DONETSK, OBLAST, RUSSIAN]

In [172]:
filter_spans?

[0;31mSignature:[0m [0mfilter_spans[0m[0;34m([0m[0mspans[0m[0;34m:[0m [0mIterable[0m[0;34m[[0m[0mForwardRef[0m[0;34m([0m[0;34m'Span'[0m[0;34m)[0m[0;34m][0m[0;34m)[0m [0;34m->[0m [0mList[0m[0;34m[[0m[0mForwardRef[0m[0;34m([0m[0;34m'Span'[0m[0;34m)[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Filter a sequence of spans and remove duplicates or overlaps. Useful for
creating named entities (where one token can only be part of one entity) or
when merging spans with `Retokenizer.merge`. When spans overlap, the (first)
longest span is preferred over shorter spans.

spans (Iterable[Span]): The spans to filter.
RETURNS (List[Span]): The filtered spans.
[0;31mFile:[0m      ~/conda/lib/python3.7/site-packages/spacy/util.py
[0;31mType:[0m      function
