In [3]:
import json
import spacy
nlp = spacy.load('en_core_web_sm')
import random

with open('twitter.json', 'r') as fin:
    docs = [json.loads(k) for k in fin.readlines()]

In [1]:
# Output
[
    {
        "text": "Hello, World!",
        "labels": [
            {"start": 0, "end": -1, "label": "PER"},
            {"start": 0, "end": -1, "label": "LOC"},
        ]
    },
]

[{'text': 'Hello, World!',
  'labels': [{'start': 0, 'end': -1, 'label': 'PER'},
   {'start': 0, 'end': -1, 'label': 'LOC'}]}]

In [7]:
orgdoc = random.choice(docs)

doc = nlp(orgdoc['text'])
labels, offsets = orgdoc['entities'], orgdoc['annotation_offsets']

def merge_offsets(doc, labels, offsets):
    spans = []
    for label, (start, end) in zip(labels, offsets):
        if label == 'O':
            continue
        span = doc.char_span(start, end, label.split('-')[-1], alignment_mode="expand")
        spans.append(span)
    spans = spacy.util.filter_spans(spans)
    doc.set_ents(spans)
    return doc

doc = merge_offsets(doc, labels, offsets)

matcher = spacy.matcher.Matcher(nlp.vocab)

# Make ORG, PER, LOC matchers
orgpattern = [
   {'ENT_TYPE': 'ORG'},
   {'ENT_TYPE': 'ORG', 'OP': '*'}
]
locpattern = [
   {'ENT_TYPE': 'LOC'},
   {'ENT_TYPE': 'LOC', 'OP': '*'}
]
perpattern = [
   {'ENT_TYPE': 'PER'},
   {'ENT_TYPE': 'PER', 'OP': '*'}

]
matcher.add("ORG", [orgpattern])
matcher.add("LOC", [locpattern])
matcher.add("PER", [perpattern])

[(d.text, d.label_) for d in spacy.util.filter_spans([spacy.tokens.Span(doc, start, stop, nlp.vocab.strings[id_]) for id_, start, stop in matcher(doc)])]

[('Najib Razak', 'PER'), ('Emergency Ops Centre', 'LOC')]

In [18]:
payload = []
for orgdoc in docs:
    doc = nlp(orgdoc['text'])
    labels, offsets = orgdoc['entities'], orgdoc['annotation_offsets']
    doc = merge_offsets(doc, labels, offsets)
    spans = spacy.util.filter_spans([spacy.tokens.Span(doc, start, stop, nlp.vocab.strings[id_]) for id_, start, stop in matcher(doc)])
    payload.append({
        'text': doc.text,
        'labels': [{"start": s.start_char, "end": s.end_char, 'label': s.label_} for s in spans]
    })

In [19]:
len(payload)

200

In [21]:
with open('ner-train.json', 'w') as fin:
    json.dump(payload, fin, indent=2)