In [None]:
import spacy
import json
import logging
import random

In [None]:
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
    try:
        training_data = []
        lines=[]
        with open(dataturks_JSON_FilePath, 'r',encoding="UTF-8") as f:
            lines = f.readlines()

        for line in lines:

            data = json.loads(line)
            text = data['content']
            entities = []
            
            if(type(data['annotation']) != type(None)):
            
                for annotation in data['annotation']:
                    
                    point = annotation['points'][0]
                    labels = annotation['label']
                    
                    if not isinstance(labels, list):
                        labels = [labels]

                    for label in labels:
                        
                        entities.append((point['start'], point['end'] + 1 ,label))


                training_data.append((text, {"entities" : entities}))

        return training_data
    except Exception as e:
        logging.exception("Unable to process " + dataturks_JSON_FilePath + "\n" + "error = " + str(e))
        return None

In [None]:
data = convert_dataturks_to_spacy("Sample_work_125.json")

In [None]:
train_data = data[:120]
test_data = data[120:]


In [None]:
def train_spacy(train_data):
    TRAIN_DATA = train_data
    nlp = spacy.blank('en')  
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)

   
    for text, annotations in TRAIN_DATA:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  
        optimizer = nlp.begin_training()
        for itn in range(50):
            print("Starting iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                
                nlp.update(
                    [text],  
                    [annotations],  
                    drop=0.05,  
                    sgd=optimizer,  
                    losses=losses)
            print(losses)
    nlp.to_disk('sample_work_model_125_drop_0.05')

In [None]:
train_spacy(train_data)

In [None]:
nlp = spacy.load('sample_work_model_125_drop_0.05')

In [None]:
doc = nlp(test_data[0][0])

In [None]:
def show_docs(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text+" --> "+ ent.label_)

In [None]:
show_docs(doc)