In [None]:
import json
import random
import spacy
from spacy.util import minibatch, compounding
from spacy.tokens import DocBin
from tqdm import tqdm

In [None]:
def convert_doccano_to_spacy(doccano_data):
    spacy_data = []
    for data in doccano_data:
        text=data['text']
        entities = []
        for entity in data['label']:
            start = entity[0]
            end = entity[1]
            label = entity[2]
            entities.append((start, end, label))
        spacy_data.append((text, {'entities': entities}))
    return spacy_data

# Read Doccano JSONL file
doccano_jsonl_file = r'xxxxxxx.jsonl'
doccano_data = []
with open(doccano_jsonl_file, 'r') as f:
    for line in f:
        data = json.loads(line)
        doccano_data.append(data)

# Convert Doccano data to spaCy training data format
spacy_data = convert_doccano_to_spacy(doccano_data)

In [None]:
spacy.prefer_gpu()

# Load a base English model (or any other language model of your choice)
model="em_core_web_sm"

model = None
if model is not None:
    nlp = spacy.load(model)  
    print("Loaded model '%s'" % model)
else:
    nlp = spacy.blank('en')  
    print("Created blank 'en' model")


if 'ner' not in nlp.pipe_names:
    ner = nlp.add_pipe('ner')
else:
    ner = nlp.get_pipe('ner')

# Add the labels to the ner component
for _, annotations in spacy_data:
    entities = annotations['entities']
    for start, end, label in entities:
        ner.add_label(label)

# Disable unnecessary pipeline components for training
pipe_exceptions = ['ner']
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
disable_pipes = [pipe for pipe in nlp.pipe_names if pipe not in other_pipes]

# Training settings
n_iter = 8
batch_size = 4
dropout = 0.4
learn_rate = 0.0002

with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()

    for itn in tqdm(range(n_iter)):
        losses = {}
        random.shuffle(spacy_data)
        batches = minibatch(spacy_data, size=batch_size)

        # Train the model with each batch
        for batch in batches:
            texts, annotations = zip(*batch)
            example_objs = []
            for i in range(len(texts)):
                doc = nlp.make_doc(texts[i])
                entities = annotations[i]['entities']
                example = spacy.training.Example.from_dict(doc, {"entities": entities})
                example_objs.append(example)
            nlp.update(example_objs, sgd=optimizer, drop=dropout, losses=losses)
       

        print(f"Iteration {itn+1}: Losses - {losses}")

In [None]:
# Save the trained model
output_dir = "output"
nlp.to_disk(output_dir)

In [None]:
#Test the model
from IPython.display import display
from spacy import displacy

nlp = spacy.load("output")
example_text = "Elon Musk is holding 15 mil of Bitcoin and Ethereum"

try:
    doc = nlp(example_text)
    displacy.render(doc, style="ent", jupyter=True)
except Exception as e:
    print("Error occurred:", e)