In [62]:
from prodigy.components.loaders import JSONL

data = JSONL('annotated_data.jsonl')

In [63]:
TRAINING_DATA = []

for line in data:
    try:
        if line['answer'] == 'accept':
            labels = []
            for span in line['spans']:
                labels.append((span['start'], span['end'], span['label']))

            TRAINING_DATA.append(
                (line['text'], {'entities': labels})
            )
    except:
        pass

print(TRAINING_DATA[:2])

[('Bitcoin bull bullish on bitcoin? Shocker.', {'entities': [(0, 7, 'COIN'), (24, 31, 'COIN')]}), ("I've owned bitcoin before Bitcoin Cash came out, do I own bitcoin cash in some way now?", {'entities': [(11, 18, 'COIN'), (26, 33, 'COIN'), (58, 65, 'COIN')]})]


### Spacy 2.x

In [64]:
import spacy

# !python -m spacy download en_core_web_md

nlp = spacy.load('en_core_web_md')
ner = nlp.get_pipe('ner')

In [65]:
LABEL = 'COIN'

# Add the new label to ner
ner.add_label(LABEL)

# Resume training
optimizer = nlp.resume_training()
move_names = list(ner.move_names)

# List of pipes you want to train
pipe_exceptions = ["ner"]

# List of pipes which should remain unaffected in training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

In [66]:
from spacy.training.example import Example
import random

# Begin training by disabling other pipeline components
with nlp.disable_pipes(*other_pipes):
    random.shuffle(TRAINING_DATA)
    losses = {}
    for batch in spacy.util.minibatch(TRAINING_DATA, size=40):
        for text, annotations in batch:
            # create Example
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            # Update the model
            nlp.update([example], losses=losses, drop=0.3)
            print("Losses", losses)

Losses {'ner': 3.2809867044565006}
Losses {'ner': 17.148286672739545}
Losses {'ner': 19.143903895659488}
Losses {'ner': 23.31138708494987}
Losses {'ner': 24.982092594455338}
Losses {'ner': 26.193015469181525}
Losses {'ner': 29.375631557283}
Losses {'ner': 35.31397430056788}
Losses {'ner': 36.61637848970324}
Losses {'ner': 53.31381365031588}
Losses {'ner': 56.42848939680457}
Losses {'ner': 62.770804703857436}
Losses {'ner': 64.61610381812974}
Losses {'ner': 66.65294904622947}
Losses {'ner': 69.63741609304988}
Losses {'ner': 80.9910823803671}
Losses {'ner': 97.02652371437436}
Losses {'ner': 110.7332091786427}
Losses {'ner': 111.91463260771137}
Losses {'ner': 115.19788737811591}
Losses {'ner': 117.7527480469925}
Losses {'ner': 128.64379310646694}
Losses {'ner': 131.49994478887206}
Losses {'ner': 138.6326529527616}
Losses {'ner': 140.8014868369341}
Losses {'ner': 141.89273207906461}
Losses {'ner': 144.0253156260464}
Losses {'ner': 145.69008638920656}
Losses {'ner': 146.6561745490866}
Losse

In [67]:
test_text = "I want to buy bitcoin, usd and solana"
doc = nlp(test_text)
print("Entities in ", test_text)
for ent in doc.ents:
    print(ent)

Entities in  I want to buy bitcoin, usd and solana
bitcoin


## Spacy 3.0 - Prodigy

In [70]:
!python -m prodigy train ./model --ner ner_cryptos

[i] Using CPU
[1m
[i] Auto-generating config with spaCy
[+] Generated training config
[1m
[+] Initialized pipeline
[1m
[i] Pipeline: ['tok2vec', 'ner']
[i] Initial learn rate: 0.001
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     33.33    1.70    1.49    1.97    0.02
  4     200         59.98   1305.01   95.68   96.64   94.74    0.96
  9     400         87.70    139.41   94.59   97.22   92.11    0.95
 15     600         84.12     71.04   95.74   95.42   96.05    0.96
 22     800         69.48     40.42   95.65   97.28   94.08    0.96
 32    1000         70.42     44.52   95.33   96.62   94.08    0.95
 43    1200         85.51     25.79   94.70   95.33   94.08    0.95
 57    1400         53.93     12.82   94.74   94.74   94.74    0.95
 75    1600         11.58      2.01   95.05   95.36   94.74    0.95
 96    1800        213.46     37.03   95.71   96.03   95.39    0.96

[2022-01-18 11:23:20,785] [INFO] Set up nlp object from config
Components: ner
Merging training and evaluation data for 1 components
  - [ner] Training: 413 | Evaluation: 103 (20% split)
Training: 351 | Evaluation: 95
Labels: ner (1)
[2022-01-18 11:23:20,874] [INFO] Pipeline: ['tok2vec', 'ner']
[2022-01-18 11:23:20,878] [INFO] Created vocabulary
[2022-01-18 11:23:20,879] [INFO] Finished initializing nlp object
[2022-01-18 11:23:21,190] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
Components: ner
Merging training and evaluation data for 1 components
  - [ner] Training: 413 | Evaluation: 103 (20% split)
Training: 351 | Evaluation: 95
Labels: ner (1)


[i] Using CPU
[1m
[i] Auto-generating config with spaCy


[2022-01-18 11:25:55,310] [INFO] Set up nlp object from config
Components: ner
Merging training and evaluation data for 1 components
  - [ner] Training: 413 | Evaluation: 103 (20% split)
Training: 351 | Evaluation: 95
Labels: ner (1)
[2022-01-18 11:25:55,384] [INFO] Pipeline: ['tok2vec', 'ner']
[2022-01-18 11:25:55,388] [INFO] Created vocabulary
[2022-01-18 11:25:55,389] [INFO] Finished initializing nlp object
[2022-01-18 11:25:55,748] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
Components: ner
Merging training and evaluation data for 1 components
  - [ner] Training: 413 | Evaluation: 103 (20% split)
Training: 351 | Evaluation: 95
Labels: ner (1)


[+] Generated training config
[1m
[+] Initialized pipeline
[1m
[i] Pipeline: ['tok2vec', 'ner']
[i] Initial learn rate: 0.001
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     33.33    1.70    1.49    1.97    0.02
  4     200         59.98   1305.01   95.68   96.64   94.74    0.96
  9     400         87.70    139.41   94.59   97.22   92.11    0.95
 15     600         84.12     71.04   95.74   95.42   96.05    0.96
 22     800         69.48     40.42   95.65   97.28   94.08    0.96
 32    1000         70.42     44.52   95.33   96.62   94.08    0.95
 43    1200         85.51     25.79   94.70   95.33   94.08    0.95
 57    1400         53.93     12.82   94.74   94.74   94.74    0.95
 75    1600         11.58      2.01   95.05   95.36   94.74    0.95
 96    1800        213.46     37.03   95.71   96.03   95.39    0.96
122    2000        178.72     30.63   95.39   95.39   95

In [2]:
import spacy

nlp = spacy.load('./model/model-best')

In [3]:
test_text = "I want to buy bitcoin, usd and solana"
doc = nlp(test_text)
print("Entities in ", test_text)
for ent in doc.ents:
    print(ent)

Entities in  I want to buy bitcoin, usd and solana
bitcoin
solana
