In [1]:
import spacy
from spacy import displacy

In [5]:
nlp = spacy.load("en_core_web_sm")

In [33]:
train = [
          ("The interest rate is the amount a lender charges a borrower and is a percentage of the principal—the amount loaned.",
           {"entities":[(4,17,"Rates")]}),
          ("The repo rate is the rate of return that can be earned by simultaneously selling a bond futures or forward contract",
           {"entities":[(4,13,"Rates")]})
      ]

In [34]:
import pandas as pd
import os
from tqdm import tqdm
from spacy.tokens import DocBin

In [35]:
db = DocBin() # create a DocBin object

for text, annot in tqdm(train): # data in previous format
    doc = nlp.make_doc(text) # create doc object from text
    ents = []
    for start, end, label in annot["entities"]: # add character indexes
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents # label the text with the ents
    db.add(doc)

db.to_disk("./train.spacy") # save the docbin object

100%|██████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 1986.88it/s]


In [36]:
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m[+] Auto-filled config with all values[0m
[38;5;2m[+] Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [37]:
!python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./train.spacy

[38;5;4m[i] Saving to output directory: output[0m
[38;5;4m[i] Using CPU[0m
[1m
[38;5;2m[+] Initialized pipeline[0m
[1m
[38;5;4m[i] Pipeline: ['tok2vec', 'ner'][0m
[38;5;4m[i] Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     16.67    0.00    0.00    0.00    0.00
200     200          1.25    287.72  100.00  100.00  100.00    1.00
400     400          0.00      0.00  100.00  100.00  100.00    1.00
600     600          0.00      0.00  100.00  100.00  100.00    1.00
800     800          0.00      0.00  100.00  100.00  100.00    1.00
1000    1000          0.00      0.00  100.00  100.00  100.00    1.00
1200    1200          0.00      0.00  100.00  100.00  100.00    1.00
1400    1400          0.00      0.00  100.00  100.00  100.00    1.00
1600    1600          0.00      0.00  100.00  100.00  100.00    1.00
1800    1800          0.00      

[2023-04-07 18:01:21,943] [INFO] Set up nlp object from config
[2023-04-07 18:01:21,950] [INFO] Pipeline: ['tok2vec', 'ner']
[2023-04-07 18:01:21,953] [INFO] Created vocabulary
[2023-04-07 18:01:21,954] [INFO] Finished initializing nlp object
[2023-04-07 18:01:22,009] [INFO] Initialized pipeline components: ['tok2vec', 'ner']


In [38]:
nlp1 = spacy.load(r"./output/model-best") #load the best model
doc = nlp1("Testing if our model is able to capture interest rate and repo rate") # input sample text
displacy.render(doc,style="ent",jupyter=True)