In [113]:
import pandas as pd
import os
from tqdm import tqdm
import spacy
from spacy.tokens import DocBin
from spacy.training.example import Example


Training data

In [173]:
TRAIN_DATA = [
    ("The sky is blue", {"entities": [(11, 15, "COLOR")]}),
    ("She wore a red dress", {"entities": [(11, 14, "COLOR")]}),
    ("The car is painted in green", {"entities": [(22, 27, "COLOR")]}),
    ("The sunsets are often orange", {"entities": [(22, 28, "COLOR")]}),
    ("The flag is red, white, and blue", {"entities": [(12, 15, "COLOR"), (17, 22, "COLOR"), (28, 32, "COLOR")]}),
    ("The room was painted in shades of gray", {"entities": [(34, 38, "COLOR")]}),
    ("The sky can be beautifully pink during sunsets", {"entities": [(27, 31, "COLOR")]}),
    ("The leaves turn yellow in the fall", {"entities": [(16, 22, "COLOR")]}),
]

# Training a NER tagger

train a pos tagger to recognize a new entity “USTHB” and “LRIA” as ORG,  

In [174]:
#nlp = spacy.blank("en") # load a new spacy model
nlp = spacy.blank("en") # load other spacy model

db = DocBin() # create a DocBin object

for text, annot in tqdm(TRAIN_DATA): # data in previous format
    doc = nlp.make_doc(text) # create doc object from text
    ents = []
    for start, end, label in annot["entities"]: # add character indexes
        span = doc.char_span(start, end, label=label, alignment_mode="contract")

        if span is None:
            print(text)
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents # label the text with the ents
    db.add(doc)

db.to_disk("./train.spacy") # save the docbin object

100%|██████████| 8/8 [00:00<00:00, 2666.65it/s]


In [175]:
import os
os.system('python -m spacy init fill-config tagger_config.cfg config.cfg')

0

In [176]:
import os
os.system('python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./train.spacy')

0

testing

In [20]:
nlp = spacy.load("./output/model-best")
TEST_DATA = [
    "The grass is green.",
    "the blue carpet is beautifully shinning",
    "The sky is blue.",
    "the color of the sky was red because of the flames",
    "that room walls were painted in half gray and half white",
    "The sun is bright."
]

for text in TEST_DATA:
        doc = nlp(text)
        spacy.displacy.render(doc, style="ent", jupyter=True)

# old training method

In [13]:
import random
from pathlib import Path
import spacy
from spacy.training.example import Example


TRAIN_DATA = [
    ("The sky is blue", {"entities": [(11, 15, "COLOR")]}),
    ("She wore a red dress", {"entities": [(11, 14, "COLOR")]}),
    ("The car is painted in green", {"entities": [(22, 27, "COLOR")]}),
    ("The sunsets are often orange", {"entities": [(22, 28, "COLOR")]}),
    ("The flag is red, white, and blue", {"entities": [(12, 15, "COLOR"), (17, 22, "COLOR"), (28, 32, "COLOR")]}),
    ("The room was painted in shades of gray", {"entities": [(34, 38, "COLOR")]}),
    ("The sky can be beautifully pink during sunsets", {"entities": [(27, 31, "COLOR")]}),
    ("The leaves turn yellow in the fall", {"entities": [(16, 22, "COLOR")]}),
]

In [1]:
nlp = spacy.blank('en')

ner = nlp.add_pipe('ner')

for _, annotations in TRAIN_DATA:
    for ent in annotations.get('entities'):
            ner.add_label(ent[2])

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']



NameError: name 'spacy' is not defined

In [15]:
examples = []
for input_data, annot in TRAIN_DATA:
    doc = nlp.make_doc(input_data)
    example = Example.from_dict(doc, annot)
    examples.append(example)

with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    for itn in range(20):
        random.shuffle(examples)
        losses = {}
        for example in examples:
            nlp.update(
                [example],
                drop=0.5, # dropout-make it harder to memorise data
                sgd=optimizer, # callable to update weights
                losses=losses)

nlp.to_disk("./model_ner_old")

In [16]:
nlp = spacy.load("./model_ner_old")
for text, _ in TRAIN_DATA:
        doc = nlp(text)
        spacy.displacy.render(doc, style="ent", jupyter=True)

# Training a POS Tagger

 ner tagger to recognize a new entity label called “COLOR”

In [27]:
import random
from pathlib import Path
from spacy.pipeline.ner import Config
import spacy
from spacy.training.example import Example

TAG_MAP = {
    'NNS': {'pos': 'NNS'},
    'any': {'pos': 'any'}
}

TRAIN_DATA = [
    ("there is many students in this class they'll pass student by student", {'tags': ['any', 'any', 'any', 'NNS', 'any', 'any', 'any', 'any', 'any', 'any', 'any', 'any', 'any']}),
    ("There are several books on the shelf and they'll read book by book",{"tags": ['any', 'any', 'any', 'NNS', 'any', 'any', 'any', 'any', 'any', 'any', 'any', 'any', 'any', 'any']}),
    ("There are many cats in the park", {"tags": ["any", "any", "any", "NNS", "any", "any", "any"]}),
    ("She has two dogs at home", {"tags": ["any", "any", "any", "NNS", "any", "any"]}),
    ("She has one dog at home", {"tags": ["any", "any", "any", "any", "any", "any"]}),
    ("there is a cat in the garden", {"tags": ["any", "any", "any", "any", "any", "any", "any"]}),
    ("There are multiple houses in the neighborhood", {"tags": ["any", "any", "any", "NNS", "any", "any", "any"]}),
    ("The kids play with colorful balls in the park", {"tags": ["any", "NNS", "any", "any", "any", "NNS", "any", "any", "any"]}),
    ("She has three cats and four dogs at home", {"tags": ["any", "any", "any", "NNS", "any", "any", "NNS", "any", "any"]}),
    ("We saw some birds in the sky", {"tags": ["any", "any", "any", "NNS", "any", "any", "any"]}),
    ("The students are studying for their exams", {"tags": ["any", "NNS", "any", "any", "any", "any", "NNS"]}),
    ("I counted the apples and there are twenty", {"tags": ["any", "any", "any", "NNS", "any", "any", "any", "any"]}),

]

nlp = spacy.blank("en")
# Create a new NER component
tagger = nlp.add_pipe("tagger")

# Add the labels to the tagger
for tag, values in TAG_MAP.items():
    tagger.add_label(tag)



In [30]:

examples = []

for input_data, annot in TRAIN_DATA:
    doc = nlp.make_doc(input_data)
    example = Example.from_dict(doc, annot)
    examples.append(example)

nlp.initialize(lambda: examples)
optimizer = nlp.begin_training()

for i in range(50):
    losses={}
    random.shuffle(examples)
    for example in examples:
        nlp.update([example],sgd=optimizer,losses=losses)
    

In [29]:
nlp.to_disk("./model_post")

doc = nlp("there is many laboratories")
print('Tags', [(t.text, t.tag_, t.pos_) for t in doc])

Tags [('there', 'any', ''), ('is', 'any', ''), ('many', 'any', ''), ('laboratories', 'NNS', '')]


In [181]:
def getIndex(text,word_to_find):
    # Split the text into words
    words = text.split()

    # Initialize a variable to store the start index
    start_index = None

    # Iterate through the words and find the start index of the target word
    for i, word in enumerate(words):
        if word == word_to_find:
            start_index = sum(len(words[j]) + 1 for j in range(i))
            break
        
    return start_index