In [113]:
import pandas as pd
import os
from tqdm import tqdm
import spacy
from spacy.tokens import DocBin
from spacy.training.example import Example


Training data

In [173]:
TRAIN_DATA = [
    ("The sky is blue", {"entities": [(11, 15, "COLOR")]}),
    ("She wore a red dress", {"entities": [(11, 14, "COLOR")]}),
    ("The car is painted in green", {"entities": [(22, 27, "COLOR")]}),
    ("The sunsets are often orange", {"entities": [(22, 28, "COLOR")]}),
    ("The flag is red, white, and blue", {"entities": [(12, 15, "COLOR"), (17, 22, "COLOR"), (28, 32, "COLOR")]}),
    ("The room was painted in shades of gray", {"entities": [(34, 38, "COLOR")]}),
    ("The sky can be beautifully pink during sunsets", {"entities": [(27, 31, "COLOR")]}),
    ("The leaves turn yellow in the fall", {"entities": [(16, 22, "COLOR")]}),
]

# Training a NER tagger

train a pos tagger to recognize a new entity “USTHB” and “LRIA” as ORG,  

In [174]:
#nlp = spacy.blank("en") # load a new spacy model
nlp = spacy.blank("en") # load other spacy model

db = DocBin() # create a DocBin object

for text, annot in tqdm(TRAIN_DATA): # data in previous format
    doc = nlp.make_doc(text) # create doc object from text
    ents = []
    for start, end, label in annot["entities"]: # add character indexes
        span = doc.char_span(start, end, label=label, alignment_mode="contract")

        if span is None:
            print(text)
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents # label the text with the ents
    db.add(doc)

db.to_disk("./train.spacy") # save the docbin object

100%|██████████| 8/8 [00:00<00:00, 2666.65it/s]


In [175]:
import os
os.system('python -m spacy init fill-config tagger_config.cfg config.cfg')

0

In [176]:
import os
os.system('python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./train.spacy')

0

testing

In [179]:
nlp = spacy.load("./output/model-best")

doc = nlp("the blue carpet is beautifully shinning")
for ent in doc.ents:
    print(ent.text,"->", ent.label_)

doc = nlp("the color of the sky was red because of the flames")
for ent in doc.ents:
    print(ent.text,"->", ent.label_)

doc = nlp("that rooms walls were painted in half gray and half white")
for ent in doc.ents:
    print(ent.text,"->", ent.label_)

blue -> COLOR
red -> COLOR
gray -> COLOR
white -> COLOR


# Training a POS Tagger

 ner tagger to recognize a new entity label called “COLOR”

In [128]:
import random
from pathlib import Path
from spacy.pipeline.ner import Config

TAG_MAP = {
    'ORG': {'pos': 'ORG'},
    '.': {'pos': 'any'}
}

TRAIN_DATA = [
    ("USTHB is the biggest university in algiers .", {'tags': ['ORG', '.', '.', '.', '.', '.', '.', '.']}),
]

nlp = spacy.blank("en")
# Create a new NER component
tagger = nlp.add_pipe("tagger")

for tag, values in TAG_MAP.items():
    tagger.add_label(tag)



In [129]:
optimizer = nlp.begin_training()
examples = []

for input_data, annot in TRAIN_DATA:
    doc = nlp.make_doc(input_data)
    example = Example.from_dict(doc, annot)
    examples.append(example)

for i in range(20):
    random.shuffle(examples)
    nlp.update(examples)

doc = nlp("the LRIA is a research laboratory in the IT faculty at USTHB specialized in AI.")
print('Tags', [(t.text, t.tag_, t.pos_) for t in doc])

Tags [('the', '.', ''), ('LRIA', '.', ''), ('is', '.', ''), ('a', '.', ''), ('research', '.', ''), ('laboratory', '.', ''), ('in', '.', ''), ('the', '.', ''), ('IT', '.', ''), ('faculty', '.', ''), ('at', '.', ''), ('USTHB', 'ORG', ''), ('specialized', '.', ''), ('in', '.', ''), ('AI', '.', ''), ('.', '.', '')]


In [181]:
def getIndex(text,word_to_find):
    # Split the text into words
    words = text.split()

    # Initialize a variable to store the start index
    start_index = None

    # Iterate through the words and find the start index of the target word
    for i, word in enumerate(words):
        if word == word_to_find:
            start_index = sum(len(words[j]) + 1 for j in range(i))
            break
        
    return start_index