In [1]:
import random
import spacy
from spacy.training.example import Example
from tqdm import tqdm

In [2]:
TRAIN_DATA = [
    ("Cats are fascinating creatures with a wide range of behaviors.", ['NNS', 'AUX', 'ADJ', 'NNS', 'ADP', 'DET', 'ADJ', 'NOUN', 'ADP', 'NNS', 'PUNCT']),
    ("I bought three pillows for my new couch.", ['PRON', 'VERB', 'NUM', 'NNS', 'ADP', 'PRON', 'ADJ', 'NOUN', 'PUNCT']),
    ("The dogs chased the squirrels up the tree.", ['DET', 'NNS', 'VERB', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'PUNCT']),
    ("She collected seashells on the beach during her vacation.", ['PRON', 'VERB', 'NNS', 'ADP', 'DET', 'NOUN', 'ADP', 'PRON', 'NOUN', 'PUNCT']),
    ("Children love to play with colorful balloons at parties.", ['NOUN', 'VERB', 'PART', 'VERB', 'ADP', 'ADJ', 'NNS', 'ADP', 'NNS', 'PUNCT']),
    ("The shelves were filled with books of different genres.", ['DET', 'NNS', 'AUX', 'VERB', 'ADP', 'NNS', 'ADP', 'ADJ', 'NNS', 'PUNCT']),
    ("Bees are essential pollinators for many types of flowers,", ['NNS', 'AUX', 'ADJ', 'NNS', 'ADP', 'ADJ', 'NNS', 'ADP', 'NNS','PUNCT']),
    ("The cat is sleeping on the windowsill,", ['DET', 'NOUN', 'AUX', 'VERB', 'ADP', 'DET', 'NOUN', 'PUNCT']),
    ("A book on the shelf caught my attention", ['DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'VERB', 'PRON', 'NOUN']),
    ("Farmers grow various crops, such as wheat, corn, and soybeans.", ['NOUN', 'VERB', 'ADJ', 'NNS', 'PUNCT', 'ADJ', 'ADP', 'NOUN', 'PUNCT', 'NOUN', 'PUNCT', 'CCONJ', 'NNS', 'PUNCT']),
]


In [3]:
# Create a blank English spaCy model.
nlp = spacy.blank("en")

# Add the POS tagger component to the pipeline.
pos_tagger = nlp.add_pipe("tagger")

taggs=["AUX", "PUNCT", "PROPN", "ADJ", "ADP", "NNS", "VERB", "PART", "DET", "NOUN", "ADV","CCONJ", "PRON","NUM"]
for tag in taggs:
    pos_tagger.add_label(tag)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
epochs = 1000

# Disable other components to only train the POS tagger.
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'tagger']

with nlp.disable_pipes(*other_pipes):
    # Initialize the optimizer for training.
    optimizer = nlp.begin_training()

    for i in tqdm(range(epochs)):
        # Shuffle the training data for randomness.
        random.shuffle(TRAIN_DATA)
        for text, annotation in TRAIN_DATA:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, {"tags": annotation})
            nlp.update([example], drop=0.5, losses={})


100%|██████████| 1000/1000 [08:21<00:00,  1.99it/s]


In [5]:
# Test the trained POS tagger on a sample text.
doc = nlp("I saw a group of penguins waddling on the ice")
for token in doc:
    print(token.text, token.tag_)

I PRON
saw ADP
a DET
group NOUN
of ADP
penguins NNS
waddling VERB
on ADP
the DET
ice NOUN


In [6]:
nlp.to_disk('nns_nlp')

In [7]:
nlp_nns = spacy.load("nns_nlp")

In [8]:
# Test the trained POS tagger on a sample text.
doc = nlp_nns("I saw a group of penguins waddling on the ice")
for token in doc:
    print(token.text, token.tag_)

I PRON
saw ADP
a DET
group NOUN
of ADP
penguins NNS
waddling VERB
on ADP
the DET
ice NOUN
