In [1]:
import pandas as pd
import numpy as np
import re

desired_width=320
pd.set_option('display.width', desired_width)
np.set_printoptions(linewidth=desired_width)
pd.set_option('display.max_columns',20)
pd.set_option('display.max_rows', 500)


# Load language model
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")

  return torch._C._cuda_getDeviceCount() > 0



# NER - Named Entity Recognition


In [2]:
doc=nlp('''The Slim Fit MVP is a slim fitting, ultra comfortable classic Jeans. 
The Extreme Motion series are the ultimate combination between extreme comfort, freedom to move 
and authentic denim, coming here in a slim fit execution, very much like Guess.
Featuring the amazing comfort, movement and unbeatable stretch of Lee’s Extreme Motion collection,
these slim fit jeans are made with motion in mind. Look for the grey elastic in the super comfy athletic
waistband and the signature donut button and brown leather back patch. Lee jeans are sold in the same
stores as Wrangler. Made using recycled fabric and constructed using a cleverly tapered slim fit.''')

for entity in doc.ents:
      print(entity.label_, ' | ', entity.text)
displacy.render(doc, style="ent")

PERSON  |  Slim Fit MVP
NORP  |  Jeans
WORK_OF_ART  |  The Extreme Motion
PERSON  |  Guess
ORG  |  Lee’s Extreme Motion
PERSON  |  Lee
PRODUCT  |  Wrangler


# Customize NER Pipeline

In [3]:
brands=pd.read_csv("Data/brands.csv")
jeans=pd.read_csv("Data/jeans.csv")["name"]

FileNotFoundError: [Errno 2] No such file or directory: 'brands.csv'

In [None]:
# training data
TRAIN_DATA = [( jeans[x] , {"entities": [(0, len(jeans[x].split(" ")[0] ), "BRAND")]}) for x in jeans.index]
TRAIN_DATA[10:20]

In [None]:
# Load pre-existing spacy model
import spacy
from spacy.training import Example
from spacy.util import minibatch, compounding
import random

nlp=spacy.load('en_core_web_sm')

# Getting the pipeline component
ner=nlp.get_pipe("ner")

# New label to add
LABEL = "BRAND"

# Add the new label to ner
ner.add_label(LABEL)

# Resume training
optimizer = nlp.resume_training()
move_names = list(ner.move_names)

# List of pipes you want to train
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]

# List of pipes which should remain unaffected in training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]



# Begin training by disabling other pipeline components
with nlp.disable_pipes(*other_pipes) :
    sizes = compounding(1.0, 4.0, 1.001)
    # Training for 30 iterations     
    for itn in range(30):
    # shuffle examples before training
        random.shuffle(TRAIN_DATA)
        # batch up the examples using spaCy's minibatch
        batches = minibatch(TRAIN_DATA, size=sizes)
        # dictionary to store losses
        losses = {}
        for batch in batches:
            texts, annotations = zip(*batch)
            example = []
            for text, annotations in batch:
                doc = nlp.make_doc(text)
                example.append(Example.from_dict(doc, annotations))
                nlp.update(
                            example,
                            drop=0.5,  # dropout - make it harder to memorise data
                            losses=losses,
                        )
            print("Losses", losses)
    

In [None]:
doc=nlp('''The Slim Fit MVP is a slim fitting, ultra comfortable classic Jeans. 
The Extreme Motion series are the ultimate combination between extreme comfort, freedom to move 
and authentic denim, coming here in a slim fit execution, very much like Guess.
Featuring the amazing comfort, movement and unbeatable stretch of Lee’s Extreme Motion collection,
these slim fit jeans are made with motion in mind. Look for the grey elastic in the super comfy athletic
waistband and the signature donut button and brown leather back patch. Lee jeans are sold in the same
stores as Wrangler. Made using recycled fabric and constructed using a cleverly tapered slim fit.''')

for entity in doc.ents:
      print(entity.label_, ' | ', entity.text)
displacy.render(doc, style="ent")