# 1. Training and updating models

In [1]:
import spacy
import json
from spacy.matcher import Matcher
from spacy.lang.en import English

with open('spaCy/exercises/iphone.json') as f:
    TEXTS = json.loads(f.read())

In [2]:
TEXTS

['How to preorder the iPhone X',
 'iPhone X is coming',
 'Should I pay $1,000 for the iPhone X?',
 'The iPhone 8 reviews are here',
 'Your iPhone goes up to 11 today',
 'I need a new phone! Any tips?']

In [19]:
nlp = English()
matcher = Matcher(nlp.vocab)

# Two tokens whose lowercase forms match 'iphone' and 'x'
pattern1 = [{'LOWER':'iphone'}, {'LOWER':'x'}]

# Token whose lowercase form matches 'iphone' and an optional digit
pattern2 = [{'LOWER':'iphone'}, {'IS_DIGIT':True, 'OP':'?'}] 

# Add patterns to the matcher
matcher.add('GADGET', None, pattern1, pattern2)



In [30]:
TRAINING_DATA = []

for doc in nlp.pipe(TEXTS):
    # Match on the doc and create a list of matched spans
    spans = [doc[start:end] for match_id, start, end in matcher(doc)]
    # Get (start character, end character, label) tuples of matches
    entities = [(span.start_char, span.end_char, 'GADGET') for span in spans]
    # Format the matches as a (doc.text, entities) tuple
    training_example = (doc.text, {'entities':entities})
    # Append the example to the training data
    TRAINING_DATA.append(training_example)
    
print(*TRAINING_DATA, sep='\n')

('How to preorder the iPhone X', {'entities': [(20, 28, 'GADGET'), (20, 26, 'GADGET')]})
('iPhone X is coming', {'entities': [(0, 8, 'GADGET'), (0, 6, 'GADGET')]})
('Should I pay $1,000 for the iPhone X?', {'entities': [(28, 36, 'GADGET'), (28, 34, 'GADGET')]})
('The iPhone 8 reviews are here', {'entities': [(4, 12, 'GADGET')]})
('Your iPhone goes up to 11 today', {'entities': [(5, 11, 'GADGET')]})
('I need a new phone! Any tips?', {'entities': []})


# 5. The training loop

In [31]:
TRAINING_DATA = [
    ("How to preorder the iPhone X", {'entities': [(20, 28, 'GADGET')]})
]

In [33]:
import random

In [35]:
# Loop for 10 iterations
for i in range(10):
    # Shuffle the training data
    random.shuffle(TRAINING_DATA)
    # Create batches and iterate over them
    for batch in spacy.util.minibatch(TRAINING_DATA):
        # Split the batch in texts and annotations
        texts = [text for text, annotation in batch]
        annotations = [annotation for text, annotation in batch]
        # Update the model
        nlp.update(texts, annotations)
        
# Save the model
#nlp.to_disk(path_to_model)

In [36]:
import spacy
import random
import json

with open("spaCy/exercises/gadgets.json") as f:
    TRAINING_DATA = json.loads(f.read())

nlp = spacy.blank("en")
ner = nlp.create_pipe("ner")
nlp.add_pipe(ner)
ner.add_label("GADGET")


In [37]:
TRAINING_DATA

[['How to preorder the iPhone X', {'entities': [[20, 28, 'GADGET']]}],
 ['iPhone X is coming', {'entities': [[0, 8, 'GADGET']]}],
 ['Should I pay $1,000 for the iPhone X?', {'entities': [[28, 36, 'GADGET']]}],
 ['The iPhone 8 reviews are here', {'entities': [[4, 12, 'GADGET']]}],
 ['Your iPhone goes up to 11 today', {'entities': [[5, 11, 'GADGET']]}],
 ['I need a new phone! Any tips?', {'entities': []}]]

In [39]:

# Start the training
nlp.begin_training()

# Loop for 10 iterations
for itn in range(10):
    # Shuffle the training data
    random.shuffle(TRAINING_DATA)
    losses = {}

    # Batch the examples and iterate over them
    for batch in spacy.util.minibatch(TRAINING_DATA, size=2):
        texts = [text for text, entities in batch]
        annotations = [entities for text, entities in batch]

        # Update the model
        nlp.update(texts, annotations, losses=losses)
        print(losses)

{'ner': 8.800000131130219}
{'ner': 23.329503118991852}
{'ner': 31.883775174617767}
{'ner': 8.860789120197296}
{'ner': 14.099116742610931}
{'ner': 18.96625143289566}
{'ner': 2.5280871465802193}
{'ner': 5.53269352670759}
{'ner': 6.658698276849464}
{'ner': 1.658433779492043}
{'ner': 3.4622879540766007}
{'ner': 6.737643931373896}
{'ner': 1.8413473030086607}
{'ner': 5.3009455972205615}
{'ner': 7.74266391475976}
{'ner': 2.312983350129798}
{'ner': 3.1025084856372587}
{'ner': 5.109129903692406}
{'ner': 0.9868378606624901}
{'ner': 1.2804984671300161}
{'ner': 4.070186218576779}
{'ner': 1.2328534185362514}
{'ner': 2.665048535258393}
{'ner': 2.771152643030007}
{'ner': 0.009034109793276457}
{'ner': 0.01292174999476714}
{'ner': 0.622420611123184}
{'ner': 0.00043256945906278155}
{'ner': 3.7201857728623136}
{'ner': 3.722092547046193}
