# 1. Training and updating models

In [3]:
import spacy
import json
from spacy.matcher import Matcher
from spacy.lang.en import English

with open('spaCy/exercises/iphone.json') as f:
    TEXTS = json.loads(f.read())

In [2]:
TEXTS

['How to preorder the iPhone X',
 'iPhone X is coming',
 'Should I pay $1,000 for the iPhone X?',
 'The iPhone 8 reviews are here',
 'Your iPhone goes up to 11 today',
 'I need a new phone! Any tips?']

In [7]:
nlp = English()
matcher = Matcher(nlp.vocab)

# Two tokens whose lowercase forms match 'iphone' and 'x'
pattern1 = [{'LOWER':'iphone'}, {'LOWER':'x'}]

# Token whose lowercase form matches 'iphone' and an optional digit
pattern2 = [{'LOWER':'iphone'}, {'IS_DIGIT':True, 'OP':'?'}] 

# Add patterns to the matcher
matcher.add('GADGET', None, pattern1, pattern2)



In [30]:
TRAINING_DATA = []

for doc in nlp.pipe(TEXTS):
    # Match on the doc and create a list of matched spans
    spans = [doc[start:end] for match_id, start, end in matcher(doc)]
    # Get (start character, end character, label) tuples of matches
    entities = [(span.start_char, span.end_char, 'GADGET') for span in spans]
    # Format the matches as a (doc.text, entities) tuple
    training_example = (doc.text, {'entities':entities})
    # Append the example to the training data
    TRAINING_DATA.append(training_example)
    
print(*TRAINING_DATA, sep='\n')

('How to preorder the iPhone X', {'entities': [(20, 28, 'GADGET'), (20, 26, 'GADGET')]})
('iPhone X is coming', {'entities': [(0, 8, 'GADGET'), (0, 6, 'GADGET')]})
('Should I pay $1,000 for the iPhone X?', {'entities': [(28, 36, 'GADGET'), (28, 34, 'GADGET')]})
('The iPhone 8 reviews are here', {'entities': [(4, 12, 'GADGET')]})
('Your iPhone goes up to 11 today', {'entities': [(5, 11, 'GADGET')]})
('I need a new phone! Any tips?', {'entities': []})


# 5. The training loop

In [4]:
TRAINING_DATA = [
    ("How to preorder the iPhone X", {'entities': [(20, 28, 'GADGET')]})
]

In [5]:
import random

In [8]:
# Loop for 10 iterations
for i in range(10):
    # Shuffle the training data
    random.shuffle(TRAINING_DATA)
    # Create batches and iterate over them
    for batch in spacy.util.minibatch(TRAINING_DATA):
        # Split the batch in texts and annotations
        texts = [text for text, annotation in batch]
        annotations = [annotation for text, annotation in batch]
        # Update the model
        nlp.update(texts, annotations)
        
# Save the model
#nlp.to_disk(path_to_model)

In [9]:
import spacy
import random
import json

with open("spaCy/exercises/gadgets.json") as f:
    TRAINING_DATA = json.loads(f.read())

nlp = spacy.blank("en") # tokenizer
ner = nlp.create_pipe("ner") # entitiy recognizer 를 pipe에 추가!
nlp.add_pipe(ner)
ner.add_label("GADGET")


In [10]:
TRAINING_DATA

[['How to preorder the iPhone X', {'entities': [[20, 28, 'GADGET']]}],
 ['iPhone X is coming', {'entities': [[0, 8, 'GADGET']]}],
 ['Should I pay $1,000 for the iPhone X?', {'entities': [[28, 36, 'GADGET']]}],
 ['The iPhone 8 reviews are here', {'entities': [[4, 12, 'GADGET']]}],
 ['Your iPhone goes up to 11 today', {'entities': [[5, 11, 'GADGET']]}],
 ['I need a new phone! Any tips?', {'entities': []}]]

In [14]:

# Start the training
nlp.begin_training()

# Loop for 10 iterations
for itn in range(10):
    # Shuffle the training data
    random.shuffle(TRAINING_DATA)
    losses = {}

    # Batch the examples and iterate over them
    for batch in spacy.util.minibatch(TRAINING_DATA, size=2):
        texts = [text for text, entities in batch]
        annotations = [entities for text, entities in batch]

        # Update the model
        nlp.update(texts, annotations, losses=losses)
        print(losses)

{'ner': 7.760874032974243}
{'ner': 12.152697265148163}
{'ner': 15.706751629710197}
{'ner': 1.812375183799304}
{'ner': 6.4459035091567785}
{'ner': 9.585161367780529}
{'ner': 2.1559822793351486}
{'ner': 4.640617244294845}
{'ner': 8.396100066951476}
{'ner': 3.2555352989584208}
{'ner': 4.697562226559967}
{'ner': 8.055177561473101}
{'ner': 0.9545068128209095}
{'ner': 2.9374474006181117}
{'ner': 4.395142194494838}
{'ner': 0.6991139204328647}
{'ner': 2.0966594541823724}
{'ner': 2.254309345508638}
{'ner': 0.07469282332397142}
{'ner': 1.3934085878148608}
{'ner': 1.3940566562108714}
{'ner': 0.00022648402086744568}
{'ner': 0.0012043443527964826}
{'ner': 0.8097342398529399}
{'ner': 0.0003391681372058031}
{'ner': 0.0005922831563225373}
{'ner': 2.2507665140497672}
{'ner': 8.111711790181886e-05}
{'ner': 8.231224039434437e-05}
{'ner': 2.074323022602053}
