In [5]:
import pandas as pd
import spacy
import pickle
import time
import random
import warnings
warnings.filterwarnings('ignore')

In [24]:
def reformat_tagged_data(data):
    output = []
    for item in data:
        new_data = (item[0],{"entities":item[1]})
        output.append(new_data)
    # Now we have reformatted TRAIN_DATA ready for model input
    return output

def test_model(nlp_model,examples_to_test):
    print("Testing {} examples".format(len(examples_to_test)))
    total = 0
    correct = 0
    completely_correct = 0
    for example in examples_to_test:
        doc = nlp_model(example[0])
        
        actual_positions = [(x[0],x[1]) for x in example[1]["entities"]]
        predictions = [str(x) for x in list(doc.ents)]
        actual_characters = [str(doc)[a:b] for (a,b) in actual_positions]
        initial_predictions = predictions
        
        for char in actual_characters:
            if char in predictions:
                correct += 1
                predictions.remove(char)
            total += 1
        
        if len(predictions) == 0:
            completely_correct += 1
        
#         print("Sentence: {}".format(doc))
#         print("Predictions: {}".format(initial_predictions))
#         print("Actual: {}".format(actual_characters))
#         print()
    print("{}% characters correctly tagged".format(round(100*(correct/total),0)))
    print("{}% sentences perfectly tagged".format(round(100*(completely_correct/len(examples_to_test)),0)))
    
def debug_n_predictions(nlp,TEST_DATA,n):
    for i in range(n):
        example = TEST_DATA[i]
        doc = nlp(example[0])
        
        actual_positions = [(x[0],x[1]) for x in example[1]["entities"]]
        predictions = [str(x) for x in list(doc.ents)]
        actual_characters = [str(doc)[a:b] for (a,b) in actual_positions]
    
        print("Sentence: {}".format(doc))
        print("Predictions: {}".format(sorted(predictions)))
        print("Actual:      {}".format(sorted(actual_characters)))
        print()

# 100 Training+Test, drop=0.5

In [22]:
train_data = pickle.load(open("data/1_train_100",'rb'))
TRAIN_DATA = reformat_tagged_data(train_data)
test_data = pickle.load(open("data/1_test_100",'rb'))
TEST_DATA = reformat_tagged_data(test_data)

In [23]:
nlp = spacy.load("en_core_web_sm")
optimizer = nlp.begin_training()
t1 = time.time()
for i in range(5):
    random.shuffle(TRAIN_DATA)
    for text, annotations in TRAIN_DATA:
        nlp.update([text], [annotations], drop=0.5, sgd=optimizer)
t2 = time.time()
print("Training Took {} seconds".format(round(t2-t1,2)))
nlp.to_disk("C:/Users/jmarchant/Python/CDW/models/model1")

Training Took 1629.24 seconds


In [25]:
nlp = spacy.load("C:/Users/jmarchant/Python/CDW/models/model1")
test_model(nlp,TRAIN_DATA)

Testing 842 examples
91.0% characters correctly tagged
86.0% sentences perfectly tagged


In [26]:
test_model(nlp,TEST_DATA)

Testing 173 examples
63.0% characters correctly tagged
52.0% sentences perfectly tagged


In [27]:
debug_n_predictions(nlp,TEST_DATA,10)

Sentence: Twenty-three days after Thanos used the Infinity Gauntlet to disintegrate half of all life in the universe, Carol Danvers rescues Tony Stark and Nebula from deep space and returns them to Earth, where they reunite with the remaining Avengers—Bruce Banner, Steve Rogers, Thor, Natasha Romanoff, and James Rhodes—and Rocket
Predictions: ['Bruce', 'Carol', 'James', 'Natasha', 'Nebula', 'Rocket', 'Steve', 'Thanos', 'Thor', 'Tony']
Actual:      ['Bruce', 'Carol', 'James', 'Natasha', 'Nebula', 'Rocket', 'Steve', 'Thanos', 'Thor', 'Tony']

Sentence: Locating Thanos on an uninhabited planet, they plan to use the Infinity Stones to reverse "the Snap", but Thanos reveals he destroyed the Stones to prevent further use
Predictions: ['Thanos', 'Thanos']
Actual:      ['Thanos', 'Thanos']

Sentence: Enraged, Thor decapitates Thanos
Predictions: ['Thanos', 'Thor']
Actual:      ['Thanos', 'Thor']

Sentence: Five years later, Scott Lang escapes from the quantum realm
Predictions: ['Lang']
Actual

# 100 Training+Test, drop=0.5, fix find_overlapping_tags(), 5 shuffles

In [28]:
train_data_3 = pickle.load(open("data/3_train_100",'rb'))
TRAIN_DATA_3 = reformat_tagged_data(train_data_3)
test_data_3 = pickle.load(open("data/3_test_100",'rb'))
TEST_DATA_3 = reformat_tagged_data(test_data_3)

In [31]:
nlp3 = spacy.load("en_core_web_sm")
optimizer = nlp3.begin_training()
t1 = time.time()
for i in range(5):
    random.shuffle(TRAIN_DATA_3)
    for text, annotations in TRAIN_DATA_3:
        nlp3.update([text], [annotations], drop=0.5, sgd=optimizer)
t2 = time.time()
print("Training Took {} seconds".format(round(t2-t1,2)))
nlp3.to_disk("C:/Users/jmarchant/Python/CDW/models/model3")

Training Took 1650.55 seconds


In [32]:
nlp3 = spacy.load("C:/Users/jmarchant/Python/CDW/models/model3")
test_model(nlp3,TRAIN_DATA_3)

Testing 842 examples
91.0% characters correctly tagged
85.0% sentences perfectly tagged


In [33]:
test_model(nlp3,TEST_DATA_3)

Testing 174 examples
65.0% characters correctly tagged
53.0% sentences perfectly tagged


In [34]:
debug_n_predictions(nlp3,TEST_DATA_3,10)

Sentence: Twenty-three days after Thanos used the Infinity Gauntlet to kill half of all life in the universe, Carol Danvers rescues Tony Stark and Nebula from deep space and they reunite with the remaining Avengers—Bruce Banner, Steve Rogers, Thor, Natasha Romanoff, and James Rhodes—and Rocket on Earth
Predictions: ['Bruce', 'Carol', 'James', 'Nebula', 'Rocket', 'Steve', 'Thanos', 'Thor', 'Tony']
Actual:      ['Bruce', 'Carol', 'James', 'Natasha', 'Nebula', 'Rocket', 'Steve', 'Thanos', 'Thor', 'Tony']

Sentence: Locating Thanos on an uninhabited planet, they plan to use the Infinity Stones to reverse "the Snap", but Thanos destroyed the Stones to prevent further use
Predictions: ['Snap', 'Thanos', 'Thanos']
Actual:      ['Thanos', 'Thanos']

Sentence: Enraged, Thor decapitates Thanos
Predictions: ['Thanos', 'Thor']
Actual:      ['Thanos', 'Thor']

Sentence: Five years later, Scott Lang escapes from the quantum realm
Predictions: ['Lang']
Actual:      ['Scott']

Sentence: Theorizing the

# 10 shuffles

In [35]:
# Uses same TRAIN and TEST data

In [36]:
nlp4 = spacy.load("en_core_web_sm")
optimizer = nlp4.begin_training()
t1 = time.time()
for i in range(10):
    random.shuffle(TRAIN_DATA_3)
    for text, annotations in TRAIN_DATA_3:
        nlp4.update([text], [annotations], drop=0.5, sgd=optimizer)
t2 = time.time()
print("Training Took {} seconds".format(round(t2-t1,2)))
nlp4.to_disk("C:/Users/jmarchant/Python/CDW/models/model4")

Training Took 3844.69 seconds


In [37]:
nlp4 = spacy.load("C:/Users/jmarchant/Python/CDW/models/model4")
test_model(nlp4,TRAIN_DATA_3)

Testing 842 examples
91.0% characters correctly tagged
91.0% sentences perfectly tagged


In [38]:
test_model(nlp4,TEST_DATA_3)

Testing 174 examples
47.0% characters correctly tagged
69.0% sentences perfectly tagged


In [39]:
debug_n_predictions(nlp4,TEST_DATA_3,10)

Sentence: Twenty-three days after Thanos used the Infinity Gauntlet to kill half of all life in the universe, Carol Danvers rescues Tony Stark and Nebula from deep space and they reunite with the remaining Avengers—Bruce Banner, Steve Rogers, Thor, Natasha Romanoff, and James Rhodes—and Rocket on Earth
Predictions: ['Bruce', 'Carol', 'James', 'Nebula', 'Rocket', 'Steve', 'Thanos', 'Thor']
Actual:      ['Bruce', 'Carol', 'James', 'Natasha', 'Nebula', 'Rocket', 'Steve', 'Thanos', 'Thor', 'Tony']

Sentence: Locating Thanos on an uninhabited planet, they plan to use the Infinity Stones to reverse "the Snap", but Thanos destroyed the Stones to prevent further use
Predictions: ['Thanos', 'Thanos']
Actual:      ['Thanos', 'Thanos']

Sentence: Enraged, Thor decapitates Thanos
Predictions: ['Thanos', 'Thor']
Actual:      ['Thanos', 'Thor']

Sentence: Five years later, Scott Lang escapes from the quantum realm
Predictions: ['Lang']
Actual:      ['Scott']

Sentence: Theorizing the quantum realm c

# 5 shuffles, drop = 0.25

In [40]:
nlp5 = spacy.load("en_core_web_sm")
optimizer = nlp5.begin_training()
t1 = time.time()
for i in range(5):
    random.shuffle(TRAIN_DATA_3)
    for text, annotations in TRAIN_DATA_3:
        nlp5.update([text], [annotations], drop=0.25, sgd=optimizer)
t2 = time.time()
print("Training Took {} seconds".format(round(t2-t1,2)))
nlp5.to_disk("C:/Users/jmarchant/Python/CDW/models/model5")

Training Took 1721.67 seconds


In [41]:
nlp5 = spacy.load("C:/Users/jmarchant/Python/CDW/models/model5")
test_model(nlp5,TRAIN_DATA_3)

Testing 842 examples
91.0% characters correctly tagged
88.0% sentences perfectly tagged


In [42]:
test_model(nlp5,TEST_DATA_3)

Testing 174 examples
57.0% characters correctly tagged
60.0% sentences perfectly tagged


In [43]:
debug_n_predictions(nlp5,TEST_DATA_3,10)

Sentence: Twenty-three days after Thanos used the Infinity Gauntlet to kill half of all life in the universe, Carol Danvers rescues Tony Stark and Nebula from deep space and they reunite with the remaining Avengers—Bruce Banner, Steve Rogers, Thor, Natasha Romanoff, and James Rhodes—and Rocket on Earth
Predictions: ['Bruce', 'Carol', 'James', 'Nebula', 'Rocket', 'Thanos', 'Thor', 'Tony']
Actual:      ['Bruce', 'Carol', 'James', 'Natasha', 'Nebula', 'Rocket', 'Steve', 'Thanos', 'Thor', 'Tony']

Sentence: Locating Thanos on an uninhabited planet, they plan to use the Infinity Stones to reverse "the Snap", but Thanos destroyed the Stones to prevent further use
Predictions: ['Thanos', 'Thanos']
Actual:      ['Thanos', 'Thanos']

Sentence: Enraged, Thor decapitates Thanos
Predictions: ['Thanos', 'Thor']
Actual:      ['Thanos', 'Thor']

Sentence: Five years later, Scott Lang escapes from the quantum realm
Predictions: ['Lang']
Actual:      ['Scott']

Sentence: Theorizing the quantum realm co