In [None]:
from __future__ import unicode_literals, print_function
import pandas as pd
import mlflow
import pickle
import plac
import random
import os
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding
from spacy.gold import GoldParse
import spacy
from spacy.matcher import Matcher

In [2]:
notebook_out = None
artefacts_temp_dir = None
experiment_name = None
mode = 'local'
payload = {
    'model': 'de_core_news_md' # 'de_core_news_md' 'en_core_web_sm'
}

In [3]:
# For Collab Runs
if mode == 'google_collab': 
    import importlib
    if importlib.util.find_spec('mlflow') is None:
      !pip install mlflow
    if importlib.util.find_spec('papermill') is None:
      !pip install papermill
    
    # Run these commmands to get different language sets
    # You must restart runtime to be able to do a subsequent load with spacy
    !python -m spacy download en_core_web_sm
    !python -m spacy download de_core_news_md
    
    # TODO: Develop BERT 
    #!pip install spacy-transformers
    #!python -m spacy download de_trf_bertbasecased_lg
    #!pip install spacy-transformers
    #!python -m spacy download en_trf_bertbasecased_lg

    from google.colab import drive
    drive.mount('/content/gdrive')
    os.chdir('/content/gdrive')
    !cd /content/gdrive
    !ls

In [4]:
# Set Experiment
if (experiment_name): mlflow.set_experiment(experiment_name)

In [15]:
# Loads the Language Model
model_types = [
    'de_core_news_md',              # Medium German Core
    'de_trf_bertbasecased_lg',      # German BERT
    'en_core_web_sm',               # Small English Core
    'en_core_web_md',               # Medium English Core
    'en_core_web_lg',               # Medium English Core
    'en_trf_bertbaseuncased_lg',    # English BERT
    'fr_core_news_md',              # Medium French Core
    'pt_core_news_sm',              # Small Portuguese Core
]

# Validate Choice
if payload['model'] not in model_types: 
    raise("Invalid model type selected.")

# Loads selected language model
# nlp = spacy.load(payload['model'])

# Example Models
#nlp_de = spacy.load('de_core_news_md')
nlp_en = spacy.load('en_core_web_sm')

In [6]:
def pretty_wrap(func):
    def wrapper(*args, **kwargs):
        print('-'*70)
        func(*args, **kwargs)
        print('-'*70)
    return(wrapper)

@pretty_wrap        
def print_predict(nlp_model, sentences):
    
    docs = [nlp_model(s) for s in sentences]

    for d in docs:
        for ent in d.ents:
            print({
                'sentence': d,
                'entity': ent.text,
                'type': ent.label_,
                'start_ind': ent.start_char,
                'end_ind': ent.end_char
            })

In [None]:
# NER German Examples

sentences = [
    "Ich wohne in Berlin.",
    "Ich studiere an der TUM.",
    "Ich bin nicht Angela Merkel.",
    "Mieten Sie ein 2019 smart EQ fortwo Coupé für nur 139 EUR / Monat."
]

print_predict(nlp_de, sentences)


In [16]:
# NER English Examples

sentences = [
    "I am living in Colorado.",
    "In 2019, my rent was $1000 per month.",
    "I really miss Obamba.",
    "This year, I will get a Tesla."
]

print_predict(nlp_en, sentences)

----------------------------------------------------------------------
{'sentence': I am living in Colorado., 'entity': 'Colorado', 'type': 'GPE', 'start_ind': 15, 'end_ind': 23}
{'sentence': In 2019, my rent was $1000 per month., 'entity': '2019', 'type': 'DATE', 'start_ind': 3, 'end_ind': 7}
{'sentence': In 2019, my rent was $1000 per month., 'entity': '1000', 'type': 'MONEY', 'start_ind': 22, 'end_ind': 26}
{'sentence': I really miss Obamba., 'entity': 'Obamba', 'type': 'GPE', 'start_ind': 14, 'end_ind': 20}
{'sentence': This year, I will get a Tesla., 'entity': 'This year', 'type': 'DATE', 'start_ind': 0, 'end_ind': 9}
{'sentence': This year, I will get a Tesla., 'entity': 'Tesla', 'type': 'WORK_OF_ART', 'start_ind': 24, 'end_ind': 29}
----------------------------------------------------------------------


In [8]:
# Custom Entity Training (Option 1: Manual)

training_data = [      
    ("Lease a 2019 smart EQ fortwo coupe for as little as $139/month", 
     {"entities": [(19, 27, "custom_entity")]}
    )
]

print([i[1]['entities'] for i in training_data])

[[(19, 27, 'custom_entity')]]


In [9]:
# Custom Entity Training (Option 2: Parse)

texts = [
    "Lease a 2019 smart EQ fortwo coupe.",
    "You can lease a 2019 smart EQ fortwo cabrio for as little as $199/month.",
    "For navigating your city, or escaping it altogether, the 2019 smart EQ fortwo features a high-tech interior.",
    "I Love the smart EQ fortwo car, it's amazing.",
    "Reviews of the smart EQ fortwo have been phenomenal! Smart EQ fortwo cars are a hit."
    "Fortwo?",
    "Fortwo is for everyone.",
    "I am thinking of buying a fortwo coupe."
    "How much does a fortwo coupe cost?",
    "At signing, how much will be due for a new fortwo?",
    "Is fortwo the best car from Smart?",
    "Fortwo is a type of Smart car."
    "The fortwo model is sick!"   
]

# Run the Parser
matcher = Matcher(nlp_en.vocab)
pattern1 = [{"LOWER": "fortwo"}]
matcher.add("MODEL", None, pattern1)
new_labels = ['MODEL']
training_data = []

for text in texts:
    doc = nlp_en(text)
    matches = matcher(doc)
    entities = []
    for match_id, start, end in matches:
        string_id = nlp_en.vocab.strings[match_id]  # Get string representation
        span = doc[start:end]  # The matched span
        #print(span.start_char, span.end_char)
        entities.append((span.start_char, span.end_char, string_id))
        #print(string_id, start, end, span.text)
        training_data.append((text, {'entities': entities}))

print([i[1]['entities'] for i in training_data])

[[(22, 28, 'MODEL')], [(30, 36, 'MODEL')], [(71, 77, 'MODEL')], [(20, 26, 'MODEL')], [(24, 30, 'MODEL'), (62, 68, 'MODEL'), (84, 90, 'MODEL')], [(24, 30, 'MODEL'), (62, 68, 'MODEL'), (84, 90, 'MODEL')], [(24, 30, 'MODEL'), (62, 68, 'MODEL'), (84, 90, 'MODEL')], [(0, 6, 'MODEL')], [(26, 32, 'MODEL'), (55, 61, 'MODEL')], [(26, 32, 'MODEL'), (55, 61, 'MODEL')], [(43, 49, 'MODEL')], [(3, 9, 'MODEL')], [(0, 6, 'MODEL'), (34, 40, 'MODEL')], [(0, 6, 'MODEL'), (34, 40, 'MODEL')]]


In [44]:
# Training Routine, for Training spaCy Models
def train_model(
        nlp_model, 
        training_data, 
        revision_data = None, 
        labels = None,
        output_dir = None, 
        n_iter = 30,
        dropout = 0.35
    ):
    
    """ Set up the pipeline and entity recognizer, and train the new entity. """

    nlp = nlp_model
    ner = nlp_model.get_pipe("ner")

    # Add new entity labels to NER model
    if labels:
        for i in labels:
            ner.add_label(i)  

    optimizer = nlp.resume_training()
    move_names = list(ner.move_names)

    # Get names of other pipes to disable them during training
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        
        
        # Include Revision data if given
        if revision_data is None:
            examples = training_data
        else:
            print('Applying Revisions...')
            examples = revision_data + training_data

        # n_iter is number of epochs
        for itn in range(n_iter): 
            
            print("{} of {}".format(itn + 1, n_iter))
            # Shuffle new each time
            random.shuffle(examples)

            # Batch up the examples using spaCy's minibatch
            batches = minibatch(examples, size=compounding(1.0, 4.0, 1.001))
            losses = {}
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts, 
                    annotations, 
                    sgd=optimizer, 
                    drop=dropout, 
                    losses=losses
                )
                
            #print("Losses", losses)
            
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)
        
            
    return (nlp)    

In [45]:
#from copy import deepcopy
#nlp_train = deepcopy(nlp_en)

# Train the Model
nlp_model = train_model(
    nlp_en, 
    training_data, 
    labels = ['MODEL'],
    output_dir = './test_model'
)

1 of 30
2 of 30
3 of 30
4 of 30
5 of 30
6 of 30
7 of 30
8 of 30
9 of 30
10 of 30
11 of 30
12 of 30
13 of 30
14 of 30
15 of 30
16 of 30
17 of 30
18 of 30
19 of 30
20 of 30
21 of 30
22 of 30
23 of 30
24 of 30
25 of 30
26 of 30
27 of 30
28 of 30
29 of 30
30 of 30
Saved model to test_model


In [48]:
# Load model from disk and test
test = spacy.load("./test_model")
doc = test("How do I lease a fortwo coupe?")
for ent in doc.ents:
    print(doc, ent.start, ent.end, ent.text, ent.label_)
    
# Use one in memory
doc = nlp_model("How do I lease a fortwo coupe?")
for ent in doc.ents:
    print(doc, ent.start, ent.end, ent.text, ent.label_)
    
    

How do I lease a fortwo coupe? 5 6 fortwo MODEL
How do I lease a fortwo coupe? 5 6 fortwo MODEL


In [49]:
# Revision Data is needed when training with new labels!
# See: Catestrophic forgetting resolution
# https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting

# Make a copy for revision
nlp_en_revised = spacy.load("en_core_web_sm")

# Add lots of texts here for training!
revision_texts = [
  "I am really smart."
]

revision_data = []
for doc in nlp_en_revised.pipe(revision_texts):
    tags = [w.tag_ for w in doc]
    heads = [w.head.i for w in doc]
    deps = [w.dep_ for w in doc]
    entities = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
    revision_data.append((doc, GoldParse(doc, tags=tags, heads=heads,
                                         deps=deps, entities=entities)))

In [50]:
nlp_model_2 = train_model(
    nlp_en, 
    training_data,  
    revision_data = revision_data, 
    labels = new_labels,
)

Applying Revisions...
1 of 30
2 of 30
3 of 30
4 of 30
5 of 30
6 of 30
7 of 30
8 of 30
9 of 30
10 of 30
11 of 30
12 of 30
13 of 30
14 of 30
15 of 30
16 of 30
17 of 30
18 of 30
19 of 30
20 of 30
21 of 30
22 of 30
23 of 30
24 of 30
25 of 30
26 of 30
27 of 30
28 of 30
29 of 30
30 of 30


In [51]:
doc = nlp_model_2("How do I lease a fortwo coupe?")
for ent in doc.ents:
    print('->', ent.text, ent.label_)


-> fortwo MODEL
