In [1]:
import joblib
import pathlib
import os

from refinery import Client
from embedders.extraction.contextual import TransformerTokenEmbedder
from sequencelearn.sequence_tagger import CRFTagger

import truss

In [2]:
# If you run the application locally, you need to include the uri https//localhost:4455
client = Client.from_secrets_file("secrets.json")

[38;5;4mℹ Connecting to https://app.kern.ai[0m
[38;5;2m✔ Logged in to system.[0m


In [3]:
# Load in the project data into a DataFrame
df = client.get_record_export(tokenize=False)
# if you set tokenize=True (default), the project-specific 
# spaCy tokenizer will process your textual data

In [4]:
# Extract only the manual labels from the DataFrame
df_yes = df[df["__seen__MANUAL"] == "yes"]

In [5]:
# Should you get a CUDA error, you can block GPU usage with
# os.environ[' CUDA_VISIBLE_DEVICES'] = ""

In [6]:
# If you don't have any SpaCy models on your machine, you can download them like this:
#!python -m spacy download en_core_web_sm

In [7]:
# Use embedders to easily convert your raw data. Transformer models will be downloaded automatically.
embedder = TransformerTokenEmbedder("distilbert-base-uncased", "en_core_web_sm")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
# Extract the corpus and the labels from the DataFrame
corpus = df_yes['headline']
labels = df_yes['headline__entities__MANUAL']

# use embedders to easily convert your raw data
embedder = TransformerTokenEmbedder("distilbert-base-uncased", "en_core_web_sm")

# contains a list of ragged shape [num_texts, num_tokens (text-specific), embedding_dimension]
embeddings = embedder.transform(corpus)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Initializing model, might take some time...


  yield documents[idx : min(idx + batch_size, length)]
Encoding batches ...: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:11<00:00,  5.87s/it]


In [9]:
# Instantiate tagger and fit to the data
tagger = CRFTagger(verbose=True, learning_rate=0.00001, num_epochs=500)
tagger.fit(embeddings, labels)

Settings for training:
num_epochs     500
learning_rate  1e-05
momentum       0.9
random_seed    727012968934355971
Epoch 1/500. Loss inf
Epoch 11/500. Loss 2289.665771484375
Epoch 21/500. Loss 779.0813598632812
Epoch 31/500. Loss 460.1942138671875
Epoch 41/500. Loss 340.2244873046875
Epoch 51/500. Loss 255.57073974609375
Epoch 61/500. Loss 198.02032470703125
Epoch 71/500. Loss 154.21685791015625
Epoch 81/500. Loss 118.7591552734375
Epoch 91/500. Loss 92.01459503173828
Epoch 101/500. Loss 72.25796508789062
Epoch 111/500. Loss 57.7586669921875
Epoch 121/500. Loss 47.1136474609375
Epoch 131/500. Loss 39.15240478515625
Epoch 141/500. Loss 33.07228088378906
Epoch 151/500. Loss 28.351837158203125
Epoch 161/500. Loss 24.662490844726562
Epoch 171/500. Loss 21.69512939453125
Epoch 181/500. Loss 19.28741455078125
Epoch 191/500. Loss 17.339141845703125
Epoch 201/500. Loss 15.697921752929688
Epoch 211/500. Loss 14.354385375976562
Epoch 221/500. Loss 13.180465698242188
Epoch 231/500. Loss 12.20492

In [10]:
# Try out the model
embs = embedder.transform(["Barack Obama is the president of the United States"])
tagger.predict(embs)

Initializing model, might take some time...


Encoding batches ...: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 19.05it/s]


[['B-people',
  'I-people',
  'OUTSIDE',
  'OUTSIDE',
  'OUTSIDE',
  'OUTSIDE',
  'OUTSIDE',
  'B-GPE',
  'I-GPE']]

In [11]:
# Will save the model to our Truss. Replace "ner-truss" if your truss is named differently
parent_path = pathlib.Path('ner-deployment.ipynb').parent.resolve()
save_path = os.path.join(parent_path, "ner-truss", "data", "model.joblib")
joblib.dump(tagger, save_path)

['/Users/jhoetter/repos/demo-ner-ml-week/ner-truss/data/model.joblib']

In [12]:
# Path the truss is saved in
truss_name = "ner-truss"
truss_parent = pathlib.Path(truss_name).parent.resolve()
truss_path = os.path.join(truss_parent, truss_name)

# Load in the truss
tr = truss.from_directory("ner-truss")

In [13]:
# Get new predictions on the truss
sentence = ["Facebook is not operating in Russia anymore"]
predictions = tr.server_predict({"inputs": embedder.transform(sentence)})

Initializing model, might take some time...


Encoding batches ...: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 19.46it/s]


In [14]:
# Print out the entities we found in the 
print(predictions)

{'predictions': [['B-organization', 'OUTSIDE', 'OUTSIDE', 'OUTSIDE', 'OUTSIDE', 'B-GPE', 'OUTSIDE']]}


In [15]:
# Call the running NER model from Azure container instance
import json 
import requests

input_dict = {"inputs": embedder.transform(sentence)}
with open("data.json", "w") as fp:
    json.dump(input_dict, fp)

headers = {
    "Content-Type": "application/json"
}

with open("data.json") as f:
    data = f.read().replace("\n", "")

response  = requests.post("http://20.166.114.59:8080/v1/models/model:predict", headers=headers, data=data)
print(response.text)

Initializing model, might take some time...


Encoding batches ...: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 17.82it/s]


{"predictions": [["B-organization", "OUTSIDE", "OUTSIDE", "OUTSIDE", "OUTSIDE", "B-GPE", "OUTSIDE"]]}
