In [None]:
!python --version

Python 3.7.13


In [None]:
!pip install import-ipynb
!pip install -U -q PyDrive

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting import-ipynb
  Downloading import_ipynb-0.1.4-py3-none-any.whl (4.1 kB)
Installing collected packages: import-ipynb
Successfully installed import-ipynb-0.1.4


In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.4-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 32.0 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 57.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 60.0 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 7.1 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Unin

In [None]:
from google.colab import drive; drive.mount('/content/drive')

import pdb
import import_ipynb
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

Mounted at /content/drive


#### **Data**

In [None]:
MAIN_PATH = "./NLP/NER"
MODEL_PATH = f"{MAIN_PATH}/models"
DATA_PATH_MUNDI =  f"{MAIN_PATH}/data/raw/classic_mundi.csv"

In [None]:
import pandas as pd
from itertools import chain
from collections import OrderedDict

In [None]:
data = pd.read_csv(DATA_PATH_MUNDI)

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47959 entries, 0 to 47958
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    47959 non-null  object
 1   labels  47959 non-null  object
dtypes: object(2)
memory usage: 749.5+ KB


In [None]:
data.head()

Unnamed: 0,text,labels
0,Thousands of demonstrators have marched throug...,O O O O O O B-geo O O O O O B-geo O O O O O B-...
1,Iranian officials say they expect to get acces...,B-gpe O O O O O O O O O O O O O O B-tim O O O ...
2,Helicopter gunships Saturday pounded militant ...,O O B-tim O O O O O B-geo O O O O O B-org O O ...
3,They left after a tense hour-long standoff wit...,O O O O O O O O O O O
4,U.N. relief coordinator Jan Egeland said Sunda...,B-geo O O B-per I-per O B-tim O B-geo O B-gpe ...


In [None]:
data.describe()

Unnamed: 0,text,labels
count,47959,47959
unique,47575,33318
top,VOA 's Mil Arcega reports .,O O O O O O O O O O O
freq,17,450


In [None]:
texts = [sent for sent in data["text"].tolist()]
print(texts[0])

Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .


In [None]:
labels = [label.split() for label in data["labels"].tolist()]
print(labels[0])

['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']


In [None]:
unique_labels = sorted(set(chain.from_iterable(labels)))
print(unique_labels)

['B-art', 'B-eve', 'B-geo', 'B-gpe', 'B-nat', 'B-org', 'B-per', 'B-tim', 'I-art', 'I-eve', 'I-geo', 'I-gpe', 'I-nat', 'I-org', 'I-per', 'I-tim', 'O']


In [None]:
labels2id = {"PAD": 0}

for idx, label in enumerate(unique_labels):
  idx += 1
  labels2id.setdefault(label, idx)

OrderedDict(sorted(labels2id.items()))

print(labels2id)

{'PAD': 0, 'B-art': 1, 'B-eve': 2, 'B-geo': 3, 'B-gpe': 4, 'B-nat': 5, 'B-org': 6, 'B-per': 7, 'B-tim': 8, 'I-art': 9, 'I-eve': 10, 'I-geo': 11, 'I-gpe': 12, 'I-nat': 13, 'I-org': 14, 'I-per': 15, 'I-tim': 16, 'O': 17}


In [None]:
from sklearn.model_selection import train_test_split

# texts_sample = texts[0:10000]
# labels_sample = labels[0:10000]

texts_sample = texts
labels_sample = labels

train_texts, val_texts, train_labels, val_labels = train_test_split(texts_sample, labels_sample , test_size=0.2, random_state=0)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.01, random_state=0)

print("train_texts:", len(train_texts))
print("test_texts:", len(val_texts))

train_texts: 37983
test_texts: 384


#### **Pretrained model loading**

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
from transformers import AutoConfig, AutoTokenizer, TFAutoModel

In [None]:
PRE_TRAINED_MODEL_NAME = "distilbert-base-uncased"
NUM_LABELS = (len(unique_labels))+1
MAX_LENGTH = 64 # TODO: check length

In [None]:
config = AutoConfig.from_pretrained(
    PRE_TRAINED_MODEL_NAME, 
    num_labels=NUM_LABELS,
    output_hidden_states=True, 
    )

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    PRE_TRAINED_MODEL_NAME,
    max_length=MAX_LENGTH,
    padding="max_length",
    truncation=True,
    )

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

#### **Preprocessing**

In [None]:
def tokenize_and_align_labels(texts, labels):
    # function doc: https://huggingface.co/docs/transformers/main/en/tasks/token_classification

    # TODO: I used a dict here because in future implementations
    # maybe can be a class or a dataclass. So this dict is a kind
    # of temporaly mock.
    encoded_inputs = {}

    for sent, lbs in zip(texts, labels):
    
        tokenized_input = tokenizer(
                sent, 
                max_length=MAX_LENGTH,
                padding="max_length", 
                truncation=True
                )

        encoded_inputs.setdefault("input_ids", []).append(
            tf.convert_to_tensor(tokenized_input.input_ids, dtype="int32")
            )
        
        encoded_inputs.setdefault("attention_mask", []).append(
            tf.convert_to_tensor(tokenized_input.attention_mask, dtype="int32")
            )

        encoded_labels = []
        previous_word_idx = None

        for word_idx in tokenized_input.word_ids():

            if word_idx is None:
                encoded_labels.append(labels2id["PAD"])

            elif word_idx != previous_word_idx:
                try:
                    encoded_labels.append(labels2id[lbs[word_idx]])
                except: 
                    encoded_labels.append(labels2id["PAD"])
                    
            else:
                encoded_labels.append(labels2id["PAD"])

        previous_word_idx = word_idx
        
        encoded_inputs.setdefault("labels", []).append(
            tf.convert_to_tensor(encoded_labels, dtype="int32")
            )

    return encoded_inputs

In [None]:
def preprocess_features(encoded_inputs):  
  return {"input_ids": encoded_inputs["input_ids"],
          "attention_mask": encoded_inputs["attention_mask"]}, encoded_inputs["labels"]

In [None]:
train_encoded_inputs = tokenize_and_align_labels(train_texts, train_labels)
val_encoded_inputs = tokenize_and_align_labels(val_texts, val_labels)

In [None]:
tf_train_ds = tf.data.Dataset.from_tensor_slices((train_encoded_inputs)).map(preprocess_features).shuffle(1000).batch(32).repeat(5)
tf_val_ds = tf.data.Dataset.from_tensor_slices((val_encoded_inputs)).map(preprocess_features).shuffle(1000).batch(32).repeat(5)

#### **Loss masking test**

In [None]:
# for x, y in tf_train_ds:
#     X_1 = x
#     y_1 = y
#     break

In [None]:
# X_1["input_ids"]

<tf.Tensor: shape=(32, 64), dtype=int32, numpy=
array([[  101,  1996,  2417, ...,     0,     0,     0],
       [  101,  3078,  8269, ...,     0,     0,     0],
       [  101, 16214,  9317, ...,     0,     0,     0],
       ...,
       [  101,  3041,  2023, ...,     0,     0,     0],
       [  101,  5037,  2610, ...,     0,     0,     0],
       [  101,  2076,  1996, ...,     0,     0,     0]], dtype=int32)>

In [None]:
# X_1["attention_mask"]

<tf.Tensor: shape=(32, 64), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>

In [None]:
# y_1

<tf.Tensor: shape=(32, 64), dtype=int32, numpy=
array([[ 0, 17,  3, ...,  0,  0,  0],
       [ 0, 17, 17, ...,  0,  0,  0],
       [ 0, 17,  8, ...,  0,  0,  0],
       ...,
       [ 0, 17, 17, ...,  0,  0,  0],
       [ 0,  4, 17, ...,  0,  0,  0],
       [ 0, 17, 17, ...,  0,  0,  0]], dtype=int32)>

In [None]:
# mask = tf.cast((y_1 != 0), dtype=tf.float32)
# mask

<tf.Tensor: shape=(32, 64), dtype=float32, numpy=
array([[0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       ...,
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.]], dtype=float32)>

#### **Model architecture**

In [None]:
transformer_encoder = TFAutoModel.from_pretrained(
    PRE_TRAINED_MODEL_NAME,
    config=config,
    )

Downloading:   0%|          | 0.00/347M [00:00<?, ?B/s]

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_projector', 'activation_13', 'vocab_transform', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [None]:
input_ids = layers.Input(shape=(MAX_LENGTH,), dtype=tf.int32, name="input_ids")
attention_mask = layers.Input(shape=(MAX_LENGTH,), dtype=tf.int32, name="attention_mask")

In [None]:
embedding = transformer_encoder(input_ids, attention_mask=attention_mask)[0]

In [None]:
logits = layers.Dense(NUM_LABELS, use_bias=False)(embedding)
logits

<KerasTensor: shape=(None, 64, 18) dtype=float32 (created by layer 'dense')>

In [None]:
# logits = layers.Flatten()(logits)
# logits

<KerasTensor: shape=(None, 1088) dtype=float32 (created by layer 'flatten')>

In [None]:
probs = layers.Activation(tf.keras.activations.softmax)(logits)
probs

<KerasTensor: shape=(None, 64, 18) dtype=float32 (created by layer 'activation')>

In [None]:
model = tf.keras.Model(
    inputs=[input_ids, attention_mask],
    outputs=probs,
)

In [None]:
def custom_loss_to_ignore_padding(y_true, y_pred):
  # main source: https://discuss.pytorch.org/t/ignore-padding-area-in-loss-computation/95804/5
  loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
      from_logits=False, reduction=tf.keras.losses.Reduction.NONE
  )
  loss = loss_fn(y_true, y_pred)
  mask = tf.cast((y_true > 0), dtype=tf.float32)
  loss = loss * mask
  return tf.reduce_sum(loss) / tf.reduce_sum(mask)

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-05)
# metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

#### **Compilation and training**

In [None]:
model.compile(optimizer=optimizer, loss=custom_loss_to_ignore_padding)

In [None]:
history = model.fit(tf_train_ds, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
model.evaluate(tf_val_ds, verbose=1)



0.2497575432062149

In [None]:
model.save_weights(MODEL_PATH)

#### **Dummy model loader**

In [None]:
import numpy as np

In [None]:
model.load_weights(MODEL_PATH)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fe1eea0c850>

In [None]:
def tokenize_and_align_word_ids(text):
    encoded_inputs = {}
    
    tokenized_input = tokenizer(
            text, 
            max_length=MAX_LENGTH,
            padding="max_length", 
            truncation=True
            )

    encoded_inputs.setdefault("input_ids", []).append(
        tf.convert_to_tensor(tokenized_input.input_ids, dtype="int64")
        )
    
    encoded_inputs.setdefault("attention_mask", []).append(
        tf.convert_to_tensor(tokenized_input.attention_mask, dtype="int64")
        )

    encoded_labels = []
    previous_word_idx = None

    for word_idx in tokenized_input.word_ids():

        if word_idx is None:
            encoded_labels.append(labels2id["PAD"])

        elif word_idx != previous_word_idx:
            try:
                encoded_labels.append(1)
            except:
                encoded_labels.append(labels2id["PAD"])
        else:
            encoded_labels.append(labels2id["PAD"])
                
        previous_word_idx = word_idx
        
    encoded_inputs.setdefault("labels", []).append(
        tf.convert_to_tensor(encoded_labels, dtype="int64")
        )

    return encoded_inputs

In [None]:
def ner_predictor(text):
  enconded_text = tokenize_and_align_word_ids(text)

  tf_df_input = tf.data.Dataset.from_tensor_slices((
      enconded_text
      )).map(input_to_features).shuffle(1000).batch(32)

  for x, y in tf_df_input:
    output = model.predict(x)
    prediction = np.argmax(output, axis=-1)[0]
    true_tag_ids = np.reshape(y, [-1])
    mask = (true_tag_ids > 0) & (prediction > 0) 
    true_tag_ids = true_tag_ids[mask]
    predicted_tag_ids = prediction[mask]

  predicted_tags = [id_to_label[tag] for tag in predicted_tag_ids]
  
  return {word:tag for word, tag in zip(text.split(), predicted_tags)}

In [None]:
example_1 = "Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country"
example_2 = "London is the capital of Ingland"
example_3 = "My name is Julián and I've lived in Argentina since 1993"

In [None]:
pred = ner_predictor(example_3)
pred

{'1993': 'I-tim',
 'Argentina': 'B-geo',
 'I': 'O',
 'Julián': 'B-per',
 'My': 'O',
 'and': 'O',
 'have': 'O',
 'in': 'O',
 'is': 'O',
 'lived': 'O',
 'name': 'O',
 'since': 'B-tim'}