In [None]:
import numpy as np
import pandas as pd

from transformers import AutoTokenizer

from tqdm import tqdm
import tensorflow as tf

import matplotlib.pyplot as plt

In [None]:
def load_sentences(filepath):

    final = []
    sentences = []

    with open(filepath, 'r') as f:

        for line in f.readlines():

            if (line == ('-DOCSTART- -X- -X- O\n') or line == '\n'):
                if len(sentences) > 0:
                    final.append(sentences)
                    sentences = []
            else:
                l = line.split(' ')
                sentences.append((l[0], l[3].strip('\n')))

    return final

In [None]:


train_samples = load_sentences('/content/train.txt')
test_samples = load_sentences('/content/test.txt')
valid_samples = load_sentences('/content/valid.txt')

samples = train_samples + test_samples

schema = ['_'] + sorted({tag for sentence in samples
                             for _, tag in sentence})

In [None]:
print(schema)

['_', 'B-LOC', 'B-MISC', 'B-ORG', 'B-PER', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O']


In [None]:
from transformers import AutoConfig, TFAutoModelForTokenClassification

MODEL_NAME = 'bert-base-cased'

config = AutoConfig.from_pretrained(MODEL_NAME, num_labels=len(schema))
model = TFAutoModelForTokenClassification.from_pretrained(MODEL_NAME,
                                                          config=config)
model.summary()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_token_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  107719680 
                                                                 
 dropout_37 (Dropout)        multiple                  0 (unused)
                                                                 
 classifier (Dense)          multiple                  7690      
                                                                 
Total params: 107727370 (410.95 MB)
Trainable params: 107727370 (410.95 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_sample(sample):
    seq = [
               (subtoken, tag)
               for token, tag in sample
               for subtoken in tokenizer(token)['input_ids'][1:-1]
           ]
    return [(3, 'O')] + seq + [(4, 'O')]

def preprocess(samples):
    tag_index = {tag: i for i, tag in enumerate(schema)}
    tokenized_samples = list(tqdm(map(tokenize_sample, samples)))
    max_len = max(map(len, tokenized_samples))
    X = np.zeros((len(samples), max_len), dtype=np.int32)
    y = np.zeros((len(samples), max_len), dtype=np.int32)
    for i, sentence in enumerate(tokenized_samples):
        for j, (subtoken_id, tag) in enumerate(sentence):
            X[i, j] = subtoken_id
            y[i,j] = tag_index[tag]
    return X, y

X_train, y_train = preprocess(train_samples)
X_test, y_test = preprocess(test_samples)
X_valid, y_valid = preprocess(valid_samples)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

14041it [00:09, 1519.76it/s]
3453it [00:01, 1809.71it/s]
3250it [00:02, 1517.89it/s]


In [None]:
EPOCHS=3
BATCH_SIZE=8

optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics='accuracy')
history = model.fit(tf.constant(X_train), tf.constant(y_train),
                    validation_data=(X_test, y_test),
                    epochs=EPOCHS,
                    batch_size=BATCH_SIZE)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
!zip -r ner_model.zip /content/ner_model
from google.colab import files
files.download("ner_model.zip")

  adding: content/ner_model/ (stored 0%)
  adding: content/ner_model/config.json (deflated 56%)
  adding: content/ner_model/tokenizer_config.json (deflated 75%)
  adding: content/ner_model/tf_model.h5 (deflated 7%)
  adding: content/ner_model/vocab.txt (deflated 49%)
  adding: content/ner_model/special_tokens_map.json (deflated 42%)
  adding: content/ner_model/tokenizer.json (deflated 70%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>