In [1]:
! pip install transformers

Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 5.7 MB/s eta 0:00:01
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-macosx_10_11_x86_64.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 6.9 MB/s eta 0:00:01
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 17.8 MB/s eta 0:00:01
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [2]:
! python -m pip install wget

Collecting wget
  Downloading wget-3.2.zip (10 kB)
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25ldone
[?25h  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9672 sha256=389e97782c6e9ba19a374af49bd00d6b9423ef0ca4c3e6143aca74c33093559d
  Stored in directory: /Users/jsingh/Library/Caches/pip/wheels/04/5f/3e/46cc37c5d698415694d83f607f833f83f0149e49b3af9d0f38
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


# downloading the data

In [3]:
! wget http://noisy-text.github.io/2017/files/wnut17train.conll

--2022-12-05 15:20:06--  http://noisy-text.github.io/2017/files/wnut17train.conll
Resolving noisy-text.github.io (noisy-text.github.io)... 185.199.111.153, 185.199.110.153, 185.199.108.153, ...
Connecting to noisy-text.github.io (noisy-text.github.io)|185.199.111.153|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 493781 (482K) [application/octet-stream]
Saving to: ‘wnut17train.conll’


2022-12-05 15:20:07 (3.22 MB/s) - ‘wnut17train.conll’ saved [493781/493781]



In [4]:
from pathlib import Path
import re

def split_into_tokens(raw_text):
    raw_docs = re.split(r'\n\t?\n', raw_text)
    token_docs = []
    tag_docs = []
    for doc in raw_docs:
        tokens = []
        tags = []
        for line in doc.split('\n'):
            row = line.split('\t')
            if len(row) == 1:
                token = row[0]
                tag = None
            else:
                token, tag = line.split('\t')
            tokens.append(token)
            tags.append(tag)
        token_docs.append(tokens)
        tag_docs.append(tags)

    return token_docs, tag_docs

def read_wnut(file_path):
    file_path = Path(file_path)

    raw_text = file_path.read_text().strip()
    token_docs, tag_docs = split_into_tokens(raw_text)

    return token_docs, tag_docs

texts, tags = read_wnut('wnut17train.conll')

In [5]:
print(texts[0][10:17], tags[0][10:17], sep='\n')


['for', 'two', 'weeks', '.', 'Empire', 'State', 'Building']
['O', 'O', 'O', 'O', 'B-location', 'I-location', 'I-location']


 O indicates the token does not correspond to any entity
 location is an entity.
 
 B- indicates the beginning of an entity, and I- indicates consecutive positions of the same entity.
 Thus, "Empire", "State", "Building" has tokens "B-location", "I-location", "I-location"

In [6]:
# Splitting our data into training and validation set

from sklearn.model_selection import train_test_split
train_texts, val_texts, train_tags, val_tags = train_test_split(texts, tags, test_size=.2)

To encode the tokens, we will use a pre-trained DistilBert tokenizer.
We can tell the tokenizer that we have ready-split tokens rather than full sentence strings by passing is_split_into_words=True
We’ll pass padding=True and truncation=True to pad the sequences to be the same length. 
We can tell the model to return information about the tokens that are split by the wordpiece tokenization process.

WordPiece Tokenization is the process by which single words are split into multiple tokens such that each token is likely to be in the vocabulary. Some words may not be in the vocabulary of a model. Thus the model splits the word into sub-words/tokens. Since we have only one tag per token, if the tokenizer splits a token into multiple sub-tokens, then we will end up with a mismatch between our tokens and our labels. To resolve this, we will train on the tag labels for the first subtoken of a split token. We can do this by setting the labels we wish to ignore to -100.

In [8]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')
train_encodings = tokenizer(
    train_texts,
    is_split_into_words=True,
    return_offsets_mapping=True,
    padding=True,
    truncation=True
)
val_encodings = tokenizer(
    val_texts, 
    is_split_into_words=True, 
    return_offsets_mapping=True, 
    padding=True, 
    truncation=True
)

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/411 [00:00<?, ?B/s]

In [9]:
import numpy as np

unique_tags = set(tag for doc in tags for tag in doc)
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

def encode_tags(tags, encodings):
    labels = [[tag2id[tag] for tag in doc] for doc in tags]
    encoded_labels = []
    for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
        # create an empty array of -100
        doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
        arr_offset = np.array(doc_offset)

        # set labels whose first offset position is 0 and the second is not 0
        doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
        encoded_labels.append(doc_enc_labels.tolist())

    return encoded_labels

train_labels = encode_tags(train_tags, train_encodings)
val_labels = encode_tags(val_tags, val_encodings)


In [52]:
print(f"There are total {len(tag2id.keys())} entity tags in the data: {tag2id.keys()}")

There are total 13 entity tags in the data: dict_keys(['I-corporation', 'I-product', 'I-person', 'I-group', 'B-location', 'O', 'I-location', 'B-creative-work', 'B-group', 'I-creative-work', 'B-person', 'B-product', 'B-corporation'])


Next, we will create a dataset object

In [10]:
import tensorflow as tf

train_encodings.pop("offset_mapping") # we don't want to pass this to the model
val_encodings.pop("offset_mapping")

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
))

2022-12-05 15:23:02.500674: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Now we can load in a token classification model and specify the number of labels. Then, our model is ready for fine-tuning.



In [11]:
from transformers import TFDistilBertForTokenClassification
model = TFDistilBertForTokenClassification.from_pretrained('distilbert-base-cased', num_labels=len(unique_tags))

Downloading:   0%|          | 0.00/354M [00:00<?, ?B/s]

2022-12-05 15:24:07.866579: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.
Some layers from the model checkpoint at distilbert-base-cased were not used when initializing TFDistilBertForTokenClassification: ['vocab_layer_norm', 'vocab_projector', 'vocab_transform', 'activation_13']
- This IS expected if you are initializing TFDistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-

# Fine-tuning our model

In [12]:
from transformers import TFDistilBertForSequenceClassification

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=model.compute_loss) # you can also use any keras loss fn
model.fit(train_dataset.shuffle(1000).batch(16), epochs=3, batch_size=16)

Epoch 1/3


  return py_builtins.overload_of(f)(*args)


Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fde979f8c70>

In [13]:
model.config

DistilBertConfig {
  "_name_or_path": "distilbert-base-cased",
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_11": 11,
    "LABEL_12": 12,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7,
    "LABEL_8": 8,
    "LABEL_9": 9
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "tra

In [49]:
from transformers import pipeline
                         
custom_ner = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple") 
output = custom_ner("""Tommy bought my Armani shoes by the New Town Mall in Paris.
     Ella Parker purchased a Samsung Galaxy s21+ from Elante mall.""")

print(output)

[{'entity_group': 'LABEL_10', 'score': 0.95119816, 'word': 'Tommy', 'start': 0, 'end': 5}, {'entity_group': 'LABEL_5', 'score': 0.82647365, 'word': 'bought my', 'start': 6, 'end': 15}, {'entity_group': 'LABEL_11', 'score': 0.3976339, 'word': 'Arm', 'start': 16, 'end': 19}, {'entity_group': 'LABEL_1', 'score': 0.5137349, 'word': '##ani', 'start': 19, 'end': 22}, {'entity_group': 'LABEL_5', 'score': 0.97369736, 'word': 'shoes by the', 'start': 23, 'end': 35}, {'entity_group': 'LABEL_4', 'score': 0.37326077, 'word': 'New', 'start': 36, 'end': 39}, {'entity_group': 'LABEL_6', 'score': 0.6780467, 'word': 'Town Mall', 'start': 40, 'end': 49}, {'entity_group': 'LABEL_5', 'score': 0.844704, 'word': 'in Paris.', 'start': 50, 'end': 59}, {'entity_group': 'LABEL_10', 'score': 0.97740185, 'word': 'Ella', 'start': 65, 'end': 69}, {'entity_group': 'LABEL_2', 'score': 0.97186667, 'word': 'Parker', 'start': 70, 'end': 76}, {'entity_group': 'LABEL_5', 'score': 0.9917011, 'word': 'purchased a', 'start':

In [51]:
def convert_id_to_name(label_result):
    output_result = {}
    label = label_result["entity_group"]
    output_result["entity_group"] = id2tag.get(
        int(label.split("_")[1]), 
        label
    )
    for key in label_result:
        if key != "entity_group":
            output_result[key] = label_result[key]
    return output_result

new_output = [convert_id_to_name(i) for i in output]
new_output

[{'entity_group': 'B-person',
  'score': 0.95119816,
  'word': 'Tommy',
  'start': 0,
  'end': 5},
 {'entity_group': 'O',
  'score': 0.82647365,
  'word': 'bought my',
  'start': 6,
  'end': 15},
 {'entity_group': 'B-product',
  'score': 0.3976339,
  'word': 'Arm',
  'start': 16,
  'end': 19},
 {'entity_group': 'I-product',
  'score': 0.5137349,
  'word': '##ani',
  'start': 19,
  'end': 22},
 {'entity_group': 'O',
  'score': 0.97369736,
  'word': 'shoes by the',
  'start': 23,
  'end': 35},
 {'entity_group': 'B-location',
  'score': 0.37326077,
  'word': 'New',
  'start': 36,
  'end': 39},
 {'entity_group': 'I-location',
  'score': 0.6780467,
  'word': 'Town Mall',
  'start': 40,
  'end': 49},
 {'entity_group': 'O',
  'score': 0.844704,
  'word': 'in Paris.',
  'start': 50,
  'end': 59},
 {'entity_group': 'B-person',
  'score': 0.97740185,
  'word': 'Ella',
  'start': 65,
  'end': 69},
 {'entity_group': 'I-person',
  'score': 0.97186667,
  'word': 'Parker',
  'start': 70,
  'end': 76}