# NER TOKEN CLASSIFICATION

Initial steps:
- Check CUDA compatibility
- Install dependencies
- load training data MAKING SURE THE DATA PATH IS CORRECT

In [1]:
import torch
torch.cuda.is_available()

True

In [2]:
torch.cuda.get_device_name()

'NVIDIA T1200 Laptop GPU'

In [3]:
!pip install -q -U transformers
!pip install -q -U accelerate
!pip install -q -U datasets

In [4]:
from datasets import load_dataset

dfiles= {"train": "./ner_train_split1.jsonl", "validation": "./ner_validation_split1.jsonl"}
dataset = load_dataset("json", data_files=dfiles)


dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tokens', 'ner_labels'],
        num_rows: 218
    })
    validation: Dataset({
        features: ['tokens', 'ner_tokens', 'ner_labels'],
        num_rows: 40
    })
})

In [5]:
import pandas as pd


dataset['train'].features

pd.DataFrame(dataset['train'][:])

Unnamed: 0,tokens,ner_tokens,ner_labels
0,"[Organic, Cinnamon, Harvest, Cereal]","[B-HLTH, B-FLVR, O, B-PRDT]","[3, 1, 0, 7]"
1,"[KitKat, Caramel, Crisp, Wafer, Bar]","[O, B-FLVR, O, B-PRDT, I-PRDT]","[0, 1, 0, 7, 8]"
2,"[Roasted, Red, Pepper, Alfredo, Spaghetti, Pas...","[B-FLVR, I-FLVR, I-FLVR, B-FLVR, O, B-PRDT, I-...","[1, 2, 2, 1, 0, 7, 8]"
3,"[Lavender, +, Vanilla, Bean, Deodorant, With, ...","[B-FLVR, O, B-FLVR, I-FLVR, B-PRDT, O, B-FLVR,...","[1, 0, 1, 2, 7, 0, 1, 2]"
4,"[Garlic, Pasta, Sauce]","[B-FLVR, B-PRDT, I-PRDT]","[1, 7, 8]"
...,...,...,...
213,"[Naturally, Flavoured, Grape, Soda]","[O, O, B-FLVR, B-PRDT]","[0, 0, 1, 7]"
214,"[Garlic, Spread]","[B-FLVR, B-PRDT]","[1, 7]"
215,"[Thick, &, Juicy, Turkey, Burgers]","[O, O, O, B-PRDT, I-PRDT]","[0, 0, 0, 7, 8]"
216,"[Three, Cheese, Pasta, Sauce]","[O, B-FLVR, B-PRDT, I-PRDT]","[0, 1, 7, 8]"


# Load Model and tokenizer from huggingface
- Tokenize and align labels
- convert tokens to trainable input

In [6]:
from transformers import AutoTokenizer

model_checkpoint= 'distilbert/distilbert-base-uncased'

tokenizer= AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer.is_fast

True

In [7]:
temp_inputs= tokenizer(dataset['train'][217]['tokens'], is_split_into_words=True)

temp_inputs.tokens()

['[CLS]',
 'lin',
 '##dor',
 '70',
 '%',
 'ca',
 '##cao',
 'dark',
 'chocolate',
 'tr',
 '##uf',
 '##fles',
 'bag',
 '[SEP]']

In [8]:
temp_inputs.word_ids()

[None, 0, 0, 1, 1, 2, 2, 3, 4, 5, 5, 5, 6, None]

In [9]:
def align_labels_with_tokens(labels, word_ids):
  new_labels= []
  current_word= None
  for word_id in word_ids:
    if word_id != current_word:
      current_word= word_id
      label= -100 if word_id is None else labels[word_id]
      new_labels.append(label)
    elif word_id == None:
      new_labels.append(-100)
    else :
      label = labels[word_id]
      if label%2 ==1 :
        label= label+1
      new_labels.append(label)

  return new_labels

In [10]:
#sample data test

label_tags= dataset['train'][217]['ner_labels']
word_ids= temp_inputs.word_ids()
print(label_tags, word_ids)

[0, 0, 0, 7, 8, 8, 0] [None, 0, 0, 1, 1, 2, 2, 3, 4, 5, 5, 5, 6, None]


In [11]:
result= align_labels_with_tokens(label_tags, word_ids)
result

[-100, 0, 0, 0, 0, 0, 0, 7, 8, 8, 8, 8, 0, -100]

In [12]:

def tokenize_and_align_labels(examples):
  tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True)

  all_labels = examples['ner_labels']

  new_labels = []
  for i, labels in enumerate(all_labels):
    word_ids = tokenized_inputs.word_ids(i)
    new_labels.append(align_labels_with_tokens(labels, word_ids))

  tokenized_inputs['labels'] = new_labels

  return tokenized_inputs

In [13]:
tokenized_aligned_dataset= dataset.map(tokenize_and_align_labels, batched= True, remove_columns= dataset['train'].column_names)


Map:   0%|          | 0/40 [00:00<?, ? examples/s]

In [14]:
tokenized_aligned_dataset['train'][217]

{'input_ids': [101,
  11409,
  7983,
  3963,
  1003,
  6187,
  20808,
  2601,
  7967,
  19817,
  16093,
  28331,
  4524,
  102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100, 0, 0, 0, 0, 0, 0, 7, 8, 8, 8, 8, 0, -100]}

In [15]:
from transformers import DataCollatorForTokenClassification

data_collator= DataCollatorForTokenClassification(tokenizer= tokenizer)

batch = data_collator([tokenized_aligned_dataset['train'][i] for i in range(2)])
batch

{'input_ids': tensor([[  101,  7554, 21229, 11203, 20943,   102,     0,     0,     0,     0],
        [  101,  8934, 24498, 14418, 10199, 15594, 11333,  7512,  3347,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'labels': tensor([[-100,    3,    1,    0,    7, -100, -100, -100, -100, -100],
        [-100,    0,    0,    1,    2,    0,    7,    8,    8, -100]])}

In [16]:
!pip install seqeval
!pip install evaluate

import evaluate
metric = evaluate.load('seqeval')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




# !IMPORTANT NOTE
## !IMPORTANT NOTE
### !IMPORTANT NOTE

Change the label names if using different labels

In [17]:
import numpy as np


label_names= [ "O", 
    "B-FLVR",
    "I-FLVR",
    "B-HLTH",
    "I-HLTH",
    "B-MLKF",
    "I-MLKF",
    "B-PRDT",
    "I-PRDT"
]
def compute_metrics(eval_preds):
  logits, labels = eval_preds

  predictions = np.argmax(logits, axis=-1)

  true_labels = [[label_names[l] for l in label if l!=-100] for label in labels]

  true_predictions = [[label_names[p] for p,l in zip(prediction, label) if l!=-100]
                      for prediction, label in zip(predictions, labels)]

  all_metrics = metric.compute(predictions=true_predictions, references=true_labels)

  return {"precision": all_metrics['overall_precision'],
          "recall": all_metrics['overall_recall'],
          "f1": all_metrics['overall_f1'],
          "accuracy": all_metrics['overall_accuracy']}

In [18]:
id2label = {i:label for i, label in enumerate(label_names)}
label2id = {label:i for i, label in enumerate(label_names)}

print(id2label, label2id)

{0: 'O', 1: 'B-FLVR', 2: 'I-FLVR', 3: 'B-HLTH', 4: 'I-HLTH', 5: 'B-MLKF', 6: 'I-MLKF', 7: 'B-PRDT', 8: 'I-PRDT'} {'O': 0, 'B-FLVR': 1, 'I-FLVR': 2, 'B-HLTH': 3, 'I-HLTH': 4, 'B-MLKF': 5, 'I-MLKF': 6, 'B-PRDT': 7, 'I-PRDT': 8}


In [19]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
                                                    model_checkpoint,
                                                    id2label=id2label,
                                                    label2id=label2id)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# BEGIN TRAINING

Look for decrease in validation loss

In [20]:
from transformers import TrainingArguments
from transformers import Trainer

!pip install transformers[torch]

args = TrainingArguments("distilbert-finetuned-ner",
                         evaluation_strategy = "epoch",
                         save_strategy="epoch",
                         learning_rate = 2e-5,
                         num_train_epochs=5,
                         weight_decay=0.01)

trainer = Trainer(model=model,
                  args=args,
                  train_dataset = tokenized_aligned_dataset['train'],
                  eval_dataset = tokenized_aligned_dataset['validation'],
                  data_collator=data_collator,
                  compute_metrics=compute_metrics,
                  tokenizer=tokenizer)

trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)






Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,1.546538,0.22973,0.184783,0.204819,0.495082
2,No log,1.203265,0.266055,0.315217,0.288557,0.606557
3,No log,0.993956,0.347107,0.456522,0.394366,0.688525
4,No log,0.898639,0.443478,0.554348,0.492754,0.721311
5,No log,0.872021,0.42735,0.543478,0.478469,0.718033


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=140, training_loss=1.140830557686942, metrics={'train_runtime': 15.0652, 'train_samples_per_second': 72.352, 'train_steps_per_second': 9.293, 'total_flos': 4355484410844.0, 'train_loss': 1.140830557686942, 'epoch': 5.0})

# INFERENCE 

Checkpoints are automatically saved in local folder as checkpoints.
Pick the checkpoints with the lowest loss (each epoch is one checkpoint e.g. epoch 5 is checkpoint 140 [larger number means later checkpoint])


In [25]:
from transformers import pipeline

checkpoint = "./distilbert-finetuned-ner/checkpoint-140"
token_classifier = pipeline(
    "token-classification", model=checkpoint, aggregation_strategy="average"
)

result =token_classifier("Schneider's Turkey Sausages, 450 g")

result

[{'entity_group': 'FLVR',
  'score': 0.41405544,
  'word': 'turkey',
  'start': 12,
  'end': 18},
 {'entity_group': 'PRDT',
  'score': 0.66121984,
  'word': 'sausages',
  'start': 19,
  'end': 27}]

# Save your checkpoint for use later
Check ner_inference Notebook for a standalone example for inferencing the model

In [22]:
!zip -r ner.zip "./distilbert-finetuned-ner/checkpoint-140"

updating: distilbert-finetuned-ner/checkpoint-140/ (stored 0%)
updating: distilbert-finetuned-ner/checkpoint-140/trainer_state.json (deflated 69%)
updating: distilbert-finetuned-ner/checkpoint-140/tokenizer_config.json (deflated 76%)
updating: distilbert-finetuned-ner/checkpoint-140/scheduler.pt (deflated 56%)
updating: distilbert-finetuned-ner/checkpoint-140/optimizer.pt (deflated 40%)
updating: distilbert-finetuned-ner/checkpoint-140/tokenizer.json (deflated 71%)
updating: distilbert-finetuned-ner/checkpoint-140/special_tokens_map.json (deflated 42%)
updating: distilbert-finetuned-ner/checkpoint-140/vocab.txt (deflated 53%)
updating: distilbert-finetuned-ner/checkpoint-140/training_args.bin (deflated 51%)
updating: distilbert-finetuned-ner/checkpoint-140/model.safetensors (deflated 8%)
updating: distilbert-finetuned-ner/checkpoint-140/rng_state.pth (deflated 25%)
updating: distilbert-finetuned-ner/checkpoint-140/config.json (deflated 51%)
