In [1]:
import requests
import gensim.downloader as api
import numpy as np

import re

import tensorflow as tf

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, auc, precision_recall_curve

In [2]:
data_train = requests.get('https://raw.githubusercontent.com/UniversalDependencies/UD_English-GUM/master/en_gum-ud-train.conllu')
data_test = requests.get('https://raw.githubusercontent.com/UniversalDependencies/UD_English-GUM/master/en_gum-ud-test.conllu')
data_dev = requests.get('https://raw.githubusercontent.com/UniversalDependencies/UD_English-GUM/master/en_gum-ud-dev.conllu')

In [3]:
def tokenizer(data):

  raw_text = data.text.strip()
  raw_docs = re.split(r'\n\t?\n', raw_text)
  token_docs = []
  tag_docs = []
  sentences = []

  for doc in raw_docs:
    tokens = []
    tags = []
    for line in doc.split('\n'):
      if (len(line.split('\t')) != 1):
        tokens.append(line.split('\t')[1].lower())
        tags.append(line.split('\t')[3])

    token_docs.append(tokens)
    tag_docs.append(tags)

  return  token_docs, tag_docs

#### Validity for pos tagging and tokens. Should have the same size (with substract 2 cause of the pseudotoken start & end

In [4]:
token_docs_train, tag_docs_train = tokenizer(data_train)
token_docs_test, tag_docs_test = tokenizer(data_test)
token_docs_dev, tag_docs_dev = tokenizer(data_dev)

#validity for pos tagging and tokens.
for i in range(len(token_docs_train)):
  if len(token_docs_train[i]) - len(tag_docs_train[i]) != 0:
    print(i)

print(token_docs_train[0])
print(tag_docs_train[0])

['aesthetic', 'appreciation', 'and', 'spanish', 'art', ':']
['ADJ', 'NOUN', 'CCONJ', 'ADJ', 'NOUN', 'PUNCT']


#### Size of training, development and test datasets

In [5]:
print('Training sentences :', len(token_docs_train))
print('Test sentences :', len(token_docs_test))
print('Development sentences :', len(token_docs_dev))

Training sentences : 8548
Test sentences : 1096
Development sentences : 1117


### Get Number of labels and check support for each label

##### '_' ---> word contains apostrophe while 'X' ---> not english

In [6]:
flatten_tags_dev = [t for tag in tag_docs_dev for t in tag]
flatten_tags_test = [t for tag in tag_docs_test for t in tag]
flatten_tags_train = [t for tag in tag_docs_train for t in tag]

print('Training set labels :', sorted(set(flatten_tags_train)))
print('Test set labels :', sorted(set(flatten_tags_test)))
print('Development set labels :', sorted(set(flatten_tags_dev)))

N_CLASSES = len(set(flatten_tags_train))
print('\nNumber of labels :', N_CLASSES)
print('Vocabulary size :', len(set(t for token in token_docs_train for t in token)))


def average_sentence_length(sentences):
  sent_sizes = []

  for sent in sentences:
    sent_sizes.append(len(sent))

  return int(np.mean(np.array(sent_sizes)))

print('\nAverage sentence length of training set :', average_sentence_length(token_docs_train))
print('Average sentence length of development set :', average_sentence_length(token_docs_dev))
print('Average sentence length of test set :', average_sentence_length(token_docs_test))

Training set labels : ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X', '_']
Test set labels : ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X', '_']
Development set labels : ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X', '_']

Number of labels : 18
Vocabulary size : 14826

Average sentence length of training set : 17
Average sentence length of development set : 17
Average sentence length of test set : 18


### Transform to dataset format and match tags to ids

In [7]:
tag2id = {
    'ADJ': 0,
    'ADP': 1,
    'ADV': 2,
    'AUX': 3,
    'CCONJ': 4,
    'DET': 5,
    'INTJ': 6,
    'NOUN': 7,
    'NUM': 8,
    'PART': 9,
    'PRON': 10,
    'PROPN': 11,
    'PUNCT': 12,
    'SCONJ': 13,
    'SYM': 14,
    'VERB': 15,
    'X': 16,
    '_': 17
}

id2tag = {
    0: 'ADJ',
    1: 'ADP',
    2: 'ADV',
    3: 'AUX',
    4: 'CCONJ',
    5: 'DET',
    6: 'INTJ',
    7: 'NOUN',
    8: 'NUM',
    9: 'PART',
    10: 'PRON',
    11: 'PROPN',
    12: 'PUNCT',
    13: 'SCONJ',
    14: 'SYM',
    15: 'VERB',
    16: 'X',
    17: '_'
}


def docs_to_dict(x, y):

  dataset = list()

  for i in range(len(x)):
    data = dict()
    data['id'] = i
    data['tokens'] = x[i]
    data['pos_tags'] = [tag2id[tag] for tag in y[i]]

    dataset.append(data)

  return dataset

train_dataset = docs_to_dict(token_docs_train[:2000], tag_docs_train)
dev_dataset = docs_to_dict(token_docs_dev[:600], tag_docs_dev)
test_dataset = docs_to_dict(token_docs_test[:600], tag_docs_test)

print(train_dataset[0])

{'id': 0, 'tokens': ['aesthetic', 'appreciation', 'and', 'spanish', 'art', ':'], 'pos_tags': [0, 7, 4, 0, 7, 12]}


### Create 1-hot vectors for y_true label

In [8]:
mlb = MultiLabelBinarizer()

y_train_1_hot  = mlb.fit_transform([[tag] for tag in flatten_tags_train])
y_test_1_hot  = mlb.transform([[tag] for tag in flatten_tags_test])
y_dev_1_hot  = mlb.transform([[tag] for tag in flatten_tags_dev])

In [9]:
!pip install transformers[torch] sentence-transformers datasets
!pip install -U accelerate
!pip install -U transformers
!pip install evaluate

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.20.3 (from transformers[torch])
  Downloading accelerate-0.26.0-py3-none-any.whl (270 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.7/270.7 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.

# Tokenize to make the format for Distilled BERT

In [10]:
import torch
import transformers
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer
import datasets
import gc
import evaluate

poseval = evaluate.load("poseval")


tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"pos_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_x_train= datasets.Dataset.from_list(train_dataset).map(tokenize_and_align_labels, batched=True)
tokenized_x_dev= datasets.Dataset.from_list(dev_dataset).map(tokenize_and_align_labels, batched=True)
tokenized_x_test = datasets.Dataset.from_list(test_dataset).map(tokenize_and_align_labels, batched=True)


data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)


# The number of trainable layers of DistilledBERT
num_layers = 102

print(tokenized_x_train)
print(tokenized_x_train[0])


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [mlb.classes_[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [mlb.classes_[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = poseval.compute(predictions=true_predictions, references=true_labels)
    return {
        "accuracy": results["accuracy"],
        "f1": results["macro avg"]["f1-score"]
    }

Downloading builder script:   0%|          | 0.00/4.46k [00:00<?, ?B/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'tokens', 'pos_tags', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 2000
})
{'id': 0, 'tokens': ['aesthetic', 'appreciation', 'and', 'spanish', 'art', ':'], 'pos_tags': [0, 7, 4, 0, 7, 12], 'input_ids': [101, 12465, 12284, 1998, 3009, 2396, 1024, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 0, 7, 4, 0, 7, 12, -100]}


# Fine-tune Distilled BERT

In [11]:
# Hyperparameter on how many layers to freeze [40%, 60%, 80%]

layers_to_freeze = [int(num_layers * 0.4), int(num_layers * 0.6), int(num_layers * 0.8)]
metrics = dict()


for portion in layers_to_freeze:

  proxy_model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased", id2label=id2tag, label2id=tag2id, num_labels=N_CLASSES)

  print('\n-----Printing frozen layers {}-----'.format(portion))

  for name, param in list(proxy_model.named_parameters())[:portion]:
    if param.requires_grad == True:
      print(name)
    param.requires_grad = False


  training_args = TrainingArguments(
    output_dir='./tok_cls_example{}/'.format(portion),
    learning_rate=1e-4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    logging_steps=20,
    weight_decay=0.01,
    evaluation_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
  )

  print('\nFine-tuning DistilledBERT frozen:', (int((portion / num_layers) * 100) + 1), '%')
  trainer = Trainer(
      proxy_model,
      training_args,
      train_dataset=tokenized_x_train,
      eval_dataset=tokenized_x_dev,
      data_collator=data_collator,
      tokenizer=tokenizer,
      compute_metrics=compute_metrics,
  )


  trainer.train()

  metrics[portion] = trainer.evaluate()

  print(metrics[portion])

  del proxy_model
  gc.collect()
  torch.cuda.empty_cache()

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



-----Printing frozen layers 40-----
distilbert.embeddings.word_embeddings.weight
distilbert.embeddings.position_embeddings.weight
distilbert.embeddings.LayerNorm.weight
distilbert.embeddings.LayerNorm.bias
distilbert.transformer.layer.0.attention.q_lin.weight
distilbert.transformer.layer.0.attention.q_lin.bias
distilbert.transformer.layer.0.attention.k_lin.weight
distilbert.transformer.layer.0.attention.k_lin.bias
distilbert.transformer.layer.0.attention.v_lin.weight
distilbert.transformer.layer.0.attention.v_lin.bias
distilbert.transformer.layer.0.attention.out_lin.weight
distilbert.transformer.layer.0.attention.out_lin.bias
distilbert.transformer.layer.0.sa_layer_norm.weight
distilbert.transformer.layer.0.sa_layer_norm.bias
distilbert.transformer.layer.0.ffn.lin1.weight
distilbert.transformer.layer.0.ffn.lin1.bias
distilbert.transformer.layer.0.ffn.lin2.weight
distilbert.transformer.layer.0.ffn.lin2.bias
distilbert.transformer.layer.0.output_layer_norm.weight
distilbert.transformer.

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy,F1
20,1.8016,0.922621,0.801107,0.544649
40,0.5276,0.295681,0.92259,0.784673
60,0.2637,0.193335,0.945079,0.825511
80,0.1716,0.160198,0.954684,0.875089
100,0.1491,0.140259,0.957057,0.890948
120,0.1314,0.135447,0.958639,0.899619


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.13489629328250885, 'eval_accuracy': 0.9584133800429427, 'eval_f1': 0.899707636059965, 'eval_runtime': 51.035, 'eval_samples_per_second': 11.757, 'eval_steps_per_second': 0.372, 'epoch': 2.0}


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



-----Printing frozen layers 61-----
distilbert.embeddings.word_embeddings.weight
distilbert.embeddings.position_embeddings.weight
distilbert.embeddings.LayerNorm.weight
distilbert.embeddings.LayerNorm.bias
distilbert.transformer.layer.0.attention.q_lin.weight
distilbert.transformer.layer.0.attention.q_lin.bias
distilbert.transformer.layer.0.attention.k_lin.weight
distilbert.transformer.layer.0.attention.k_lin.bias
distilbert.transformer.layer.0.attention.v_lin.weight
distilbert.transformer.layer.0.attention.v_lin.bias
distilbert.transformer.layer.0.attention.out_lin.weight
distilbert.transformer.layer.0.attention.out_lin.bias
distilbert.transformer.layer.0.sa_layer_norm.weight
distilbert.transformer.layer.0.sa_layer_norm.bias
distilbert.transformer.layer.0.ffn.lin1.weight
distilbert.transformer.layer.0.ffn.lin1.bias
distilbert.transformer.layer.0.ffn.lin2.weight
distilbert.transformer.layer.0.ffn.lin2.bias
distilbert.transformer.layer.0.output_layer_norm.weight
distilbert.transformer.

Step,Training Loss,Validation Loss,Accuracy,F1
20,1.9352,1.224146,0.741666,0.47441
40,0.7416,0.433362,0.896146,0.736451
60,0.3587,0.26757,0.926997,0.784725
80,0.2436,0.218084,0.938863,0.807881
100,0.2079,0.192681,0.944627,0.855559
120,0.1871,0.183529,0.946774,0.869541


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.18297314643859863, 'eval_accuracy': 0.9463216182619505, 'eval_f1': 0.857518413364946, 'eval_runtime': 47.818, 'eval_samples_per_second': 12.548, 'eval_steps_per_second': 0.397, 'epoch': 2.0}


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



-----Printing frozen layers 81-----
distilbert.embeddings.word_embeddings.weight
distilbert.embeddings.position_embeddings.weight
distilbert.embeddings.LayerNorm.weight
distilbert.embeddings.LayerNorm.bias
distilbert.transformer.layer.0.attention.q_lin.weight
distilbert.transformer.layer.0.attention.q_lin.bias
distilbert.transformer.layer.0.attention.k_lin.weight
distilbert.transformer.layer.0.attention.k_lin.bias
distilbert.transformer.layer.0.attention.v_lin.weight
distilbert.transformer.layer.0.attention.v_lin.bias
distilbert.transformer.layer.0.attention.out_lin.weight
distilbert.transformer.layer.0.attention.out_lin.bias
distilbert.transformer.layer.0.sa_layer_norm.weight
distilbert.transformer.layer.0.sa_layer_norm.bias
distilbert.transformer.layer.0.ffn.lin1.weight
distilbert.transformer.layer.0.ffn.lin1.bias
distilbert.transformer.layer.0.ffn.lin2.weight
distilbert.transformer.layer.0.ffn.lin2.bias
distilbert.transformer.layer.0.output_layer_norm.weight
distilbert.transformer.

Step,Training Loss,Validation Loss,Accuracy,F1
20,2.3574,2.067529,0.493276,0.199826
40,1.6586,1.314934,0.690247,0.463225
60,1.0621,0.841867,0.810035,0.61842
80,0.7305,0.640808,0.856029,0.686876
100,0.6053,0.553207,0.875014,0.710045
120,0.5427,0.521471,0.87863,0.710064


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.5194765329360962, 'eval_accuracy': 0.8793083964289751, 'eval_f1': 0.711555632671605, 'eval_runtime': 48.8665, 'eval_samples_per_second': 12.278, 'eval_steps_per_second': 0.389, 'epoch': 2.0}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Evaluation on Train, Development and Test Datasets

In [12]:
model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased", id2label=id2tag, label2id=tag2id, num_labels=N_CLASSES)


for name, param in list(model.named_parameters())[:int(num_layers * 0.4)]:
    param.requires_grad = False


training_args = TrainingArguments(
    output_dir='./tok_cls_exampleBest/',
    learning_rate=1e-4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    logging_steps=20,
    weight_decay=0.01,
    evaluation_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
  )

print('\nFine-tuning DistilledBERT frozen: 40 %')


trainer = Trainer(
      model,
      training_args,
      train_dataset=tokenized_x_train,
      eval_dataset=tokenized_x_dev,
      data_collator=data_collator,
      tokenizer=tokenizer,
      compute_metrics=compute_metrics,
  )


trainer.train()

print(trainer.evaluate())

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Fine-tuning DistilledBERT frozen: 40 %


Step,Training Loss,Validation Loss,Accuracy,F1
20,1.7801,0.95759,0.784495,0.525753
40,0.5478,0.309802,0.922929,0.784755
60,0.2651,0.200834,0.943044,0.813154
80,0.1793,0.167211,0.951859,0.868243
100,0.1522,0.144919,0.957961,0.889732
120,0.1318,0.13821,0.960335,0.902304


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.1377260386943817, 'eval_accuracy': 0.9605605153124647, 'eval_f1': 0.9025828230703392, 'eval_runtime': 48.378, 'eval_samples_per_second': 12.402, 'eval_steps_per_second': 0.393, 'epoch': 2.0}


In [13]:
# We use a random sample from train, test and development datasets for evaluation

train_set = [' '.join(x_doc) for x_doc in token_docs_train[:1000]]
dev_set = [' '.join(x_doc) for x_doc in token_docs_dev[:600]]
test_set = [' '.join(x_doc) for x_doc in token_docs_test[:600]]


predictions_train = []
predictions_dev = []
predictions_test = []


model.eval()
with torch.no_grad():

  for example in train_set:
    tokenized_example = tokenizer(example, return_tensors='pt')
    logits = model(**tokenized_example).logits
    predictions = np.argmax(logits, axis=2)
    predictions_train.append([model.config.id2label[t.item()] for t in predictions[0]])

  for example in dev_set:
    tokenized_example = tokenizer(example, return_tensors='pt')
    logits = model(**tokenized_example).logits
    predictions = np.argmax(logits, axis=2)
    predictions_dev.append([model.config.id2label[t.item()] for t in predictions[0]])

  for example in test_set:
    tokenized_example = tokenizer(example, return_tensors='pt')
    logits = model(**tokenized_example).logits
    predictions = np.argmax(logits, axis=2)
    predictions_test.append([model.config.id2label[t.item()] for t in predictions[0]])

In [14]:
# Clean predictions from -100 id of special token

def clean_and_flat_preds(predictions, labels):
  cleaned_pred = []

  for i in range(len(predictions)):
    cleaned_pred.append([p for (p, l) in zip(predictions[i], labels[i]['labels']) if l != -100])

  flat_pred = [t for tag in cleaned_pred for t in tag]

  return flat_pred


y_predictions_train = mlb.transform([[tag] for tag in clean_and_flat_preds(predictions_train, tokenized_x_train)])
y_predictions_dev = mlb.transform([[tag] for tag in clean_and_flat_preds(predictions_dev, tokenized_x_dev)])
y_predictions_test = mlb.transform([[tag] for tag in clean_and_flat_preds(predictions_test, tokenized_x_test)])


del model
gc.collect()
torch.cuda.empty_cache()

# Metrics

Precision , Recall , F1 , AUC scores for Distilled BERT classifier

In [15]:
def prec_rec_auc(y_true, y_pred, N_CLASSES):
  precision = dict()
  recall = dict()
  auc_score = dict()

  for i in range(N_CLASSES):
    precision[i], recall[i], _ = precision_recall_curve(y_true[:, i], y_pred[:, i])
    auc_score[mlb.classes_[i]] = auc(recall[i], precision[i])

  return precision, recall, auc_score


def macro_auc(auc_scores):
  auc = []

  for key in auc_scores.keys():
    auc.append(auc_scores[key])

  return np.mean(np.array(auc))



_, _, distillbert_auc_train = prec_rec_auc(y_train_1_hot[:len(y_predictions_train)], y_predictions_train, N_CLASSES)

_, _, distillbert_auc_dev = prec_rec_auc(y_dev_1_hot[:len(y_predictions_dev)], y_predictions_dev, N_CLASSES)

_, _, distillbert_auc_test = prec_rec_auc(y_test_1_hot[:len(y_predictions_test)], y_predictions_test, N_CLASSES)



print('\n--------- Fine-tuned Distilled BERT ---------\n')
print('Training set\n')
print(classification_report(y_train_1_hot[:len(y_predictions_train)], y_predictions_train, target_names = mlb.classes_, zero_division=0.0))
print('AUC training :', distillbert_auc_train, '\n')
print('Macro AUC :', macro_auc(distillbert_auc_train), '\n')
print('\nDevelopment set\n')
print(classification_report(y_dev_1_hot[:len(y_predictions_dev)], y_predictions_dev, target_names = mlb.classes_, zero_division=0.0))
print('AUC development :', distillbert_auc_dev, '\n')
print('Macro AUC :', macro_auc(distillbert_auc_dev), '\n')
print('\nTest set\n')
print(classification_report(y_test_1_hot[:len(y_predictions_test)], y_predictions_test, target_names = mlb.classes_, zero_division=0.0))
print('AUC test :', distillbert_auc_test, '\n')
print('Macro AUC :', macro_auc(distillbert_auc_test), '\n')


--------- Fine-tuned Distilled BERT ---------

Training set

              precision    recall  f1-score   support

         ADJ       0.94      0.94      0.94      1894
         ADP       0.99      1.00      0.99      2930
         ADV       0.96      0.96      0.96       781
         AUX       0.99      1.00      0.99       861
       CCONJ       0.99      0.99      0.99       752
         DET       1.00      1.00      1.00      2126
        INTJ       0.71      1.00      0.83        10
        NOUN       0.97      0.98      0.97      4900
         NUM       0.97      0.99      0.98       914
        PART       0.99      0.99      0.99       432
        PRON       0.99      1.00      1.00      1042
       PROPN       0.93      0.94      0.93      2128
       PUNCT       1.00      1.00      1.00      3733
       SCONJ       0.94      0.92      0.93       332
         SYM       0.98      0.73      0.83        55
        VERB       0.98      0.97      0.97      2153
           X       