In [None]:
import nltk
nltk.download('punkt')

import numpy as np
import tensorflow as tf

from sklearn.metrics import classification_report, auc, precision_recall_curve

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Load IMDB Dataset

In [None]:
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.imdb.load_data(
    path='imdb.npz',
    num_words=None,
    skip_top=0,
    maxlen=None,
    seed=113,
    start_char=1,
    oov_char=2,
    index_from=3)


word_index = tf.keras.datasets.imdb.get_word_index()
start_char = 1
oov_char = 2


inverted_word_index = dict((i + 3, word) for (word, i) in word_index.items())

inverted_word_index[start_char] = '[START]'
inverted_word_index[oov_char] = '[OOV]'

for i in range(len(x_train)):
  x_train[i] = ' '.join(inverted_word_index[i] for i in x_train[i])

for i in range(len(x_test)):
  x_test[i] = ' '.join(inverted_word_index[i] for i in x_test[i])



x_train = np.concatenate((x_train, x_test[10000:]))
y_train = np.concatenate((y_train, y_test[10000:]))

x_dev = x_train[:10000]
y_dev = y_train[:10000]

x_train = x_train[10000:]
y_train = y_train[10000:]

x_test = x_test[:10000]
y_test = y_test[:10000]

print('\nTraining set shape :', x_train.shape)
print('Development set shape :', x_dev.shape)
print('Test set shape :', x_test.shape)


# flatten_x_train = [token for doc in x_train for token in nltk.tokenize.word_tokenize(doc)]
# print('\nVocabulary size :', len(set(flatten_x_train)) - 3)


def average_doc_length(docs_tokenized):
  doc_sizes = []

  for doc in docs_tokenized:
    doc_sizes.append(len(doc))

  return int(np.mean(np.array(doc_sizes)))


x_train_tokenized = [nltk.tokenize.word_tokenize(doc)[3:] for doc in x_train]
x_dev_tokenized = [nltk.tokenize.word_tokenize(doc)[3:] for doc in x_dev]
x_test_tokenized = [nltk.tokenize.word_tokenize(doc)[3:] for doc in x_test]

print('\nAverage doc length of training set :', average_doc_length(x_train_tokenized))
print('Average doc length of development set :', average_doc_length(x_dev_tokenized))
print('Average doc length of test set :', average_doc_length(x_test_tokenized))


Training set shape : (30000,)
Development set shape : (10000,)
Test set shape : (10000,)

Average doc length of training set : 238
Average doc length of development set : 242
Average doc length of test set : 234


In [None]:
target_names = ['negative', 'positive']


# We use only some of the reviews that we have to fine-tune the model

train_set = [' '.join(x_doc) for x_doc in x_train_tokenized[:1000]]
dev_set = [' '.join(x_doc) for x_doc in x_dev_tokenized[:100]]

def docs_to_dict(x, y):

  dataset = list()

  for i in range(len(x)):
    data = dict()
    data['text'] = x[i]
    data['label'] = y[i]

    dataset.append(data)

  return dataset


train_dataset = docs_to_dict(train_set, y_train)
dev_dataset = docs_to_dict(dev_set, y_dev)

print(train_dataset[0])

{'text': "i think this is one of the weakest of the kenneth branagh shakespearian works after such great efforts as much ado about nothing etc i thought this was poor the cast was weaker alicia silverstone nivoli mcelhone but my biggest gripe was that they messed with the bard 's work and cut out some of the play to put in the musical dance sequences br br you just do n't do shakespeare and then mess with the play sorry but that is just wrong i love some cole porter just like the next person but jeez do n't mess with the shakespeare skip this and watch prospero 's books if you want to see a brilliant shakespearean adaptation of the tempest", 'label': 0}


In [None]:
!pip install transformers[torch] sentence-transformers datasets
!pip install -U accelerate
!pip install -U transformers



# Tokenize to make the format for Distilled BERT




In [None]:
import torch
import transformers
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
import datasets
import gc


tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_x_train = datasets.Dataset.from_list(train_dataset).map(preprocess_function, batched=True)
tokenized_x_dev = datasets.Dataset.from_list(dev_dataset).map(preprocess_function, batched=True)

print(tokenized_x_train)


from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    # Calculate accuracy
    accuracy = accuracy_score(labels, preds)

   # Calculate F1-score
    f1 = f1_score(labels, preds, average='weighted')

    return {
        'accuracy': accuracy,
        'f1': f1
    }


data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")


# The number of trainable layers of DistilledBERT
num_layers = 104

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 1000
})


# Fine-tune Distilled BERT

In [None]:
# Hyperparameter on how many layers to freeze [40%, 60%, 80%]

layers_to_freeze = [int(num_layers * 0.4), int(num_layers * 0.6), int(num_layers * 0.8)]
metrics = dict()


for portion in layers_to_freeze:

  proxy_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

  print('\n-----Printing frozen layers {}-----'.format(portion))

  for name, param in list(proxy_model.named_parameters())[:portion]:
    if param.requires_grad == True:
      print(name)
    param.requires_grad = False


  training_args = TrainingArguments(
    output_dir='./txt_cls_example{}/'.format(portion),
    learning_rate=1e-4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    logging_steps=20,
    weight_decay=0.01,
    evaluation_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
  )

  print('\nFine-tuning DistilledBERT frozen:', (int((portion / num_layers) * 100) + 1), '%')
  trainer = Trainer(
      proxy_model,
      training_args,
      train_dataset=tokenized_x_train,
      eval_dataset=tokenized_x_dev,
      data_collator=data_collator,
      tokenizer=tokenizer,
      compute_metrics=compute_metrics,
  )


  trainer.train()

  metrics[portion] = trainer.evaluate()

  print(metrics[portion])

  del proxy_model
  gc.collect()
  torch.cuda.empty_cache()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



-----Printing frozen layers 41-----
distilbert.embeddings.word_embeddings.weight
distilbert.embeddings.position_embeddings.weight
distilbert.embeddings.LayerNorm.weight
distilbert.embeddings.LayerNorm.bias
distilbert.transformer.layer.0.attention.q_lin.weight
distilbert.transformer.layer.0.attention.q_lin.bias
distilbert.transformer.layer.0.attention.k_lin.weight
distilbert.transformer.layer.0.attention.k_lin.bias
distilbert.transformer.layer.0.attention.v_lin.weight
distilbert.transformer.layer.0.attention.v_lin.bias
distilbert.transformer.layer.0.attention.out_lin.weight
distilbert.transformer.layer.0.attention.out_lin.bias
distilbert.transformer.layer.0.sa_layer_norm.weight
distilbert.transformer.layer.0.sa_layer_norm.bias
distilbert.transformer.layer.0.ffn.lin1.weight
distilbert.transformer.layer.0.ffn.lin1.bias
distilbert.transformer.layer.0.ffn.lin2.weight
distilbert.transformer.layer.0.ffn.lin2.bias
distilbert.transformer.layer.0.output_layer_norm.weight
distilbert.transformer.

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy,F1
20,0.5847,0.290141,0.86,0.860848
40,0.4232,0.282395,0.89,0.890699
60,0.2207,0.234247,0.88,0.880343


{'eval_loss': 0.23243600130081177, 'eval_accuracy': 0.89, 'eval_f1': 0.8901687979539642, 'eval_runtime': 104.8126, 'eval_samples_per_second': 0.954, 'eval_steps_per_second': 0.067, 'epoch': 2.0}


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



-----Printing frozen layers 62-----
distilbert.embeddings.word_embeddings.weight
distilbert.embeddings.position_embeddings.weight
distilbert.embeddings.LayerNorm.weight
distilbert.embeddings.LayerNorm.bias
distilbert.transformer.layer.0.attention.q_lin.weight
distilbert.transformer.layer.0.attention.q_lin.bias
distilbert.transformer.layer.0.attention.k_lin.weight
distilbert.transformer.layer.0.attention.k_lin.bias
distilbert.transformer.layer.0.attention.v_lin.weight
distilbert.transformer.layer.0.attention.v_lin.bias
distilbert.transformer.layer.0.attention.out_lin.weight
distilbert.transformer.layer.0.attention.out_lin.bias
distilbert.transformer.layer.0.sa_layer_norm.weight
distilbert.transformer.layer.0.sa_layer_norm.bias
distilbert.transformer.layer.0.ffn.lin1.weight
distilbert.transformer.layer.0.ffn.lin1.bias
distilbert.transformer.layer.0.ffn.lin2.weight
distilbert.transformer.layer.0.ffn.lin2.bias
distilbert.transformer.layer.0.output_layer_norm.weight
distilbert.transformer.

Step,Training Loss,Validation Loss,Accuracy,F1
20,0.5941,0.400121,0.82,0.82
40,0.3816,0.238445,0.92,0.919333
60,0.242,0.216006,0.92,0.919333


{'eval_loss': 0.20826655626296997, 'eval_accuracy': 0.93, 'eval_f1': 0.92958605664488, 'eval_runtime': 165.1124, 'eval_samples_per_second': 0.606, 'eval_steps_per_second': 0.042, 'epoch': 2.0}


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



-----Printing frozen layers 83-----
distilbert.embeddings.word_embeddings.weight
distilbert.embeddings.position_embeddings.weight
distilbert.embeddings.LayerNorm.weight
distilbert.embeddings.LayerNorm.bias
distilbert.transformer.layer.0.attention.q_lin.weight
distilbert.transformer.layer.0.attention.q_lin.bias
distilbert.transformer.layer.0.attention.k_lin.weight
distilbert.transformer.layer.0.attention.k_lin.bias
distilbert.transformer.layer.0.attention.v_lin.weight
distilbert.transformer.layer.0.attention.v_lin.bias
distilbert.transformer.layer.0.attention.out_lin.weight
distilbert.transformer.layer.0.attention.out_lin.bias
distilbert.transformer.layer.0.sa_layer_norm.weight
distilbert.transformer.layer.0.sa_layer_norm.bias
distilbert.transformer.layer.0.ffn.lin1.weight
distilbert.transformer.layer.0.ffn.lin1.bias
distilbert.transformer.layer.0.ffn.lin2.weight
distilbert.transformer.layer.0.ffn.lin2.bias
distilbert.transformer.layer.0.output_layer_norm.weight
distilbert.transformer.

Step,Training Loss,Validation Loss,Accuracy,F1
20,0.6539,0.577639,0.85,0.850586
40,0.5152,0.345134,0.87,0.870516
60,0.3726,0.285119,0.89,0.890436


{'eval_loss': 0.2843807339668274, 'eval_accuracy': 0.89, 'eval_f1': 0.8904363747329873, 'eval_runtime': 100.3676, 'eval_samples_per_second': 0.996, 'eval_steps_per_second': 0.07, 'epoch': 2.0}


# Evaluation on Train, Development and Test Datasets

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

for name, param in list(model.named_parameters())[:int(num_layers * 0.6)]:
    param.requires_grad = False


training_args = TrainingArguments(
    output_dir='./txt_cls_exampleBest/',
    learning_rate=1e-4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    logging_steps=20,
    weight_decay=0.01,
    evaluation_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
  )

print('\nFine-tuning DistilledBERT frozen: 60 %')


trainer = Trainer(
      model,
      training_args,
      train_dataset=tokenized_x_train,
      eval_dataset=tokenized_x_dev,
      data_collator=data_collator,
      tokenizer=tokenizer,
      compute_metrics=compute_metrics,
  )


trainer.train()

print(trainer.evaluate())


# We use a random sample from train, test and development datasets for evaluation

train_set = [' '.join(x_doc) for x_doc in x_train_tokenized[:100]]
dev_set = [' '.join(x_doc) for x_doc in x_dev_tokenized[:100]]
test_set = [' '.join(x_doc) for x_doc in x_test_tokenized[400:500]]


tokenized_x_train = tokenizer(train_set, truncation=True, padding=True, return_tensors='pt')
tokenized_x_dev = tokenizer(dev_set, truncation=True, padding=True, return_tensors='pt')
tokenized_x_test = tokenizer(test_set, truncation=True, padding=True, return_tensors='pt')


model.eval()
with torch.no_grad():
  predictions_train = model(**tokenized_x_train)
  predictions_dev = model(**tokenized_x_dev)
  predictions_test = model(**tokenized_x_test)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Fine-tuning DistilledBERT frozen: 60 %


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy,F1
20,0.6145,0.340392,0.87,0.870717
40,0.4034,0.226958,0.91,0.909011
60,0.252,0.20608,0.92,0.91889


{'eval_loss': 0.1954440474510193, 'eval_accuracy': 0.92, 'eval_f1': 0.918890290037831, 'eval_runtime': 115.4738, 'eval_samples_per_second': 0.866, 'eval_steps_per_second': 0.061, 'epoch': 2.0}


In [None]:
def softmax(x):
 return np.exp(x) / np.sum(np.exp(x))


y_predictions_train = [np.argmax(softmax(x)) for x in np.array(predictions_train.logits)]
y_predictions_dev = [np.argmax(softmax(x)) for x in np.array(predictions_dev.logits)]
y_predictions_test = [np.argmax(softmax(x)) for x in np.array(predictions_test.logits)]


del model
gc.collect()
torch.cuda.empty_cache()

# Metrics

Precision , Recall , F1 , AUC scores for Distilled BERT classifier

In [None]:
distillbert_precision_train, distillbert_recall_train, thresholds = precision_recall_curve(y_train[:100], y_predictions_train)

distillbert_precision_dev, distillbert_recall_dev, thresholds = precision_recall_curve(y_dev[:100], y_predictions_dev)

distillbert_precision_test, distillbert_recall_test, thresholds = precision_recall_curve(y_test[400:500], y_predictions_test)


print('\n--------- Fine-tuned Distilled BERT ---------\n')
print('Training set\n')
print(classification_report(y_train[:100], y_predictions_train, target_names = target_names))
print('AUC training :', auc(distillbert_recall_train, distillbert_precision_train), '\n')
print('\nDevelopment set\n')
print(classification_report(y_dev[:100], y_predictions_dev, target_names = target_names))
print('AUC development :', auc(distillbert_recall_dev, distillbert_precision_dev), '\n')
print('\nTest set\n')
print(classification_report(y_test[400:500], y_predictions_test, target_names = target_names))
print('AUC test :', auc(distillbert_recall_test, distillbert_precision_test))


--------- Fine-tuned Distilled BERT ---------

Training set

              precision    recall  f1-score   support

    negative       0.92      0.92      0.92        59
    positive       0.88      0.88      0.88        41

    accuracy                           0.90       100
   macro avg       0.90      0.90      0.90       100
weighted avg       0.90      0.90      0.90       100

AUC training : 0.9030487804878049 


Development set

              precision    recall  f1-score   support

    negative       0.93      0.91      0.92        46
    positive       0.93      0.94      0.94        54

    accuracy                           0.93       100
   macro avg       0.93      0.93      0.93       100
weighted avg       0.93      0.93      0.93       100

AUC development : 0.9508585858585858 


Test set

              precision    recall  f1-score   support

    negative       0.92      0.84      0.88        55
    positive       0.82      0.91      0.86        45

    accuracy    