After having obtained a fine-tuned student model, we now want to evaluate its performance on the non-augmented test set and compare it to the teacher model's performance.

For this purpose, we would like to compare the predictive distribution for input samples (preferably from the test set) of the teacher and student model. We can then compare the two distributions visually and quantitatively using the Kullback-Leibler divergence.

In [26]:
import os
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf

from src.utils.data import Dataset
from src.data.robustness_study.bert_data_preprocessing import bert_preprocess, get_tf_dataset
from src.models.bert_model import AleatoricMCDropoutBERT, create_bert_config
from src.utils.loss_functions import aleatoric_loss, null_loss

In [27]:
# load and preprocess test data
df_test = pd.read_csv('../data/robustness_study/preprocessed/test.csv', sep='\t', index_col=0)
df_test.head()

Unnamed: 0,text,target
0,in class still drunk from last night and this ...,1
1,sighs of relief from beijing guoan fans <hasht...,0
2,<user> <user> <user> they do why else do teach...,1
3,preparing to be called a nigger all night i me...,1
4,rt <user> we dont love these hoes <url>,1


In [57]:
MAX_LENGTH = 48
BATCH_SIZE = 16

dataset = Dataset()
dataset.test = df_test

subset_size = 25
dataset.test = dataset.test.sample(n=min(subset_size, len(dataset.test)), random_state=42)

tokenized_dataset = {
    'test': bert_preprocess(dataset.test, max_length=MAX_LENGTH)
}
test_data = get_tf_dataset(tokenized_dataset, 'test')
test_data = test_data.batch(BATCH_SIZE).cache().prefetch(tf.data.AUTOTUNE)

In [52]:
# load and initialize teacher model
TEACHER_PATH = '../out/bert_teacher/final_hd030_ad020_cd035/model'

with open(os.path.join(TEACHER_PATH, 'config.json'), 'r') as f:
    teacher_config = json.load(f)
    
config = create_bert_config(teacher_config['hidden_dropout_prob'],
                            teacher_config['attention_probs_dropout_prob'],
                            teacher_config['classifier_dropout'])

teacher = AleatoricMCDropoutBERT(config=config, custom_loss_fn=aleatoric_loss)
checkpoint_path = os.path.join(TEACHER_PATH, 'cp-{epoch:02d}.ckpt')
checkpoint_dir = os.path.dirname(checkpoint_path)

latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
if latest_checkpoint:
    print("Loading weights from", checkpoint_dir)
    teacher.load_weights(latest_checkpoint)

teacher.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
        loss={'classifier': aleatoric_loss, 'log_variance': null_loss},
        metrics=[tf.keras.metrics.BinaryAccuracy(), tf.keras.metrics.Precision(), tf.keras.metrics.Recall()],
        run_eagerly=True
    )

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Loading weights from ../out/bert_teacher/final_hd030_ad020_cd035/model




In [None]:
# load and initialize student model
STUDENT_PATH = '../out/bert_student/aleatoric_and_epistemic/m5_k5/2epochs/model'

with open(os.path.join(STUDENT_PATH, 'config.json'), 'r') as f:
    student_config = json.load(f)
    
config = create_bert_config(teacher_config['hidden_dropout_prob'],
                            teacher_config['attention_probs_dropout_prob'],
                            teacher_config['classifier_dropout'])

student = AleatoricMCDropoutBERT(config=config, custom_loss_fn=aleatoric_loss)
checkpoint_path = os.path.join(STUDENT_PATH, 'cp-{epoch:02d}.ckpt')
checkpoint_dir = os.path.dirname(checkpoint_path)

latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
if latest_checkpoint:
    print("Loading weights from", checkpoint_dir)
    student.load_weights(latest_checkpoint)
    
student.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
        loss={'classifier': aleatoric_loss, 'log_variance': null_loss},
        metrics=[tf.keras.metrics.BinaryAccuracy(), tf.keras.metrics.Precision(), tf.keras.metrics.Recall()],
        run_eagerly=True
    )

##### Test set evaluation: WA and MCD

In [80]:
from src.training.train_bert_teacher import compute_metrics, compute_mc_dropout_metrics
from src.distribution_distillation.uncertainty_distillation import compute_student_metrics, compute_student_mc_dropout_metrics

In [55]:
### weight averaging

In [60]:
# evaluate teacher model on test set
teacher_metrics_wa = compute_metrics(teacher, test_data)

In [61]:
teacher_metrics_wa.keys()

dict_keys(['y_true', 'y_pred', 'y_prob', 'variance', 'average_inference_time', 'accuracy_score', 'precision_score', 'recall_score', 'f1_score', 'nll_score', 'brier_score', 'ece_score'])

In [62]:
teacher_metrics_wa['f1_score']

0.9787234042553191

In [63]:
# evaluate student model on test set
student_metrics_wa = compute_metrics(student, test_data)

In [64]:
student_metrics_wa['f1_score']

0.9787234042553191

In [65]:
### mc dropout

In [66]:
teacher_metrics_mcd = compute_mc_dropout_metrics(teacher, test_data, n=5)

In [67]:
teacher_metrics_mcd.keys()

dict_keys(['y_true', 'y_pred', 'y_prob', 'variance', 'total_uncertainty', 'average_inference_time', 'accuracy_score', 'precision_score', 'recall_score', 'f1_score', 'nll_score', 'brier_score', 'avg_pred_entropy_score', 'ece_score'])

In [68]:
teacher_metrics_mcd['f1_score']

0.9787234042553191

In [71]:
teacher_metrics_mcd['average_inference_time']

1899.8313617706299

In [77]:
np.mean(teacher_metrics_mcd['total_uncertainty']), teacher_metrics_mcd['avg_pred_entropy_score']

(0.30245292514562605, 0.09658627212047577)

## REQUIRE CUSTOM STUDENT FUNCTION FOR MCD METRICS THAT TAKES INTO ACCOUNT BODY CACHING 

In [81]:
student_metrics_mcd = compute_student_mc_dropout_metrics(student, test_data, n=5)

ValueError: too many values to unpack (expected 2)

In [70]:
student_metrics_mcd['f1_score']

0.9787234042553191

In [72]:
student_metrics_mcd['average_inference_time']

1937.8099250793457

In [78]:
np.mean(student_metrics_mcd['total_uncertainty']), student_metrics_mcd['avg_pred_entropy_score']

(0.2996355628967285, 0.8415776491165161)

##### Test set evaluation: Predictive distribution