# Evaluating a Classifier based on deep learning

This notebook is based on a fine-tuned distilBERT model. Please execute this notebook `model_2_deep_learning_training.ipynb` in advance.

## Import necessary dependencies and data

In [28]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, auc, roc_curve
import os
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

## Load the Fine-Tuned Model

In [5]:
# Load the Model
model_path = os.path.join(os.curdir, 'models', 'model_2_deep_learning_distilBERT_pretrained_tuned.h5')
try:
    model = tf.keras.models.load_model(model_path)
    model.summary()
except OSError as e:
    print(f"Error loading model: {e}")



## Load the Validation Set and the Test Set Data for Evaluation

In [8]:
DATA_ROOT = os.path.realpath(os.path.join(os.path.dirname(os.curdir), '..', 'data'))
DATASET_PATH = os.path.join(DATA_ROOT, 'Numpy Data')
X_val = np.load(os.path.join(DATASET_PATH, 'Text', 'X_val_text.npy'))
X_test = np.load(os.path.join(DATASET_PATH, 'Text', 'X_test_text.npy'))
y_val = np.load(os.path.join(DATASET_PATH, 'y_val_text.npy'))
assert X_val.shape[0] == y_val.shape[0]

In [9]:
# Inspect the Shapes of the data
print(f"X_val shape: {X_val.shape}")
print(f"y_val shape: {y_val.shape}")
print(f"X_test shape: {X_test.shape}")

X_val shape: (100, 3, 768)
y_val shape: (100,)
X_test shape: (18679, 3, 768)


## Make Predictions based on the Model

In [20]:
pred_val = model.predict(X_val)
print(f"pred_val shape: {pred_val.shape}")

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
pred_val shape: (100, 1)


In [22]:
# Shape of ground truth labels
y_val.shape

(100,)

In [23]:
print(f"A value of prediction: {pred_val[0]}")
print(f"A value of ground truth: {y_val[0]}")

A value of prediction: [0.99548554]
A value of ground truth: 1


Looks like the prediction is a float ranging between 0 and 1. It could possibly be the confidence (or the probability) that the model believes the text to be in class 1 (i.e., suicidal).

In [None]:
# Flatten the predictions 
pred_val = pred_val.flatten()
pred_val.shape

(100,)

In [None]:
# Round predictions in Validation Set
pred_val_binary = np.where(pred_val > 0.5, 1, 0)
print(f"A value of binary prediction: {pred_val_binary[0]}")

A value of binary prediction: 1


## Evaluate the Prediction on the Validation Set

In [34]:
# Ensure y_val and pred_val_binary have the same number of samples
min_samples = min(len(y_val), len(pred_val_binary))
y_val_trimmed = y_val[:min_samples]
pred_val_binary_trimmed = pred_val_binary[:min_samples]

# Calculate metrics
acc = accuracy_score(y_val_trimmed, pred_val_binary_trimmed)
prec = precision_score(y_val_trimmed, pred_val_binary_trimmed)
rec = recall_score(y_val_trimmed, pred_val_binary_trimmed)
macro_f1 = f1_score(y_val_trimmed, pred_val_binary_trimmed, average='macro')
micro_f1 = f1_score(y_val_trimmed, pred_val_binary_trimmed, average='micro')
auc_score = auc(y_val_trimmed, pred_val_binary_trimmed)

print("Evaluation on Dev Set:")
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall: {rec:.4f}")
print(f"Macro F1: {macro_f1:.4f}")
print(f"Micro F1: {micro_f1:.4f}")
print(f"ROC AUC: {auc_score:.4f}")


Evaluation on Dev Set:
Accuracy: 0.9300
Precision: 1.0000
Recall: 0.9300
Macro F1: 0.4819
Micro F1: 0.9300
ROC AUC: 0.0000
