# Evaluating a Classifier based on deep learning

This notebook is based on a fine-tuned distilBERT model. Please execute this notebook `model_2_deep_learning_training.ipynb` in advance.

## Import necessary dependencies and data

In [1]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, auc, roc_curve
import os
import tensorflow as tf
import numpy as np
import json
import pandas as pd

## Load the Fine-Tuned Model

In [2]:
# Load the Model
model_path = os.path.join(os.curdir, 'models', 'model_2_deep_learning_customized.h5')
try:
    model = tf.keras.models.load_model(model_path)
    model.summary()
except OSError as e:
    print(f"Error loading model: {e}")



## Load the Validation Set and the Test Set Data for Evaluation

In [3]:
DATA_ROOT = os.path.realpath(os.path.join(os.path.dirname(os.curdir), '..', 'data'))
DATASET_PATH = os.path.join(DATA_ROOT, 'Numpy Data')
X_val = np.load(os.path.join(DATASET_PATH, 'Text', 'X_val_text.npy'))
X_test = np.load(os.path.join(DATASET_PATH, 'Text', 'X_test_text.npy'))
y_val = np.load(os.path.join(DATASET_PATH, 'y_val_text.npy'))
assert X_val.shape[0] == y_val.shape[0]

In [4]:
# Inspect the Shapes of the data
print(f"X_val shape: {X_val.shape}")
print(f"y_val shape: {y_val.shape}")
print(f"X_test shape: {X_test.shape}")

X_val shape: (100, 3, 768)
y_val shape: (100,)
X_test shape: (18679, 3, 768)


## Make Predictions based on the Model

In [5]:
# Reshape inputs to include the time-step dimension
input_title = tf.expand_dims(X_val[:, 0, :], axis=1)  # Shape: (batch_size, 1, 768)
input_content = tf.expand_dims(X_val[:, 1, :], axis=1)  # Shape: (batch_size, 1, 768)
input_hashtags = tf.expand_dims(X_val[:, 2, :], axis=1)  # Shape: (batch_size, 1, 768)

# Pass the reshaped inputs as a list to the model
pred_val = model.predict([input_title, input_content, input_hashtags])

print(f"pred_val shape: {pred_val.shape}")

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 190ms/step
pred_val shape: (100, 1)


In [6]:
# Shape of ground truth labels
y_val.shape

(100,)

In [7]:
print(f"A value of prediction: {pred_val[0]}")
print(f"A value of ground truth: {y_val[0]}")

A value of prediction: [1.]
A value of ground truth: 1


Looks like the prediction is a float ranging between 0 and 1. It could possibly be the confidence (or the probability) that the model believes the text to be in class 1 (i.e., suicidal).

In [8]:
# Flatten the predictions 
pred_val = pred_val.flatten()
pred_val.shape

(100,)

In [9]:
# Convert the predictions to binary values
pred_val_binary = np.round(pred_val).astype(int)
np.unique(pred_val_binary)

array([1])

## Evaluate the Prediction on the Validation Set

In [10]:
acc = accuracy_score(y_val, pred_val_binary)
prec = precision_score(y_val, pred_val_binary)
rec = recall_score(y_val, pred_val_binary)
macro_f1 = f1_score(y_val, pred_val_binary, average='macro')
micro_f1 = f1_score(y_val, pred_val_binary, average='micro')
fpr, tpr, _ = roc_curve(y_val, pred_val_binary)
auc_score = auc(fpr, tpr)
if not isinstance(auc_score, int):
    auc_score = 0


print(f"Validation Set Metrics:")
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall: {rec:.4f}")
print(f"Macro F1: {macro_f1:.4f}")
print(f"Micro F1: {micro_f1:.4f}")
print(f"ROC-AUC: {auc_score:.4f}")


Validation Set Metrics:
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
Macro F1: 1.0000
Micro F1: 1.0000
ROC-AUC: 0.0000




This results however makes sense because the dataset Reddit SuicideWatch is a dataset with completely suicidal posts. But then since the ROC-AUC = 0, we are unable to plot the graph.

## Evaluation on the Test Set

In [11]:
# Reshape inputs to include the time-step dimension
input_title = tf.expand_dims(X_test[:, 0, :], axis=1)  # Shape: (batch_size, 1, 768)
input_content = tf.expand_dims(X_test[:, 1, :], axis=1)  # Shape: (batch_size, 1, 768)
input_hashtags = tf.expand_dims(X_test[:, 2, :], axis=1)  # Shape: (batch_size, 1, 768)

# Pass the reshaped inputs as a list to the model
pred_test = model.predict([input_title, input_content, input_hashtags])

print(f"pred_val shape: {pred_test.shape}")

[1m584/584[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
pred_val shape: (18679, 1)


In [12]:
print(f"A sample of prediction: {pred_test[0]}")

A sample of prediction: [0.99999905]


In [13]:
# Flatten the predictions 
pred_test = pred_test.flatten()
pred_test.shape

(18679,)

In [14]:
# Convert the predictions to binary values
pred_test_binary = np.round(pred_test).astype(int)
np.unique(pred_test_binary)

array([0, 1])

In [15]:
if not os.path.exists("Results"):
    os.makedirs("Results")

In [16]:
# Load original test data (raw text)
TEST_DATA_PATH = os.path.join(DATA_ROOT, 'Depression_Tweets')
test_data = pd.read_json(os.path.join(TEST_DATA_PATH, 'depression_json'))
print(f"Number of records: {len(test_data)}")

Number of records: 18679


In [17]:
# Prepare results in the desired format
results = []
for idx, (text, label) in enumerate(zip(test_data['content'], pred_test_binary)):
    results.append({
        "id": idx,
        "predicted_label": int(label),
        "raw_text": text
    })

# Define the output file path
output_file_path = os.path.join("Results", "Result_deep_learning_customized.jsonl")

# Save the results to a JSONL file
with open(output_file_path, "w") as jsonl_file:
    for record in results:
        jsonl_file.write(json.dumps(record) + "\n")

print(f"Results saved to {output_file_path}")


Results saved to Results\Result_deep_learning_customized.jsonl
