# LLaVA evaluation
This notebook performs simple evaluation of the results of LLaVA finetuning. It calculates metrics scores (AUC ROC, Accuracy) on both train and test sets.

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score
from warnings import filterwarnings
filterwarnings('ignore')

Train set

In [12]:
# with finetuning
data = pd.read_json('../../../data/llava/train_predictions_finetuned.json')
assert all(data['prediction'].isin(['Yes', 'No']))
data['label_pred'] = np.where(data['prediction'] == 'Yes', 1, 0)

y = data['label']
y_pred = data['label_pred'].astype('Int64')

print('Accuracy:', accuracy_score(y, y_pred))
print('AUC:', roc_auc_score(y, y_pred))

Accuracy: 0.9998823529411764
AUC: 0.9998343822457767


Test set (*dev_seen*)

In [10]:
# without finetuning
test_original = pd.read_json('../../../data/llava/dev_seen_predictions_not_finetuned.json')
assert all(test_original['prediction'].isin(['Yes', 'No']))
test_original['label_pred'] = np.where(test_original['prediction'] == 'Yes', 1, 0)

y = test_original['label']
y_pred = test_original['label_pred'].astype('Int64')

print('Accuracy:', accuracy_score(y, y_pred))
print('AUC:', roc_auc_score(y, y_pred))

Accuracy: 0.584
AUC: 0.5824038661567266


In [11]:
test_finetuned = pd.read_json('../../../data/llava/dev_seen_predictions_finetuned.json')
assert all(test_finetuned['prediction'].isin(['Yes', 'No']))
test_finetuned['label_pred'] = np.where(test_finetuned['prediction'] == 'Yes', 1, 0)

y = test_finetuned['label']
y_pred = test_finetuned['label_pred'].astype('Int64')

print('Accuracy:', accuracy_score(y, y_pred))
print('AUC:', roc_auc_score(y, y_pred))

Accuracy: 0.714
AUC: 0.711870509353347
