In [3]:
from pathlib import Path 
import json
from transformers import pipeline

from src.nli import classify_intent
from src.util import load_config

config = load_config(Path("configs/config.yaml"))

from transformers import pipeline

zeroshot_classifier = pipeline(
    "zero-shot-classification",
    model="../data/models/deberta-v3-large-zeroshot-v1.1-all-33",
)

with open("data/evaluation/intent_classification.json", "r", encoding="utf-8") as infile: 
    data = json.load(infile)


In [4]:
import time 

start = time.time()
predictions, references = [], []
for text, expected_intent in data: 
    intent, score = classify_intent(text, zeroshot_classifier)
    predictions.append(intent)
    references.append(expected_intent)
runtime = time.time()-start
print(f"runtime {runtime:.2f} - {runtime/len(data):2f}")
print("classified ", len(predictions), " documents")

[38;20m2024-02-03 13:10:05,888 - src.nli - INFO - classified intent: conversation 0.87 (nli.py:42)[0m
[38;20m2024-02-03 13:10:10,573 - src.nli - INFO - classified intent: conversation 0.96 (nli.py:42)[0m
[38;20m2024-02-03 13:10:15,781 - src.nli - INFO - classified intent: conversation 0.45 (nli.py:42)[0m
[38;20m2024-02-03 13:10:21,000 - src.nli - INFO - classified intent: conversation 0.99 (nli.py:42)[0m
[38;20m2024-02-03 13:10:25,909 - src.nli - INFO - classified intent: conversation 0.67 (nli.py:42)[0m
[38;20m2024-02-03 13:10:30,731 - src.nli - INFO - classified intent: conversation 0.99 (nli.py:42)[0m
[38;20m2024-02-03 13:10:35,576 - src.nli - INFO - classified intent: rules 0.35 (nli.py:42)[0m
[38;20m2024-02-03 13:10:40,743 - src.nli - INFO - classified intent: conversation 0.82 (nli.py:42)[0m
[38;20m2024-02-03 13:10:45,632 - src.nli - INFO - classified intent: conversation 0.92 (nli.py:42)[0m
[38;20m2024-02-03 13:10:54,262 - src.nli - INFO - classified intent: d

runtime 314.84 - 5.523514
classified  57  documents


In [5]:
from sklearn.metrics import f1_score, confusion_matrix, recall_score
import pandas as pd 

classes = ["deckbuilding", "rules", "conversation"]

prediction_classes = [classes.index(pred) for pred in predictions]
reference_classes = [classes.index(ref) for ref in references]

print(classes)
f1_scores = f1_score(y_true=reference_classes, y_pred=prediction_classes, labels=list(range(len(classes))), average=None)
recall_scores = recall_score(y_true=reference_classes, y_pred=prediction_classes, labels=list(range(len(classes))), average=None)

evaluation = pd.DataFrame({
    "labels": classes, 
    "f1": f1_scores, 
    "recall": recall_scores
}).sort_values("f1")
evaluation 

['deckbuilding', 'rules', 'conversation']


Unnamed: 0,labels,f1,recall
2,conversation,0.8,0.666667
0,deckbuilding,0.837209,1.0
1,rules,0.888889,0.888889


In [6]:
matrix = confusion_matrix(y_true=reference_classes, y_pred=prediction_classes, labels=list(range(len(classes))))
pd.DataFrame(matrix, columns=classes, index=classes)

Unnamed: 0,deckbuilding,rules,conversation
deckbuilding,18,0,0
rules,2,16,0
conversation,5,2,14
