# Uploading classified data

In [74]:
from google.colab import files
import zipfile
import os

import pandas as pd
import glob
from sklearn.metrics import (
    accuracy_score, confusion_matrix, precision_score,
    recall_score, f1_score, classification_report
)

In [75]:
uploaded = files.upload()

Saving llama-3.3-70b-instruct.zip to llama-3.3-70b-instruct (1).zip


In [76]:
zip_path = "llama-3.3-70b-instruct.zip"

with zipfile.ZipFile(zip_path, 'r') as z:
    z.extractall('/content/')

# Intial filtering - which data is avaiable in Gold standard

In [77]:
# Loading gold dataset
gold = pd.read_excel("Gold_standard_Mirela.xlsx")

# Loading prediction files
pred_files = glob.glob("llama-3.3-70b-instruct/*.xlsx")

all_predictions = []

for f in pred_files:
    df = pd.read_excel(f)
    df['Source_File'] = f
    all_predictions.append(df)

pred = pd.concat(all_predictions, ignore_index=True)


In [78]:
pred = pred.rename(columns={"text": "Sentence"})

#merging on sentence in both (Gold standard and ALL other files)
merged = gold.merge(
    pred,
    on=["Sentence"],
    suffixes=("_gold", "_pred"),
    how="left"
)


In [79]:
#merged.to_csv("llama-3.3-70b-instruct.csv")

In [81]:
merged = merged.fillna(0)


# Binary Evaulation

In [82]:
y_true_eval = merged["Evaluative"]
y_pred_eval = merged["binary"].astype(int)


In [83]:
print("=== Evaluative (Binary) ===")
print("Accuracy:", accuracy_score(y_true_eval, y_pred_eval))
print("Confusion Matrix:\n", confusion_matrix(y_true_eval, y_pred_eval, labels=[0,1]))

# Full per-class report
print("\nClassification Report:\n")
print(classification_report(y_true_eval, y_pred_eval, labels=[0,1]))

# ----- Precision / Recall / F1 -----
print("\n=== Metrics ===")

print("Precision (macro):", precision_score(y_true_eval, y_pred_eval, average='macro'))
print("Recall (macro):", recall_score(y_true_eval, y_pred_eval, average='macro'))
print("F1-score (macro):", f1_score(y_true_eval, y_pred_eval, average='macro'))

print("\nPrecision (weighted):", precision_score(y_true_eval, y_pred_eval, average='weighted'))
print("Recall   (weighted):", recall_score(y_true_eval, y_pred_eval, average='weighted'))
print("F1-score (weighted):", f1_score(y_true_eval, y_pred_eval, average='weighted'))


=== Evaluative (Binary) ===
Accuracy: 0.6517857142857143
Confusion Matrix:
 [[162  16]
 [140 130]]

Classification Report:

              precision    recall  f1-score   support

           0       0.54      0.91      0.68       178
           1       0.89      0.48      0.62       270

    accuracy                           0.65       448
   macro avg       0.71      0.70      0.65       448
weighted avg       0.75      0.65      0.64       448


=== Metrics ===
Precision (macro): 0.713417399981856
Recall (macro): 0.6957969205160216
F1-score (macro): 0.65

Precision (weighted): 0.7497642915462475
Recall   (weighted): 0.6517857142857143
F1-score (weighted): 0.6448660714285713


# Multiclass classification (Without Ambiguous)

In [84]:
multi_cols = ["Affect", "Judgement", "Appreciation"]

In [85]:
pred_cols = [
    "multiclass_probability_affect",
    "multiclass_probability_judgment",
    "multiclass_probability_appreciation"
]

merged[pred_cols] = (merged[pred_cols] >= 0.5).astype(int)
merged["multiclass_ambiguity"] = (merged[pred_cols] == 1).sum(axis=1).gt(1).astype(int)


In [86]:
multi_cols = ["Affect", "Judgement", "Appreciation"]

In [87]:
pred_cols = [
    "multiclass_probability_affect",
    "multiclass_probability_judgment",
    "multiclass_probability_appreciation",
]

# Ambiguous if more than one predicted label is 1
merged["multiclass_ambiguity"] = (merged[pred_cols] == 1).sum(axis=1).gt(1).astype(int)

# Filter: non-ambiguous in gold + non-ambiguous in multiclass prediction
non_amb = merged[(merged["Ambiguous"] == 0)]

# non_amb = merged[(merged["Ambiguous"] == 0) & (merged["multiclass_ambiguity"] == 0) - we can take this as well

In [88]:
# --- GOLD (one-hot among Affect / Judgement / Appreciation) ---
y_true_mc = non_amb[multi_cols].idxmax(axis=1)

col_to_label = {
    "multiclass_probability_affect": "Affect",
    "multiclass_probability_judgment": "Judgement",
    "multiclass_probability_appreciation": "Appreciation",
}

y_pred_mc = non_amb[pred_cols].idxmax(axis=1).map(col_to_label)


In [89]:
print("=== Multiclass (Affect vs Judgement vs Appreciation) ===")
print("Accuracy:", accuracy_score(y_true_mc, y_pred_mc))
print("Confusion Matrix:\n", confusion_matrix(y_true_mc, y_pred_mc, labels=multi_cols))

# Full per-class report
print("\nClassification Report:\n")
print(classification_report(y_true_mc, y_pred_mc, labels=multi_cols))

# ----- Precision / Recall / F1 -----
print("\n=== Metrics ===")

print("Precision (macro):", precision_score(y_true_mc, y_pred_mc, average='macro'))
print("Recall (macro):", recall_score(y_true_mc, y_pred_mc, average='macro'))
print("F1-score (macro):", f1_score(y_true_mc, y_pred_mc, average='macro'))

print("\nPrecision (weighted):", precision_score(y_true_mc, y_pred_mc, average='weighted'))
print("Recall   (weighted):", recall_score(y_true_mc, y_pred_mc, average='weighted'))
print("F1-score (weighted):", f1_score(y_true_mc, y_pred_mc, average='weighted'))


=== Multiclass (Affect vs Judgement vs Appreciation) ===
Accuracy: 0.5985748218527316
Confusion Matrix:
 [[197   5  17]
 [ 71  18  22]
 [ 51   3  37]]

Classification Report:

              precision    recall  f1-score   support

      Affect       0.62      0.90      0.73       219
   Judgement       0.69      0.16      0.26       111
Appreciation       0.49      0.41      0.44        91

    accuracy                           0.60       421
   macro avg       0.60      0.49      0.48       421
weighted avg       0.61      0.60      0.55       421


=== Metrics ===
Precision (macro): 0.5989015521683398
Recall (macro): 0.48943298258366746
F1-score (macro): 0.47940983417259037

Precision (weighted): 0.6090102126643333
Recall   (weighted): 0.5985748218527316
F1-score (weighted): 0.5460193257324104


# Multi Label Classification - With Ambiguous

In [90]:
amb = merged[merged["Ambiguous"] == 1]

y_true_ml = amb[multi_cols].values

y_pred_ml = (amb[pred_cols] >= 0.5).astype(int).values


In [91]:

print("=== Multilabel (Ambiguous cases) ===")

print("Accuracy:", accuracy_score(y_true_ml, y_pred_ml))
#print("Confusion Matrix:\n", confusion_matrix(y_true_ml, y_pred_ml, labels=multi_cols))

# Full per-class report
print("\nClassification Report:\n")
print(classification_report(y_true_ml, y_pred_ml, target_names=multi_cols))


print("\n=== Metrics ===")
print("Precision (micro):", precision_score(y_true_ml, y_pred_ml, average="micro"))
print("Recall (micro):", recall_score(y_true_ml, y_pred_ml, average="micro"))
print("F1 (micro):", f1_score(y_true_ml, y_pred_ml, average="micro"))

print("\nPrecision (macro):", precision_score(y_true_ml, y_pred_ml, average="macro"))
print("Recall (macro):", recall_score(y_true_ml, y_pred_ml, average="macro"))
print("F1 (macro):", f1_score(y_true_ml, y_pred_ml, average="macro"))


print("\nPrecision (weighted):", precision_score(y_true_mc, y_pred_mc, average='weighted'))
print("Recall   (weighted):", recall_score(y_true_mc, y_pred_mc, average='weighted'))
print("F1-score (weighted):", f1_score(y_true_mc, y_pred_mc, average='weighted'))



=== Multilabel (Ambiguous cases) ===
Accuracy: 0.0

Classification Report:

              precision    recall  f1-score   support

      Affect       1.00      0.20      0.33        20
   Judgement       1.00      0.06      0.11        18
Appreciation       0.62      0.50      0.55        16

   micro avg       0.72      0.24      0.36        54
   macro avg       0.87      0.25      0.33        54
weighted avg       0.89      0.24      0.32        54
 samples avg       0.48      0.24      0.32        54


=== Metrics ===
Precision (micro): 0.7222222222222222
Recall (micro): 0.24074074074074073
F1 (micro): 0.3611111111111111

Precision (macro): 0.8717948717948718
Recall (macro): 0.2518518518518518
F1 (macro): 0.3301068763863682

Precision (weighted): 0.6090102126643333
Recall   (weighted): 0.5985748218527316
F1-score (weighted): 0.5460193257324104


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
