# Uploading classified data

In [92]:
from google.colab import files
import zipfile
import os

import pandas as pd
import glob
from sklearn.metrics import (
    accuracy_score, confusion_matrix, precision_score,
    recall_score, f1_score, classification_report
)

In [93]:
uploaded = files.upload()

Saving mistral-large-instruct.zip to mistral-large-instruct.zip


In [94]:
zip_path = "mistral-large-instruct.zip"

with zipfile.ZipFile(zip_path, 'r') as z:
    z.extractall('/content/')

# Intial filtering - which data is avaiable in Gold standard

In [95]:
# Loading gold dataset
gold = pd.read_excel("Gold_standard_Mirela.xlsx")

# Loading prediction files
pred_files = glob.glob("mistral-large-instruct/*.xlsx")

all_predictions = []

for f in pred_files:
    df = pd.read_excel(f)
    df['Source_File'] = f
    all_predictions.append(df)

pred = pd.concat(all_predictions, ignore_index=True)


In [96]:
pred = pred.rename(columns={"text": "Sentence"})

#merging on sentence in both (Gold standard and ALL other files)
merged = gold.merge(
    pred,
    on=["Sentence"],
    suffixes=("_gold", "_pred"),
    how="left"
)


In [99]:
merged.to_csv("mistral-large-instruct.csv")

In [97]:
merged = merged.fillna(0)


# Binary Evaulation

In [100]:
y_true_eval = merged["Evaluative"]
y_pred_eval = merged["binary"].astype(int)


In [101]:
print("=== Evaluative (Binary) ===")
print("Accuracy:", accuracy_score(y_true_eval, y_pred_eval))
print("Confusion Matrix:\n", confusion_matrix(y_true_eval, y_pred_eval, labels=[0,1]))

# Full per-class report
print("\nClassification Report:\n")
print(classification_report(y_true_eval, y_pred_eval, labels=[0,1]))

# ----- Precision / Recall / F1 -----
print("\n=== Metrics ===")

print("Precision (macro):", precision_score(y_true_eval, y_pred_eval, average='macro'))
print("Recall (macro):", recall_score(y_true_eval, y_pred_eval, average='macro'))
print("F1-score (macro):", f1_score(y_true_eval, y_pred_eval, average='macro'))

print("\nPrecision (weighted):", precision_score(y_true_eval, y_pred_eval, average='weighted'))
print("Recall   (weighted):", recall_score(y_true_eval, y_pred_eval, average='weighted'))
print("F1-score (weighted):", f1_score(y_true_eval, y_pred_eval, average='weighted'))


=== Evaluative (Binary) ===
Accuracy: 0.6607142857142857
Confusion Matrix:
 [[164  14]
 [138 132]]

Classification Report:

              precision    recall  f1-score   support

           0       0.54      0.92      0.68       178
           1       0.90      0.49      0.63       270

    accuracy                           0.66       448
   macro avg       0.72      0.71      0.66       448
weighted avg       0.76      0.66      0.65       448


=== Metrics ===
Precision (macro): 0.7235779733284949
Recall (macro): 0.7051186017478153
F1-score (macro): 0.658974358974359

Precision (weighted): 0.7606514301266183
Recall   (weighted): 0.6607142857142857
F1-score (weighted): 0.6539720695970697


# Multiclass classification (Without Ambiguous)

In [102]:
multi_cols = ["Affect", "Judgement", "Appreciation"]

In [103]:
pred_cols = [
    "multiclass_probability_affect",
    "multiclass_probability_judgment",
    "multiclass_probability_appreciation"
]

merged[pred_cols] = (merged[pred_cols] >= 0.5).astype(int)
merged["multiclass_ambiguity"] = (merged[pred_cols] == 1).sum(axis=1).gt(1).astype(int)


In [104]:
multi_cols = ["Affect", "Judgement", "Appreciation"]

In [105]:
pred_cols = [
    "multiclass_probability_affect",
    "multiclass_probability_judgment",
    "multiclass_probability_appreciation",
]

# Ambiguous if more than one predicted label is 1
merged["multiclass_ambiguity"] = (merged[pred_cols] == 1).sum(axis=1).gt(1).astype(int)

# Filter: non-ambiguous in gold + non-ambiguous in multiclass prediction
non_amb = merged[(merged["Ambiguous"] == 0)]

# non_amb = merged[(merged["Ambiguous"] == 0) & (merged["multiclass_ambiguity"] == 0) - we can take this as well

In [106]:
# --- GOLD (one-hot among Affect / Judgement / Appreciation) ---
y_true_mc = non_amb[multi_cols].idxmax(axis=1)

col_to_label = {
    "multiclass_probability_affect": "Affect",
    "multiclass_probability_judgment": "Judgement",
    "multiclass_probability_appreciation": "Appreciation",
}

y_pred_mc = non_amb[pred_cols].idxmax(axis=1).map(col_to_label)


In [107]:
print("=== Multiclass (Affect vs Judgement vs Appreciation) ===")
print("Accuracy:", accuracy_score(y_true_mc, y_pred_mc))
print("Confusion Matrix:\n", confusion_matrix(y_true_mc, y_pred_mc, labels=multi_cols))

# Full per-class report
print("\nClassification Report:\n")
print(classification_report(y_true_mc, y_pred_mc, labels=multi_cols))

# ----- Precision / Recall / F1 -----
print("\n=== Metrics ===")

print("Precision (macro):", precision_score(y_true_mc, y_pred_mc, average='macro'))
print("Recall (macro):", recall_score(y_true_mc, y_pred_mc, average='macro'))
print("F1-score (macro):", f1_score(y_true_mc, y_pred_mc, average='macro'))

print("\nPrecision (weighted):", precision_score(y_true_mc, y_pred_mc, average='weighted'))
print("Recall   (weighted):", recall_score(y_true_mc, y_pred_mc, average='weighted'))
print("F1-score (weighted):", f1_score(y_true_mc, y_pred_mc, average='weighted'))


=== Multiclass (Affect vs Judgement vs Appreciation) ===
Accuracy: 0.6057007125890737
Confusion Matrix:
 [[198   3  18]
 [ 68  22  21]
 [ 53   3  35]]

Classification Report:

              precision    recall  f1-score   support

      Affect       0.62      0.90      0.74       219
   Judgement       0.79      0.20      0.32       111
Appreciation       0.47      0.38      0.42        91

    accuracy                           0.61       421
   macro avg       0.63      0.50      0.49       421
weighted avg       0.63      0.61      0.56       421


=== Metrics ===
Precision (macro): 0.6264589712865575
Recall (macro): 0.4956410572848929
F1-score (macro): 0.49228288879541854

Precision (weighted): 0.6322704530583965
Recall   (weighted): 0.6057007125890737
F1-score (weighted): 0.5580517274960747


# Multi Label Classification - With Ambiguous

In [108]:
amb = merged[merged["Ambiguous"] == 1]

y_true_ml = amb[multi_cols].values

y_pred_ml = (amb[pred_cols] >= 0.5).astype(int).values


In [109]:

print("=== Multilabel (Ambiguous cases) ===")

print("Accuracy:", accuracy_score(y_true_ml, y_pred_ml))
#print("Confusion Matrix:\n", confusion_matrix(y_true_ml, y_pred_ml, labels=multi_cols))

# Full per-class report
print("\nClassification Report:\n")
print(classification_report(y_true_ml, y_pred_ml, target_names=multi_cols))


print("\n=== Metrics ===")
print("Precision (micro):", precision_score(y_true_ml, y_pred_ml, average="micro"))
print("Recall (micro):", recall_score(y_true_ml, y_pred_ml, average="micro"))
print("F1 (micro):", f1_score(y_true_ml, y_pred_ml, average="micro"))

print("\nPrecision (macro):", precision_score(y_true_ml, y_pred_ml, average="macro"))
print("Recall (macro):", recall_score(y_true_ml, y_pred_ml, average="macro"))
print("F1 (macro):", f1_score(y_true_ml, y_pred_ml, average="macro"))


print("\nPrecision (weighted):", precision_score(y_true_mc, y_pred_mc, average='weighted'))
print("Recall   (weighted):", recall_score(y_true_mc, y_pred_mc, average='weighted'))
print("F1-score (weighted):", f1_score(y_true_mc, y_pred_mc, average='weighted'))



=== Multilabel (Ambiguous cases) ===
Accuracy: 0.0

Classification Report:

              precision    recall  f1-score   support

      Affect       1.00      0.10      0.18        20
   Judgement       1.00      0.11      0.20        18
Appreciation       0.54      0.44      0.48        16

   micro avg       0.65      0.20      0.31        54
   macro avg       0.85      0.22      0.29        54
weighted avg       0.86      0.20      0.28        54
 samples avg       0.41      0.20      0.27        54


=== Metrics ===
Precision (micro): 0.6470588235294118
Recall (micro): 0.2037037037037037
F1 (micro): 0.30985915492957744

Precision (macro): 0.8461538461538461
Recall (macro): 0.2162037037037037
F1 (macro): 0.28819226750261234

Precision (weighted): 0.6322704530583965
Recall   (weighted): 0.6057007125890737
F1-score (weighted): 0.5580517274960747


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
