## Imports

In [1]:
import sklearn.metrics as sk_metrics
import sklearn.model_selection as sk
import seaborn
import matplotlib.pyplot as plt
import numpy
import pandas as pd
import csv
import os
import collections
from google.colab import files
from google.colab import drive

## Nombre

In [35]:
#nombre_archivo = "resultados_llama"
#carpeta = "clasificacion llama"

#nombre_archivo = "resultados_orca"
#carpeta = "clasificacion orca"

nombre_archivo = "resultados_bert"
carpeta = "bert"

folder_name = "mineria de datos/resultados/"
exp_min = 0
exp_max = 1

## Carga de los datos y division del data set

In [3]:
# Check if the zip file is present and has been unzipped
if not os.path.exists("cefr-levelled-english-texts.zip"):
    # Download the dataset if the zip file is not present
    !kaggle datasets download -d amontgomerie/cefr-levelled-english-texts

if not os.path.exists("cefr_leveled_texts.csv"):  # Adjust this to match the folder name after unzipping
    # Unzip the file if the unzipped folder does not exist
    !unzip cefr-levelled-english-texts.zip

Dataset URL: https://www.kaggle.com/datasets/amontgomerie/cefr-levelled-english-texts
License(s): CC0-1.0
Downloading cefr-levelled-english-texts.zip to /content
  0% 0.00/1.36M [00:00<?, ?B/s]
100% 1.36M/1.36M [00:00<00:00, 61.4MB/s]
Archive:  cefr-levelled-english-texts.zip
  inflating: cefr_leveled_texts.csv  


In [4]:
# Load the CSV file into a DataFrame
df = pd.read_csv('cefr_leveled_texts.csv')

# Get the minimum number of samples in any class
min_samples = df['label'].value_counts().min()

# Downsample each class to have the same number of samples as the smallest class
df_balanced = df.groupby('label').apply(lambda x: x.sample(n=min_samples, random_state=60)).reset_index(drop=True)

distribution = df_balanced['label'].value_counts()
train, div = sk.train_test_split(df_balanced, test_size=0.2, random_state=70)
dev, holdout = sk.train_test_split(div, test_size=0.5, random_state=50)

train = train.reset_index(drop=True)
dev = dev.reset_index(drop=True)
holdout = holdout.reset_index(drop=True)

  df_balanced = df.groupby('label').apply(lambda x: x.sample(n=min_samples, random_state=60)).reset_index(drop=True)


In [5]:
true_list = pd.concat([dev, train])
true_labels = true_list['label'].tolist()

## Carga de los experimentos

In [36]:
drive.mount('/content/drive', force_remount=False)
folder_root = "/content/drive/My Drive/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [37]:
file_path = os.path.join(folder_root, folder_name, carpeta)

all_results = []
for i in range (exp_min, exp_max + 1):
  file_name= file_path + f"/experimento_{i}.csv"
  df = pd.read_csv(file_name)
  labels_list = df['Predicted Labels'].tolist()
  all_results.append(labels_list)

## Metricas de Interes

In [41]:
# Map CEFR levels to numerical values
cefr_mapping = {'A1': 0, 'A2': 1, 'B1': 2, 'B2': 3, 'C1': 4, 'C2': 5}

def weighted_hit_ratio(true_labels, predicted_labels, weight_M):
    total_hits = 0
    total_penalty = 0

    for true_label, predicted_label in zip(true_labels, predicted_labels):

        if predicted_label == 'Unknown' or predicted_label == 'LABEL_0':
            continue

        true_value = cefr_mapping[true_label]
        predicted_value = cefr_mapping[predicted_label]

        # Calculate the distance between actual and predicted classes
        distance = abs(true_value - predicted_value)

        if distance == 0:
            # It's a hit, so add weight M
            total_hits += weight_M
        else:
            # It's an error, so add the penalty (the distance)
            total_penalty += distance

    # Weighted hit ratio = Total hits / (Total hits + Total penalties)
    if total_hits + total_penalty == 0:
        return 0  # To avoid division by zero
    return total_hits / (total_hits + total_penalty)

In [42]:
def aproximate_acuracy(true_labels, predicted_labels):
    total_hits = 0
    total_penalty = 0

    for true_label, predicted_label in zip(true_labels, predicted_labels):

        if predicted_label == 'Unknown' or predicted_label == 'LABEL_0':
            continue

        true_value = cefr_mapping[true_label]
        predicted_value = cefr_mapping[predicted_label]

        # Calculate the distance between actual and predicted classes
        distance = abs(true_value - predicted_value)

        if distance == 0:
            # It's a hit, so add weight M
            total_hits += 1
        elif distance == 1:
            total_hits += 0.5

    return total_hits / len(true_labels)

## Calcular metricas y armar csv

In [43]:
# Initialize a list to store metrics for each experiment
metrics_data = []

hit_weight = 2

# Define class labels for F1 scores per class
class_labels = ["A1", "A2", "B1", "B2", "C1", "C2", 'Unknown']

for idx, results in enumerate(all_results):
  #calculate accuracy
  accuracy = sk_metrics.accuracy_score(true_labels, results)

  #calculate weighted_hit_ratio
  hit_ratio = weighted_hit_ratio(true_labels, results, hit_weight)
  aproximate = aproximate_acuracy(true_labels, results)

  # Calculate F1 scores
  micro_f1 = sk_metrics.f1_score(true_labels, results, average='micro')
  macro_f1 = sk_metrics.f1_score(true_labels, results, average='macro')

  # Calculate F1 scores for each class
  f1_per_class = sk_metrics.f1_score(true_labels, results, labels=["A1", "A2", "B1", "B2", "C1", "C2", 'Unknown'], average=None)

  # Calculate precision and recall for each class
  precision_per_class = sk_metrics.precision_score(true_labels, results, labels=class_labels, average=None)
  recall_per_class = sk_metrics.recall_score(true_labels, results, labels=class_labels, average=None)

  # Count the predicted labels for each class
  predicted_counts = collections.Counter(results)
  true_counts = collections.Counter(true_labels)

  # Ensure that all class labels are present in the counts, even if some classes are missing
  predicted_counts = [predicted_counts.get(label, 0) for label in class_labels]
  true_counts = [true_counts.get(label, 0) for label in class_labels]

  # Append the metrics to the list (one row for each experiment)
  # For each class, the order will be: Predicted Count, Precision, Recall, F1
  row = [f'Experiment {idx}', accuracy, aproximate, hit_ratio, macro_f1, micro_f1]  # Common metrics
  for i in range(len(class_labels)):
      row += [predicted_counts[i], true_counts[i], precision_per_class[i], recall_per_class[i], f1_per_class[i]]

  metrics_data.append(row)


# Save metrics to a CSV file
filename = file_path + f"/{nombre_archivo}.csv"
with open(filename, 'w', newline='') as csvfile:
  writer = csv.writer(csvfile)

  # Write the header
  header = ['Experiment', 'Accuracy', 'Accuracy aproximada', 'Accuracy Pesada', 'Macro-F1', 'Micro-F1']
  for label in class_labels:
      header += [f'{label} Predicted Count', f'{label} cantidad de mustras de una clase en el experimento', f'{label} Precision', f'{label} Recall', f'{label} F1']

  writer.writerow(header)

  # Write the metrics data
  writer.writerows(metrics_data)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Matriz de confucion

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
for idx, results in enumerate(all_results):
  # Compute the confusion matrix
  conf_matrix = sk_metrics.confusion_matrix(true_labels, results)

  # Plot the confusion matrix as a heatmap
  plt.figure(figsize=(8, 6))
  seaborn.heatmap(
      conf_matrix, annot=True, fmt='d', cmap='Blues',
      xticklabels=numpy.unique(true_labels),
      yticklabels=numpy.unique(true_labels))
  plt.xlabel('Predicted label')
  plt.ylabel('True label')
  plt.title(f'Confusion Matrix para exp: {idx}')

  file_path = f'/content/drive/My Drive/mineria de datos/matriz de confucion/llama/{idx}.png'  # Change path as needed
  plt.savefig(file_path, dpi=600)
  print(f"Plot saved to {file_path}")