In [None]:
!python -m spacy download es_core_news_md
!python -m spacy_affixes download es

In [None]:
from rantanplan import get_scansion
import pandas as pd
import re
import json
from tqdm.notebook import tqdm
import sklearn.metrics

import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay

In [None]:
df = pd.read_csv('annotation-final-sorted-filtered.csv', sep=',')

In [None]:
def clean_text(string):
    output = string.strip()
    # replacements = (("“", '"'), ("”", '"'), ("//", ""), ("«", '"'), ("»",'"'))
    
    # Quitamos símbolos que no aportan información
    replacements = (
        ("_", " "), ("x000D", " "), ("-", " "),
        ("[", ""), ("]", ""), ("'", ""), ("#", " "),
    )
    for replacement in replacements:
        output = output.replace(*replacement)
    # Any sequence of two or more spaces should be converted into one space
    #output = re.sub(r'(?is)\s+', ' ', output)
    output = re.sub(r'[0-9]', '', output)  # Quitar números
    output = re.sub(r'(\s*\*)+\n', '', output)  # Quitar líneas con asteriscos
    output = re.sub(r'\t+', ' ', output)  # Cambiar tabulaciones por espacios
    output = re.sub(r'\n{2,}', '\n', output)  # Cambiar varios saltos de línea seguidos por uno solo
    output = re.sub(r' {2,}', ' ', output)  # Cambiar varios espacios seguidos por uno solo

    return output.strip()

In [None]:
df.Stanza_text = df.Stanza_text.apply(lambda x: clean_text(x))

In [None]:
errors = []
eval_dict = {}
for idx, stanza in tqdm(enumerate(df.Stanza_text)):
    try:
        eval_dict[idx] = {"rantanplan": get_scansion(stanza, rhyme_analysis=True)[0].get("structure"), "correct": str(df.ST_Correct[idx]), "text": stanza}
    except:
        errors.append(stanza)
        print('#######', stanza)
        continue

## Evaluation

In [None]:
#out_evaluation = ["sonnet", "terceto_encadenado", "décima_antigua", "copla_mixta"]
out_of_evaluation = ["sonnet", "terceto_encadenado"]
dict_revision = {}
text, annotated_type, rantanplan_type = ([], [],[])
for value in eval_dict.values():
    #if value["correct"] == "copla_mixta":
    #    continue
    annotated_type.append(value["correct"])
    if value["rantanplan"] is None or value["rantanplan"] in out_evalout_of_evaluationuation:
        value["rantanplan"] = 'None'
    rantanplan_type.append(value["rantanplan"])
    text.append(value["text"])
dict_revision.update({'text': text,'y_true': annotated_type, 'y_pred': rantanplan_type})

In [None]:
df_eval = pd.DataFrame.from_dict(dict_revision)

In [None]:
precision,recall,f1,_ = sklearn.metrics.precision_recall_fscore_support(df_eval.y_true, df_eval.y_pred, average="macro", zero_division=0)
mcc = sklearn.metrics.matthews_corrcoef(df_eval.y_true, df_eval.y_pred)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1: ", f1)
print("MCC :", mcc)

In [None]:
accuracy = sklearn.metrics.accuracy_score(df_eval.y_true, df_eval.y_pred)
print("Accuracy: ", accuracy)

In [None]:
conf_matrix = sklearn.metrics.confusion_matrix(df_eval.y_true, df_eval.y_pred,labels = sorted(df_eval.y_true.unique()))

In [None]:
plt.rcParams['font.size'] = 25
plt.rcParams["figure.figsize"] = (40,40)

display_labels = sorted(df_eval.y_true.unique())
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix,
                              display_labels=display_labels)

disp = disp.plot(include_values=True, cmap='Blues', xticks_rotation='vertical')
plt.show()