In [None]:
import os
import datetime
import pandas as pd
import numpy as np
from simpletransformers.classification import ClassificationArgs, ClassificationModel
from sklearn.metrics import root_mean_squared_error, cohen_kappa_score
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt

# general hyperparameters for the execution
hyperparams = {
    "TARGET": 5, # Competences 1-5; (Global Score 6 - not used)
    
    "MANUAL_SEED": 42,
    "NUM_EPOCHS": 6,
    "LEARNING_RATE": 2e-5,
    "WEIGHT_DECAY": 0.01,
    "TRAIN_BATCH_SIZE": 8,
    "EVAL_BATCH_SIZE": 8,
    "GRADIANT_ACCUMULATION_STEPS": 1,
    "EVAL_DURING_TRAIN_STEPS": 30,
    "EARLY_STOP_MET": "eval_loss",
    "EARLY_STOP_PAT": 2,

    "REGRESSION": True,
    # "MAX_SEQ_LENGTH": 256,
    "NO_CACHE": False,
    "LOGGING_STEPS": 20,
}

# output and model paths
OUTPUT_PATH = f"../outputs/transformers/c{hyperparams['TARGET']}"
BEST_MODEL_PATH = f"{OUTPUT_PATH}/best-model"
LOG_PATH = f"../resultados/c{hyperparams['TARGET']}"

# model arguments
MODEL_TYPE = "bert"
MODEL_VARIATION = "base" # base/large
MODEL_NAME = f"neuralmind/{MODEL_TYPE}-{MODEL_VARIATION}-portuguese-cased"
MODEL_ARGS = ClassificationArgs(
    manual_seed=hyperparams["MANUAL_SEED"],
    output_dir=OUTPUT_PATH,
    best_model_dir=BEST_MODEL_PATH,
    overwrite_output_dir=True,
    reprocess_input_data=True,
    use_multiprocessing=False,
    evaluate_during_training=True,
    evaluate_each_epoch=True,
    save_best_model=True,
    use_early_stopping=True,
    save_eval_checkpoints=False,
    save_model_every_epoch=False,
    regression=hyperparams["REGRESSION"],
    # max_seq_length=hyperparams["MAX_SEQ_LENGTH"],
    no_cache=hyperparams["NO_CACHE"],
    logging_steps=hyperparams["LOGGING_STEPS"],
    num_train_epochs=hyperparams["NUM_EPOCHS"],
    learning_rate=hyperparams["LEARNING_RATE"],
    weight_decay=hyperparams["WEIGHT_DECAY"],
    train_batch_size=hyperparams["TRAIN_BATCH_SIZE"],
    eval_batch_size=hyperparams["EVAL_BATCH_SIZE"],
    gradient_accumulation_steps=hyperparams["GRADIANT_ACCUMULATION_STEPS"],
    evaluate_during_training_steps=hyperparams["EVAL_DURING_TRAIN_STEPS"],
    early_stopping_metric=hyperparams["EARLY_STOP_MET"],
    early_stopping_patience=hyperparams["EARLY_STOP_PAT"],
)

# dataset path and definition
DATASET_PATH = "../corpus/"
DATASET_NAME = "-00000-of-00001.parquet"
DIVISIONS = ("train", "validation", "test")
def target_dataset_path(target: str):
    if target in DIVISIONS:
        return DATASET_PATH + target + DATASET_NAME
    else:
        raise ValueError("ERROR: Invalid target for dataset.")

In [None]:
# map grades
def normalize_score(row, target):
    return row['grades'][target] // 40

essays_set = {}
for division in DIVISIONS:
    essays = pd.read_parquet(target_dataset_path(division), engine='pyarrow')[['essay_text', 'grades']]
    essays.rename(columns={'essay_text': 'text'}, inplace=True)
    essays['label'] = essays.apply(lambda row: normalize_score(row, hyperparams["TARGET"] - 1), axis=1) # Normalizing target score
    essays_set[division] = essays[['text', 'label']]

In [None]:
model = ClassificationModel(
    model_type=MODEL_TYPE,
    model_name=MODEL_NAME,
    num_labels=1, # regression
    args=MODEL_ARGS,
)

In [None]:
global_step, training_details = model.train_model(essays_set['train'], eval_df=essays_set['validation'])

In [None]:
# get train loss
train_loss_df = pd.DataFrame({
    'step': training_details['global_step'],
    'running_loss': training_details['train_loss']
})

print('Training Loss History:')
print(train_loss_df)

# get evaluation loss
eval_steps = []
if 'eval_loss' in training_details and training_details['eval_loss']:
    initial_step = model.args.evaluate_during_training_steps
    num_evals = len(training_details['eval_loss'])
    eval_steps = [initial_step * (i + 1) for i in range(num_evals)]

eval_loss_df = pd.DataFrame({
    'step': eval_steps,
    'eval_loss': training_details.get('eval_loss', []) # .get() para evitar erro se a lista estiver vazia
})

print('Evaluation Loss History:')
print(eval_loss_df)

# make graph to analyze
plt.figure(figsize=(14, 7))

plt.plot(train_loss_df['step'], train_loss_df['running_loss'], label='Train Loss (Running Loss)')
if not eval_loss_df.empty:
    plt.plot(eval_loss_df['step'], eval_loss_df['eval_loss'], label='Validation Loss (Eval Loss)', marker='o', linestyle='--')

plt.title('Learning Curve: Train Loss vs. Validation Loss', fontsize=16)
plt.xlabel('Training Steps', fontsize=12)
plt.ylabel('Loss', fontsize=12)
plt.legend(fontsize=12)
plt.grid(True)
plt.show()

In [None]:
# load best model
best_model = ClassificationModel(
    model_type=MODEL_TYPE,
    model_name=BEST_MODEL_PATH,
    num_labels=1, # regression
    args=MODEL_ARGS,
)

In [None]:
# currently using best_model.predict()
# result, model_outputs, wrong_predictions = best_model.eval_model(essays_set[''])

In [None]:
def enem_accuracy_score(true_values, predicted_values):
    """
    Calcula a proporção de predições que estão dentro do limite de 80 pontos de divergência em relação às notas verdadeiras, conforme os critérios de avaliação do ENEM.

    true_values: Lista de notas verdadeiras (valores reais).
    predicted_values: Lista de notas preditas pelo modelo.
    """

    assert len(true_values) == len(predicted_values), "Mismatched length between true and predicted values."

    # non_divergent_count = sum([1 for t, p in zip(true_values, predicted_values) if abs(t - p) <= 80])
    non_divergent_count = sum([1 for t, p in zip(true_values, predicted_values) if abs(t - p) <= 2]) # 80 -> 2

    return non_divergent_count / len(true_values)

In [None]:
test_text_list = essays_set['test']['text'].tolist()
y_true = essays_set['test']['label'].tolist()

y_pred, raw_outputs = best_model.predict(test_text_list)
y_pred_round = np.clip(np.round(y_pred), 0, 5).astype(int)

# get metrics
qwk = cohen_kappa_score(y_true, y_pred_round, weights='quadratic')
enem_acc = enem_accuracy_score(y_true, y_pred_round)
class_rep_mtx = classification_report(y_true, y_pred_round, target_names=['Nota 0', 'Nota 40', 'Nota 80', 'Nota 120', 'Nota 160', 'Nota 200'], zero_division=0.0)
conf_mtx = confusion_matrix(y_true, y_pred_round)

# scale for RMSE metric
scaled_predictions = y_pred * 40
scaled_original = np.array(y_true) * 40
rmse = root_mean_squared_error(scaled_original, scaled_predictions)

print(f'Quadratic Weighted Kappa: {qwk:.4f}\n')
print(f'ENEM Accuracy: {enem_acc:.4f}\n')
print(f'Root Mean Squared Error: {rmse:.4f}\n')
print('Classification Report Matrix:')
print(class_rep_mtx)
print('\nConfusion Matrix:')
print(conf_mtx)

# precision (for each class): % of predictions for that were actually correct
# recall (for each class): % of all actual instances that the model correctly identified
# f1-score (for each class): harmonic mean of precision and recall - performance
# support (for each class): total number of actual occurrences in the dataset

In [None]:
# set up path and file for report
timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
log_filename = f"transformer_regressao_c{hyperparams['TARGET']}_{timestamp}.txt"
full_log_path = os.path.join(LOG_PATH, log_filename)
os.makedirs(LOG_PATH, exist_ok=True)

# report informations
report_content = f"""
======================================================================
           RELATÓRIO DE EXPERIMENTO DE MODELO
======================================================================

Data e Hora: {timestamp}
Arquivo de Log: {log_filename}

----------------------------------------------------------------------
                   INFORMAÇÕES DO MODELO
----------------------------------------------------------------------
"""
report_content += f"{'MODEL_TYPE':<28}: {MODEL_TYPE}\n"
report_content += f"{'MODEL_VARIATION':<28}: {MODEL_VARIATION}\n"
report_content += f"{'MODEL_NAME':<28}: {MODEL_NAME}\n"
report_content += """
----------------------------------------------------------------------
               HIPERPARÂMETROS DE TREINAMENTO
----------------------------------------------------------------------
"""
for key, value in hyperparams.items():
    report_content += f"{key:<28}: {value}\n"
report_content += """
----------------------------------------------------------------------
                      HISTÓRICO DE LOSS
----------------------------------------------------------------------
"""
report_content += "\nTraining Loss History:\n"
report_content += train_loss_df.to_string()
report_content += "\n\nEvaluation Loss History:\n"
report_content += eval_loss_df.to_string()
report_content += """


----------------------------------------------------------------------
             MÉTRICAS DE AVALIAÇÃO (CONJUNTO DE TESTE)
----------------------------------------------------------------------
"""
report_content += f"\nQuadratic Weighted Kappa: {qwk:.4f}\n"
report_content += f"ENEM Accuracy: {enem_acc:.4f}\n"
report_content += f"Root Mean Squared Error: {rmse:.4f}\n"
report_content += "\nClassification Report:\n"
report_content += class_rep_mtx  # already string
report_content += "\n\nConfusion Matrix:\n(Linhas = Real, Colunas = Previsto)\n"
report_content += str(conf_mtx) # numpy to string
report_content += """

======================================================================
                       FIM DO RELATÓRIO
======================================================================
"""

with open(full_log_path, 'w', encoding='utf-8') as f:
    f.write(report_content)

print(f"\nRelatório do experimento salvo com sucesso em: '{full_log_path}'")