In [None]:
import nltk
nltk.download('stopwords')
from datasets import Dataset, DatasetDict
import pandas as pd
import numpy as np
import mord as m
import os
from nltk.corpus import stopwords
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import mutual_info_regression, SelectKBest, chi2
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, cohen_kappa_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor, GradientBoostingRegressor
from sklearn.svm import SVR, SVC
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from functools import partial

# Definição do Tipo de Análise:
# Valores de 0 a 4: Competências do ENEM (C1 a C5)
# Valor 5: Nota final (soma das competências)
REFERENCE_CONCEPT = 4

# Carregamento do dataset 


In [None]:
DATASET_PATH = "../corpus/"
DATASET_NAME = "-00000-of-00001.parquet"
DIVISIONS = ("train", "validation", "test")
def target_dataset_path(target: str):
    if target in DIVISIONS:
        return DATASET_PATH + target + DATASET_NAME
    else:
        raise ValueError("ERROR: Invalid target for dataset.")

datasets_dict = {}

for division in DIVISIONS:
    file_path = target_dataset_path(division)
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Arquivo não encontrado: {file_path}")
    df = pd.read_parquet(file_path, engine='pyarrow')
    datasets_dict[division] = Dataset.from_pandas(df)

dataset_filtrado = DatasetDict(datasets_dict)

In [None]:
# Mapeamento de notas para competências (não aplicável à nota final)
grade_mapping = {
    0: 0,
    40: 1,
    80: 2,
    120: 3,
    160: 4,
    200: 5,
}

def create_label(row):
  if REFERENCE_CONCEPT == 5:
    return {"label": row["grades"][-1]}
  else:
    grade = row["grades"][REFERENCE_CONCEPT]
    return {"label": grade_mapping[grade]}

dataset = dataset_filtrado.map(create_label)

# Função para Calcular a Acurácia do ENEM

In [None]:
def enem_accuracy_score(true_values, predicted_values):
    """Calcula acurácia no padrão ENEM (diferença <= 80 pontos)"""
    if REFERENCE_CONCEPT == 5:
      limite_pontos = 80
    else:
      limite_pontos = 2
    assert len(true_values) == len(predicted_values), "Mismatched length between true and predicted values."  # Verifica se cada valor predito tem um correspondente pra calcular a diferença

    non_divergent_count = sum([1 for t, p in zip(true_values, predicted_values) if abs(t - p) <= limite_pontos])

    return non_divergent_count / len(true_values)

# Função regression_report inspirada na classification_report

In [None]:
def regression_report(y_true, y_pred):
    """Relatório completo de métricas de regressão"""
    r2 = r2_score(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    enem_acc = enem_accuracy_score(y_true, y_pred)

    # QWK precisa transformar as notas em inteiros
    y_true_rounded = np.round(y_true).astype(int)
    y_pred_rounded = np.round(y_pred).astype(int)

    qwk = cohen_kappa_score(y_true_rounded, y_pred_rounded, weights="quadratic")

    print("Regression Report:")
    print(f"R² Score: {r2:.4f}")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"ENEM Accuracy Score: {enem_acc:.2f}")
    print(f"Quadratic Weighted Kappa (QWK): {qwk:.4f}")
    print()

# Pré-Processamento

In [None]:
# Configuração para TF-IDF
stop_words = stopwords.words("portuguese")
X_train = dataset["train"]["essay_text"]
X_test = dataset["test"]["essay_text"]
y_train = np.array(dataset["train"]["label"]).reshape(-1)
y_test = np.array(dataset["test"]["label"]).reshape(-1)

if REFERENCE_CONCEPT == 0:
    # C1 (Gramática): Caracteres (mantém stop words)
    params = {'sublinear_tf': True, 'analyzer': 'char_wb', 'min_df': 5, 'ngram_range': (3, 5), 'max_features': 5000}

elif REFERENCE_CONCEPT == 3:
    # C4 (Coesão): Palavras + Conectivos (mantém stop words)
    params = {'sublinear_tf': True, 'min_df': 5, 'ngram_range': (1, 3)}

else:
    # C2, C3, C5 (Tema/Argumentação): Palavras sem stop words
    params = {'sublinear_tf': True, 'min_df': 5, 'ngram_range': (1, 2), 'stop_words': stop_words}

if REFERENCE_CONCEPT == 0: 
    k_best_val = 500 
else:
    k_best_val = 2000 

# Pipeline para regressão
preprocessador = make_pipeline(
    TfidfVectorizer(**params),
    SelectKBest(score_func=partial(mutual_info_regression, random_state=1), k=k_best_val),
    Normalizer(norm='l2')
)
X_train_transf = preprocessador.fit_transform(X_train, y_train)

# Pipeline para classificação (usado apenas para competências)
if REFERENCE_CONCEPT != 5:
    preprocessador_clf = make_pipeline(
        TfidfVectorizer(**params),
        SelectKBest(score_func=chi2, k=k_best_val),
        Normalizer(norm='l2')
    )
    X_train_clf = preprocessador_clf.fit_transform(X_train, y_train)

# Modelos

In [None]:
# Dicionário de modelos
modelos = {
    # Modelos de Regressão
    "Lasso": Lasso(),
    "Regressao Linear": LinearRegression(),
    "Ridge": Ridge(),
    "Random Forest Regression": RandomForestRegressor(random_state=1),
    "HistGradientBoostingRegressor": HistGradientBoostingRegressor(random_state=1),
    "SVR": SVR(),
    "MLP": MLPRegressor(random_state=1, max_iter=1000),
    "KNeighborsRegressor": KNeighborsRegressor(),
    "DecisionTreeRegressor": DecisionTreeRegressor(random_state=1),
    "GradientBoostingRegressor": GradientBoostingRegressor(random_state=1),

    # Modelos de regressão ordinal (apenas para competências)
    "Ordinal Regression LogisticAT": m.LogisticAT(alpha=1.0),
    "Ordinal Regression LogisticIT": m.LogisticIT(alpha=1.0),
    "Ordinal Regression OrdinalRidge": m.OrdinalRidge(),
    "Least Absolute Deviation (LAD)": m.LAD(random_state=1) # É como uma regressão linear, mas utiliza o erro absoluto em vez do erro quadrático
}

# Treinamento e Avaliação

In [None]:
print(f"++++++++++++ ANÁLISE DA {'NOTA FINAL' if REFERENCE_CONCEPT == 5 else f'COMPETÊNCIA {REFERENCE_CONCEPT + 1}'} ++++++++++++")

for nome, modelo in modelos.items():
  if REFERENCE_CONCEPT == 5 and (nome.startswith("Ordinal Regression") or nome == "Least Absolute Deviation (LAD)"):
        continue  # Pula esses modelos se estiver avaliando a nota final porque mord não funciona sem o mapeamento

  # Treinamento especial para HistGradientBoosting (requer array denso)
  if nome == "HistGradientBoostingRegressor":
    X_train_transf_dense = X_train_transf.toarray()
    X_test_transf_dense = (preprocessador.transform(X_test)).toarray()
    modelo.fit(X_train_transf_dense, y_train)
    y_pred = modelo.predict(X_test_transf_dense)
  else:
    modelo.fit(X_train_transf, y_train)
    y_pred = modelo.predict(preprocessador.transform(X_test))

  print(f"****  Modelo: {nome} ****")
  regression_report(y_test, y_pred)

  if REFERENCE_CONCEPT != 5:
        y_pred_round = np.round(y_pred).astype(int)
        print("Classification Report:")
        print(classification_report(y_test, y_pred_round, zero_division=0))
        print("Confusion Matrix:")
        print(confusion_matrix(y_test, y_pred_round))
        print()

# Classificação (Somente para Competências)


SVC é a implementação do SVM para problemas de classificação


In [None]:
if REFERENCE_CONCEPT != 5:
  print("Otimização de SVC para Classificação:")
  param_grid = {'kernel': ['linear', 'rbf', 'poly'], 'C': [0.1, 1, 10, 100]}
  grid_search = GridSearchCV(SVC(), param_grid, cv=5, n_jobs=-1, verbose=1)
  grid_search.fit(X_train_clf, y_train)

  y_pred_clf = grid_search.predict(preprocessador_clf.transform(X_test))
  print("\nMelhores parâmetros:", grid_search.best_params_)
  print("\nClassification Report:")
  print(classification_report(y_test, y_pred_clf))