In [1]:
#1 - LIBRARIES

import pandas as pd
import numpy as np

from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler


from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, QuantileTransformer, PowerTransformer, Normalizer
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import (
    confusion_matrix, precision_score, recall_score, accuracy_score,
    roc_auc_score, f1_score, log_loss, brier_score_loss, average_precision_score,  
    balanced_accuracy_score, matthews_corrcoef, classification_report
)

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.metrics import AUC, Precision, Recall, TruePositives, TrueNegatives, FalsePositives, FalseNegatives
from tensorflow.keras.callbacks import CSVLogger, EarlyStopping, ReduceLROnPlateau, ModelCheckpoint



import torch
from tabpfn import TabPFNClassifier



In [3]:
# 2 - LOADING DATA

separator = ','
fold = "4" # set the fold number for the files

url_train = "data/6h/train_df_kfold_"+fold+".csv"
url_val = "data/6h/val_df_kfold_"+fold+".csv"
url_test = "data/6h/test_df_kfold_"+fold+".csv"


# CHOOSE COLUMNS TO DELETE

qtde_attributes = 18 #18 10

#18 attributes
list_col_delete = ['Class_Flare']

#10 Bobra'attributes
list_col_delete = ['Class_Flare','MEANGAM','MEANGBH','MEANGBT','MEANGBZ', 'MEANJZD','MEANJZH','MEANALP','MEANSHR']

#10 SHAP'atrributes
list_col_delete = ['Class_Flare','ABSNJZH','MEANGAM','MEANJZD','MEANJZH','SAVNCPP','TOTPOT','TOTUSJH','TOTUSJZ']

# NORMALIZATION
scaler_name = 'StandardScaler'  #'StandardScaler', 'RobustScaler', 'MinMaxScaler', 'NormalizerL1', 'PowerTransformer', 'QuantileTransformer'


# BALANCING
balanceamento = 'smote'  #'smote', 'oversampling', 'undersampling', 'class_weight', ou 'none'

#Configuration
epoch = 100
batch = 64

optimizer=keras.optimizers.Adam(learning_rate=1e-4) #1e4
focal_gamma = 2
focal_alpha = 0.25



In [4]:
# 3 - READ AND PREPARE DATA

train_df = pd.read_csv(url_train, sep=separator)
val_df = pd.read_csv(url_val, sep=separator)
test_df = pd.read_csv(url_test, sep=separator)


# Convert datetime
date1_ta = pd.to_datetime(train_df['T_REC'], errors='coerce', format='%Y-%m-%d %H:%M:%S')
date2_ta = pd.to_datetime(train_df['T_REC'], errors='coerce', format='%Y-%m-%d')

date1_va = pd.to_datetime(val_df['T_REC'], errors='coerce', format='%Y-%m-%d %H:%M:%S')
date2_va = pd.to_datetime(val_df['T_REC'], errors='coerce', format='%Y-%m-%d')

date1_te = pd.to_datetime(test_df['T_REC'], errors='coerce', format='%Y-%m-%d %H:%M:%S')
date2_te = pd.to_datetime(test_df['T_REC'], errors='coerce', format='%Y-%m-%d')

train_df['T_REC'] = date1_ta.fillna(date2_ta)
val_df['T_REC'] = date1_va.fillna(date2_va)
test_df['T_REC'] = date1_te.fillna(date2_te)

# Remove timezone to avoid date shifts
train_df['T_REC'] = train_df['T_REC'].dt.tz_localize(None)
val_df['T_REC'] = val_df['T_REC'].dt.tz_localize(None)
test_df['T_REC'] = test_df['T_REC'].dt.tz_localize(None)

# order date
train_df = train_df.sort_values(by='T_REC')
val_df = val_df.sort_values(by='T_REC')
test_df = test_df.sort_values(by='T_REC')


#save test extra columns
harpnum_test = test_df['harpnum'].values
t_rec_test = test_df['T_REC'].values
letra_class_test = test_df['Letra_Class'].values

info_test = pd.DataFrame({
    'harpnum': harpnum_test,
    'T_REC': t_rec_test,
    'Letra_Class': letra_class_test
})
info_test.to_csv('results/tabpfn-info_test-fold'+fold+'.csv', index=False)


#delete columns
for lcd in list_col_delete:
    train_df.pop(lcd)
    val_df.pop(lcd)
    test_df.pop(lcd)
    



  train_df = pd.read_csv(url_train, sep=separator)
  val_df = pd.read_csv(url_val, sep=separator)
  test_df = pd.read_csv(url_test, sep=separator)


In [5]:
# 4 - TabPFN random undersampling

# Total maximum number that TabPFN recommends
max_samples = 1000
max_per_class = max_samples // 2  # 512 for each class

# Separate positives and negatives
positivos = train_df[train_df['Class'] == 1]
negativos = train_df[train_df['Class'] == 0]

# Subsample if necessary
positivos_sample = (
    positivos.sample(n=max_per_class, random_state=42)
    if len(positivos) > max_per_class
    else positivos
)

negativos_sample = (
    negativos.sample(n=max_per_class, random_state=42)
    if len(negativos) > max_per_class
    else negativos
)

# Combine and shuffle
train_balanceado = pd.concat([positivos_sample, negativos_sample]).sample(frac=1, random_state=42)

print(f"Samples in balanced training: {len(train_balanceado)}")
print(f"Positives: {len(positivos_sample)}, Negatives: {len(negativos_sample)}")

# Columns not used in training
colunas_extras = ['T_REC', 'harpnum', 'Letra_Class']

# Separate features and targets for training and validation (removing extras)
X_train = train_balanceado.drop(columns=['Class'] + colunas_extras).values
y_train = train_balanceado['Class'].values

X_val = val_df.drop(columns=['Class'] + colunas_extras).values
y_val = val_df['Class'].values

# For the test set, we keep the extra columns before removing
test_extras = test_df[colunas_extras].copy()
X_test = test_df.drop(columns=['Class'] + colunas_extras).values
y_test = test_df['Class'].values

# Normalization

scaler_name = "StandardScaler"
scaler_standard = StandardScaler()
scaler_robust = RobustScaler()
scaler_minmax = MinMaxScaler()
scaler_l1 = Normalizer(norm='l1')
transformer_yeo = PowerTransformer(method='yeo-johnson')
transformer_quantile = QuantileTransformer(output_distribution='normal')

# Choose the scaler based on the scaler_name variable
if scaler_name == 'StandardScaler':
    scaler = scaler_standard
elif scaler_name == 'RobustScaler':
    scaler = scaler_robust
elif scaler_name == 'MinMaxScaler':
    scaler = scaler_minmax
elif scaler_name == 'NormalizerL1':
    scaler = scaler_l1
elif scaler_name == 'PowerTransformer':
    scaler = transformer_yeo
elif scaler_name == 'QuantileTransformer':
    scaler = transformer_quantile
else:
    raise ValueError(f"Scaler '{scaler_name}' not recognized. Please choose a valid one.")




X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)


Samples in balanced training: 1000
Positives: 500, Negatives: 500


In [6]:

# Treinamento
model = TabPFNClassifier(device='cuda' if torch.cuda.is_available() else 'cpu')

model.fit(X_train_scaled, y_train)


Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client


In [7]:
#6 - Model Evaluate


#results = model.evaluate(X_test, y_test, verbose=0)
#print(f"Test Results - Loss: {results[0]:.4f}, AUC: {results[1]:.4f}")

#probabilities
y_pred_probs = model.predict_proba(X_test_scaled)[:, 1]

# open auxiliar
info_test = pd.read_csv('results/tabpfn-info_test-fold' + fold + '.csv')[['harpnum', 'T_REC', 'Letra_Class']]
info_test = info_test.reset_index(drop=True)


metrics_list = []
positivos_info_total = []
probs_com_classe_real = []

# --- CSV 3: harpnum + real class + probability + t_rec + Letra_Class ---
for idx, (true_class, prob) in enumerate(zip(y_test, y_pred_probs)):
    harpnum = info_test.loc[idx, 'harpnum']
    t_rec = info_test.loc[idx, 'T_REC']
    letra_class = info_test.loc[idx, 'Letra_Class']
    probs_com_classe_real.append({
        'harpnum': harpnum,
        'classe_real': int(true_class),
        'probabilidade_modelo': round(float(prob), 6),
        'T_REC': t_rec,
        'Letra_Class': letra_class
    })

# --- Loop of thresholds  ---
thresholds = np.arange(0.10, 1.00, 0.01)

for threshold in thresholds:
    y_pred = (y_pred_probs >= threshold).astype(int)
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()
    
    print("Matriz")
    print(cm)

    # Metrics
    bce = tf.keras.losses.BinaryCrossentropy()
    #computed_loss = bce(y_test, y_pred_probs).numpy()
    computed_loss = "null"
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    accuracy = accuracy_score(y_test, y_pred)
    auc_score = roc_auc_score(y_test, y_pred_probs)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    balanced_acc = balanced_accuracy_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)

    sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    tss = sensitivity + specificity - 1
    total = tp + tn + fp + fn
    pe = ((tp + fn)*(tp + fp) + (tn + fn)*(tn + fp)) / (total**2)
    hss = (accuracy - pe) / (1 - pe) if (1 - pe) != 0 else 0
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0

    metrics_list.append({
        'threshold': round(threshold, 2),
        'tn': tn, 'fp': fp, 'fn': fn, 'tp': tp,
        'loss': computed_loss,
        'auc': auc_score,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'accuracy': accuracy,
        'balanced_accuracy': balanced_acc,
        'mcc': mcc,
        'tss': tss,
        'hss': hss,
        'true_positive_rate': sensitivity,
        'true_negative_rate': specificity,
        'false_positive_rate': fpr,
        'false_negative_rate': fnr
    })

    # --- CSV 2: harpnums positivos únicos + probabilidade + Letra_Class ---
    indices_positivos = np.where(y_pred == 1)[0]
    for idx in indices_positivos:
        prob = y_pred_probs[idx]
        harpnum = info_test.loc[idx, 'harpnum']
        letra_class = info_test.loc[idx, 'Letra_Class']
        positivos_info_total.append({
            'threshold': round(threshold, 2),
            'harpnum': harpnum,
            'probabilidade': round(float(prob), 6),
            'Letra_Class': letra_class
        })

# --- Save files ---
# CSV 1: metrics
pd.DataFrame(metrics_list).to_csv('results/tabpfn-metrics-fold' + fold + '.csv', index=False)

# CSV 2: unique positives with probability
df_positivos = pd.DataFrame(positivos_info_total).drop_duplicates()
df_positivos.to_csv('results/tabpfn-harpnums_all_thresholds_fold' + fold + '.csv', index=False)

# CSV 3: all samples with real class and probability
pd.DataFrame(probs_com_classe_real).to_csv('results/tabpfn-real_class_prob_harpnum_' + fold + '.csv', index=False)

print("\nResults saved successfully")

Matriz
[[109170  29563]
 [    12   1108]]
Matriz
[[110026  28707]
 [    13   1107]]
Matriz
[[110790  27943]
 [    13   1107]]
Matriz
[[111479  27254]
 [    15   1105]]
Matriz
[[112106  26627]
 [    16   1104]]
Matriz
[[112725  26008]
 [    16   1104]]
Matriz
[[113261  25472]
 [    17   1103]]
Matriz
[[113808  24925]
 [    17   1103]]
Matriz
[[114281  24452]
 [    17   1103]]
Matriz
[[114727  24006]
 [    17   1103]]
Matriz
[[115198  23535]
 [    19   1101]]
Matriz
[[115627  23106]
 [    19   1101]]
Matriz
[[116031  22702]
 [    19   1101]]
Matriz
[[116403  22330]
 [    20   1100]]
Matriz
[[116759  21974]
 [    21   1099]]
Matriz
[[117122  21611]
 [    21   1099]]
Matriz
[[117462  21271]
 [    22   1098]]
Matriz
[[117793  20940]
 [    23   1097]]
Matriz
[[118147  20586]
 [    23   1097]]
Matriz
[[118453  20280]
 [    26   1094]]
Matriz
[[118797  19936]
 [    26   1094]]
Matriz
[[119118  19615]
 [    26   1094]]
Matriz
[[119423  19310]
 [    28   1092]]
Matriz
[[119731  19002]
 [    30  