In [11]:
#1 - LIBRARIES

import pandas as pd
import numpy as np

from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler


from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, QuantileTransformer, PowerTransformer, Normalizer
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import (
    confusion_matrix, precision_score, recall_score, accuracy_score,
    roc_auc_score, f1_score, log_loss, brier_score_loss, average_precision_score,  
    balanced_accuracy_score, matthews_corrcoef
)

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.metrics import AUC, Precision, Recall, TruePositives, TrueNegatives, FalsePositives, FalseNegatives
from tensorflow.keras.callbacks import CSVLogger, EarlyStopping, ReduceLROnPlateau, ModelCheckpoint


import keras.ops as ops


In [None]:
# 2 - LOADING DATA
separator = ','
fold = "4" # set the fold number for the files

url_train = "data/6h/train_df_kfold_"+fold+".csv"
url_val = "data/6h/val_df_kfold_"+fold+".csv"
url_test = "data/6h/test_df_kfold_"+fold+".csv"


# CHOOSE COLUMNS TO DELETE

qtde_attributes = 18 #18 10

#18 attributes
list_col_delete = ['Class_Flare','Letra_Class', 'T_REC', 'harpnum']

#10 Bobra'attributes
list_col_delete = ['Class_Flare','Letra_Class','T_REC', 'harpnum', 'MEANGAM','MEANGBH','MEANGBT','MEANGBZ', 'MEANJZD','MEANJZH','MEANALP','MEANSHR']

#10 SHAP'atrributes
list_col_delete = ['Class_Flare','Letra_Class', 'T_REC', 'harpnum' ,'ABSNJZH','MEANGAM','MEANJZD','MEANJZH','SAVNCPP','TOTPOT','TOTUSJH','TOTUSJZ']


# NORMALIZATION
scaler_name = 'StandardScaler'  #'StandardScaler', 'RobustScaler', 'MinMaxScaler', 'NormalizerL1', 'PowerTransformer', 'QuantileTransformer'


# BALANCING
balanceamento = 'smote'  #'smote', 'oversampling', 'undersampling', 'class_weight', ou 'none'

# Feature Transformer (FT) model configuration
embed_dim=128
num_heads=8
ff_dim=128
num_transformer_blocks=6
dropout_rate=0.3

epoch = 100 
batch = 64 

optimizer=keras.optimizers.Adam(learning_rate=1e-4) #1e4
focal_gamma = 2
focal_alpha = 0.25



In [13]:
# 3 - READ AND PREPARE DATA

train_df = pd.read_csv(url_train, sep=separator)
val_df = pd.read_csv(url_val, sep=separator)
test_df = pd.read_csv(url_test, sep=separator)


# Convert datetime
date1_ta = pd.to_datetime(train_df['T_REC'], errors='coerce', format='%Y-%m-%d %H:%M:%S')
date2_ta = pd.to_datetime(train_df['T_REC'], errors='coerce', format='%Y-%m-%d')

date1_va = pd.to_datetime(val_df['T_REC'], errors='coerce', format='%Y-%m-%d %H:%M:%S')
date2_va = pd.to_datetime(val_df['T_REC'], errors='coerce', format='%Y-%m-%d')

date1_te = pd.to_datetime(test_df['T_REC'], errors='coerce', format='%Y-%m-%d %H:%M:%S')
date2_te = pd.to_datetime(test_df['T_REC'], errors='coerce', format='%Y-%m-%d')

train_df['T_REC'] = date1_ta.fillna(date2_ta)
val_df['T_REC'] = date1_va.fillna(date2_va)
test_df['T_REC'] = date1_te.fillna(date2_te)

# Remove timezone to avoid date shifts
train_df['T_REC'] = train_df['T_REC'].dt.tz_localize(None)
val_df['T_REC'] = val_df['T_REC'].dt.tz_localize(None)
test_df['T_REC'] = test_df['T_REC'].dt.tz_localize(None)

# order date
train_df = train_df.sort_values(by='T_REC')
val_df = val_df.sort_values(by='T_REC')
test_df = test_df.sort_values(by='T_REC')


#save test extra columns
harpnum_test = test_df['harpnum'].values
t_rec_test = test_df['T_REC'].values
letra_class_test = test_df['Letra_Class'].values

info_test = pd.DataFrame({
    'harpnum': harpnum_test,
    'T_REC': t_rec_test,
    'Letra_Class': letra_class_test
})
info_test.to_csv('results/ft-transformers-info_test-fold'+fold+'.csv', index=False)


#delete columns
for lcd in list_col_delete:
    train_df.pop(lcd)
    val_df.pop(lcd)
    test_df.pop(lcd)
    

target_col = 'Class'
X_train = train_df.drop(columns=[target_col]).values
y_train = train_df[target_col].values
X_val = val_df.drop(columns=[target_col]).values
y_val = val_df[target_col].values
X_test = test_df.drop(columns=[target_col]).values
y_test = test_df[target_col].values


  train_df = pd.read_csv(url_train, sep=separator)
  val_df = pd.read_csv(url_val, sep=separator)
  test_df = pd.read_csv(url_test, sep=separator)


In [14]:
#4 - NORMALIZATION AND BALANCING


scaler_standard = StandardScaler()
scaler_robust = RobustScaler()
scaler_minmax = MinMaxScaler()
scaler_l1 = Normalizer(norm='l1')
transformer_yeo = PowerTransformer(method='yeo-johnson')
transformer_quantile = QuantileTransformer(output_distribution='normal')

# Choose the scaler based on the scaler_name variable
if scaler_name == 'StandardScaler':
    scaler = scaler_standard
elif scaler_name == 'RobustScaler':
    scaler = scaler_robust
elif scaler_name == 'MinMaxScaler':
    scaler = scaler_minmax
elif scaler_name == 'NormalizerL1':
    scaler = scaler_l1
elif scaler_name == 'PowerTransformer':
    scaler = transformer_yeo
elif scaler_name == 'QuantileTransformer':
    scaler = transformer_quantile
else:
    raise ValueError(f"Scaler '{scaler_name}' não reconhecido. Escolha um válido.")


X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)


if balanceamento == 'smote':
    smote = SMOTE(sampling_strategy=0.6, k_neighbors=3, random_state=42)
    X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
    class_weights = None  # not necessary in this case

elif balanceamento == 'oversampling':
    oversample = RandomOverSampler(sampling_strategy=0.6, random_state=42)
    X_train_res, y_train_res = oversample.fit_resample(X_train, y_train)
    class_weights = None

elif balanceamento == 'undersampling':
    undersample = RandomUnderSampler(sampling_strategy=0.6, random_state=42)
    X_train_res, y_train_res = undersample.fit_resample(X_train, y_train)
    class_weights = None

elif balanceamento == 'class_weight':
    # Don't change X_train/y_train, just calculate the weights
    X_train_res, y_train_res = X_train, y_train
    classes = np.unique(y_train)
    weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
    class_weights = dict(zip(classes, weights))

elif balanceamento == 'none':
    X_train_res, y_train_res = X_train, y_train
    class_weights = None

else:
    raise ValueError("Invalid balancing method. Choose from: 'smote', 'oversample', 'undersample', 'class_weight', 'none'.")




In [18]:
#5- FT-TRANSFORMEFS MODEL

n_features = X_train.shape[1]


class AddCLSToken(layers.Layer):
    def __init__(self, embed_dim, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim

    def build(self, input_shape):
        self.cls_token = self.add_weight(
            shape=(1, 1, self.embed_dim),
            initializer=tf.keras.initializers.RandomNormal(),
            trainable=True,
            name="cls_token"
        )

    def call(self, x):
        batch_size = ops.shape(x)[0]
        cls_tokens = ops.tile(self.cls_token, [batch_size, 1, 1])
        return ops.concatenate([cls_tokens, x], axis=1)


def build_ft_transformer(
    n_features,
    embed_dim=64,
    num_heads=8,
    ff_dim=128,
    num_transformer_blocks=2,
    dropout_rate=0.1
):
    inputs = keras.Input(shape=(n_features,))
    
    # Tokenization of numeric tabular data
    x = NumericalFeatureTokenizer(n_features, embed_dim)(inputs)

    # Add learnable [CLS] token
    x = AddCLSToken(embed_dim)(x)

    # Transformer blocks
    for _ in range(num_transformer_blocks):
        # Multi-head self-attention + residual
        x_norm = layers.LayerNormalization(epsilon=1e-6)(x)
        attn_output = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(x_norm, x_norm)
        x = layers.Add()([x, attn_output])

        # Feed-forward + residual
        x_norm = layers.LayerNormalization(epsilon=1e-6)(x)
        ff_output = layers.Dense(ff_dim, activation="gelu")(x_norm)
        ff_output = layers.Dense(embed_dim)(ff_output)
        x = layers.Add()([x, ff_output])

        x = layers.Dropout(dropout_rate)(x)

    # Token output [CLS]
    cls_output = x[:, 0, :]  # (batch_size, embed_dim)

    # Final classification layer
    outputs = layers.Dense(1, activation="sigmoid")(cls_output)

    return keras.Model(inputs, outputs)




#ft-transformer customer for tabular data
def build_ft_transformer_old2(
    n_features, 
    embed_dim=64, 
    num_heads=8, 
    ff_dim=128, 
    num_transformer_blocks=2, 
    dropout_rate=0.1
):
    inputs = keras.Input(shape=(n_features,))

    # (batch_size, n_features, 1)
    x = layers.Reshape((n_features, 1))(inputs)

    # Embedding de cada feature
    x = layers.TimeDistributed(layers.Dense(embed_dim))(x)

    # --- Adiciona o CLS token ---
    cls_token = tf.Variable(initial_value=tf.random.normal([1, 1, embed_dim]), trainable=True, name="cls_token")
    batch_size = tf.shape(x)[0]
    cls_tokens = tf.tile(cls_token, [batch_size, 1, 1])  # (batch_size, 1, embed_dim)
    x = tf.concat([cls_tokens, x], axis=1)  # (batch_size, n_features+1, embed_dim)

    # Transformer blocks
    for _ in range(num_transformer_blocks):
        # LayerNorm antes da Attention (Pre-LN)
        x_norm = layers.LayerNormalization(epsilon=1e-6)(x)
        attn_output = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(x_norm, x_norm)
        x = layers.Add()([x, attn_output])

        # Feedforward
        x_norm = layers.LayerNormalization(epsilon=1e-6)(x)
        ff_output = layers.Dense(ff_dim, activation="relu")(x_norm)
        ff_output = layers.Dense(embed_dim)(ff_output)
        x = layers.Add()([x, ff_output])
        
        x = layers.Dropout(dropout_rate)(x)

    # --- Só usa o CLS token ---
    cls_output = x[:, 0, :]  # Pega só o primeiro token (CLS)

    # Classificador final
    outputs = layers.Dense(1, activation="sigmoid")(cls_output)

    model = keras.Model(inputs, outputs)
    return model



class NumericalFeatureTokenizer(layers.Layer):
    def __init__(self, n_features, embed_dim):
        super().__init__()
        self.n_features = n_features
        self.embed_dim = embed_dim
        # Um peso aprendível para cada feature
        self.linear_weights = self.add_weight(
            shape=(n_features, embed_dim),
            initializer='glorot_uniform',
            trainable=True,
            name="feature_embedding_weights"
        )
        # Um bias aprendível por feature
        self.bias = self.add_weight(
            shape=(n_features, embed_dim),
            initializer='zeros',
            trainable=True,
            name="feature_embedding_bias"
        )
    
    def call(self, inputs):
        # inputs: (batch_size, n_features)
        # Applies a linear transformation per feature
        x = tf.expand_dims(inputs, -1)  # (batch_size, n_features, 1)
        out = x * self.linear_weights + self.bias  # broadcasting
        return out  # (batch_size, n_features, embed_dim)

#ft-transformer - papper
def build_ft_transformer_old(
    n_features,
    embed_dim=64,
    num_heads=8,
    ff_dim=128,
    num_transformer_blocks=2,
    dropout_rate=0.1
):
    inputs = keras.Input(shape=(n_features,))
    
    # Tokenization of numeric tabular data
    x = NumericalFeatureTokenizer(n_features, embed_dim)(inputs)

    # Add learnable [CLS] token
    cls_token = tf.Variable(
        initial_value=tf.random.normal([1, 1, embed_dim]),
        trainable=True,
        name="cls_token"
    )
    batch_size = ops.shape(x)[0]
    cls_tokens = ops.tile(cls_token, [batch_size, 1, 1])
    x = ops.concat([cls_tokens, x], axis=1)  # (batch_size, n_features+1, embed_dim)

    # Transformer blocks
    for _ in range(num_transformer_blocks):
        # Multi-head self-attention + residual
        x_norm = layers.LayerNormalization(epsilon=1e-6)(x)
        attn_output = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(x_norm, x_norm)
        x = layers.Add()([x, attn_output])

        # Feed-forward + residual
        x_norm = layers.LayerNormalization(epsilon=1e-6)(x)
        ff_output = layers.Dense(ff_dim, activation="gelu")(x_norm)
        ff_output = layers.Dense(embed_dim)(ff_output)
        x = layers.Add()([x, ff_output])

        x = layers.Dropout(dropout_rate)(x)

    # Token output [CLS]
    cls_output = x[:, 0, :]  # (batch_size, embed_dim)

    # Final classification layer
    outputs = layers.Dense(1, activation="sigmoid")(cls_output)

    return keras.Model(inputs, outputs)


def binary_focal_loss(gamma=2., alpha=0.25):
    def focal_loss(y_true, y_pred):
        y_true = tf.cast(y_true, tf.float32)
        bce = tf.keras.backend.binary_crossentropy(y_true, y_pred)
        p_t = y_true * y_pred + (1 - y_true) * (1 - y_pred)
        alpha_factor = y_true * alpha + (1 - y_true) * (1 - alpha)
        modulating_factor = tf.pow((1 - p_t), gamma)
        return tf.reduce_mean(alpha_factor * modulating_factor * bce)
    return focal_loss




# --- Create model ---
model = build_ft_transformer(
    n_features=n_features,
    embed_dim=embed_dim,
    num_heads=num_heads,
    ff_dim=ff_dim,
    num_transformer_blocks=num_transformer_blocks,
    dropout_rate=dropout_rate
)



#compile model 
model.compile(
    optimizer=optimizer,
    loss=binary_focal_loss(gamma=focal_gamma, alpha=focal_alpha),
    metrics=[
        AUC(name='auc'),
        Precision(name='precision'),
        Recall(name='recall'),
        TruePositives(name='tp'),
        TrueNegatives(name='tn'),
        FalsePositives(name='fp'),
        FalseNegatives(name='fn')
    ]
)




csv_logger = CSVLogger('results/ft-transformers-trainning_log_fold_'+fold+'.csv', append=True)

callbacks = [
    EarlyStopping(monitor='val_auc', patience=10, mode='max', restore_best_weights=True),
    ModelCheckpoint('best_model.keras', monitor='val_auc', mode='max', save_best_only=True),
    ReduceLROnPlateau(monitor='val_auc', factor=0.5, patience=5, mode='max', verbose=1),
    csv_logger
]


#Fit Model

if class_weights == None:
    history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=epoch,
    batch_size=batch,
    verbose=1
    )
else:
    history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=epoch,
    batch_size=batch,
    callbacks=callbacks,
    verbose=1
)



[1m1722/1722[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m903s[0m 516ms/step - auc: 0.7245 - fn: 2917.0000 - fp: 1204.0000 - loss: 0.0114 - precision: 0.0452 - recall: 0.0192 - tn: 436630.0000 - tp: 57.0000 - val_auc: 0.9290 - val_fn: 857.0000 - val_fp: 0.0000e+00 - val_loss: 0.0061 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00 - val_tn: 107907.0000 - val_tp: 0.0000e+00


In [19]:
#6 - Salve training e val metrics


# Convert history to DataFrame
history_df = pd.DataFrame(history.history)



# Save to a CSV file
history_df.to_csv('results/ft-transformers-trainning_val_history_fold_'+fold+'.csv', index=False)

In [20]:
#7 - Model Evaluate


results = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Results - Loss: {results[0]:.4f}, AUC: {results[1]:.4f}")

#probabilities
y_pred_probs = model.predict(X_test).flatten()

# open auxiliar
info_test = pd.read_csv('results/ft-transformers-info_test-fold' + fold + '.csv')[['harpnum', 'T_REC', 'Letra_Class']]
info_test = info_test.reset_index(drop=True)


metrics_list = []
positivos_info_total = []
probs_com_classe_real = []

# --- CSV 3: harpnum + real class + probability + t_rec + Letra_Class ---
for idx, (true_class, prob) in enumerate(zip(y_test, y_pred_probs)):
    harpnum = info_test.loc[idx, 'harpnum']
    t_rec = info_test.loc[idx, 'T_REC']
    letra_class = info_test.loc[idx, 'Letra_Class']
    probs_com_classe_real.append({
        'harpnum': harpnum,
        'classe_real': int(true_class),
        'probabilidade_modelo': round(float(prob), 6),
        'T_REC': t_rec,
        'Letra_Class': letra_class
    })

# --- Loop of thresholds  ---
thresholds = np.arange(0.10, 1.00, 0.01)

for threshold in thresholds:
    y_pred = (y_pred_probs >= threshold).astype(int)
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    # Métricas
    bce = tf.keras.losses.BinaryCrossentropy()
    computed_loss = bce(y_test, y_pred_probs).numpy()
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    accuracy = accuracy_score(y_test, y_pred)
    auc_score = roc_auc_score(y_test, y_pred_probs)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    balanced_acc = balanced_accuracy_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)

    sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    tss = sensitivity + specificity - 1
    total = tp + tn + fp + fn
    pe = ((tp + fn)*(tp + fp) + (tn + fn)*(tn + fp)) / (total**2)
    hss = (accuracy - pe) / (1 - pe) if (1 - pe) != 0 else 0
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0

    metrics_list.append({
        'threshold': round(threshold, 2),
        'tn': tn, 'fp': fp, 'fn': fn, 'tp': tp,
        'loss': computed_loss,
        'auc': auc_score,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'accuracy': accuracy,
        'balanced_accuracy': balanced_acc,
        'mcc': mcc,
        'tss': tss,
        'hss': hss,
        'true_positive_rate': sensitivity,
        'true_negative_rate': specificity,
        'false_positive_rate': fpr,
        'false_negative_rate': fnr
    })

    # --- CSV 2: harpnums positivos únicos + probabilidade + Letra_Class ---
    indices_positivos = np.where(y_pred == 1)[0]
    for idx in indices_positivos:
        prob = y_pred_probs[idx]
        harpnum = info_test.loc[idx, 'harpnum']
        letra_class = info_test.loc[idx, 'Letra_Class']
        positivos_info_total.append({
            'threshold': round(threshold, 2),
            'harpnum': harpnum,
            'probabilidade': round(float(prob), 6),
            'Letra_Class': letra_class
        })

# --- Save files ---
# CSV 1: metrics
pd.DataFrame(metrics_list).to_csv('results/ft-transformers-metrics-fold' + fold + '.csv', index=False)

# CSV 2: unique positives with probability
df_positivos = pd.DataFrame(positivos_info_total).drop_duplicates()
df_positivos.to_csv('results/ft-transformers-harpnums_all_thresholds_fold' + fold + '.csv', index=False)

# CSV 3: all samples with real class and probability
pd.DataFrame(probs_com_classe_real).to_csv('results/ft-transformers-real_class_prob_harpnum_' + fold + '.csv', index=False)

print("\nResults saved successfully.")

Test Results - Loss: 0.0058, AUC: 0.9483
[1m4371/4371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 31ms/step

Results saved successfully.
