In [2]:
#1 - LIBRARIES


# Data manipulation
import pandas as pd
import numpy as np
import dask.dataframe as dd

# Class balancing techniques
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

# Preprocessing and metrics (Scikit-Learn)
from sklearn.preprocessing import (
    StandardScaler, RobustScaler, MinMaxScaler,
    QuantileTransformer, PowerTransformer, Normalizer
)
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import (
    confusion_matrix, precision_score, recall_score, accuracy_score,
    roc_auc_score, f1_score, log_loss, brier_score_loss,
    average_precision_score, balanced_accuracy_score, matthews_corrcoef
)

# TensorFlow and Keras (main API)
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.metrics import (
    AUC, Precision, Recall,
    TruePositives, TrueNegatives, FalsePositives, FalseNegatives
)
from tensorflow.keras.callbacks import (
    CSVLogger, EarlyStopping, ReduceLROnPlateau,
    ModelCheckpoint, LearningRateScheduler
)
from tensorflow.keras.layers import (
    LSTM, Dense, Dropout, BatchNormalization,
    Conv1D, Conv2D, MaxPooling1D, MaxPooling2D,
    Flatten, Reshape
)

#Only SVM model
#from tensorflow.keras.layers.experimental import RandomFourierFeatures
#from tensorflow.keras.layers.experimental import RandomFourierFeatures

# Standalone Keras (only if explicitly needed)
from keras.models import Sequential
from keras.layers import Input, Bidirectional



In [None]:
# 2 - LOADING DATA

separator = ','
fold = "4" # set the fold number for the files

url_train = "data/6h/train_df_kfold_"+fold+".csv"
url_val = "data/6h/val_df_kfold_"+fold+".csv"
url_test = "data/6h/test_df_kfold_"+fold+".csv"


# CHOOSE COLUMNS TO DELETE

qtde_attributes = 18 #18 10

#18 attributes
list_col_delete = ['Class_Flare','Letra_Class']

#10 Bobra'attributes
list_col_delete = ['Class_Flare','Letra_Class']

#10 SHAP'atrributes
list_col_delete = ['Class_Flare','Letra_Class']


# NORMALIZATION
scaler_name = 'StandardScaler'  #'StandardScaler', 'RobustScaler', 'MinMaxScaler', 'NormalizerL1', 'PowerTransformer', 'QuantileTransformer'


# BALANCING
set_balancing = 'oversampling'  #'smote', 'oversampling', 'undersampling', 'class_weight', ou 'none'

# Model name
set_model_name = "mlp" #'mlp', 'svm', 'lstm'

epoch = 100
batch = 64

optimizer=keras.optimizers.Adam(learning_rate=1e-4) #1e4




In [38]:
# 3 - READ AND PREPARE DATA

train_df = pd.read_csv(url_train, sep=separator)
val_df = pd.read_csv(url_val, sep=separator)
test_df = pd.read_csv(url_test, sep=separator)


# Convert datetime
date1_ta = pd.to_datetime(train_df['T_REC'], errors='coerce', format='%Y-%m-%d %H:%M:%S')
date2_ta = pd.to_datetime(train_df['T_REC'], errors='coerce', format='%Y-%m-%d')

date1_va = pd.to_datetime(val_df['T_REC'], errors='coerce', format='%Y-%m-%d %H:%M:%S')
date2_va = pd.to_datetime(val_df['T_REC'], errors='coerce', format='%Y-%m-%d')

date1_te = pd.to_datetime(test_df['T_REC'], errors='coerce', format='%Y-%m-%d %H:%M:%S')
date2_te = pd.to_datetime(test_df['T_REC'], errors='coerce', format='%Y-%m-%d')

train_df['T_REC'] = date1_ta.fillna(date2_ta)
val_df['T_REC'] = date1_va.fillna(date2_va)
test_df['T_REC'] = date1_te.fillna(date2_te)

# Remove timezone to avoid date shifts
train_df['T_REC'] = train_df['T_REC'].dt.tz_localize(None)
val_df['T_REC'] = val_df['T_REC'].dt.tz_localize(None)
test_df['T_REC'] = test_df['T_REC'].dt.tz_localize(None)

# order date
train_df = train_df.sort_values(by='T_REC')
val_df = val_df.sort_values(by='T_REC')
test_df = test_df.sort_values(by='T_REC')


#save test extra columns
harpnum_test = test_df['harpnum'].values
t_rec_test = test_df['T_REC'].values
letra_class_test = test_df['Letra_Class'].values

info_test = pd.DataFrame({
    'harpnum': harpnum_test,
    'T_REC': t_rec_test,
    'Letra_Class': letra_class_test
})
info_test.to_csv('results/'+set_model_name+'-info_test-fold'+fold+'.csv', index=False)


#delete columns
for lcd in list_col_delete:
    train_df.pop(lcd)
    val_df.pop(lcd)
    test_df.pop(lcd)
    

  train_df = pd.read_csv(url_train, sep=separator)
  val_df = pd.read_csv(url_val, sep=separator)
  test_df = pd.read_csv(url_test, sep=separator)


In [39]:
#4 - functions

def calculate_class_weight(train_df):

    neg = len(train_df [train_df.Class == 0])
    pos = len(train_df [train_df.Class == 1])

    total = pos  + neg
    weight_for_0 = (1 / neg) * (total / 2.0)
    weight_for_1 = (1 / pos) * (total / 2.0)

    class_weight = {0: weight_for_0, 1: weight_for_1}

    print('Weight class 0: {:.2f}'.format(weight_for_0))
    print('Weight class 1: {:.2f}'.format(weight_for_1))
    print(class_weight)
    
    return class_weight

def count_train_val_test(train_df, val_df, test_df):
    #4.2 count train, val, test sets

    neg_t = len(train_df [train_df.Class == 0])
    pos_t = len(train_df [train_df.Class == 1])

    print("\n Train: neg=>", neg_t, " pos=>", pos_t)


    neg_v = len(val_df [val_df .Class == 0])
    pos_v = len(val_df [val_df .Class == 1])

    print("\nVal: neg=>", neg_v, " pos=>", pos_v)

    neg_te = len(test_df [test_df .Class == 0])
    pos_te = len(test_df [test_df .Class == 1])

    print("\nTest: neg=>", neg_te, " pos=>", pos_te)

def smote_balancing(train_df):
    
    print("SMOTE balancing")
    
    train_df['T_REC'] = train_df['T_REC'].apply(lambda x: x.value)
    X = train_df.loc[:, train_df.columns != 'Class']
    y = train_df.Class
    sm = SMOTE(sampling_strategy=0.6, k_neighbors=1, random_state=100)
    X_res, y_res = sm.fit_resample(X, y)
    
    print("depois dos smote")
    train_df = pd.concat([pd.DataFrame(X_res), pd.DataFrame(y_res)], axis=1)
    
    train_df['T_REC'] = train_df['T_REC'].apply(pd.Timestamp)

    train_df = train_df.sort_values(by='T_REC')

    train_df = dd.from_pandas(train_df, npartitions=10)
    print("depois do dd")
    return train_df


def oversampling_balancing(train_df):
    pos_flare = train_df[train_df['Class']==1]
    neg_flare = train_df[train_df['Class']==0]

    pos_flare = pos_flare.sample(frac = len(neg_flare) / len(pos_flare), replace = True, random_state=101)
    train_df = dd.concat([pos_flare, neg_flare], interleave_partitions=True)   
    train_df.sort_values(by='T_REC')
    
    return train_df

def undersampling_balancing(train_df):
    pos_flare = train_df[train_df['Class']==1]
    neg_flare = train_df[train_df['Class']==0]
    
    print(len(neg_flare), - len(pos_flare))

    #neg_flare = neg_flare.sample(frac= 1 / ( len(neg_flare)/len(pos_flare)) , random_state=101)
    neg_flare = neg_flare.sample(frac=1 / (len(neg_flare) / len(pos_flare)), replace=True, random_state=101)

    train_df = dd.concat([pos_flare, neg_flare], interleave_partitions=True)   
    train_df.sort_values(by='T_REC')
    
    return train_df



def normalization_data(train_features, val_features, test_features, method):
        
    #Dataset normalization - StandardScaler
    scalerS = StandardScaler()
    scalerR = RobustScaler()
    scalerM = MinMaxScaler()
    scalerL = Normalizer(norm='l1')
    
    #metodo quantile Transformer
    transformerJ = PowerTransformer(method='yeo-johnson')
    transformer = QuantileTransformer(output_distribution='normal')  # ou 'uniform' or 'normal'
    
    scaler = scalerS
    
    if method == "Standard":
        scaler == scalerS
    elif method == "Robust":
        scaler == scalerR
    elif method == "MinMax":
        scaler == scalerM
    elif method == "Normalizer":
        scaler == scalerL
    elif method == "PowerTransformer":
        scaler = transformerJ
    elif method == "QuantileTransformer":
        scaler = transformer


    train_features = train_features.astype(float)
    val_features = val_features.astype(float)
    test_features = test_features.astype(float)
    

    train_features = scaler.fit_transform(train_features)
    val_features = scaler.transform(val_features)
    test_features = scaler.transform(test_features)

    train_features = np.clip(train_features, -5, 5)
    val_features = np.clip(val_features, -5, 5)
    test_features = np.clip(test_features, -5, 5)
    
    return train_features, val_features, test_features

def binary_focal_loss(gamma=2., alpha=0.25):
    def focal_loss(y_true, y_pred):
        y_true = tf.cast(y_true, tf.float32)
        bce = tf.keras.backend.binary_crossentropy(y_true, y_pred)
        p_t = y_true * y_pred + (1 - y_true) * (1 - y_pred)
        alpha_factor = y_true * alpha + (1 - y_true) * (1 - alpha)
        modulating_factor = tf.pow((1 - p_t), gamma)
        return tf.reduce_mean(alpha_factor * modulating_factor * bce)
    return focal_loss


#MLP model
def make_model_MLP(METRICS, train_features,output_bias=None):
    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)
    

    model = keras.Sequential([
        keras.layers.Dense(256, activation='relu', input_shape=(train_features.shape[-1],), kernel_regularizer=keras.regularizers.l2(0.001)),
        keras.layers.Dropout(0.4),

        keras.layers.Dense(128, activation='relu', kernel_regularizer=keras.regularizers.l2(0.003)), #0.001
        keras.layers.Dropout(0.3),

        keras.layers.Dense(64, activation='relu', kernel_regularizer=keras.regularizers.l2(0.003)), #0.001
        keras.layers.Dropout(0.2),

        keras.layers.Dense(32, activation='relu'),
        
        keras.layers.Dense(1, activation='sigmoid')
    ])


    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=1e-4),
        loss=keras.losses.BinaryCrossentropy(),
        #loss=binary_focal_loss(gamma=focal_gamma, alpha=focal_alpha),
        #metrics=METRICS
        metrics=[
            tf.keras.metrics.BinaryAccuracy(name="accuracy"),
            tf.keras.metrics.AUC(name="auc"),
            tf.keras.metrics.Precision(name="precision"),
            tf.keras.metrics.Recall(name="recall"),
        ])
    
    return model


#LSTM model
def make_model_LSTM(METRICS, output_bias=None):
    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)

    model = Sequential()
    model.add(LSTM(32, input_shape=(qtde_attributes, 1), return_sequences=True))
    model.add(BatchNormalization())
    model.add(Dropout(0.3))
    model.add(LSTM(16, return_sequences=False))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # Para classificação binária



    model.compile(
                #loss='mse',
                #optimizer ='adam',
                optimizer=keras.optimizers.Adam(learning_rate=1e-4),
                loss=keras.losses.BinaryCrossentropy(),
                  metrics=[
                    tf.keras.metrics.BinaryAccuracy(name="accuracy"),
                    tf.keras.metrics.AUC(name="auc"),
                    tf.keras.metrics.Precision(name="precision"),
                    tf.keras.metrics.Recall(name="recall"),
                 ])


    return model


def make_model_SVM(METRICS,output_bias=None):
    
    model = keras.Sequential([
        keras.Input(shape=(qtde_attributes, )),  # Número de features do CSV
        RandomFourierFeatures(output_dim=1024, scale=5.0, kernel_initializer="gaussian"),
        layers.Dense(256, activation="relu"),
        layers.Dense(128, activation="relu"),
        layers.Dense(64, activation="relu"),
        layers.Dense(1, activation="sigmoid"),
    ])

    
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=1e-4),
        #loss=keras.losses.hinge,
        loss=keras.losses.BinaryCrossentropy(),
        #loss=binary_focal_loss(gamma=focal_gamma, alpha=focal_alpha),
       #metrics=METRICS,
        metrics=[
            tf.keras.metrics.BinaryAccuracy(name="accuracy"),
            tf.keras.metrics.AUC(name="auc"),
            tf.keras.metrics.Precision(name="precision"),
            tf.keras.metrics.Recall(name="recall"),
        ])
        
    return model
        

In [40]:
# 5 - Callbacks

def early_stopping(monitor, patience, verbose, restoreWeight):
    early_stopping = EarlyStopping(
        monitor=monitor,       # Métrica a ser monitorada val_accuracy val_loss
        patience=patience,              # Número de épocas sem melhoria para interromper o treinamento
        verbose=verbose,                # Verbose para exibir mensagens durante o treinamento
        restore_best_weights=restoreWeight # Restaura os melhores pesos quando o treinamento é interrompido
    )
    
    return early_stopping



def model_checkpoint(monitor, salve, mode, verbose):
    
    checkpoint_callback = ModelCheckpoint(
        'best_model_transformers.keras',  # Caminho para salvar o modelo
        monitor=monitor,  # Métrica para monitorar
        save_best_only=salve,  # Salvar apenas o modelo com a melhor métrica
        mode=mode,  # Modo para monitorar a métrica (min para perda, max para precisão)
        verbose=verbose  # Verbose level
    )
    
    return checkpoint_callback


def lr_scheduleold(epoch, lr):
# Exemplo simples: decaimento exponencial da taxa de aprendizado
    if epoch < 5:
        return lr  # Use a taxa de aprendizado inicial
    else:
        return lr * 0.95  # Decaimento a cada época após a 5ª
    

def lr_schedule(epoch, lr):
    if epoch < 10:
        return 0.01
    elif epoch < 50:
        return 0.005
    elif epoch < 100:
        return 0.001
    else:
        return 0.0005
    
    




In [41]:
# 5 - Execute Model

#class_weight
class_weight = calculate_class_weight(train_df)
print("pos:=> ", len(test_df[test_df['Class'] == 1]), "neg:=> ", len(test_df[test_df['Class'] == 0]))

cols_to_pop = ['harpnum']

{col: train_df.pop(col) for col in cols_to_pop}




#4.4 Balancing training set M and C
if set_balancing == 'smote':
    train_df = smote_balancing(train_df)
elif set_balancing == 'oversampling':
    train_df = oversampling_balancing(train_df)
elif set_balancing == 'undersampling':
    train_df = undersampling_balancing(train_df)


count_train_val_test(train_df, val_df, test_df) 

cols_to_pop = ['T_REC']
        
{col: train_df.pop(col) for col in cols_to_pop}


cols_to_pop = ['T_REC','harpnum']


        
{col: val_df.pop(col) for col in cols_to_pop}
{col: test_df.pop(col) for col in cols_to_pop}



#Form np arrays of labels and features.
train_labels = np.array(train_df.pop('Class'))
bool_train_labels = train_labels != 0
val_labels = np.array(val_df.pop('Class'))
test_labels = np.array(test_df.pop('Class'))


train_features = np.array(train_df)
val_features = np.array(val_df)
test_features = np.array(test_df)

#Normalization data
train_features, val_features, test_features = normalization_data(train_features, val_features, test_features, "Standard")

METRICS = ["accuracy"]
if set_model_name == "mlp":
    model = make_model_MLP(METRICS,train_features)
elif set_model_name == "lstm":
    model = make_model_LSTM(METRICS)
elif set_model_name == "svm":
    model = make_model_SVM(METRICS)




#Callbacks

model_checkpoint_fraud = [keras.callbacks.ModelCheckpoint("fraud_model_at_epoch_{epoch}.keras")]
early_stopping = early_stopping('val_auc', 50, 1, True)  
model_checkpoint = model_checkpoint('val_loss', True, 'min', 1)
reduce_lr = ReduceLROnPlateau(monitor='val_auc', factor=0.5, patience=5, min_lr=1e-5)
lr_callback = LearningRateScheduler(lr_schedule)

callbacks  = [reduce_lr, lr_callback, early_stopping]


if set_balancing == "weight":
    history = model.fit(
                train_features,
                train_labels,
                batch_size=batch,
                epochs=epoch,
                callbacks= callbacks,
                validation_data=(val_features, val_labels),
                class_weight=class_weight)
else:
    history = model.fit(
            train_features,
            train_labels,
            batch_size=batch,
            epochs=
            epoch,
            callbacks= callbacks,
            validation_data=(val_features, val_labels),
        )



     


Weight class 0: 0.50
Weight class 1: 74.11
{0: 0.5033962643376256, 1: 74.1102891728312}
pos:=>  1120 neg:=>  138733

 Train: neg=> 437834  pos=> 437834

Val: neg=> 107907  pos=> 857

Test: neg=> 138733  pos=> 1120


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m13683/13683[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 3ms/step - accuracy: 0.8913 - auc: 0.9550 - loss: 0.2902 - precision: 0.8686 - recall: 0.9222 - val_accuracy: 0.8717 - val_auc: 0.9691 - val_loss: 0.3058 - val_precision: 0.0563 - val_recall: 0.9697 - learning_rate: 0.0100
Restoring model weights from the end of the best epoch: 1.


In [42]:
#6 - Salve training e val metrics


# Convert history to DataFrame
history_df = pd.DataFrame(history.history)



# Saved to a CSV file
history_df.to_csv('results/'+set_model_name+'-trainning_val_history_fold_'+fold+'.csv', index=False)

In [44]:
#7 - Model Evaluate

X_test = test_features
y_test = test_labels

results = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Results - Loss: {results[0]:.4f}, AUC: {results[1]:.4f}")

#probabilities
y_pred_probs = model.predict(X_test)

# open auxiliar
info_test = pd.read_csv('results/'+set_model_name+'-info_test-fold' + fold + '.csv')[['harpnum', 'T_REC', 'Letra_Class']]
info_test = info_test.reset_index(drop=True)


metrics_list = []
positivos_info_total = []
probs_com_classe_real = []

# --- CSV 3: harpnum + real class + probability + t_rec + Letra_Class ---
for idx, (true_class, prob) in enumerate(zip(y_test, y_pred_probs)):
    harpnum = info_test.loc[idx, 'harpnum']
    t_rec = info_test.loc[idx, 'T_REC']
    letra_class = info_test.loc[idx, 'Letra_Class']
    probs_com_classe_real.append({
        'harpnum': harpnum,
        'classe_real': int(true_class),
        'probabilidade_modelo': round(float(prob), 6),
        'T_REC': t_rec,
        'Letra_Class': letra_class
    })

# --- Loop of thresholds  ---
thresholds = np.arange(0.10, 1.00, 0.01)

for threshold in thresholds:
    y_pred = (y_pred_probs >= threshold).astype(int)
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    # Métricas
    bce = tf.keras.losses.BinaryCrossentropy()
    #computed_loss = bce(y_test, y_pred_probs).numpy()
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    accuracy = accuracy_score(y_test, y_pred)
    auc_score = roc_auc_score(y_test, y_pred_probs)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    balanced_acc = balanced_accuracy_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)

    sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    tss = sensitivity + specificity - 1
    total = tp + tn + fp + fn
    pe = ((tp + fn)*(tp + fp) + (tn + fn)*(tn + fp)) / (total**2)
    hss = (accuracy - pe) / (1 - pe) if (1 - pe) != 0 else 0
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0

    metrics_list.append({
        'threshold': round(threshold, 2),
        'tn': tn, 'fp': fp, 'fn': fn, 'tp': tp,
        'loss': results[0],
        'auc': auc_score,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'accuracy': accuracy,
        'balanced_accuracy': balanced_acc,
        'mcc': mcc,
        'tss': tss,
        'hss': hss,
        'true_positive_rate': sensitivity,
        'true_negative_rate': specificity,
        'false_positive_rate': fpr,
        'false_negative_rate': fnr
    })

    # --- CSV 2: harpnums positivos únicos + probabilidade + Letra_Class ---
    indices_positivos = np.where(y_pred == 1)[0]
    for idx in indices_positivos:
        prob = y_pred_probs[idx]
        harpnum = info_test.loc[idx, 'harpnum']
        letra_class = info_test.loc[idx, 'Letra_Class']
        positivos_info_total.append({
            'threshold': round(threshold, 2),
            'harpnum': harpnum,
            'probabilidade': round(float(prob), 6),
            'Letra_Class': letra_class
        })

# --- Save files ---
# CSV 1: metrics
pd.DataFrame(metrics_list).to_csv('results/'+set_model_name+'-metrics-fold' + fold + '.csv', index=False)

# CSV 2: unique positives with probability
df_positivos = pd.DataFrame(positivos_info_total).drop_duplicates()
df_positivos.to_csv('results/'+set_model_name+'-harpnums_all_thresholds_fold' + fold + '.csv', index=False)

# CSV 3: all samples with real class and probability
pd.DataFrame(probs_com_classe_real).to_csv('results/'+set_model_name+'-real_class_prob_harpnum_' + fold + '.csv', index=False)

print("\nResultados salvos com sucesso.")

Test Results - Loss: 0.2825, AUC: 0.8867
[1m4371/4371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 964us/step


  'probabilidade_modelo': round(float(prob), 6),
  'probabilidade': round(float(prob), 6),
  'probabilidade': round(float(prob), 6),
  'probabilidade': round(float(prob), 6),
  'probabilidade': round(float(prob), 6),
  'probabilidade': round(float(prob), 6),
  'probabilidade': round(float(prob), 6),
  'probabilidade': round(float(prob), 6),
  'probabilidade': round(float(prob), 6),
  'probabilidade': round(float(prob), 6),
  'probabilidade': round(float(prob), 6),
  'probabilidade': round(float(prob), 6),
  'probabilidade': round(float(prob), 6),
  'probabilidade': round(float(prob), 6),
  'probabilidade': round(float(prob), 6),
  'probabilidade': round(float(prob), 6),
  'probabilidade': round(float(prob), 6),
  'probabilidade': round(float(prob), 6),
  'probabilidade': round(float(prob), 6),
  'probabilidade': round(float(prob), 6),
  'probabilidade': round(float(prob), 6),
  'probabilidade': round(float(prob), 6),
  'probabilidade': round(float(prob), 6),
  'probabilidade': round(fl


Resultados salvos com sucesso.
