In [18]:
#1 - LIBRARIES

import pandas as pd
import numpy as np

from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler


from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, QuantileTransformer, PowerTransformer, Normalizer
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import (
    confusion_matrix, precision_score, recall_score, accuracy_score,
    roc_auc_score, f1_score, log_loss, brier_score_loss, average_precision_score,  
    balanced_accuracy_score, matthews_corrcoef
)

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.metrics import AUC, Precision, Recall, TruePositives, TrueNegatives, FalsePositives, FalseNegatives
from tensorflow.keras.callbacks import CSVLogger, EarlyStopping, ReduceLROnPlateau, ModelCheckpoint


In [27]:
# 2 - LOADING DATA

separator = ','
fold = "4" # set the fold number for the files

url_train = "data/6h/train_df_kfold_"+fold+".csv"
url_val = "data/6h/val_df_kfold_"+fold+".csv"
url_test = "data/6h/test_df_kfold_"+fold+".csv"


# CHOOSE COLUMNS TO DELETE

qtde_attributes = 18 #18 10


#18 attributes
list_col_delete = ['Class_Flare','Letra_Class']

#10 Bobra'attributes
list_col_delete = ['Class_Flare','Letra_Class', 'MEANGAM','MEANGBH','MEANGBT','MEANGBZ', 'MEANJZD','MEANJZH','MEANALP','MEANSHR']

#10 SHAP'atrributes
list_col_delete = ['Class_Flare','Letra_Class', 'ABSNJZH','MEANGAM','MEANJZD','MEANJZH','SAVNCPP','TOTPOT','TOTUSJH','TOTUSJZ']


# NORMALIZATION
scaler_name = 'StandardScaler'  #'StandardScaler', 'RobustScaler', 'MinMaxScaler', 'NormalizerL1', 'PowerTransformer', 'QuantileTransformer'


# BALANCING
balanceamento = 'smote'  #'smote', 'oversampling', 'undersampling', 'class_weight', ou 'none'

# Transformer  model configuration
head_size=192
num_heads=12
ff_dim = 256
num_transformer_blocks= 6
mlp_units = [128, 64, 32]
dropout = 0.2
mlp_dropout = 0.2

epoch = 1
batch = 512

optimizer=keras.optimizers.Adam(learning_rate=1e-4) #1e4
focal_gamma = 2
focal_alpha = 0.25






In [28]:
# 3 - READ AND PREPARE DATA

train_df = pd.read_csv(url_train, sep=separator)
val_df = pd.read_csv(url_val, sep=separator)
test_df = pd.read_csv(url_test, sep=separator)


# Convert datetime
date1_ta = pd.to_datetime(train_df['T_REC'], errors='coerce', format='%Y-%m-%d %H:%M:%S')
date2_ta = pd.to_datetime(train_df['T_REC'], errors='coerce', format='%Y-%m-%d')

date1_va = pd.to_datetime(val_df['T_REC'], errors='coerce', format='%Y-%m-%d %H:%M:%S')
date2_va = pd.to_datetime(val_df['T_REC'], errors='coerce', format='%Y-%m-%d')

date1_te = pd.to_datetime(test_df['T_REC'], errors='coerce', format='%Y-%m-%d %H:%M:%S')
date2_te = pd.to_datetime(test_df['T_REC'], errors='coerce', format='%Y-%m-%d')

train_df['T_REC'] = date1_ta.fillna(date2_ta)
val_df['T_REC'] = date1_va.fillna(date2_va)
test_df['T_REC'] = date1_te.fillna(date2_te)

# Remove timezone to avoid date shifts
train_df['T_REC'] = train_df['T_REC'].dt.tz_localize(None)
val_df['T_REC'] = val_df['T_REC'].dt.tz_localize(None)
test_df['T_REC'] = test_df['T_REC'].dt.tz_localize(None)

# order date
train_df = train_df.sort_values(by='T_REC')
val_df = val_df.sort_values(by='T_REC')
test_df = test_df.sort_values(by='T_REC')


#save test extra columns
harpnum_test = test_df['harpnum'].values
t_rec_test = test_df['T_REC'].values
letra_class_test = test_df['Letra_Class'].values

info_test = pd.DataFrame({
    'harpnum': harpnum_test,
    'T_REC': t_rec_test,
    'Letra_Class': letra_class_test
})
info_test.to_csv('results/window-transformers-info_test-fold'+fold+'.csv', index=False)


#delete columns
for lcd in list_col_delete:
    train_df.pop(lcd)
    val_df.pop(lcd)
    test_df.pop(lcd)
    


  train_df = pd.read_csv(url_train, sep=separator)
  val_df = pd.read_csv(url_val, sep=separator)
  test_df = pd.read_csv(url_test, sep=separator)


In [29]:
#Create sequences

SEQUENCE_SIZE = 36  # Number of previous steps
FEATURES = [col for col in train_df.columns if col not in ['Class', 'T_REC', 'harpnum']]

def create_sequences(df, sequence_size, features):
    """Gera sequências respeitando o harpnum."""
    X, y = [], []
    
    for harpnum, group in df.groupby('harpnum'):
        group = group.sort_values('T_REC')  # Ordena por tempo
        data = group[features].values
        labels = group['Class'].values
        
        for i in range(len(group) - sequence_size):
            window = data[i:i+sequence_size]
            label = labels[i+sequence_size]  # Prever o evento seguinte
            X.append(window)
            y.append(label)
    
    return np.array(X), np.array(y)


x_train, y_train = create_sequences(train_df, SEQUENCE_SIZE, FEATURES)
x_val, y_val = create_sequences(val_df, SEQUENCE_SIZE, FEATURES)
x_test, y_test = create_sequences(test_df, SEQUENCE_SIZE, FEATURES)

print(f"x_train shape: {x_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"x_val shape: {x_val.shape}")
print(f"x_test shape: {x_test.shape}")



#Normalization

scaler_standard = StandardScaler()
scaler_robust = RobustScaler()
scaler_minmax = MinMaxScaler()
scaler_l1 = Normalizer(norm='l1')
transformer_yeo = PowerTransformer(method='yeo-johnson')
transformer_quantile = QuantileTransformer(output_distribution='normal')

# Choose the scaler based on the scaler_name variable
if scaler_name == 'StandardScaler':
    scaler = scaler_standard
elif scaler_name == 'RobustScaler':
    scaler = scaler_robust
elif scaler_name == 'MinMaxScaler':
    scaler = scaler_minmax
elif scaler_name == 'NormalizerL1':
    scaler = scaler_l1
elif scaler_name == 'PowerTransformer':
    scaler = transformer_yeo
elif scaler_name == 'QuantileTransformer':
    scaler = transformer_quantile
else:
    raise ValueError(f"Scaler '{scaler_name}' não reconhecido. Escolha um válido.")



# Colapse for  2D
x_train_2d = x_train.reshape(-1, x_train.shape[-1])
x_val_2d = x_val.reshape(-1, x_val.shape[-1])
x_test_2d = x_test.reshape(-1, x_test.shape[-1])

# Fit only train
scaler.fit(x_train_2d)

# Apply transformationcha
x_train = scaler.transform(x_train_2d).reshape(x_train.shape)
x_val = scaler.transform(x_val_2d).reshape(x_val.shape)
x_test = scaler.transform(x_test_2d).reshape(x_test.shape)

# Reshape
n_samples, seq_len, n_features = x_train.shape
x_train_flat = x_train.reshape((n_samples, seq_len * n_features))

x_train_flat = x_train.reshape((n_samples, seq_len * n_features))

#Balancing
if balanceamento == 'smote':
    smote = SMOTE(sampling_strategy=0.6, k_neighbors=3, random_state=42)
    X_train_res, y_train_res = smote.fit_resample(x_train_flat, y_train)
    class_weights = None  # not necessary in this case

elif balanceamento == 'oversampling':
    oversample = RandomOverSampler(sampling_strategy=0.6, random_state=42)
    X_train_res, y_train_res = oversample.fit_resample(x_train, y_train)
    class_weights = None

elif balanceamento == 'undersampling':
    undersample = RandomUnderSampler(sampling_strategy=0.6, random_state=42)
    X_train_res, y_train_res = undersample.fit_resample(x_train, y_train)
    class_weights = None

elif balanceamento == 'class_weight':
    # Does not change X_train/y_train, just calculates the weights
    X_train_res, y_train_res = x_train, y_train
    classes = np.unique(y_train)
    weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
    class_weights = dict(zip(classes, weights))

elif balanceamento == 'none':
    X_train_res, y_train_res = x_train, y_train
    class_weights = None

else:
    raise ValueError("Invalid balancing method. Choose from: 'smote', 'oversample', 'undersample', 'class_weight', 'none'.")



# Return to sequential format
x_train = X_train_res.reshape((-1, seq_len, n_features))
y_train = y_train_res

print(f"New x_train shape after SMOTE: {x_train.shape}")
print(f"New y_train shape after SMOTE: {y_train.shape}")


# Transformers' Model
def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0):
    x = layers.LayerNormalization(epsilon=1e-6)(inputs)
    x = layers.MultiHeadAttention(key_dim=head_size, num_heads=num_heads, dropout=dropout)(x, x)
    x = layers.Dropout(dropout)(x)
    res = x + inputs
    
    x = layers.LayerNormalization(epsilon=1e-6)(res)
    x = layers.Conv1D(filters=ff_dim, kernel_size=1, activation="relu")(x)
    x = layers.Dropout(dropout)(x)
    x = layers.Conv1D(filters=inputs.shape[-1], kernel_size=1)(x)
    
    return x + res

def build_model(input_shape, head_size, num_heads, ff_dim, num_transformer_blocks, mlp_units, dropout=0, mlp_dropout=0):
    inputs = keras.Input(shape=input_shape)
    x = inputs
    for _ in range(num_transformer_blocks):
        x = transformer_encoder(x, head_size, num_heads, ff_dim, dropout)
    x = layers.GlobalAveragePooling1D()(x)
    for dim in mlp_units:
        x = layers.Dense(dim, activation="relu")(x)
        x = layers.Dropout(mlp_dropout)(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)
    return keras.Model(inputs, outputs)



# Reshape
input_shape = x_train.shape[1:]


# --- Focal loss ---
def binary_focal_loss(gamma=2., alpha=0.25):
    def focal_loss(y_true, y_pred):
        y_true = tf.cast(y_true, tf.float32)
        bce = tf.keras.backend.binary_crossentropy(y_true, y_pred)
        p_t = y_true * y_pred + (1 - y_true) * (1 - y_pred)
        alpha_factor = y_true * alpha + (1 - y_true) * (1 - alpha)
        modulating_factor = tf.pow((1 - p_t), gamma)
        return tf.reduce_mean(alpha_factor * modulating_factor * bce)
    return focal_loss



model = build_model(
    input_shape,
    head_size=head_size,
    num_heads=num_heads,
    ff_dim=ff_dim,
    num_transformer_blocks=num_transformer_blocks,
    mlp_units= mlp_units,
    mlp_dropout= mlp_dropout,
    dropout= dropout
)

model.compile(
    optimizer=optimizer,
    loss=binary_focal_loss(gamma=focal_gamma, alpha=focal_alpha),
    metrics=[
        AUC(name='auc'),
        Precision(name='precision'),
        Recall(name='recall'),
        TruePositives(name='tp'),
        TrueNegatives(name='tn'),
        FalsePositives(name='fp'),
        FalseNegatives(name='fn')
    ]
)




csv_logger = CSVLogger('results/window-transformers-trainning_log_fold_'+fold+'.csv', append=True)

callbacks = [
    EarlyStopping(monitor='val_auc', patience=10, mode='max', restore_best_weights=True),
    ModelCheckpoint('best_model.keras', monitor='val_auc', mode='max', save_best_only=True),
    ReduceLROnPlateau(monitor='val_auc', factor=0.5, patience=5, mode='max', verbose=1),
    csv_logger
]


if class_weights == None:
    history = model.fit(
        x_train, y_train,
        validation_data=(x_val, y_val),
        epochs=epoch,
        batch_size=batch,
        callbacks=callbacks, 
        verbose=1
    )
else:
    history = model.fit(
        x_train, y_train,
        validation_data=(x_val, y_val),
        epochs=epoch,
        batch_size=batch,
        callbacks=callbacks,
        class_weights = class_weights,
        verbose=1
    )
    





x_train shape: (321616, 36, 10)
y_train shape: (321616,)
x_val shape: (79260, 36, 10)
x_test shape: (102572, 36, 10)
New x_train shape after SMOTE: (510364, 36, 10)
New y_train shape after SMOTE: (510364,)
[1m997/997[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8293s[0m 8s/step - auc: 0.9529 - fn: 55899.0000 - fp: 16640.0000 - loss: 0.0297 - precision: 0.8906 - recall: 0.7079 - tn: 302338.0000 - tp: 135487.0000 - val_auc: 0.9541 - val_fn: 207.0000 - val_fp: 4068.0000 - val_loss: 0.0210 - val_precision: 0.1114 - val_recall: 0.7113 - val_tn: 74475.0000 - val_tp: 510.0000 - learning_rate: 1.0000e-04


In [30]:
#6 - Salve training e val metrics


# Convert history to DataFrame
history_df = pd.DataFrame(history.history)



# Saved to a CSV file
history_df.to_csv('results/window-transformers-trainning_val_history_fold_'+fold+'.csv', index=False)

In [31]:
#7 - Model Evaluate


results = model.evaluate(x_test, y_test, verbose=0)
print(f"Test Results - Loss: {results[0]:.4f}, AUC: {results[1]:.4f}")

#probabilities
y_pred_probs = model.predict(x_test).flatten()

# open auxiliar
info_test = pd.read_csv('results/window-transformers-info_test-fold' + fold + '.csv')[['harpnum', 'T_REC', 'Letra_Class']]
info_test = info_test.reset_index(drop=True)


metrics_list = []
positivos_info_total = []
probs_com_classe_real = []

# --- CSV 3: harpnum + real class + probability + t_rec + Letra_Class ---
for idx, (true_class, prob) in enumerate(zip(y_test, y_pred_probs)):
    harpnum = info_test.loc[idx, 'harpnum']
    t_rec = info_test.loc[idx, 'T_REC']
    letra_class = info_test.loc[idx, 'Letra_Class']
    probs_com_classe_real.append({
        'harpnum': harpnum,
        'classe_real': int(true_class),
        'probabilidade_modelo': round(float(prob), 6),
        'T_REC': t_rec,
        'Letra_Class': letra_class
    })

# --- Loop of thresholds  ---
thresholds = np.arange(0.10, 1.00, 0.01)

for threshold in thresholds:
    y_pred = (y_pred_probs >= threshold).astype(int)
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    # Métricas
    bce = tf.keras.losses.BinaryCrossentropy()
    computed_loss = bce(y_test, y_pred_probs).numpy()
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    accuracy = accuracy_score(y_test, y_pred)
    auc_score = roc_auc_score(y_test, y_pred_probs)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    balanced_acc = balanced_accuracy_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)

    sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    tss = sensitivity + specificity - 1
    total = tp + tn + fp + fn
    pe = ((tp + fn)*(tp + fp) + (tn + fn)*(tn + fp)) / (total**2)
    hss = (accuracy - pe) / (1 - pe) if (1 - pe) != 0 else 0
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0

    metrics_list.append({
        'threshold': round(threshold, 2),
        'tn': tn, 'fp': fp, 'fn': fn, 'tp': tp,
        'loss': computed_loss,
        'auc': auc_score,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'accuracy': accuracy,
        'balanced_accuracy': balanced_acc,
        'mcc': mcc,
        'tss': tss,
        'hss': hss,
        'true_positive_rate': sensitivity,
        'true_negative_rate': specificity,
        'false_positive_rate': fpr,
        'false_negative_rate': fnr
    })

    # --- CSV 2: harpnums positivos únicos + probabilidade + Letra_Class ---
    indices_positivos = np.where(y_pred == 1)[0]
    for idx in indices_positivos:
        prob = y_pred_probs[idx]
        harpnum = info_test.loc[idx, 'harpnum']
        letra_class = info_test.loc[idx, 'Letra_Class']
        positivos_info_total.append({
            'threshold': round(threshold, 2),
            'harpnum': harpnum,
            'probabilidade': round(float(prob), 6),
            'Letra_Class': letra_class
        })

# --- Save files ---
# CSV 1: metrics
pd.DataFrame(metrics_list).to_csv('results/window-transformers-metrics-fold' + fold + '.csv', index=False)

# CSV 2: unique positives with probability
df_positivos = pd.DataFrame(positivos_info_total).drop_duplicates()
df_positivos.to_csv('results/window-transformers-harpnums_all_thresholds_fold' + fold + '.csv', index=False)

# CSV 3: all samples with real class and probability
pd.DataFrame(probs_com_classe_real).to_csv('results/window-transformers-real_class_prob_harpnum_' + fold + '.csv', index=False)

print("\nResults saved successfully.")

Test Results - Loss: 0.0178, AUC: 0.9518
[1m3206/3206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m273s[0m 85ms/step

Results saved successfully.
