In [1]:
import os
import sys
import pandas as pd
import numpy as np
import tensorflow as tf
import pickle
import optuna

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

from keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Import library with current code functions
sys.path.append(os.path.join("..", "lib"))
import neural_network_functions as neural_net_fun
import general_functions as gf
import files_paths as fp


2024-07-19 12:58:41.105152: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Check if TensorFlow can find the GPU
physical_devices = tf.config.list_physical_devices('GPU')

if len(physical_devices) > 0:
    print("TensorFlow found the following GPU(s):")
    for device in physical_devices:
        print(device)
else:
    print("No GPU found. TensorFlow is running on the CPU.")

TensorFlow found the following GPU(s):
PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')


2024-07-19 12:58:43.603093: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-07-19 12:58:43.693705: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-07-19 12:58:43.693918: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

## Balancing the amount of seeds per sequence

In [3]:
def balance_sequences_per_seed(dt, n, list_seed_name, label):
    if len(list_seed_name)==0:
        return dt
    for seed in list_seed_name:
        sequeces = dt[dt['label'] == label]
        sequence_numbers_query = sequeces.query(f'seed_name == {seed}')['sample_id'].unique()
        random_indices = np.random.choice(sequence_numbers_query, n, replace=False)
        sequences_to_remove = np.setdiff1d(sequence_numbers_query, random_indices)
        dt = dt[~dt['sample_id'].isin(sequences_to_remove)]
    return dt  

In [4]:
def preprocess_data_current_dataset(dataset_path):
    SEQUENCES_NAME_FILE = dataset_path
    SEQUENCES = pd.read_csv(SEQUENCES_NAME_FILE)
    if 'Unnamed: 0' in SEQUENCES.columns:
        SEQUENCES.drop(columns=['Unnamed: 0'], inplace=True)
    
    SEQUENCES = SEQUENCES.query("label == 'happy' | label == 'surprise'")

    SEQUENCES_DT = SEQUENCES.copy()

    encoder = LabelEncoder()
    SEQUENCES_DT['label_numerical'] = encoder.fit_transform(SEQUENCES_DT['label'])

    # Select features
    features = SEQUENCES_DT.columns.to_list()[3:25]
    n_features = len(features)
    sample_id = set(SEQUENCES_DT['sample_id'])
    len_sample_max = SEQUENCES_DT.sample_id.value_counts().max()

    timesteps = len_sample_max
    n_classes = 1

    # Padding
    grouped_data = []
    for sample_id, group in SEQUENCES_DT.groupby('sample_id'):
        sequence_features = group[features]
        if len(sequence_features) < len_sample_max:
            N_rows = len_sample_max - len(sequence_features)
            pad = pd.DataFrame(np.zeros((N_rows, n_features)), columns = sequence_features.columns)
            sequence_features_pad = pd.concat([pad, sequence_features], ignore_index=True)
        label = SEQUENCES_DT[SEQUENCES_DT.sample_id == sample_id].iloc[0].label_numerical
        grouped_data.append((sequence_features_pad, label))
    
    grouped_data_x = [item[0] for item in grouped_data]
    X = np.array(grouped_data_x)
    
    grouped_data_y = [item[1] for item in grouped_data]
    Y = np.array(grouped_data_y)

    X_balanced, Y_balanced = X, Y
    
    X_balanced_normalized = neural_net_fun.normalize_data(X_balanced)

    X_train, X_test, y_train, y_test = train_test_split(X_balanced_normalized, Y_balanced, test_size=0.2, random_state=42, shuffle=True)

    y_train = y_train.reshape(-1, 1)
    y_test = y_test.reshape(-1, 1)

    return timesteps, n_classes, n_features, X_balanced_normalized, Y_balanced, X_train, X_test, y_train, y_test


## Tunning hyperparameters

In [5]:
def objective(trial, model_def_func, timesteps, n_classes, n_features, X_balanced_normalized, Y_balanced):
    # Suggest hyperparameters
    conv_filters = trial.suggest_int('conv_filters', 16, 64)
    kernel_size = trial.suggest_int('kernel_size', 2, 5)
    lstm_units = trial.suggest_int('lstm_units', 20, 100)
    dropout_conv = trial.suggest_float('dropout_conv', 0.2, 0.5)
    dropout_lstm = trial.suggest_float('dropout_lstm', 0.2, 0.5)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2)
    kernel_regularizer_l1 = trial.suggest_loguniform('kernel_regularizer_l1', 1e-6, 1e-2)
    kernel_regularizer_l2 = trial.suggest_loguniform('kernel_regularizer_l2', 1e-6, 1e-2)
    batch_size = trial.suggest_int('batch_size', 16, 64)
    patience = trial.suggest_int('patience', 3, 10)
    act = trial.suggest_categorical('act', ['tanh', 'relu'])
    act_rec = trial.suggest_categorical('act_rec', ['tanh', 'sigmoid'])
    m1_weight = trial.suggest_float('m1_weight', 1.5, 5)
    m3_weight = trial.suggest_float('m3_weight', 1.5, 5)
    m12_weight = trial.suggest_float('m12_weight', 1.5, 5)

    # Initialize KFold
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    # Store scores
    scores = []
    
    for train_index, val_index in kf.split(X_balanced_normalized):
        X_train_fold, X_val = X_balanced_normalized[train_index], X_balanced_normalized[val_index]
        y_train_fold, y_val = Y_balanced[train_index], Y_balanced[val_index]

        # Create model
        model = model_def_func(
            timesteps, 
            n_features, 
            n_classes, 
            conv_filters, 
            kernel_size, 
            lstm_units, 
            dropout_conv, 
            dropout_lstm, 
            learning_rate, 
            kernel_regularizer_l1, 
            kernel_regularizer_l2,
            act,
            act_rec,
            m1_weight,
            m3_weight,
            m12_weight
        )
        
        # Early stopping and learning rate reduction callbacks
        early_stopping = EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True)
        lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=int(patience/2), min_lr=1e-6)
        
        # Train the model
        model.fit(X_train_fold, y_train_fold, epochs=50, batch_size=batch_size, verbose=0, validation_data=(X_val, y_val), callbacks=[early_stopping, lr_scheduler])
        
        # Evaluate the model
        y_pred = model.predict(X_val)

        y_pred = np.round(y_pred)
        
        # Calculate accuracy score for this fold
        accuracy = accuracy_score(y_val, y_pred)
        scores.append(accuracy)
    
    # Calculate average score across folds
    avg_score = np.mean(scores)
    
    return avg_score


In [6]:
def training_process(model_def_func, timesteps, n_classes, n_features,  X_train, X_test, y_train, y_test, model_parameters):
    # Create a study and optimize
    
    model = model_def_func(
        timesteps, 
        n_features, 
        n_classes, 
        model_parameters.get('conv_filters', None), 
        model_parameters.get('kernel_size', None), 
        model_parameters.get('lstm_units', None), 
        model_parameters.get('dropout_conv', None), 
        model_parameters.get('dropout_lstm', None), 
        model_parameters['learning_rate'],
        model_parameters.get('kernel_regularizer_l1', None),
        model_parameters.get('kernel_regularizer_l2', None),
        model_parameters.get('act', None),
        model_parameters.get('act_rec', None),
        model_parameters.get('m1_weight', None),
        model_parameters.get('m3_weight', None),
        model_parameters.get('m12_weight', None)
    )

    batch_size = model_parameters['batch_size']
    patience = model_parameters['patience']

    # Early stopping callback
    early_stopping = EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True)

    # Train the model with early stopping
    history = model.fit(
        X_train, y_train, 
        epochs=100, 
        batch_size=batch_size, 
        validation_data=(X_test, y_test),  
        callbacks=[early_stopping]
    )

    neural_net_fun.plot_learning_curves(history)

    return history, model_def_func.__name__


## Run using a specific model with a specific dataset

In [7]:
model = neural_net_fun.define_model

dataset_list = gf.find_files_in_all_subdirectories([os.path.join('.', 'result_sequences')], '*.CSV')
dataset = dataset_list[0]

timesteps, n_classes, n_features, X_balanced_normalized, Y_balanced, X_train, X_test, y_train, y_test = preprocess_data_current_dataset(dataset)

# Getting the best hyperparameters
study = optuna.create_study(direction='maximize')
study.optimize(lambda trial: objective(trial, model, timesteps, n_classes, n_features, X_balanced_normalized, Y_balanced), n_trials=5)
print("Best params found:", study.best_params)

# Training model
current_history, model.__name__ = training_process(model, timesteps, n_classes, n_features, X_train, X_test, y_train, y_test, study.best_params)

# Save the current history to a file
dataset_name = os.path.basename(dataset)
history_filename = f"history_{model.__name__}_{dataset_name}.pkl"
history_folder = "history_folder"
os.makedirs(history_folder, exist_ok=True)
history_file_path = os.path.join(history_folder, history_filename)
with open(history_file_path, 'wb') as f:
    pickle.dump(current_history, f)

[I 2024-07-19 12:58:43,963] A new study created in memory with name: no-name-2dd5ad66-978f-4303-9b55-194c4ac9ff01
2024-07-19 12:58:43.972296: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355




2024-07-19 12:58:43.972502: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-07-19 12:58:43.972663: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-07-19 12:58:44.043670: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

Cause: Unable to locate the source code of <function weighted_mse.<locals>.loss at 0x736d081743a0>. Note that functions defined in certain environments, like the interactive Python shell, do not expose their source code. If that is the case, you should define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.experimental.do_not_convert. Original error: could not get source code
Cause: Unable to locate the source code of <function weighted_mse.<locals>.loss at 0x736d081743a0>. Note that functions defined in certain environments, like the interactive Python shell, do not expose their source code. If that is the case, you should define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.experimental.do_not_convert. Original error: could not get source code


2024-07-19 12:58:45.344764: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:432] Loaded cuDNN version 8907
2024-07-19 12:58:45.489513: I tensorflow/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2024-07-19 12:58:45.653428: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:606] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2024-07-19 12:58:45.693671: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x736b105a3330 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-07-19 12:58:45.693696: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 3060, Compute Capability 8.6
2024-07-19 12:58:45.713777: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:255] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-07-19 12:58:45.82057

Cause: Unable to locate the source code of <function weighted_mse.<locals>.loss at 0x736d038c64c0>. Note that functions defined in certain environments, like the interactive Python shell, do not expose their source code. If that is the case, you should define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.experimental.do_not_convert. Original error: could not get source code
Cause: Unable to locate the source code of <function weighted_mse.<locals>.loss at 0x736d038c64c0>. Note that functions defined in certain environments, like the interactive Python shell, do not expose their source code. If that is the case, you should define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.experimental.do_not_convert. Original error: could not get source code
Cause: Unable to locate the source code of <function weighted_mse.<locals>.loss at 0x736c39901c10>. Note that function

[W 2024-07-19 12:59:15,775] Trial 0 failed with parameters: {'conv_filters': 39, 'kernel_size': 3, 'lstm_units': 60, 'dropout_conv': 0.25376140858612367, 'dropout_lstm': 0.25519195944862955, 'learning_rate': 0.009327015555256212, 'kernel_regularizer_l1': 0.0001607110812638946, 'kernel_regularizer_l2': 0.00674938997840212, 'batch_size': 56, 'patience': 8, 'act': 'tanh', 'act_rec': 'tanh', 'm1_weight': 2.1799082993320225, 'm3_weight': 3.0847938580126484, 'm12_weight': 3.625867635539239} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/home/orlandomota/miniconda3/envs/tensor-gpu-env/lib/python3.8/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_56534/2761237924.py", line 10, in <lambda>
    study.optimize(lambda trial: objective(trial, model, timesteps, n_classes, n_features, X_balanced_normalized, Y_balanced), n_trials=5)
  File "/tmp/ipykernel_56534/2093420806.py", l

KeyboardInterrupt: 

## Save the models

In [None]:
model.save('lstm_time_series_model.keras')

# Save the classes object
np.save('classes.npy', classes)