Aquí voy a definir mediante un notebook, paso a paso, el train_LSE

In [2]:
# %pip install tensorflow
# %pip install scikit-learn
# %pip freeze

In [1]:
# Tensorflow is an open-source ML framework developed by Google
import tensorflow as tf

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_virtual_device_configuration(
            gpus[0],
            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=5000)]
        )
        print("We limit the memory of the GPU to 5000MB")
    except RuntimeError as e:
        print("Error al configurar la GPU:", e)
else:
    print("No se detectó ninguna GPU.")


# Python library for scientific computing (arrays)
import numpy as np

# Confusion matrix used to evaluate the performance of the model
from sklearn.metrics import ConfusionMatrixDisplay

# A module that allows us to interact with the OS
import os

print(tf.__version__) # This shows the tensorflow version
print(tf.config.list_physical_devices()) 

# Keras is a Python high-level NNs API (application programming interface) integrated into TensorFlow
# Import regularization techniques (L1, L2...) 
from tensorflow.keras                       import regularizers

# Import the Sequential and Model classes
    # a) Sequential allows for linear stacking of layers
    # b) Model provides more flexibility for defining complex NN architectures
from tensorflow.keras.models                import Sequential, Model

# Import all layer classes from tensorflow.keras.layers (Dense, Conv2D, MaxPooling2D...)
from tensorflow.keras.layers                import *

# Import ModelCheckpoint and ReduceLROnPlateau classes
    # These classes are usually called during training
    # a) ModelCheckpoint is used to save model weights
    # b) ReduceLROnPlateau is used to reduce the learning rate 
    # If the monitored metric did not improve after some epochs, the learning rate will be reduced
    # Let us assume the "factor" parameter is 0.1, then the learning rate will be reduced by a factor of 10
    # We can define whether we want it to get activated when the metric has stopped improving or when it has stopped decreasing
from tensorflow.keras.callbacks             import ModelCheckpoint, ReduceLROnPlateau

# Import the SGD (Stochastic Gradient Descent) to minimize the loss function during the training of a NN
from tensorflow.keras.optimizers            import SGD

import matplotlib.pyplot as plt

2025-04-28 21:20:05.816936: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-28 21:20:05.873989: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


No se detectó ninguna GPU.
2.16.1
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]


2025-04-28 21:20:09.385700: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


Ahora se definen algunas funciones que se utilizarán más adelante

In [4]:
# This function will be useful after your model has been trained, during the evaluation phase, to visualize how well your model performs per class. It builds a confusion matrix
def getConfusionMatrix(model, dataset):
   
    """
    Generate and display a confusion matrix for a trained model on a given dataset.
    
    Parameters:
        model (tf.keras.Model): Trained TensorFlow/Keras model.
        dataset (tf.data.Dataset): Dataset to evaluate (should yield image batches and one-hot encoded labels).

    Returns:
        sklearn ConfusionMatrixDisplay object (also displays the matrix).
    """

    y_pred = []  # List to store predicted class indices
    y_true = []  # List to store true one-hot labels

    # Iterate over the dataset (batch-wise)
    for image_batch, label_batch in dataset:   # use dataset.unbatch() with repeat
        
        y_true.append(label_batch) # Store true labels
        preds = model.predict(image_batch) # Get model predictions
        y_pred.append(np.argmax(preds, axis = - 1)) # Convert predictions to class indices

    # Convert the true and predicted labels into tensors
    correct_labels = tf.concat([item for item in y_true], axis = 0)
    predicted_labels = tf.concat([item for item in y_pred], axis = 0)

    # Convert one-hot encoded labels to class indices
    final_correct_labels = []
    for lb in correct_labels:
        itlist=list(lb.numpy())
        final_correct_labels.append(itlist.index(1.0))

    return ConfusionMatrixDisplay.from_predictions(final_correct_labels, predicted_labels, display_labels=class_names, cmap=plt.cm.Blues)


In [5]:
# This function, logModelTrainFinished(...), is meant to be executed after training the model. It saves your final model, logs a confusion matrix 
# tracks misclassified examples, stores all this into Neptune.ai for experiment tracking
# It’s a post-training logging utility, essential for experiment reproducibility and understanding model behavior.

def logModelTrainFinished(run, model, validation_ds, class_names): 
    """
    Parameters:
        run: Neptune run object (used to log files and metrics)
        model: your trained Keras model
        validation_ds: dataset used to evaluate performance (ideally validation or test)
        class_names: list of string names of classes 
    """
    os.makedirs("model_trainer", exist_ok=True) # To create a folder (model_trainer) to store temporary files (models, confusion matrix, misclassified images)

    model.save("model_trainer/current_run_model.keras") # Save the latest model 
    run["model_last.h5"].upload("model_trainer/current_run_model.keras") # Upload it to Neptune

    getConfusionMatrix(model, validation_ds).figure_.savefig("model_trainer/confusion_matrix.png") # Calls the previous function getConfusionMatrix(...) to generate the confusion matrix figure. Saves the figure as an image
    run["eval/conf_matrix"].upload("model_trainer/confusion_matrix.png") # Uploads it to Neptune 

    # If folder missclasified exists delete it (por si hay entrenamientos previos, borras lo anterior)
    if os.path.exists("model_trainer/missclasified"):
        os.system("rm -rf model_trainer/missclasified")
    # Create folder missclasified
    os.makedirs("model_trainer/missclasified", exist_ok=True)

    cubatch = 0 # Count batchs to generate unique names (para que salga info del batch en el nombre de la imagen)
    # Generate missclasified images
    for image_batch, label_batch in validation_ds:   # train_ds or validation_ds
    
        # Compute predictions
        preds = model.predict(image_batch)

        # Dump and upload missclasified images
        for i in range(len(preds)):
            if np.argmax(preds[i]) != np.argmax(label_batch[i]):
                #save image
                imgName = "batch_" + str(cubatch) + "_img_" + str(i) +  "_true_" + class_names[np.argmax(label_batch[i])] + "_pred_" + class_names[np.argmax(preds[i])] + ".png"
                imgPath = "missclasified/" + imgName
                plt.imsave("model_trainer/" + imgPath, image_batch[i].numpy())
                run["eval/"+imgPath].upload("model_trainer/" + imgPath)
    
        cubatch += 1

    try:
        accuracy_dataframe = run["eval/accuracy"].fetch_values() #para obtener el accuracy subido a neptune
    except:
        accuracy_dataframe = []
        
    # get num of epochs executed
    run["train/epochs_executed"] = len(accuracy_dataframe) # Guardar los epochs ejecutados en neptune
    run["model_params_count"] = model.count_params() # Guardar el numero de parámetros en neptune


    # Guardar el "mejor" modelo según accuracy, encontrado con el checkpointer más arriba. Antes solo se guardaba en la carpeta "model" del servidor, pero esto desaparece con cada entrenamiento. 
    # Primero cargamos ese modelo
    model = tf.keras.models.load_model("model_best.keras")

    # Guardamos el modelo nuevo como un ".keras" para subirlo después a Neptune
    model.save("model_trainer/best_run_model.keras") #guardar el modelo
    run["best_model"].upload("model_trainer/best_run_model.keras") #subir el modelo a neptune

    first = accuracy_dataframe.iloc[0] 
    last = accuracy_dataframe.iloc[-1]

    # get timestamp column from series
    first = first["timestamp"]
    last = last["timestamp"]

    # get difference in seconds
    duration_timedelta = last - first

    run["train/duration_seconds"] = duration_timedelta.total_seconds()
    run["train/duration_text"] = str(duration_timedelta)

    # wait for neptune to upload files
    run.wait()
    run.stop()

Ahora ya pasamos al código

In [6]:
# Base variables for the project
DATASET_BASE_PATH = './Datasets_LSE'
DATASET_PATH = DATASET_BASE_PATH + '/Letters192'
# TRAINED_MODELS_PATH = '../TrainedModels'


In [7]:
# Execute hidden_neptune.py to get api key
neptune_api_token = open("./neptune_api_token_LSE.txt").read()
using_neptune = True

# %pip install neptune
import neptune
os.environ["NEPTUNE_API_TOKEN"] = neptune_api_token


Ahora debo cargar los datasets:

In [8]:
# This cell loads your image dataset from folders into TensorFlow's format (tf.data.Dataset) so it can be used for training and evaluation. 

bs = 32 # Antes era 64 pero daba errores del GPU # It's the batch size. How many images are loaded per batch during training.
image_side = 192 # All input images are resized to 192×192 pixels (standard input size for the model). CREO QUE NO ES NECESARIO (pq aqui ya lo esoty haciendo con las 192x192)

# Load dataset, it must contains Train and Test in categorical folders
with tf.device('/cpu:0'):
  raw_train_ds = tf.keras.utils.image_dataset_from_directory(
    DATASET_PATH + "/TrainV",
    label_mode = "categorical",
    shuffle = True,
    image_size = (image_side, image_side),
    batch_size = bs)

  raw_validation_ds = tf.keras.utils.image_dataset_from_directory(
    DATASET_PATH + "/Validation",
    label_mode = "categorical",
    shuffle = True,
    image_size = (image_side, image_side),
    batch_size = bs)

2025-04-10 12:21:41.230816: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1928] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13764 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:98:00.0, compute capability: 7.5


Found 57967 files belonging to 23 classes.
Found 6654 files belonging to 23 classes.


In [9]:
# Hare we are assigning class labels and normalizing your images so the model can learn effectively.
# We define class names
class_names = raw_train_ds.class_names
print(class_names)

# Normalize from 0-255 to 0-1
normalization_layer = tf.keras.layers.Rescaling(1./255)
train_ds = raw_train_ds.map(lambda x, y: (normalization_layer(x), y))
validation_ds = raw_validation_ds.map(lambda x, y: (normalization_layer(x), y))


['A', 'B', 'C', 'CH', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'W', 'X']


In [10]:
# It creates realistic variations of your training images (on the fly) so the model doesn’t overfit to the training data, learns to be robust to noise, rotations, translations, etc.

data_augmentation = tf.keras.Sequential(
    [
        tf.keras.layers.RandomFlip("horizontal"),
        tf.keras.layers.RandomRotation(0.01),
        tf.keras.layers.RandomZoom(0.02),
        tf.keras.layers.RandomTranslation(0.08, 0.08, fill_mode='nearest', fill_value=0.5),
        tf.keras.layers.RandomBrightness([-0.15,0.1],value_range=(0, 1)),
        #tf.keras.layers.RandomCrop(25,25),
        #tf.keras.layers.RandomContrast(0.02)
    ]
)
data_augmentation.build((None, image_side, image_side, 3)) # This is important to prevent usage of data augmentation change the shape


In [11]:
# Here we config model params
model_params = {"dropout1": 0.3, "dropout2": 0.2, "dense": 74, "l2reg": 0.02} # Los de Pamela de para DenseNet201

In [12]:
import neptune
project = neptune.init_project(project="LSE-Sign-Language2/LSE-Sign-Language", api_token=neptune_api_token)

[neptune] [info   ] Neptune initialized. Open in the app: https://app.neptune.ai/LSE-Sign-Language2/LSE-Sign-Language/


In [13]:
os.environ["NEPTUNE_API_TOKEN"] = neptune_api_token
os.environ["NEPTUNE_PROJECT"] = "LSE-Sign-Language2/LSE-Sign-Language"

In [8]:
# Aqui hago una exploración de los modelos para ver que capas congelar durante el Transfer Learning
# DENSENET201
#base_model = tf.keras.applications.DenseNet201(
#     input_shape=(192, 192, 3),
#     include_top=False,
#     weights='imagenet' )

# MOBILENET
#base_model = tf.keras.applications.MobileNet(
#     input_shape=(192, 192, 3), 
#     include_top=False, 
#     weights='imagenet'
#     )

# MOBILENETV2
#base_model = tf.keras.applications.MobileNetV2(
#    input_shape=(192, 192, 3), 
#    include_top=False, 
#    weights='imagenet'
# )

# InceptionV3
#base_model = tf.keras.applications.InceptionV3(
#    input_shape=(192, 192, 3), 
#    include_top=False, 
#    weights='imagenet'
#)

# ResNet152
base_model = tf.keras.applications.ResNet152(
    input_shape=(192,192,3),
    include_top=False,
    weights='imagenet'
)

print(len(base_model.layers))
# base_model.summary()


#for i, layer in enumerate(base_model.layers):
#    output_shape = getattr(layer, 'output_shape', 'N/A')
#    kernel_size = getattr(layer, 'kernel_size', '')  # Solo aplica a Conv layers
#    kernel_str = f"{kernel_size[0]}x{kernel_size[1]}" if kernel_size else "N/A"

#    print(f"{i:03d} | Name: {layer.name:<25} | Type: {layer.__class__.__name__:<20} | Kernel: {kernel_str:<5} | Output Shape: {output_shape}")


print(base_model.layers[514])   # 44 fin bloque 4 # 80 fin bloque 8 # 115 fin bloque 12 # 153 congelo todo (bloque 16 y las útlimas capas tb)

# Cuando ejecute algo aqui, mirar lo de nvidia-sim para ver si se ha quedado colgado el GPU
# ps -o user= -p 203699
# kill -9 203699



515
<Activation name=conv5_block3_out, built=True>


In [15]:
base_model = tf.keras.applications.DenseNet201(input_shape=(image_side,image_side,3), include_top=False, weights='imagenet')
#base_model = tf.keras.applications.ResNet152(input_shape=(image_side,image_side,3), include_top=False, weights='imagenet')

num_layers_to_freeze = 0
for layer in base_model.layers[:num_layers_to_freeze]:
    layer.trainable = False

# Create the model
model = tf.keras.Sequential([
  data_augmentation, # Apply augmentation before training
  base_model,  # Feature extractor
  Flatten(),   # Flatten feature map to vector
  Dense(model_params["dense"], kernel_regularizer=regularizers.l2(model_params["l2reg"]), activation = 'relu'),  # Fully connected layer (with L2 regularization)
  Dropout(model_params["dropout2"]), # Drop some neurons to avoid overfitting
  Dense(len(class_names), activation = 'softmax') # Output layer: one neuron per class
])

model.build((None, image_side, image_side, 3))
model.summary()


In [None]:
# Here, we compile the model, initialize a Neptune run to track training, define callbacks (checkpointing, early stopping, etc.)
# train the model with model.fit(...), logs metrics to NeptuneHandles interruptions or crashes gracefully and finalizes the run with post-training logs

learning_rate = 0.001 # controls how fast the model learns. 0.001 is a good start.
epochs = 50 # it will try to train for 50 full passes over the dataset (unless stopped earlier).

# Starts a new Neptune run to track the training session.
if using_neptune:
    run = neptune.init_run(project="LSE-Sign-Language2/LSE-Sign-Language", api_token=neptune_api_token, capture_hardware_metrics=True, capture_stdout=True, capture_stderr=False)
    params = {
        "learning_rate": learning_rate, 
        "optimizer": "SGD",
        "base_model": base_model.name,
        "image_side": image_side,
        "epochs": epochs,
        "batch_size": bs
        }
    
    run["sys/name"] = f"{base_model.name}_freeze_{num_layers_to_freeze}_on_{os.path.basename(DATASET_PATH)}"
    run["parameters"] = params
    run["status"] = "running"
    



# categorical_crossentropy: used for multi-class classification (with one-hot labels) and SGD (Stochastic Gradient Descent) a basic optimizer.
model.compile(loss="categorical_crossentropy", optimizer= SGD(learning_rate=learning_rate), metrics=['accuracy'])

# This save the best model
checkpointer = ModelCheckpoint(filepath='model_best.keras', verbose=1, save_best_only=True, monitor = 'val_accuracy', mode = 'max')

#reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.000001) # this reduce learning rate when val_loss is not improving

# Stops training early if the validation loss doesn't improve for 5 epochs
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5) 

# This function logs training and validation metrics to Neptune after every epoch
def epochCallback(epoch, logs):
    if using_neptune:
        run["train/loss"].log(logs["loss"])
        run["train/accuracy"].log(logs["accuracy"])
        run["eval/loss"].log(logs["val_loss"])
        run["eval/accuracy"].log(logs["val_accuracy"])

keyinterrupt = False
history = None

# This is the actual training loop, uses the normalized, augmented datasets. Uses callbacks to: log to Neptune, save best model, stop early if overfitting
try:
    history = model.fit(train_ds, validation_data = validation_ds, epochs=epochs,
                                callbacks = [
    #                                reduce_lr,
                                    checkpointer,
                                    tf.keras.callbacks.LambdaCallback(on_epoch_end=epochCallback),
                                    early_stop
                                ])
    if using_neptune:
        run["status"] = "finished"
except tf.errors.ResourceExhaustedError:
    if using_neptune:
        run["status"] = "crashed-ResourceExhausted"
except KeyboardInterrupt:
    keyinterrupt=True
    if using_neptune:
        run["status"] = "Interrupted keyboard"

logModelTrainFinished(run, model, validation_ds, class_names)

if keyinterrupt:
    raise KeyboardInterrupt

[neptune] [info   ] Neptune initialized. Open in the app: https://app.neptune.ai/LSE-Sign-Language2/LSE-Sign-Language/e/LSES-21
Epoch 1/50


2025-03-31 22:20:33.934503: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8907


[1m906/906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.6439 - loss: 3.1318
Epoch 1: val_accuracy improved from -inf to 0.69206, saving model to model_best.keras
[1m906/906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1371s[0m 1s/step - accuracy: 0.6441 - loss: 3.1310 - val_accuracy: 0.6921 - val_loss: 2.9519
Epoch 2/50
[1m906/906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.9791 - loss: 1.8796
Epoch 2: val_accuracy improved from 0.69206 to 0.73144, saving model to model_best.keras
[1m906/906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1290s[0m 1s/step - accuracy: 0.9791 - loss: 1.8796 - val_accuracy: 0.7314 - val_loss: 2.8885
Epoch 3/50
[1m906/906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.9889 - loss: 1.7478
Epoch 3: val_accuracy improved from 0.73144 to 0.74767, saving model to model_best.keras
[1m906/906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1283s[0m 1s/step - accur

In [None]:
run.stop()