### **0. Prepare enviroment**

In [1]:
# Import libraries
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from keras.models import load_model
from tensorflow.keras.models import Model
from tensorflow.keras.layers import GlobalMaxPooling2D

2025-07-10 12:36:26.160722: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### **1. Load data**

#### **1.1. Load images**

In [2]:
# Data Generator from Keras Sequence

from tensorflow.keras.utils import Sequence, to_categorical

class DataGenerator(Sequence):


    def __init__(self, image_paths, labels, batch_size, img_size, shuffle=True, seed=42):

        """
        Initialization
        
        Parameters:
        -----------
        image_paths : list
            List of image paths
        labels : array-like
            Array of integer labels (e.g., 0, 1, 2, ...).
        batch_size : int
            Size of the batch.
        img_size : tuple
            Image size (height, weight).
        shuffle : bool, optional
            Shuffle after each epoch.
        seed : int, optional
            Random seed.
        """
        self.image_paths = image_paths
        self.labels = np.array(labels)
        self.batch_size = batch_size
        self.img_size = img_size
        
        self.n_classes = len(np.unique(self.labels))
        if self.n_classes == 2:
            self.n_classes = 1
            
        self.shuffle = shuffle
        self.seed = seed
        self.indexes = np.arange(len(self.image_paths))
        self.on_epoch_end()


    
    def __len__(self):
        # Number of batches per epoch
        return int(np.ceil(len(self.image_paths) / self.batch_size))

    
    def __getitem__(self, index):
        
        # Generating indices for the current epoch
        start = index * self.batch_size
        end = (index + 1) * self.batch_size
        batch_indexes = self.indexes[start:end]

        # Batch creation
        batch_image_paths = [self.image_paths[i] for i in batch_indexes]
        batch_labels = [self.labels[i] for i in batch_indexes]
        
        X, y = self.__data_generation(batch_image_paths, batch_labels)
        
        return X, y

    
    def on_epoch_end(self):
        # Shuffle indexes after each epoch
        if self.shuffle:
            np.random.seed(self.seed)
            np.random.shuffle(self.indexes)

            
    def __data_generation(self, batch_image_paths, batch_labels):
        
        # Initialize arrays to store batch data
        X = np.empty((len(batch_image_paths), self.img_size[0], self.img_size[1], 3), dtype=np.float32)
        y = np.empty((len(batch_image_paths),), dtype=int)
    
        # Iterate over each .npy image in the batch
        for i, path in enumerate(batch_image_paths):
            # Load preprocessed image from .npy file
            X_ = np.load(path)
            X[i] = X_
            y[i] = batch_labels[i]
    
        # Convert labels to one-hot encoding if there is more than two classes
        if self.n_classes > 2:
            y = to_categorical(y, num_classes=self.n_classes)
            
        return X, y

In [None]:
# Subsets
subset_train = ['TC_Canon_RAD_HURJC_train', 'TC_Canon_CZC_HURJC_train', 'TC_Canon_MIN_HURJC_train']
subsets_train_name = 'TC_Canon_CZC_RAD_MIN_HURJC_train'
subset_test  = ['TC_Canon_RAD_HURJC_test', 'TC_Canon_CZC_HURJC_test', 'TC_Canon_MIN_HURJC_test']
subsets_test_name = 'TC_Canon_CZC_RAD_MIN_HURJC_test'

# Ruta base
subsets_path = 'big_volume/subsets_v0/'

In [4]:
# Separar rutas de imágenes y etiquetas en train y test
images_train_path = []
images_test_path = []
labels_train = []
labels_test = []

for s in subset_train:
    subset_path = os.path.join(subsets_path, s)
    df = pd.read_csv(os.path.join(subsets_path, s + '.csv'))
    label_dict = dict(zip(df['rx_cod'], df['label_CalTend']))
    
    for img_name in os.listdir(subset_path):
        img_path = os.path.join(subset_path, img_name)
        if os.path.exists(img_path):
            images_train_path.append(img_path)
            labels_train.append(label_dict[os.path.splitext(img_name)[0]])

for s in subset_test:
    subset_path = os.path.join(subsets_path, s)
    df = pd.read_csv(os.path.join(subsets_path, s + '.csv'))
    label_dict = dict(zip(df['rx_cod'], df['label_CalTend']))
    
    for img_name in os.listdir(subset_path):
        img_path = os.path.join(subset_path, img_name)
        if os.path.exists(img_path):
            images_test_path.append(img_path)
            labels_test.append(label_dict[os.path.splitext(img_name)[0]])


In [5]:
# Parámetros del generador
batch_size = 32
img_size = (512, 512)  # <-- AQUÍ defines el tamaño correcto

# Crear generadores
train_generator = DataGenerator(images_train_path, labels_train, batch_size, img_size, shuffle=True)
test_generator  = DataGenerator(images_test_path,  labels_test,  batch_size, img_size, shuffle=False)



In [6]:
X_batch, y_batch = train_generator[0]
print(f'X_batch shape: {X_batch.shape}')
print(f'y_batch shape: {y_batch.shape}')
print(f'Example labels: {y_batch[:5]}')


X_batch shape: (32, 512, 512, 3)
y_batch shape: (32,)
Example labels: [1 0 0 1 1]


In [None]:
import os
import numpy as np
import pandas as pd

# Subsets
subset_train = ['TC_Canon_RAD_HURJC_train', 'TC_Canon_CZC_HURJC_train', 'TC_Canon_MIN_HURJC_train']
subsets_train_name = 'TC_Canon_CZC_RAD_MIN_HURJC_train'
subset_test  = ['TC_Canon_RAD_HURJC_test', 'TC_Canon_CZC_HURJC_test', 'TC_Canon_MIN_HURJC_test']
subsets_test_name = 'TC_Canon_CZC_RAD_MIN_HURJC_test'

# Ruta base
subsets_path = 'big_volume/subsets_v0/'

# Inicializar listas
images_path = []
labels = []

# Procesar todos los subsets (entrenamiento + test)
subsets = subset_train + subset_test

for s in subsets:
    # Carpeta de imágenes
    subset_path = os.path.join(subsets_path, s)
    images_subset = [os.path.join(subset_path, image_name) for image_name in os.listdir(subset_path)]

    # Filtrar imágenes que realmente existen
    images_subset = [img_path for img_path in images_subset if os.path.exists(img_path)]
    images_path.extend(images_subset)

    # Metadata
    df = pd.read_csv(os.path.join(subsets_path, s + '.csv'))

    # Extraer etiquetas
    label_dict = dict(zip(df['rx_cod'], df['label_CalTend']))
    labels_subset = [
        label_dict[
            os.path.splitext(os.path.basename(img_path))[0]
        ]
        for img_path in images_subset
    ]
    labels.extend(labels_subset)

# Convertir a array
images_path = np.array(images_path)
y = np.array(labels)

# Resumen
print('Images:', len(images_path))
print('Labels:', y.shape)


Images: 4268
Labels: (4268,)


#### **1.2. Load metadata**

In [8]:

# Cargar y unir CSVs de train
df_train = pd.concat([
    pd.read_csv(os.path.join(subsets_path, s + '.csv'))
    for s in subset_train
], ignore_index=True)

# Cargar y unir CSVs de test
df_test = pd.concat([
    pd.read_csv(os.path.join(subsets_path, s + '.csv'))
    for s in subset_test
], ignore_index=True)

# Opcional: imprime resumen rápido
print("Train set:", df_train.shape)
print(df_train['label_CalTend'].value_counts())

print("Test set:", df_test.shape)
print(df_test['label_CalTend'].value_counts())

Train set: (3768, 20)
label_CalTend
0    1884
1    1884
Name: count, dtype: int64
Test set: (500, 20)
label_CalTend
1    250
0    250
Name: count, dtype: int64


In [9]:
# Extract labels
def extract_metadata (df, images_path):
    
    label_dict     = dict(zip(df['rx_cod'], df['label_CalTend']))
    sex_dict       = dict(zip(df['rx_cod'], df['sex']))
    birthdate_dict = dict(zip(df['rx_cod'], df['birthdate']))
    date_rx_dict   = dict(zip(df['rx_cod'], df['date_rx']))

    rx_cod    = np.array([os.path.split(images_path)[1][:-4] for images_path in images_path])
    y         = np.array([label_dict     [os.path.split(images_path)[1][:-4]] for images_path in images_path])
    sex       = np.array([sex_dict       [os.path.split(images_path)[1][:-4]] for images_path in images_path])
    birthdate = np.array([birthdate_dict [os.path.split(images_path)[1][:-4]] for images_path in images_path])
    date_rx   = np.array([date_rx_dict   [os.path.split(images_path)[1][:-4]] for images_path in images_path])

    return rx_cod, y, sex, birthdate, date_rx

In [10]:
rx_cod_train, y_train, sex_train, birthdate_train, date_rx_train = extract_metadata (df_train, images_train_path)
rx_cod_test,  y_test,  sex_test,  birthdate_test,  date_rx_test  = extract_metadata (df_test,  images_test_path)

print(f'y_shape_train: {y_train.shape}')
print(f'y_shape_test:  {y_test.shape}')

y_shape_train: (3768,)
y_shape_test:  (500,)


### **2. Load model**

In [None]:
# EJEMPLO MODELO CON "NESTED BASE_MODEL"
# Cargar el modelo
model_path = 'big_volume/Models_v0/EXP1/TC_Canon_CZC_RAD_MIN_HURJC_train/model_conv_VGG19_freeze_False_top_m_GMP_neurons_last_layer_1_optimizer_SGD_loss_function_binary_crossentropy_activation_s_initializer_seed_42_lr_0.005_epochs_50_batch_size_32_patience_15_subset_TC_Canon_CZC_RAD_MIN_HURJC_train/FOLD_4/best_model_epoch_22_val_loss_0.5731_val_acc_0.8486.h5'
model = load_model(model_path)
model.summary()

2025-07-10 12:36:32.180396: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-07-10 12:36:32.241127: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-07-10 12:36:32.244082: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 vgg19 (Functional)          (None, 16, 16, 512)       20024384  
                                                                 
 global_max_pooling2d (Glob  (None, 512)               0         
 alMaxPooling2D)                                                 
                                                                 
 dense (Dense)               (None, 1)                 513       
                                                                 
Total params: 20024897 (76.39 MB)
Trainable params: 20024897 (76.39 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


### **3.Extract features of last conv + GMP**

In [12]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import GlobalMaxPooling2D
from datetime import datetime


# Asegurar índices únicos para acceso rápido
df_train = df_train.set_index('rx_cod')
df_test  = df_test.set_index('rx_cod')

# Crear diccionarios de metadatos
metadata_train = df_train[['sex', 'birthdate', 'date_rx', 'station_name']].to_dict(orient='index')
metadata_test  = df_test[['sex', 'birthdate', 'date_rx', 'station_name']].to_dict(orient='index')

# Crear el extractor de características desde VGG19
conv_model = model.get_layer('vgg19')  # Asegúrate de que existe
last_conv_layer = conv_model.get_layer('block5_conv3')
feature_extractor = Model(inputs=conv_model.input, outputs=last_conv_layer.output)

# Función de extracción por batches
def extract_features(generator, metadata_dict):
    feature_list = []
    label_list = []
    rx_cod_list = []
    sex_list = []
    birthdate_list = []
    date_rx_list = []
    station_rx_list = []

    for i in range(len(generator)):
        X_batch, y_batch = generator[i]

        # Extraer feature maps y aplicar pooling
        feature_maps = feature_extractor.predict(X_batch)
        pooled_features = GlobalMaxPooling2D()(feature_maps).numpy()

        feature_list.append(pooled_features)
        label_list.append(y_batch)

        # Obtener rx_cod del batch
        batch_indexes = generator.indexes[i * generator.batch_size : (i + 1) * generator.batch_size].tolist()
        batch_rx_cod = [os.path.splitext(os.path.basename(generator.image_paths[j]))[0] for j in batch_indexes]
        rx_cod_list.extend(batch_rx_cod)

        # Añadir metadatos
        for cod in batch_rx_cod:
            info = metadata_dict.get(cod, {'sex': None, 'birthdate': None, 'date_rx': None, 'station_name':None})
            sex_list.append(info['sex'])
            birthdate_list.append(info['birthdate'])
            date_rx_list.append(info['date_rx'])
            station_rx_list.append(info['station_name'])

    features = np.vstack(feature_list)
    labels = np.concatenate(label_list).flatten()
    return features, labels, rx_cod_list, sex_list, birthdate_list, date_rx_list, station_rx_list

# Extraer características de train y test
features_train, y_train, rx_cod_train, sex_train, birthdate_train, date_rx_train, station_rx_train = extract_features(train_generator, metadata_train)
features_test,  y_test,  rx_cod_test,  sex_test,  birthdate_test,  date_rx_test, station_rx_test  = extract_features(test_generator,  metadata_test)

# Crear DataFrames
column_names = [f'feature_{i+1}' for i in range(features_train.shape[1])]
df_features_train = pd.DataFrame(features_train, columns=column_names, index=rx_cod_train)
df_features_test  = pd.DataFrame(features_test,  columns=column_names, index=rx_cod_test)

# Añadir variables
df_features_train['label']      = y_train
df_features_test['label']       = y_test

df_features_train['sex']        = sex_train
df_features_test['sex']         = sex_test

df_features_train['birthdate']  = birthdate_train
df_features_test['birthdate']   = birthdate_test

df_features_train['date_rx']    = date_rx_train
df_features_test['date_rx']     = date_rx_test

df_features_train['station_name']    = station_rx_train
df_features_test['station_name']     = station_rx_test


# Calcular edad
def calcular_edad(birthdate_str, date_rx_str):
    try:
        birth = datetime.strptime(birthdate_str, '%Y-%m-%d')
        rx    = datetime.strptime(date_rx_str, '%Y-%m-%d')
        return (rx - birth).days // 365
    except:
        return np.nan

df_features_train['age'] = [
    calcular_edad(bd, drx) for bd, drx in zip(df_features_train['birthdate'], df_features_train['date_rx'])
]
df_features_test['age'] = [
    calcular_edad(bd, drx) for bd, drx in zip(df_features_test['birthdate'], df_features_test['date_rx'])
]

# Guardar en CSV
df_features_train.to_csv('Machine_Learning/TC_Canon_CZC_RAD_MIN_HURJC_train_features.csv')
df_features_test.to_csv('Machine_Learning/TC_Canon_CZC_RAD_MIN_HURJC_test_features.csv')


2025-07-10 12:36:34.296751: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:432] Loaded cuDNN version 8600


