 # Librerias

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import pickle
import os

from collections import defaultdict
import math

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import tensorflow
import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, InputLayer
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten

2024-12-15 11:24:01.197502: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-15 11:24:01.218914: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-15 11:24:01.322807: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-15 11:24:01.322848: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-15 11:24:01.341182: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to

# Funciones

In [3]:
def remove_correlated_features(X, threshold):
    # Eliminar columnas constantes
    X = X.loc[:, X.apply(pd.Series.nunique) > 1]
    # Calcular la matriz de correlación absoluta
    corr_matrix = X.corr().abs()
    # Seleccionar la parte superior de la matriz de correlación
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    
    # Identificar las columnas a eliminar para evitar la correlación alta
    to_drop = set()
    for column in upper.columns:
        # Obtener las columnas correlacionadas con la actual
        correlated_columns = [col for col in upper.columns if upper[col][column] > threshold]
        # Si hay columnas correlacionadas, eliminar todas excepto la primera
        if correlated_columns:
            to_drop.update(correlated_columns[1:])

    # Eliminar las columnas correlacionadas
    return X.drop(columns=list(to_drop), errors='ignore')

In [4]:
def get_frequent_features(features_array):
# seleccionar los m features que mas se repitan en cada experimento
    flattened = np.concatenate([np.unique(row) for row in features_array])
    unique_feats, counts = np.unique(flattened, return_counts=True)
    mean_feats = sum(len(row) for row in features_array) / len(features_array)
    print('Mean features:', mean_feats)

    sorted_indices = np.argsort(-counts)  # Índices ordenados por frecuencia (descendente)
    freq_feat = unique_feats[sorted_indices][:int(np.ceil(mean_feats))]

    return freq_feat.tolist()

In [5]:
def dnn_model(num_entradas, num_clases):
    model = Sequential()
    # Capa de entrada
    model.add(InputLayer(input_shape=(num_entradas,)))
    # Capas ocultas
    model.add(Dense(200, activation='relu'))
    model.add(Dense(200, activation='relu'))
    model.add(Dense(200, activation='relu'))
    # Capa de salida
    if num_clases == 2:
        # Para clasificación binaria
        model.add(Dense(1, activation='sigmoid'))
        loss = 'binary_crossentropy'
    else:
        # Para clasificación multiclase
        model.add(Dense(num_clases, activation='softmax'))
        loss = 'sparse_categorical_crossentropy'
    # Compilar el modelo
    model.compile(optimizer='RMSprop', loss=loss, metrics=['accuracy'])
    # devolver modelo compilado
    return model

In [6]:
def cnn_model_1d(n_features, num_clases):
    model = Sequential()
    # Capa de entrada
    # model.add(InputLayer(input_shape=(input_shape,)))
    # Capa de entrada y primeras capas convolucionales 1D
    model.add(Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=(n_features, 1)))
    # model.add(MaxPooling1D(pool_size=2))

    model.add(Conv1D(64, kernel_size=3, activation='relu'))

    model.add(Conv1D(128, kernel_size=3, activation='relu'))

    # Aplanar el volumen para conectarlo con capas densas
    model.add(Flatten())

    # Capas densas
    model.add(Dense(200, activation='relu'))
    model.add(Dense(200, activation='relu'))

    # Capa de salida
    if num_clases == 2:
        # Para clasificación binaria
        model.add(Dense(1, activation='sigmoid'))
        loss = 'binary_crossentropy'
    else:
        # Para clasificación multiclase
        model.add(Dense(num_clases, activation='softmax'))
        loss = 'sparse_categorical_crossentropy'

    # Compilar el modelo
    model.compile(optimizer='RMSprop', loss=loss, metrics=['accuracy'])

    return model

In [7]:
def show_metrics(y_true, y_pred):
    print('Accuracy:  ', accuracy_score(y_true, y_pred))
    print('Precision: ', precision_score(y_true, y_pred, average='macro'))
    print('Recall:    ', recall_score(y_true, y_pred, average='macro'))
    print('F1:        ', f1_score(y_true, y_pred, average='macro'))
    
    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=None)
    disp.plot(cmap=plt.cm.Blues)
    plt.show()

# Cargar datos

In [8]:
data = pd.read_csv('X-IIoTID_clean.csv')

In [10]:
data['class1'].isna().sum()

0

# Experimento

In [None]:
scaler = MinMaxScaler()

exp_names = ['E1.1__results.pkl','E1.2__results.pkl','E1.3__results.pkl','E1.4__results.pkl',
        'E2.1__results.pkl','E2.2__results.pkl','E2.3__results.pkl','E2.4__results.pkl',
        'E3.1__results.pkl','E3.2__results.pkl','E3.3__results.pkl','E3.4__results.pkl']

keys = ['class1', 'class2', 'class3']
exclude_cols = ['class1', 'class2', 'class3', 'Date', 'Timestamp', 'Scr_IP', 'Des_IP']

X_orig = data.drop(columns=exclude_cols)

# iterar sobre experimentos
for exp in exp_names:
    print('Experiment: ', exp)
    # Cargar el archivo .pkl
    with open('resultados_exp/' + exp, 'rb') as file:
        exp_results = pickle.load(file)

    # iterar sobre tipos de clases
    for key in keys:
        print('Classes: ', key)
        
        ### Filtrar features ###
        if '.3' in exp or '.4' in exp:
            freq_feat = get_frequent_features(exp_results[key]['features']) # features frecuentes del experimento/clase
            X_mod = X_orig.iloc[:, freq_feat] # filtar features en X_orig
        elif '.1' in exp:
            X_mod = X_orig
        # dividir sets en train y test
        x_tr, x_te, y_tr, y_te = train_test_split(
                X_mod, data[key].reset_index(drop=True),  # Asegura que el índice sea continuo
                test_size=0.2,
                random_state=42,
                stratify=data[key]
            )
        
        # print('forma x_te', np.shape(x_te))
        # print('forma y_te', np.shape(y_te))

        # print("Valores nulos en y_te antes de escalar:", y_te.isna().sum())
                
        ### eliminar columnas correlacionadas en exp x.2 ###
        if '.2' in exp:
            x_tr = remove_correlated_features(x_tr, 0.99)
            x_te = x_te[x_tr.columns]

        ### Escalar ### el problema es aqui
        x_tr = scaler.fit_transform(x_tr)
        x_te = scaler.transform(x_te)

        # print("Valores nulos en y_te antes de concatenar:", y_te.isna().sum())

        ### Entrenar modelos ###
        # seleccionar clasificador
        if 'E1' in exp:
            classifier = DecisionTreeClassifier()
        elif 'E2' in exp:
            classifier = dnn_model(x_tr.shape[1], y_tr.nunique())
        elif 'E3' in exp:
            classifier = cnn_model_1d(x_tr.shape[1], y_tr.nunique())
        # entrenar
        print(f'Training | Exp. {exp} | Classes {key}')
        if 'E2' in exp or 'E3' in exp:
            classifier.fit(x_tr, y_tr, batch_size=250, epochs=10)
        else:
            classifier.fit(x_tr, y_tr)

        ## Exportar modelo ##
        if isinstance(classifier, DecisionTreeClassifier):
            # Guardar modelo scikit-learn
            with open(f'model_{exp}_{key}.pkl', 'wb') as file:
                pickle.dump(classifier, file)
        elif 'E2' in exp or 'E3' in exp:
            # Guardar modelo de redes neuronales
            classifier.save(f'model_{exp}_{key}.h5')

        ### Testing ###
        # para comparar con jetson
        print(f'Testing | Exp. {exp} | Classes {key}')
        start_time = time.time()
        if 'E2' in exp or 'E3' in exp:
            if key == 'class3':
                start_time = time.time()
                y_pred = (classifier.predict(x_te) > 0.5).astype(int).flatten()
            else:
                start_time = time.time()
                y_pred = np.argmax(classifier.predict(x_te), axis=-1)
        else:
            start_time = time.time()
            y_pred = classifier.predict(x_te)
        test_time = time.time() - start_time        
        print('Tiempo test: ', test_time)
        
        ### Guardar para pruebas en jetson ##
        y_te = np.expand_dims(y_te, axis=1) # Asegurarse de que y_te tenga 2 dimensiones
        test_data = np.concatenate((x_te, y_te), axis=1) # Concatenar los arrays

        y_pred = np.expand_dims(y_pred, axis=1) # Asegurarse de que y_pred tenga 2 dimensiones
        test_data = np.concatenate((test_data, y_pred), axis=1) # Concatenar los arrays

        # Guardar en formato .npy
        np.save(f'{exp}_{key}_test.npy', test_data)

        # print('forma resultado', np.shape(test_data))
        # print(test_data)
        # ultimas_columnas = test_data[:, -2:]
        # nans_por_columna = np.isnan(ultimas_columnas).sum(axis=0) # Contar valores NaN en cada columna
        # print("Valores NaN por columna:", nans_por_columna) # Contar valores NaN en total para las dos columnas
      
        show_metrics(y_te, y_pred)

Experiment:  E1.1__results.pkl
Classes:  class1
Training | Exp. E1.1__results.pkl | Classes class1


KeyboardInterrupt: 