# TFG: Título del TFG

## Hugo López Álvarez

In [1]:
import math
import numpy    
import pandas   
import wandb
import torch    
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, Subset, TensorDataset
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, log_loss, fbeta_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.combine import SMOTETomek

## Clases

Definición de la clase DatasetTFG que se usará para entrenar al modelo

In [2]:
class DatasetTFG(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]

Definición de la clase Modelo
- La capa1 transforma la dimensión de entrada a ventaOculta.Value neuronas
- La capa2 pasa de las neuronas de la capa1, a tantas salidas como clases

In [None]:
class ModeloMulticlase(nn.Module):
    def __init__(self, input_dim, ventanaOculta, numClases):
        super().__init__()
        self.capa1 = nn.Linear(input_dim, ventanaOculta)    
        self.bn1 = nn.BatchNorm1d(ventanaOculta, momentum=0.01)
        self.capa2 =  nn.Linear(ventanaOculta, numClases)
        
    def forward(self,  X):
        X = torch.relu(self.bn1(self.capa1(X)))  
        X = self.capa2(X) 
        return X    

# Funciones

In [4]:

def split_ip_column(df, ip_column_name):
    
    # Divide la IP en cuatro partes
    ip_parts = df[ip_column_name].str.split('.', expand=True)
    
    # Crea nombres de columnas basados en el nombre original
    new_columns = {
        0: f"{ip_column_name}_part1",
        1: f"{ip_column_name}_part2", 
        2: f"{ip_column_name}_part3",
        3: f"{ip_column_name}_part4"
    }
    
    # Se elimina la columna de ip_column_name
    df = df.drop(columns=[ip_column_name]) 
    
    # Añade las nuevas columnas al DataFrame
    for part, col_name in new_columns.items():
        df[col_name] = pandas.to_numeric(ip_parts[part])  # Convierte a numérico
    
    return df

## Cargar datos

In [5]:
fileData = pandas.read_csv('../Datasets/modUQ.csv')

### Comprobación de la obtención correcta del csv

In [6]:
fileData.head()

Unnamed: 0,FLOW_START_MILLISECONDS,FLOW_END_MILLISECONDS,IPV4_SRC_ADDR,L4_SRC_PORT,IPV4_DST_ADDR,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,...,SRC_TO_DST_IAT_MIN,SRC_TO_DST_IAT_MAX,SRC_TO_DST_IAT_AVG,SRC_TO_DST_IAT_STDDEV,DST_TO_SRC_IAT_MIN,DST_TO_SRC_IAT_MAX,DST_TO_SRC_IAT_AVG,DST_TO_SRC_IAT_STDDEV,Label,Attack
0,1424242193040,1424242193043,59.166.0.2,4894,149.171.126.3,53,17,5.0,146,2,...,0,0,0,0,0,0,0,0,0,Benign
1,1424242192744,1424242193079,59.166.0.4,52671,149.171.126.6,31992,6,11.0,4704,28,...,0,91,12,19,0,90,12,19,0,Benign
2,1424242190649,1424242193109,59.166.0.0,47290,149.171.126.9,6881,6,37.0,13662,238,...,0,1843,10,119,0,1843,5,88,0,Benign
3,1424242193145,1424242193146,59.166.0.8,43310,149.171.126.7,53,17,5.0,146,2,...,0,0,0,0,0,0,0,0,0,Benign
4,1424242193239,1424242193241,59.166.0.1,45870,149.171.126.1,53,17,5.0,130,2,...,0,0,0,0,0,0,0,0,0,Benign


### Se convierten las columnas no numéricas para poder utilizarlas con pytorch

In [7]:
ataquesData = fileData[fileData['Label'] != 0].copy()
ataquesData['Attack'] = LabelEncoder().fit_transform(ataquesData['Attack'])

### Se comprueba que los datos se han transformado correctamente

In [8]:
print(ataquesData.dtypes)

FLOW_START_MILLISECONDS          int64
FLOW_END_MILLISECONDS            int64
IPV4_SRC_ADDR                   object
L4_SRC_PORT                      int64
IPV4_DST_ADDR                   object
L4_DST_PORT                      int64
PROTOCOL                         int64
L7_PROTO                       float64
IN_BYTES                         int64
IN_PKTS                          int64
OUT_BYTES                        int64
OUT_PKTS                         int64
TCP_FLAGS                        int64
CLIENT_TCP_FLAGS                 int64
SERVER_TCP_FLAGS                 int64
FLOW_DURATION_MILLISECONDS       int64
DURATION_IN                      int64
DURATION_OUT                     int64
MIN_TTL                          int64
MAX_TTL                          int64
LONGEST_FLOW_PKT                 int64
SHORTEST_FLOW_PKT                int64
MIN_IP_PKT_LEN                   int64
MAX_IP_PKT_LEN                   int64
SRC_TO_DST_SECOND_BYTES        float64
DST_TO_SRC_SECOND_BYTES  

## Se eliminan los datos con valores infinitos

In [9]:
#print("¿Existen valores infinitos en X?: ", numpy.isinf(fileData.values).any())
ataquesData = ataquesData.replace([numpy.inf, -numpy.inf], numpy.nan).dropna()
#print("¿Siguen existiendo valores infinitos en X?: ", numpy.isinf(fileData.values).any())

### Se separan las características (X) de la etiqueta (Y)

In [10]:
X = ataquesData.drop(columns=['Label', 'Attack', 'FLOW_START_MILLISECONDS', 'FLOW_END_MILLISECONDS', 'IPV4_SRC_ADDR', 'IPV4_DST_ADDR']).values
Y = ataquesData['Attack'].values

### Se elimina fileData que contiene el csv con los datos para liberar memoria

In [11]:
del fileData

## Se separan los datos del entrenamiento de los datos de prueba
El entrenamiento tendrá el 80% de los datos

La prueba tendrá el 20% de los datos

In [12]:
X_entrena, X_prueba, Y_entrana, Y_prueba = train_test_split(
    X, Y, test_size=0.2, random_state=42,  stratify=Y
)

## Se normalizan los datos

In [13]:
escalador = MinMaxScaler(feature_range=(0,1))
X_entrena_normalizado = escalador.fit_transform(X_entrena)

### Se convierten los datos a tensores de Pytorch

In [14]:
X_entrena_tensor = torch.tensor(X_entrena_normalizado, dtype=torch.float32)
Y_entrena_tensor = torch.tensor(Y_entrana, dtype=torch.float32) # Puede que dé problemas con CrossEntropyLoss 

## Creación del Dataset personalizado

In [15]:
dataset_entrena = DatasetTFG(X_entrena_tensor, Y_entrena_tensor)

## Se configura pérdida y optimizador

In [16]:
frec_clases = numpy.bincount(Y_entrana) # Frecuencia de las clases
print(frec_clases) 

weight_clases = (len(Y_entrana) - frec_clases)/frec_clases

weight_clases_tensor = torch.tensor(weight_clases, dtype=torch.float32)  # Auto-cálculo

perdida = nn.CrossEntropyLoss(weight=weight_clases_tensor)


[  981  2761  4040 31052 20470  3808  9033  1269   109]


## Definición de hiperparámetros

In [17]:
batch_size = [32, 64, 128, 256, 512] 
learning_rate = [1e-2, 1e-3, 3e-4, 1e-4, 3e-5, 1e-5]
hidden_factor=[25, 49, 98]
epochs= [30, 50, 80]

## Se define el objeto KFold que se utilizará para la validación cruzada

In [18]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

## Bucle de entrenamiento o épocas

In [None]:
for bs in batch_size:
    for lr in learning_rate:
        for hs in hidden_factor:
            for e in epochs:
                # Listas para almacenar métricas de cada fold
                fold_metrics = {
                    'loss': [], 'accuracy': [], 
                    'precision_macro': [], 'recall_macro': [], 'f1_macro': [],
                    'precision_weighted': [], 'recall_weighted': [], 'f1_weighted': [],
                    'roc_auc_ovo': [], 'roc_auc_ovr': [],
                    'confusion_matrix': []
                }
                
                # Validación cruzada
                for fold, (train_idx, val_idx) in enumerate(kf.split(X_entrena_tensor, Y_entrena_tensor)):
                    print(f"\n--- Fold {fold+1} ---")
                    # Configuración del experimento en wandb (por fold)
                    nombreExperimento = f'TFG_MUL_bs({bs})_lr({lr})_hs({hs})_e({e})_fold({fold+1})'
                    wandb.init(
                        project="TFG_MUL_CV_FOLDS",
                        name=nombreExperimento,
                        config={
                            "batch_size": bs,
                            "learning_rate": lr,
                            "hidden_size": hs,
                            "epochs": e,
                            "fold": fold+1
                        }
                    )
                    
                    # Divisón train/val para este fold
                    train_data = Subset(DatasetTFG(X_entrena_tensor, Y_entrena_tensor), train_idx)
                    val_data = Subset(DatasetTFG(X_entrena_tensor, Y_entrena_tensor), val_idx)
                    
                    train_loader = DataLoader(train_data, batch_size=bs, shuffle=True)
                    val_loader = DataLoader(val_data, batch_size=bs)
                    
                    # Modelo y optimizador - Cambiar la capa de salida según el número de clases
                    num_classes = len(torch.unique(Y_entrena_tensor))
                    modelo = ModeloMulticlase(input_dim=X_entrena_tensor.shape[1], ventanaOculta=hs, numClases=num_classes)
                    optimizador = optim.AdamW(modelo.parameters(), lr=lr)
                    # Calcular pesos
                    class_weights = compute_class_weight(
                        'balanced',
                        classes=numpy.unique(Y_entrena_tensor.numpy()),
                        y=Y_entrena_tensor.numpy()
                    )
                    weights_tensor = torch.tensor(class_weights, dtype=torch.float32)

                    # Usar en la pérdida
                    perdida = nn.CrossEntropyLoss(weight=weights_tensor)
                    
                    # Entrenamiento
                    for epoch in range(e):
                        modelo.train()
                        for batch_X, batch_Y in train_loader:
                            optimizador.zero_grad()
                            salidas = modelo(batch_X)
                            loss = perdida(salidas, batch_Y.long())  # Asegurar que las etiquetas sean long
                            loss.backward()
                            optimizador.step()
                    
                    # Evaluación en validation fold
                    modelo.eval()
                    val_preds, val_probs, val_targets = [], [], []
                    val_loss = 0.0
                    
                    with torch.no_grad():
                        for batch_X_val, batch_Y_val in val_loader:
                            salidas_val = modelo(batch_X_val)
                            val_loss += perdida(salidas_val, batch_Y_val.long()).item()
                            
                            # Obtener probabilidades y predicciones
                            probs = torch.softmax(salidas_val, dim=1)
                            preds = torch.argmax(probs, dim=1)
                            
                            val_probs.extend(probs.cpu().numpy())
                            val_preds.extend(preds.cpu().numpy())
                            val_targets.extend(batch_Y_val.cpu().numpy())
                    
                    # Métricas para este fold
                    val_loss /= len(val_loader)
                    cm = confusion_matrix(val_targets, val_preds)
                    
                    # Cálculo de métricas multiclase
                    fold_metrics['loss'].append(val_loss)
                    fold_metrics['accuracy'].append(accuracy_score(val_targets, val_preds))
                    fold_metrics['precision_macro'].append(precision_score(val_targets, val_preds, average='macro'))
                    fold_metrics['recall_macro'].append(recall_score(val_targets, val_preds, average='macro'))
                    fold_metrics['f1_macro'].append(f1_score(val_targets, val_preds, average='macro'))
                    fold_metrics['precision_weighted'].append(precision_score(val_targets, val_preds, average='weighted'))
                    fold_metrics['recall_weighted'].append(recall_score(val_targets, val_preds, average='weighted'))
                    fold_metrics['f1_weighted'].append(f1_score(val_targets, val_preds, average='weighted'))
                    
                    # ROC AUC (solo si no es demasiado costoso computacionalmente)
                    try:
                        fold_metrics['roc_auc_ovo'].append(roc_auc_score(val_targets, val_probs, multi_class='ovo', average='macro'))
                        fold_metrics['roc_auc_ovr'].append(roc_auc_score(val_targets, val_probs, multi_class='ovr', average='macro'))
                    except:
                        fold_metrics['roc_auc_ovo'].append(-1)
                        fold_metrics['roc_auc_ovr'].append(-1)
                    
                    fold_metrics['confusion_matrix'].append(cm)
                    
                    # Log metrics por fold
                    wandb.log({
                        "fold": fold+1,
                        "loss": val_loss,
                        "accuracy": fold_metrics['accuracy'][-1],
                        "precision_macro": fold_metrics['precision_macro'][-1],
                        "recall_macro": fold_metrics['recall_macro'][-1],
                        "f1_macro": fold_metrics['f1_macro'][-1],
                        "precision_weighted": fold_metrics['precision_weighted'][-1],
                        "recall_weighted": fold_metrics['recall_weighted'][-1],
                        "f1_weighted": fold_metrics['f1_weighted'][-1],
                        "roc_auc_ovo": fold_metrics['roc_auc_ovo'][-1],
                        "roc_auc_ovr": fold_metrics['roc_auc_ovr'][-1],
                        "confusion_matrix": wandb.plot.confusion_matrix(
                            probs=None,
                            y_true=val_targets,
                            preds=val_preds,
                            class_names=[str(i) for i in range(num_classes)]
                        )
                    })
                    
                    wandb.finish()
                
                # Cálculo de métricas promedio (media de los K folds)
                avg_metrics = {k: numpy.mean(v) for k, v in fold_metrics.items() if k != 'confusion_matrix'}
                avg_cm = sum(fold_metrics['confusion_matrix']) / len(fold_metrics['confusion_matrix'])
                
                plt.figure(figsize=(10, 8))
                sns.heatmap(avg_cm, annot=True, fmt=".1f")
                plt.title(f"Avg Confusion Matrix (bs={bs}, lr={lr})")
                
                # Log de métricas promedio en wandb (experimento resumen)
                wandb.init(
                    project="TFG_MUL_CV_AVG",
                    name=f"AVG_bs({bs})_lr({lr})_hs({hs})_e({e})",
                    config={
                        "batch_size": bs,
                        "learning_rate": lr,
                        "hidden_size": hs,
                        "epochs": e
                    }
                )
                
                wandb.log({
                    "avg_loss": avg_metrics['loss'],
                    "avg_accuracy": avg_metrics['accuracy'],
                    "avg_precision_macro": avg_metrics['precision_macro'],
                    "avg_recall_macro": avg_metrics['recall_macro'],
                    "avg_f1_macro": avg_metrics['f1_macro'],
                    "avg_precision_weighted": avg_metrics['precision_weighted'],
                    "avg_recall_weighted": avg_metrics['recall_weighted'],
                    "avg_f1_weighted": avg_metrics['f1_weighted'],
                    "avg_roc_auc_ovo": avg_metrics['roc_auc_ovo'],
                    "avg_roc_auc_ovr": avg_metrics['roc_auc_ovr'],
                    "avg_confusion_matrix": wandb.Table(
                        dataframe=pandas.DataFrame(
                            avg_cm,
                            index=[f"True {i}" for i in range(num_classes)],
                            columns=[f"Pred {i}" for i in range(num_classes)]
                        )
                    ),
                    "avg_confusion_matrix_image": wandb.Image(plt)
                })
                
                plt.close()
                
                print(f"\nResultados promedio para bs={bs}, lr={lr}, hs={hs}, e={e}:")
                print(f"  Loss: {avg_metrics['loss']:.4f} | Accuracy: {avg_metrics['accuracy']:.4f} | F1 Macro: {avg_metrics['f1_macro']:.4f}")
                
                wandb.finish()


--- Fold 1 ---


[34m[1mwandb[0m: Currently logged in as: [33malv-lop-hugo[0m ([33mTFG_Hugo[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


0,1
accuracy,▁
f1_macro,▁
f1_weighted,▁
fold,▁
loss,▁
precision_macro,▁
precision_weighted,▁
recall_macro,▁
recall_weighted,▁
roc_auc_ovo,▁

0,1
accuracy,0.50806
f1_macro,0.34497
f1_weighted,0.52579
fold,1.0
loss,1.87785
precision_macro,0.33389
precision_weighted,0.61762
recall_macro,0.52165
recall_weighted,0.50806
roc_auc_ovo,0.77268



--- Fold 2 ---


0,1
accuracy,▁
f1_macro,▁
f1_weighted,▁
fold,▁
loss,▁
precision_macro,▁
precision_weighted,▁
recall_macro,▁
recall_weighted,▁
roc_auc_ovo,▁

0,1
accuracy,0.49929
f1_macro,0.32785
f1_weighted,0.511
fold,2.0
loss,1.88555
precision_macro,0.32152
precision_weighted,0.6344
recall_macro,0.505
recall_weighted,0.49929
roc_auc_ovo,0.77929



--- Fold 3 ---


0,1
accuracy,▁
f1_macro,▁
f1_weighted,▁
fold,▁
loss,▁
precision_macro,▁
precision_weighted,▁
recall_macro,▁
recall_weighted,▁
roc_auc_ovo,▁

0,1
accuracy,0.49147
f1_macro,0.35112
f1_weighted,0.5246
fold,3.0
loss,1.8859
precision_macro,0.33789
precision_weighted,0.62916
recall_macro,0.51255
recall_weighted,0.49147
roc_auc_ovo,0.76556



--- Fold 4 ---


0,1
accuracy,▁
f1_macro,▁
f1_weighted,▁
fold,▁
loss,▁
precision_macro,▁
precision_weighted,▁
recall_macro,▁
recall_weighted,▁
roc_auc_ovo,▁

0,1
accuracy,0.54584
f1_macro,0.34683
f1_weighted,0.54408
fold,4.0
loss,1.8837
precision_macro,0.34579
precision_weighted,0.61588
recall_macro,0.50007
recall_weighted,0.54584
roc_auc_ovo,0.75786



--- Fold 5 ---


0,1
accuracy,▁
f1_macro,▁
f1_weighted,▁
fold,▁
loss,▁
precision_macro,▁
precision_weighted,▁
recall_macro,▁
recall_weighted,▁
roc_auc_ovo,▁

0,1
accuracy,0.47368
f1_macro,0.31474
f1_weighted,0.50036
fold,5.0
loss,1.89942
precision_macro,0.31272
precision_weighted,0.62742
recall_macro,0.4844
recall_weighted,0.47368
roc_auc_ovo,0.76402



Resultados promedio para bs=32, lr=0.01, hs=25, e=30:
  Loss: 1.8865 | Accuracy: 0.5037 | F1 Macro: 0.3371


0,1
avg_accuracy,▁
avg_f1_macro,▁
avg_f1_weighted,▁
avg_loss,▁
avg_precision_macro,▁
avg_precision_weighted,▁
avg_recall_macro,▁
avg_recall_weighted,▁
avg_roc_auc_ovo,▁
avg_roc_auc_ovr,▁

0,1
avg_accuracy,0.50367
avg_f1_macro,0.3371
avg_f1_weighted,0.52116
avg_loss,1.88648
avg_precision_macro,0.33036
avg_precision_weighted,0.6249
avg_recall_macro,0.50474
avg_recall_weighted,0.50367
avg_roc_auc_ovo,0.76788
avg_roc_auc_ovr,0.7467



--- Fold 1 ---


KeyboardInterrupt: 

## Se preparan los datos de prueba

X_prueba_normalizado = escalador.transform(X_prueba)

X_prueba_tensor = torch.tensor(X_prueba_normalizado, dtype=torch.float32)
Y_prueba_tensor = torch.tensor(Y_prueba, dtype=torch.float32)

dataset_prueba = DatasetTFG(X_prueba_tensor, Y_prueba_tensor)


## Se guarda el modelo en un fichero

torch.save(modelo.state_dict(), 'Modelos/TFG_HUGO_MUL.pth')

### BCEWithLogitsLoss

Época: 1, Pérdida: 0.020396
Época: 2, Pérdida: 0.009877
Época: 3, Pérdida: 0.004355
Época: 4, Pérdida: 0.00114
Época: 5, Pérdida: 0.002756
Época: 6, Pérdida: 0.001659
Época: 7, Pérdida: 0.004422
Época: 8, Pérdida: 0.000846
Época: 9, Pérdida: 0.000706
Época: 10, Pérdida: 0.000514

Pérdida durante la prueba: 0.0075, Exactitud:  95.90%
