# TFG: Título del TFG

## Hugo López Álvarez

In [1]:
import math
import numpy    
import pandas   
import wandb
import torch    
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, Subset
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, log_loss, fbeta_score
from sklearn.model_selection import KFold, StratifiedKFold

## Clases

Definición de la clase DatasetTFG que se usará para entrenar al modelo

In [2]:
class DatasetTFG(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]

Definición de la clase Modelo
- La capa1 transforma la dimensión de entrada a 64 neuronas
- La capa2 pasa de las 64 neuronas a 1 neurona

In [3]:
class Modelo(nn.Module):
    def __init__(self, input_dim, ventanaOculta):
        super().__init__()
        self.capa1 = nn.Linear(input_dim, ventanaOculta)
        self.capa2 =  nn.Linear(ventanaOculta, 1) 
        
    def forward(self,  X):
        X = self.capa1(X)
        X = self.capa2(X)
        return X

# Funciones

In [4]:

def split_ip_column(df, ip_column_name):
    
    # Divide la IP en cuatro partes
    ip_parts = df[ip_column_name].str.split('.', expand=True)
    
    # Crea nombres de columnas basados en el nombre original
    new_columns = {
        0: f"{ip_column_name}_part1",
        1: f"{ip_column_name}_part2", 
        2: f"{ip_column_name}_part3",
        3: f"{ip_column_name}_part4"
    }
    
    # Se elimina la columna de ip_column_name
    df = df.drop(columns=[ip_column_name]) 
    
    # Añade las nuevas columnas al DataFrame
    for part, col_name in new_columns.items():
        df[col_name] = pandas.to_numeric(ip_parts[part])  # Convierte a numérico
    
    return df

## Cargar datos

In [5]:
fileData = pandas.read_csv('../Datasets/modUQ.csv')

### Comprobación de la obtención correcta del csv

In [6]:
fileData.head()

Unnamed: 0,FLOW_START_MILLISECONDS,FLOW_END_MILLISECONDS,IPV4_SRC_ADDR,L4_SRC_PORT,IPV4_DST_ADDR,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,...,SRC_TO_DST_IAT_MIN,SRC_TO_DST_IAT_MAX,SRC_TO_DST_IAT_AVG,SRC_TO_DST_IAT_STDDEV,DST_TO_SRC_IAT_MIN,DST_TO_SRC_IAT_MAX,DST_TO_SRC_IAT_AVG,DST_TO_SRC_IAT_STDDEV,Label,Attack
0,1424242193040,1424242193043,59.166.0.2,4894,149.171.126.3,53,17,5.0,146,2,...,0,0,0,0,0,0,0,0,0,Benign
1,1424242192744,1424242193079,59.166.0.4,52671,149.171.126.6,31992,6,11.0,4704,28,...,0,91,12,19,0,90,12,19,0,Benign
2,1424242190649,1424242193109,59.166.0.0,47290,149.171.126.9,6881,6,37.0,13662,238,...,0,1843,10,119,0,1843,5,88,0,Benign
3,1424242193145,1424242193146,59.166.0.8,43310,149.171.126.7,53,17,5.0,146,2,...,0,0,0,0,0,0,0,0,0,Benign
4,1424242193239,1424242193241,59.166.0.1,45870,149.171.126.1,53,17,5.0,130,2,...,0,0,0,0,0,0,0,0,0,Benign


### Se convierten las columnas no numéricas para poder utilizarlas con pytorch

In [7]:
#fileData = split_ip_column(fileData, 'IPV4_SRC_ADDR')
#fileData = split_ip_column(fileData, 'IPV4_DST_ADDR')
fileData['Attack'] = LabelEncoder().fit_transform(fileData['Attack'])

### Se comprueba que los datos se han transformado correctamente

In [8]:
print(fileData.dtypes)

FLOW_START_MILLISECONDS          int64
FLOW_END_MILLISECONDS            int64
IPV4_SRC_ADDR                   object
L4_SRC_PORT                      int64
IPV4_DST_ADDR                   object
L4_DST_PORT                      int64
PROTOCOL                         int64
L7_PROTO                       float64
IN_BYTES                         int64
IN_PKTS                          int64
OUT_BYTES                        int64
OUT_PKTS                         int64
TCP_FLAGS                        int64
CLIENT_TCP_FLAGS                 int64
SERVER_TCP_FLAGS                 int64
FLOW_DURATION_MILLISECONDS       int64
DURATION_IN                      int64
DURATION_OUT                     int64
MIN_TTL                          int64
MAX_TTL                          int64
LONGEST_FLOW_PKT                 int64
SHORTEST_FLOW_PKT                int64
MIN_IP_PKT_LEN                   int64
MAX_IP_PKT_LEN                   int64
SRC_TO_DST_SECOND_BYTES        float64
DST_TO_SRC_SECOND_BYTES  

## Se eliminan los datos con valores infinitos

In [9]:
#print("¿Existen valores infinitos en X?: ", numpy.isinf(fileData.values).any())
fileData = fileData.replace([numpy.inf, -numpy.inf], numpy.nan).dropna()
#print("¿Siguen existiendo valores infinitos en X?: ", numpy.isinf(fileData.values).any())

### Se separan las características (X) de la etiqueta (Y)

In [10]:
X = fileData.drop(columns=['Label', 'Attack', 'FLOW_START_MILLISECONDS', 'FLOW_END_MILLISECONDS', 'IPV4_SRC_ADDR', 'IPV4_DST_ADDR']).values
Y = fileData['Label'].values

### Se elimina fileData que contiene el csv con los datos para liberar memoria

In [11]:
del fileData

## Se separan los datos del entrenamiento de los datos de prueba
El entrenamiento tendrá el 80% de los datos

La prueba tendrá el 20% de los datos

In [12]:
X_entrena, X_prueba, Y_entrana, Y_prueba = train_test_split(
    X, Y, test_size=0.2, random_state=42,  stratify=Y
)

## Se normalizan los datos

In [13]:
escalador = MinMaxScaler(feature_range=(0,1))
X_entrena_normalizado = escalador.fit_transform(X_entrena)

### Se convierten los datos a tensores de Pytorch

In [14]:
X_entrena_tensor = torch.tensor(X_entrena_normalizado, dtype=torch.float32)
Y_entrena_tensor = torch.tensor(Y_entrana, dtype=torch.float32)

## Creación del Dataset personalizado

In [15]:
dataset_entrena = DatasetTFG(X_entrena_tensor, Y_entrena_tensor)

## Mejores combinaciones de hiperparámetros encontrados

In [16]:
combinaciones = [
    {'batch_size': 15000, 'learning_rate': 1e-2, 'hidden_size': 25, 'epochs': 10},
    {'batch_size': 10000, 'learning_rate': 1e-3, 'hidden_size': 25, 'epochs': 30},
    {'batch_size': 20000, 'learning_rate': 1e-2, 'hidden_size': 25, 'epochs': 10},
    {'batch_size': 20000, 'learning_rate': 1e-2, 'hidden_size': 25, 'epochs': 20},
    {'batch_size': 2000, 'learning_rate': 1e-3, 'hidden_size': 25, 'epochs': 10},
    
    
    {'batch_size': 20000, 'learning_rate': 1e-2, 'hidden_size': 49, 'epochs': 10},
    {'batch_size': 10000, 'learning_rate': 1e-3, 'hidden_size': 49, 'epochs': 30},
    {'batch_size': 15000, 'learning_rate': 1e-2, 'hidden_size': 49, 'epochs': 10},
    {'batch_size': 2000, 'learning_rate': 1e-3, 'hidden_size': 49, 'epochs': 10},
    {'batch_size': 2000, 'learning_rate': 1e-4, 'hidden_size': 49, 'epochs': 30},
    
    
    {'batch_size': 20000, 'learning_rate': 1e-2, 'hidden_size': 98, 'epochs': 10},
    {'batch_size': 10000, 'learning_rate': 1e-3, 'hidden_size': 98, 'epochs': 30},
    {'batch_size': 15000, 'learning_rate': 1e-3, 'hidden_size': 98, 'epochs': 30},
    {'batch_size': 15000, 'learning_rate': 1e-2, 'hidden_size': 98, 'epochs': 10},
    {'batch_size': 10000, 'learning_rate': 1e-3, 'hidden_size': 98, 'epochs': 20},
]

## Se preparan los datos de prueba

In [17]:
X_prueba_normalizado = escalador.transform(X_prueba)
X_prueba_tensor = torch.tensor(X_prueba_normalizado, dtype=torch.float32)
Y_prueba_tensor = torch.tensor(Y_prueba, dtype=torch.float32)
test_dataset = DatasetTFG(X_prueba_tensor, Y_prueba_tensor)

## Bucle de entrenamiento o épocas

In [None]:
for config in combinaciones:
    # Se configura wandb para esta combinación
    wandb.init(
        project="TFG_BIN_EVAL",
        name=f"bs({config['batch_size']})_lr({config['learning_rate']})_hs({config['hidden_size']})_e({config['epochs']})",
        config=config
    )
    
    # Se entrena el modelo con todos los datos de entrenamiento y no con validacion cruzada
    modelo = Modelo(input_dim=X_entrena_tensor.shape[1], ventanaOculta=config['hidden_size'])
    optimizador = optim.AdamW(modelo.parameters(), lr=config['learning_rate'])
    perdida = nn.BCEWithLogitsLoss()
    
    train_loader = DataLoader(DatasetTFG(X_entrena_tensor, Y_entrena_tensor), 
                           batch_size=config['batch_size'], 
                           shuffle=True)
    
    print(f"\nEntrenando con: bs={config['batch_size']}, lr={config['learning_rate']}, "
          f"hs={config['hidden_size']}, e={config['epochs']}")
    
    for epoch in range(config['epochs']):
        modelo.train()
        total_loss = 0
        for batch_X, batch_Y in train_loader:
            optimizador.zero_grad()
            salidas = modelo(batch_X)
            loss = perdida(salidas, batch_Y.unsqueeze(1))
            loss.backward()
            optimizador.step()
            total_loss += loss.item()
        
        wandb.log({"train_loss": total_loss/len(train_loader), "epoch": epoch})
    
    # Se guarda el modelo
    nombre_modelo = f"TFG_BIN_bs({config['batch_size']})_lr({config['learning_rate']})_hs({config['hidden_size']})_e({config['epochs']}).pth"
    torch.save(modelo.state_dict(), f'ModelosBIN/{nombre_modelo}')
    wandb.save(f'ModelosBIN/{nombre_modelo}')
    
    # Se evalua el modelo en el conjunto de prueba
    test_loader = DataLoader(test_dataset, batch_size=config['batch_size'])
    
    modelo.eval()
    test_preds, test_probs, test_targets = [], [], []
    test_loss = 0.0
    
    with torch.no_grad():
        for batch_X_test, batch_Y_test in test_loader:
            salidas_test = modelo(batch_X_test)
            test_loss += perdida(salidas_test, batch_Y_test.unsqueeze(1)).item()
            probs = torch.sigmoid(salidas_test)
            preds = (probs > 0.5).int()
            test_probs.extend(probs.cpu().numpy())
            test_preds.extend(preds.cpu().numpy())
            test_targets.extend(batch_Y_test.cpu().numpy())
    
    # Se calculan las metricas
    test_loss /= len(test_loader)
    tn, fp, fn, tp = confusion_matrix(test_targets, test_preds).ravel()
    
    # Se registran los resultados de la evaluacion en wandb
    wandb.log({
        "test_loss": test_loss,
        "test_accuracy": accuracy_score(test_targets, test_preds),
        "test_precision": precision_score(test_targets, test_preds, zero_division=0),
        "test_recall": recall_score(test_targets, test_preds, zero_division=0),
        "test_f1": f1_score(test_targets, test_preds),
        "test_f2": fbeta_score(test_targets, test_preds, beta=2),
        "test_roc_auc": roc_auc_score(test_targets, test_probs),
        "test_specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "test_true_positives": tp,
        "test_false_positives": fp,
        "test_true_negatives": tn,
        "test_false_negatives": fn
    })
    
    print(f"Evaluación de bs={config['batch_size']}, lr={config['learning_rate']} finalizada")
    wandb.finish()

[34m[1mwandb[0m: Currently logged in as: [33malv-lop-hugo[0m ([33mTFG_Hugo[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin



Entrenando con: bs=15000, lr=0.01, hs=25, e=10
Evaluación de bs=15000, lr=0.01 finalizada


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_f1,▁
test_f2,▁
test_false_negatives,▁
test_false_positives,▁
test_loss,▁
test_precision,▁
test_recall,▁
test_roc_auc,▁

0,1
epoch,9.0
test_accuracy,0.99982
test_f1,0.99785
test_f2,0.99829
test_false_negatives,26.0
test_false_positives,53.0
test_loss,0.00407
test_precision,0.99712
test_recall,0.99859
test_roc_auc,0.99984



Entrenando con: bs=10000, lr=0.001, hs=25, e=30
Evaluación de bs=10000, lr=0.001 finalizada


0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
test_accuracy,▁
test_f1,▁
test_f2,▁
test_false_negatives,▁
test_false_positives,▁
test_loss,▁
test_precision,▁
test_recall,▁
test_roc_auc,▁

0,1
epoch,29.0
test_accuracy,0.99981
test_f1,0.99772
test_f2,0.99827
test_false_negatives,25.0
test_false_positives,59.0
test_loss,0.0043
test_precision,0.9968
test_recall,0.99864
test_roc_auc,0.99977



Entrenando con: bs=20000, lr=0.01, hs=25, e=10
Evaluación de bs=20000, lr=0.01 finalizada


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_f1,▁
test_f2,▁
test_false_negatives,▁
test_false_positives,▁
test_loss,▁
test_precision,▁
test_recall,▁
test_roc_auc,▁

0,1
epoch,9.0
test_accuracy,0.99983
test_f1,0.99793
test_f2,0.99842
test_false_negatives,23.0
test_false_positives,53.0
test_loss,0.00444
test_precision,0.99712
test_recall,0.99875
test_roc_auc,0.99977



Entrenando con: bs=20000, lr=0.01, hs=25, e=20
Evaluación de bs=20000, lr=0.01 finalizada


0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_accuracy,▁
test_f1,▁
test_f2,▁
test_false_negatives,▁
test_false_positives,▁
test_loss,▁
test_precision,▁
test_recall,▁
test_roc_auc,▁

0,1
epoch,19.0
test_accuracy,0.99983
test_f1,0.99788
test_f2,0.99817
test_false_negatives,30.0
test_false_positives,48.0
test_loss,0.00394
test_precision,0.99739
test_recall,0.99837
test_roc_auc,0.9999



Entrenando con: bs=2000, lr=0.001, hs=25, e=10
Evaluación de bs=2000, lr=0.001 finalizada


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_f1,▁
test_f2,▁
test_false_negatives,▁
test_false_positives,▁
test_loss,▁
test_precision,▁
test_recall,▁
test_roc_auc,▁

0,1
epoch,9.0
test_accuracy,0.99983
test_f1,0.99788
test_f2,0.99821
test_false_negatives,29.0
test_false_positives,49.0
test_loss,0.0041
test_precision,0.99734
test_recall,0.99842
test_roc_auc,0.99982



Entrenando con: bs=20000, lr=0.01, hs=49, e=10
Evaluación de bs=20000, lr=0.01 finalizada


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_f1,▁
test_f2,▁
test_false_negatives,▁
test_false_positives,▁
test_loss,▁
test_precision,▁
test_recall,▁
test_roc_auc,▁

0,1
epoch,9.0
test_accuracy,0.99984
test_f1,0.99799
test_f2,0.99838
test_false_negatives,25.0
test_false_positives,49.0
test_loss,0.0042
test_precision,0.99734
test_recall,0.99864
test_roc_auc,0.99984



Entrenando con: bs=10000, lr=0.001, hs=49, e=30
Evaluación de bs=10000, lr=0.001 finalizada


0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
test_accuracy,▁
test_f1,▁
test_f2,▁
test_false_negatives,▁
test_false_positives,▁
test_loss,▁
test_precision,▁
test_recall,▁
test_roc_auc,▁

0,1
epoch,29.0
test_accuracy,0.99983
test_f1,0.99788
test_f2,0.99834
test_false_negatives,25.0
test_false_positives,53.0
test_loss,0.00415
test_precision,0.99712
test_recall,0.99864
test_roc_auc,0.99981



Entrenando con: bs=15000, lr=0.01, hs=49, e=10
Evaluación de bs=15000, lr=0.01 finalizada


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_f1,▁
test_f2,▁
test_false_negatives,▁
test_false_positives,▁
test_loss,▁
test_precision,▁
test_recall,▁
test_roc_auc,▁

0,1
epoch,9.0
test_accuracy,0.99984
test_f1,0.99799
test_f2,0.99828
test_false_negatives,28.0
test_false_positives,46.0
test_loss,0.00394
test_precision,0.9975
test_recall,0.99848
test_roc_auc,0.9999



Entrenando con: bs=2000, lr=0.001, hs=49, e=10
Evaluación de bs=2000, lr=0.001 finalizada


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_f1,▁
test_f2,▁
test_false_negatives,▁
test_false_positives,▁
test_loss,▁
test_precision,▁
test_recall,▁
test_roc_auc,▁

0,1
epoch,9.0
test_accuracy,0.99982
test_f1,0.99785
test_f2,0.9981
test_false_negatives,32.0
test_false_positives,47.0
test_loss,0.00394
test_precision,0.99745
test_recall,0.99826
test_roc_auc,0.99986



Entrenando con: bs=2000, lr=0.0001, hs=49, e=30
Evaluación de bs=2000, lr=0.0001 finalizada


0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
test_accuracy,▁
test_f1,▁
test_f2,▁
test_false_negatives,▁
test_false_positives,▁
test_loss,▁
test_precision,▁
test_recall,▁
test_roc_auc,▁

0,1
epoch,29.0
test_accuracy,0.9998
test_f1,0.99755
test_f2,0.99811
test_false_negatives,28.0
test_false_positives,62.0
test_loss,0.00425
test_precision,0.99663
test_recall,0.99848
test_roc_auc,0.99981



Entrenando con: bs=20000, lr=0.01, hs=98, e=10
Evaluación de bs=20000, lr=0.01 finalizada


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_f1,▁
test_f2,▁
test_false_negatives,▁
test_false_positives,▁
test_loss,▁
test_precision,▁
test_recall,▁
test_roc_auc,▁

0,1
epoch,9.0
test_accuracy,0.99984
test_f1,0.99807
test_f2,0.99844
test_false_negatives,24.0
test_false_positives,47.0
test_loss,0.00408
test_precision,0.99745
test_recall,0.99869
test_roc_auc,0.9999



Entrenando con: bs=10000, lr=0.001, hs=98, e=30
Evaluación de bs=10000, lr=0.001 finalizada


0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
test_accuracy,▁
test_f1,▁
test_f2,▁
test_false_negatives,▁
test_false_positives,▁
test_loss,▁
test_precision,▁
test_recall,▁
test_roc_auc,▁

0,1
epoch,29.0
test_accuracy,0.99982
test_f1,0.99777
test_f2,0.99797
test_false_negatives,35.0
test_false_positives,47.0
test_loss,0.00402
test_precision,0.99744
test_recall,0.9981
test_roc_auc,0.99987



Entrenando con: bs=15000, lr=0.001, hs=98, e=30
Evaluación de bs=15000, lr=0.001 finalizada


0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
test_accuracy,▁
test_f1,▁
test_f2,▁
test_false_negatives,▁
test_false_positives,▁
test_loss,▁
test_precision,▁
test_recall,▁
test_roc_auc,▁

0,1
epoch,29.0
test_accuracy,0.99983
test_f1,0.99791
test_f2,0.99841
test_false_negatives,23.0
test_false_positives,54.0
test_loss,0.00425
test_precision,0.99707
test_recall,0.99875
test_roc_auc,0.99978



Entrenando con: bs=15000, lr=0.01, hs=98, e=10
Evaluación de bs=15000, lr=0.01 finalizada


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
test_f1,▁
test_f2,▁
test_false_negatives,▁
test_false_positives,▁
test_loss,▁
test_precision,▁
test_recall,▁
test_roc_auc,▁

0,1
epoch,9.0
test_accuracy,0.99983
test_f1,0.99796
test_f2,0.99827
test_false_negatives,28.0
test_false_positives,47.0
test_loss,0.00383
test_precision,0.99745
test_recall,0.99848
test_roc_auc,0.9999



Entrenando con: bs=10000, lr=0.001, hs=98, e=20
Evaluación de bs=10000, lr=0.001 finalizada
