# TFG: Título del TFG

## Hugo López Álvarez

In [1]:
import math
import numpy    
import pandas   
import wandb
import torch    
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, Subset
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, log_loss, fbeta_score
from sklearn.model_selection import KFold, StratifiedKFold

## Clases

Definición de la clase DatasetTFG que se usará para entrenar al modelo

In [2]:
class DatasetTFG(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]

Definición de la clase Modelo
- La capa1 transforma la dimensión de entrada a 64 neuronas
- La capa2 pasa de las 64 neuronas a 1 neurona

In [None]:
class Modelo(nn.Module):
    def __init__(self, input_dim, ventanaOculta):
        super().__init__()
        self.capa1 = nn.Linear(input_dim, ventanaOculta)
        self.capa2 =  nn.Linear(ventanaOculta, 1) 
        
    def forward(self,  X):
        X = self.capa1(X)
        X = self.capa2(X)
        return X

# Funciones

In [4]:

def split_ip_column(df, ip_column_name):
    
    # Divide la IP en cuatro partes
    ip_parts = df[ip_column_name].str.split('.', expand=True)
    
    # Crea nombres de columnas basados en el nombre original
    new_columns = {
        0: f"{ip_column_name}_part1",
        1: f"{ip_column_name}_part2", 
        2: f"{ip_column_name}_part3",
        3: f"{ip_column_name}_part4"
    }
    
    # Se elimina la columna de ip_column_name
    df = df.drop(columns=[ip_column_name]) 
    
    # Añade las nuevas columnas al DataFrame
    for part, col_name in new_columns.items():
        df[col_name] = pandas.to_numeric(ip_parts[part])  # Convierte a numérico
    
    return df

## Cargar datos

In [5]:
fileData = pandas.read_csv('../Datasets/modUQ.csv')

### Comprobación de la obtención correcta del csv

In [6]:
fileData.head()

Unnamed: 0,FLOW_START_MILLISECONDS,FLOW_END_MILLISECONDS,IPV4_SRC_ADDR,L4_SRC_PORT,IPV4_DST_ADDR,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,...,SRC_TO_DST_IAT_MIN,SRC_TO_DST_IAT_MAX,SRC_TO_DST_IAT_AVG,SRC_TO_DST_IAT_STDDEV,DST_TO_SRC_IAT_MIN,DST_TO_SRC_IAT_MAX,DST_TO_SRC_IAT_AVG,DST_TO_SRC_IAT_STDDEV,Label,Attack
0,1424242193040,1424242193043,59.166.0.2,4894,149.171.126.3,53,17,5.0,146,2,...,0,0,0,0,0,0,0,0,0,Benign
1,1424242192744,1424242193079,59.166.0.4,52671,149.171.126.6,31992,6,11.0,4704,28,...,0,91,12,19,0,90,12,19,0,Benign
2,1424242190649,1424242193109,59.166.0.0,47290,149.171.126.9,6881,6,37.0,13662,238,...,0,1843,10,119,0,1843,5,88,0,Benign
3,1424242193145,1424242193146,59.166.0.8,43310,149.171.126.7,53,17,5.0,146,2,...,0,0,0,0,0,0,0,0,0,Benign
4,1424242193239,1424242193241,59.166.0.1,45870,149.171.126.1,53,17,5.0,130,2,...,0,0,0,0,0,0,0,0,0,Benign


### Se convierten las columnas no numéricas para poder utilizarlas con pytorch

In [None]:
#fileData = split_ip_column(fileData, 'IPV4_SRC_ADDR')
#fileData = split_ip_column(fileData, 'IPV4_DST_ADDR')
fileData['Attack'] = LabelEncoder().fit_transform(fileData['Attack'])

### Se comprueba que los datos se han transformado correctamente

In [8]:
print(fileData.dtypes)

FLOW_START_MILLISECONDS          int64
FLOW_END_MILLISECONDS            int64
IPV4_SRC_ADDR                   object
L4_SRC_PORT                      int64
IPV4_DST_ADDR                   object
L4_DST_PORT                      int64
PROTOCOL                         int64
L7_PROTO                       float64
IN_BYTES                         int64
IN_PKTS                          int64
OUT_BYTES                        int64
OUT_PKTS                         int64
TCP_FLAGS                        int64
CLIENT_TCP_FLAGS                 int64
SERVER_TCP_FLAGS                 int64
FLOW_DURATION_MILLISECONDS       int64
DURATION_IN                      int64
DURATION_OUT                     int64
MIN_TTL                          int64
MAX_TTL                          int64
LONGEST_FLOW_PKT                 int64
SHORTEST_FLOW_PKT                int64
MIN_IP_PKT_LEN                   int64
MAX_IP_PKT_LEN                   int64
SRC_TO_DST_SECOND_BYTES        float64
DST_TO_SRC_SECOND_BYTES  

## Se eliminan los datos con valores infinitos

In [None]:
fileData = fileData.replace([numpy.inf, -numpy.inf], numpy.nan).dropna()

### Se separan las características (X) de la etiqueta (Y)

In [10]:
X = fileData.drop(columns=['Label', 'Attack', 'FLOW_START_MILLISECONDS', 'FLOW_END_MILLISECONDS', 'IPV4_SRC_ADDR', 'IPV4_DST_ADDR']).values
Y = fileData['Label'].values

### Se elimina fileData que contiene el csv con los datos para liberar memoria

In [11]:
del fileData

## Se separan los datos del entrenamiento de los datos de prueba
El entrenamiento tendrá el 80% de los datos

La prueba tendrá el 20% de los datos

In [12]:
X_entrena, X_prueba, Y_entrana, Y_prueba = train_test_split(
    X, Y, test_size=0.2, random_state=42,  stratify=Y
)

## Se normalizan los datos

In [13]:
escalador = MinMaxScaler(feature_range=(0,1))
X_entrena_normalizado = escalador.fit_transform(X_entrena)

### Se convierten los datos a tensores de Pytorch

In [14]:
X_entrena_tensor = torch.tensor(X_entrena_normalizado, dtype=torch.float32)
Y_entrena_tensor = torch.tensor(Y_entrana, dtype=torch.float32)

## Creación del Dataset personalizado

In [15]:
dataset_entrena = DatasetTFG(X_entrena_tensor, Y_entrena_tensor)

## Se configura pérdida y optimizador

In [None]:
pos_weight = torch.tensor([(len(Y_entrana)-sum(Y_entrana))/sum(Y_entrana)])  # Auto-cálculo
perdida = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

## Definición de hiperparámetros

In [None]:
batch_size=[2000, 10000, 15000, 20000]
learning_rate=[0.01, 0.001, 0.0001]
epochs=[10, 20, 30]
hidden_factor=[math.ceil(X_entrena_tensor.size(1)/2), X_entrena_tensor.size(1), X_entrena_tensor.size(1)*2] # la mitad de las columnas, el número de columnas y el doble

## Se define el objeto KFold que se utilizará para la validación cruzada

In [None]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

## Bucle de entrenamiento o épocas

In [None]:
for bs in batch_size:
    for lr in learning_rate:
        for hs in hidden_factor:
            for e in epochs:
                # Listas para almacenar métricas de cada fold
                fold_metrics = {
                    'loss': [], 'accuracy': [], 'precision': [], 
                    'recall': [], 'f1': [], 'f2': [], 'roc_auc': [],
                    'specificity': [], 'tp': [], 'fp': [], 'tn': [], 'fn': []
                }
                
                # Validación cruzada
                for fold, (train_idx, val_idx) in enumerate(kf.split(X_entrena_tensor, Y_entrena_tensor)):
                    print(f"\n--- Fold {fold+1} ---")
                    # Configuración del experimento en wandb (por fold)
                    nombreExperimento = f'TFG_BIN_bs({bs})_lr({lr})_hs({hs})_e({e})_fold({fold+1})'
                    wandb.init(
                        project="TFG_BIN_CV_FOLDS",
                        name=nombreExperimento,
                        config={
                            "batch_size": bs,
                            "learning_rate": lr,
                            "hidden_size": hs,
                            "epochs": e,
                            "fold": fold+1
                        }
                    )
                    
                    # Divisón train/val para este fold
                    train_data = Subset(DatasetTFG(X_entrena_tensor, Y_entrena_tensor), train_idx)
                    val_data = Subset(DatasetTFG(X_entrena_tensor, Y_entrena_tensor), val_idx)
                    
                    train_loader = DataLoader(train_data, batch_size=bs, shuffle=True)
                    val_loader = DataLoader(val_data, batch_size=bs)
                    
                    # Modelo y optimizador
                    modelo = Modelo(input_dim=X_entrena_tensor.shape[1], ventanaOculta=hs)
                    optimizador = optim.AdamW(modelo.parameters(), lr=lr)
                    perdida = nn.BCEWithLogitsLoss()
                    
                    # Entrenamiento
                    for epoch in range(e):
                        modelo.train()
                        for batch_X, batch_Y in train_loader:
                            optimizador.zero_grad()
                            salidas = modelo(batch_X)
                            loss = perdida(salidas, batch_Y.unsqueeze(1))
                            loss.backward()
                            optimizador.step()
                    
                    # Evaluación en validation fold
                    modelo.eval()
                    val_preds, val_probs, val_targets = [], [], []
                    val_loss = 0.0
                    
                    with torch.no_grad():
                        for batch_X_val, batch_Y_val in val_loader:
                            salidas_val = modelo(batch_X_val)
                            val_loss += perdida(salidas_val, batch_Y_val.unsqueeze(1)).item()
                            probs = torch.sigmoid(salidas_val)
                            preds = (probs > 0.5).int()
                            val_probs.extend(probs.cpu().numpy())
                            val_preds.extend(preds.cpu().numpy())
                            val_targets.extend(batch_Y_val.cpu().numpy())
                    
                    # Métricas para este fold
                    val_loss /= len(val_loader)
                    tn, fp, fn, tp = confusion_matrix(val_targets, val_preds).ravel()
                    
                    fold_metrics['loss'].append(val_loss)
                    fold_metrics['accuracy'].append(accuracy_score(val_targets, val_preds))
                    fold_metrics['precision'].append(precision_score(val_targets, val_preds, zero_division=0))
                    fold_metrics['recall'].append(recall_score(val_targets, val_preds, zero_division=0))
                    fold_metrics['f1'].append(f1_score(val_targets, val_preds))
                    fold_metrics['f2'].append(fbeta_score(val_targets, val_preds, beta=2))
                    fold_metrics['roc_auc'].append(roc_auc_score(val_targets, val_probs))
                    fold_metrics['specificity'].append(tn / (tn + fp) if (tn + fp) > 0 else 0)
                    fold_metrics['tp'].append(tp)
                    fold_metrics['fp'].append(fp)
                    fold_metrics['tn'].append(tn)
                    fold_metrics['fn'].append(fn)
                    
                    # Log metrics por fold
                    wandb.log({
                        "fold": fold+1,
                        "loss": val_loss,
                        "accuracy": fold_metrics['accuracy'][-1],
                        "precision": fold_metrics['precision'][-1],
                        "recall": fold_metrics['recall'][-1],
                        "f1": fold_metrics['f1'][-1],
                        "f2": fold_metrics['f2'][-1],
                        "roc_auc": fold_metrics['roc_auc'][-1],
                        "specificity": fold_metrics['specificity'][-1],
                        "true_positives": tp,
                        "false_positives": fp,
                        "true_negatives": tn,
                        "false_negatives": fn
                    })
                    
                    wandb.finish()
                
                # Cálculo de métricas promedio (media de los K folds)
                avg_metrics = {k: numpy.mean(v) for k, v in fold_metrics.items()}
                
                # Log de métricas promedio en wandb (experimento resumen)
                wandb.init(
                    project="TFG_BIN_CV_AVG",
                    name=f"AVG_bs({bs})_lr({lr})_hs({hs})_e({e})",
                    config={
                        "batch_size": bs,
                        "learning_rate": lr,
                        "hidden_size": hs,
                        "epochs": e
                    }
                )
                
                wandb.log({
                    "avg_loss": avg_metrics['loss'],
                    "avg_accuracy": avg_metrics['accuracy'],
                    "avg_precision": avg_metrics['precision'],
                    "avg_recall": avg_metrics['recall'],
                    "avg_f1": avg_metrics['f1'],
                    "avg_f2": avg_metrics['f2'],
                    "avg_roc_auc": avg_metrics['roc_auc'],
                    "avg_specificity": avg_metrics['specificity'],
                    "avg_tp": avg_metrics['tp'],
                    "avg_fp": avg_metrics['fp'],
                    "avg_tn": avg_metrics['tn'],
                    "avg_fn": avg_metrics['fn']
                })
                
                print(f"\nResultados promedio para bs={bs}, lr={lr}, hs={hs}, e={e}:")
                print(f"  Loss: {avg_metrics['loss']:.4f} | Accuracy: {avg_metrics['accuracy']:.4f} | F1: {avg_metrics['f1']:.4f}")
                
                wandb.finish()


--- Fold 1 ---


[34m[1mwandb[0m: Currently logged in as: [33malv-lop-hugo[0m ([33mTFG_Hugo[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
