# APOSEMAT IOT-23

## Primer Fichero

### Carga y limpieza

In [1]:
from DataLoader import DataLoader
from DataProcessor import DataProcessor
import pandas as pd

Loader = DataLoader()

In [2]:
# Load the Zeek connection log with labels
df = Loader.load_dataset(file_path="C:/Users/PcVip/deteccion-botnets-tfm/data/conn.log.labeled", file_type="zeek")

In [3]:
# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,...,local_resp,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,tunnel_parents,label,detailed-label
0,1526756000.0,C9YvmJ3zxtuqxWxLW5,192.168.2.5,38792,200.168.87.203,59353,tcp,,2.998333,0.0,...,,0,S,3,180,0,0,(empty),Malicious,PartOfAHorizontalPortScan
1,1526756000.0,CGsZqZ3UiQexLzPRVb,192.168.2.5,38792,200.168.87.203,59353,tcp,,,,...,,0,S,1,60,0,0,(empty),Malicious,PartOfAHorizontalPortScan
2,1526756000.0,C0LkBW2VEa292Nvet8,192.168.2.5,38793,200.168.87.203,59353,tcp,,2.997182,0.0,...,,0,S,3,180,0,0,(empty),Malicious,PartOfAHorizontalPortScan
3,1526756000.0,CMDLrn2cVhrqvW8gKa,192.168.2.5,38793,200.168.87.203,59353,tcp,,,,...,,0,S,1,60,0,0,(empty),Malicious,PartOfAHorizontalPortScan
4,1526756000.0,C2UM8f4knuL5Vnvp3h,192.168.2.5,38794,200.168.87.203,59353,tcp,,2.996286,0.0,...,,0,S,3,180,0,0,(empty),Malicious,PartOfAHorizontalPortScan


In [4]:
# Observe the DataFrame structure and types to check for any inconsistencies
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156103 entries, 0 to 156102
Data columns (total 23 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   ts              156103 non-null  float64
 1   uid             156103 non-null  object 
 2   id.orig_h       156103 non-null  object 
 3   id.orig_p       156103 non-null  Int64  
 4   id.resp_h       156103 non-null  object 
 5   id.resp_p       156103 non-null  Int64  
 6   proto           156103 non-null  object 
 7   service         5908 non-null    object 
 8   duration        82159 non-null   float64
 9   orig_bytes      82159 non-null   Int64  
 10  resp_bytes      82159 non-null   Int64  
 11  conn_state      156103 non-null  object 
 12  local_orig      0 non-null       boolean
 13  local_resp      0 non-null       boolean
 14  missed_bytes    156103 non-null  Int64  
 15  history         154896 non-null  object 
 16  orig_pkts       156103 non-null  Int64  
 17  orig_ip_by

In [5]:
# Clean the dataset using the DataLoader's clean_dataset method that removes duplicates
df = Loader.clean_dataset(df)

2025-08-23 14:21:45,562 - INFO - Dataset cleaned successfully. 0 rows have been removed.


In [6]:
# Drop columns that could provide artificial patterns and prepare the features and labels
drop_cols = ["uid", "id.orig_h", "id.resp_h", "id.orig_p", "id.resp_p",
    "tunnel_parents", "service", "history", "local_orig", "local_resp"
]

X = df.drop(columns=drop_cols + ["label","detailed-label"])
y = df["label"].map({"Benign":0, "Malicious":1}).astype("Int64")

In [7]:
# Check the first few rows of the features DataFrame
X.head()

Unnamed: 0,ts,proto,duration,orig_bytes,resp_bytes,conn_state,missed_bytes,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes
0,1526756000.0,tcp,2.998333,0.0,0.0,S0,0,3,180,0,0
1,1526756000.0,tcp,,,,S0,0,1,60,0,0
2,1526756000.0,tcp,2.997182,0.0,0.0,S0,0,3,180,0,0
3,1526756000.0,tcp,,,,S0,0,1,60,0,0
4,1526756000.0,tcp,2.996286,0.0,0.0,S0,0,3,180,0,0


In [8]:
# Check the first few rows of the labels Series
y.head()

0    1
1    1
2    1
3    1
4    1
Name: label, dtype: Int64

In [9]:
# Look at the distribution of the labels to check for class imbalance
print(y.value_counts(normalize=True))

label
1    0.970942
0    0.029058
Name: proportion, dtype: Float64


### Nested Stratified KFold with SearchGrid

In [10]:
# Declare the DataProcessor instance
Processor = DataProcessor()

# Get numerical and categorical columns from the DataFrame manually
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()

In [11]:
from sklearn.feature_selection import SelectKBest
from scipy.stats import kruskal
import numpy as np

# Define the Kruskal-Wallis score function for feature selection
# This function computes the Kruskal-Wallis H statistic for each feature
def kruskal_wallis_score(X, y):
    scores = []
    pvalues = []
    for i in range(X.shape[1]):
        groups = [X[y == cls, i] for cls in np.unique(y)]
        try:
            stat, p = kruskal(*groups)
        except ValueError:
            stat, p = 0, 1
        scores.append(stat)
        pvalues.append(p)
    return np.array(scores), np.array(pvalues)

In [12]:
import numpy as np
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import f1_score, roc_auc_score, average_precision_score

In [13]:
# Define the outer and inner cross-validation strategies
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

In [14]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier  
from sklearn.metrics import matthews_corrcoef, brier_score_loss, confusion_matrix, classification_report
from sklearn.base import clone

# Define the feature selectors
feature_selectors = {
    "anova": SelectKBest(score_func=f_classif),
    "kruskal": SelectKBest(score_func=kruskal_wallis_score)
}

# Define the models and their hyperparameter grids
models = {
    "XGBoost": {
        "estimator": XGBClassifier(eval_metric="logloss"),
        "param_grid": {
            "select__k": [4, 5],
            "clf__n_estimators": [100, 200],
            "clf__max_depth": [3, 5, 7],
            "clf__learning_rate": [0.01, 0.1, 0.2]
        }
    },
    "RandomForest": {
        "estimator": RandomForestClassifier(),
        "param_grid": {
            "select__k": [4, 5],
            "clf__n_estimators": [100],
            "clf__max_depth": [None, 10, 20],
            "clf__min_samples_split": [2, 5]
        }
    },
    "MlpClassifier": {
        "estimator": MLPClassifier(max_iter=500, random_state=42),
        "param_grid": {
            "select__k": [4, 5],
            "clf__hidden_layer_sizes": [(50,), (100,), (50, 50)],
            "clf__activation": ["relu", "tanh"],
            "clf__alpha": [0.0001, 0.001]
        }
    }
}

results = []
for selector in feature_selectors.keys():
    print(f"\nTesting Feature Selector: {selector}...")
    selector_instance = feature_selectors[selector]
    for model, configuration in models.items():
        estimator = configuration["estimator"]
        param_grid = configuration["param_grid"]

        print(f"\n Testing Model: {model}...")
        outer_scores = {"f1": [], "roc_auc": [], "auprc": [], "mcc": [], "brier": [], "fnr": []}
        best_params_folds = []
        best_features_folds = []
        confusion_matrixes = []
        classification_reports = []

        for fold, (train_idx, test_idx) in enumerate(outer_cv.split(X, y), 1):
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

            pipe = Pipeline([
                ("pre", DataProcessor(num_cols, cat_cols)),
                ("select", clone(selector_instance)),
                ("clf", estimator)
            ])

            search = GridSearchCV(
                pipe,
                param_grid,
                scoring="f1",
                cv=inner_cv,
                n_jobs=-1
            )
            search.fit(X_train, y_train)

            best_model = search.best_estimator_
            best_params_folds.append(search.best_params_)

            y_pred = best_model.predict(X_test)
            y_proba = best_model.predict_proba(X_test)[:, 1]

            # Metrics
            outer_scores["f1"].append(f1_score(y_test, y_pred))
            outer_scores["roc_auc"].append(roc_auc_score(y_test, y_proba))
            outer_scores["auprc"].append(average_precision_score(y_test, y_proba))
            outer_scores["mcc"].append(matthews_corrcoef(y_test, y_pred))
            outer_scores["brier"].append(brier_score_loss(y_test, y_proba))

            # Best Features
            select_step = best_model.named_steps["select"]
            feature_names = best_model.named_steps["pre"].get_feature_names_out()
            selected_mask = select_step.get_support()
            selected_features = feature_names[selected_mask]
            best_features_folds.append(list(selected_features))

            # Confussion matrixes and classification reports
            cm = confusion_matrix(y_test, y_pred, labels=[0,1])
            tn, fp, fn, tp = cm.ravel()
            fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
            outer_scores["fnr"].append(fnr)
            confusion_matrixes.append(cm)

            report = classification_report(y_test, y_pred, target_names=["Benigno (0)", "Malicioso (1)"], output_dict=True)
            classification_reports.append(report)

            #print(f"[Fold {fold}] Best params: {search.best_params_}")
            #print(f"[Fold {fold}] Features seleccionadas: {selected_features}")
            #print(f"[Fold {fold}] Confusion matrix:\n{cm}")
            #print(f"[Fold {fold}] Classification report:\n{classification_report(y_test, y_pred, target_names=['Benigno (0)','Malicioso (1)'])}")

        results.append({
            "Modelo": f"{model}_{selector}",
            "F1": f"{np.mean(outer_scores['f1']):.3f} ± {np.std(outer_scores['f1']):.3f}",
            "ROC-AUC": f"{np.mean(outer_scores['roc_auc']):.3f} ± {np.std(outer_scores['roc_auc']):.3f}",
            "AUPRC": f"{np.mean(outer_scores['auprc']):.3f} ± {np.std(outer_scores['auprc']):.3f}",
            "MCC": f"{np.mean(outer_scores['mcc']):.3f} ± {np.std(outer_scores['mcc']):.3f}",
            "Brier": f"{np.mean(outer_scores['brier']):.3f} ± {np.std(outer_scores['brier']):.3f}",
            "FNR": f"{np.mean(outer_scores['fnr']):.3f} ± {np.std(outer_scores['fnr']):.3f}",
            "FeatureSelector": selector,
            "BestParams_por_fold": best_params_folds,
            "BestFeatures_por_fold": best_features_folds,
            "ConfusionMatrix_por_fold": confusion_matrixes,
            "ClassificationReport_por_fold": classification_reports
        })

df_results = pd.DataFrame(results)
print("\nFinal Results:")
print(df_results[["Modelo", "F1", "ROC-AUC", "AUPRC", "MCC", "Brier", "FNR"]])

for _, row in df_results.iterrows():
    modelo = row["Modelo"]
    selector = row["FeatureSelector"]
    features_folds = row["BestFeatures_por_fold"]

    all_feats = [feat for fold_feats in features_folds for feat in fold_feats]
    feat_counts = pd.Series(all_feats).value_counts()

    print(f"\n{modelo} ({selector})")
    print("Features más seleccionadas:")
    print(feat_counts)


Testing Feature Selector: anova...

 Testing Model: XGBoost...

 Testing Model: RandomForest...

 Testing Model: MlpClassifier...

Testing Feature Selector: kruskal...

 Testing Model: XGBoost...

 Testing Model: RandomForest...

 Testing Model: MlpClassifier...

Final Results:
                  Modelo             F1        ROC-AUC          AUPRC  \
0          XGBoost_anova  0.999 ± 0.000  0.999 ± 0.000  1.000 ± 0.000   
1     RandomForest_anova  0.999 ± 0.000  0.999 ± 0.000  1.000 ± 0.000   
2    MlpClassifier_anova  0.999 ± 0.000  0.999 ± 0.000  1.000 ± 0.000   
3        XGBoost_kruskal  0.999 ± 0.000  0.999 ± 0.000  1.000 ± 0.000   
4   RandomForest_kruskal  0.999 ± 0.000  0.999 ± 0.000  1.000 ± 0.000   
5  MlpClassifier_kruskal  0.999 ± 0.000  0.999 ± 0.000  1.000 ± 0.000   

             MCC          Brier            FNR  
0  0.970 ± 0.003  0.002 ± 0.000  0.000 ± 0.000  
1  0.970 ± 0.003  0.002 ± 0.000  0.000 ± 0.000  
2  0.970 ± 0.003  0.002 ± 0.000  0.000 ± 0.000  
3  0.970 ± 0

In [15]:
print("\nFinal Results:")
print(df_results[["Modelo", "F1", "ROC-AUC", "AUPRC", "MCC", "Brier", "FNR"]])


Final Results:
                  Modelo             F1        ROC-AUC          AUPRC  \
0          XGBoost_anova  0.999 ± 0.000  0.999 ± 0.000  1.000 ± 0.000   
1     RandomForest_anova  0.999 ± 0.000  0.999 ± 0.000  1.000 ± 0.000   
2    MlpClassifier_anova  0.999 ± 0.000  0.999 ± 0.000  1.000 ± 0.000   
3        XGBoost_kruskal  0.999 ± 0.000  0.999 ± 0.000  1.000 ± 0.000   
4   RandomForest_kruskal  0.999 ± 0.000  0.999 ± 0.000  1.000 ± 0.000   
5  MlpClassifier_kruskal  0.999 ± 0.000  0.999 ± 0.000  1.000 ± 0.000   

             MCC          Brier            FNR  
0  0.970 ± 0.003  0.002 ± 0.000  0.000 ± 0.000  
1  0.970 ± 0.003  0.002 ± 0.000  0.000 ± 0.000  
2  0.970 ± 0.003  0.002 ± 0.000  0.000 ± 0.000  
3  0.970 ± 0.003  0.002 ± 0.000  0.000 ± 0.000  
4  0.970 ± 0.003  0.002 ± 0.000  0.000 ± 0.000  
5  0.970 ± 0.003  0.002 ± 0.000  0.000 ± 0.000  


In [16]:
for _, row in df_results.iterrows():
    modelo = row["Modelo"]
    best_params_list = row["BestParams_por_fold"]

    params_tuples = [tuple(sorted(d.items())) for d in best_params_list]
    params_counts = pd.Series(params_tuples).value_counts()

    print(f"\n{modelo}")


    best_overall = dict(params_counts.index[0])
    print("=> Mejor configuración final:", best_overall)


XGBoost_anova
=> Mejor configuración final: {'clf__learning_rate': 0.1, 'clf__max_depth': 5, 'clf__n_estimators': 100, 'select__k': 4}

RandomForest_anova
=> Mejor configuración final: {'clf__max_depth': None, 'clf__min_samples_split': 2, 'clf__n_estimators': 100, 'select__k': 4}

MlpClassifier_anova
=> Mejor configuración final: {'clf__activation': 'relu', 'clf__alpha': 0.0001, 'clf__hidden_layer_sizes': (50,), 'select__k': 5}

XGBoost_kruskal
=> Mejor configuración final: {'clf__learning_rate': 0.1, 'clf__max_depth': 5, 'clf__n_estimators': 100, 'select__k': 4}

RandomForest_kruskal
=> Mejor configuración final: {'clf__max_depth': None, 'clf__min_samples_split': 2, 'clf__n_estimators': 100, 'select__k': 4}

MlpClassifier_kruskal
=> Mejor configuración final: {'clf__activation': 'relu', 'clf__alpha': 0.0001, 'clf__hidden_layer_sizes': (50,), 'select__k': 5}


In [17]:
# Try with simple logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
pipe = Pipeline([
    ("pre", DataProcessor(num_cols, cat_cols)),
    ("clf", LogisticRegression(max_iter=500))
])
param_grid = {
    "clf__C": [0.01, 0.1, 1, 10, 100],
    "clf__penalty": ["l2"],
    "clf__solver": ["lbfgs", "saga"]
}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(pipe, param_grid, scoring="f1", cv=cv, n_jobs=-1)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]
print("Best parameters:", grid_search.best_params_)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

print("Classification Report:\n", classification_report(y_test, y_pred, target_names=["Benigno (0)", "Malicioso (1)"]))



Best parameters: {'clf__C': 1, 'clf__penalty': 'l2', 'clf__solver': 'lbfgs'}
Confusion Matrix:
 [[  907     0]
 [    1 30313]]
Classification Report:
                precision    recall  f1-score   support

  Benigno (0)       1.00      1.00      1.00       907
Malicioso (1)       1.00      1.00      1.00     30314

     accuracy                           1.00     31221
    macro avg       1.00      1.00      1.00     31221
 weighted avg       1.00      1.00      1.00     31221



In [None]:
X.drop(columns=["conn_state", "missed_bytes", "proto"], inplace=True)


In [None]:
# Drop conn state, missed bytes and proto
#cat_cols.remove("conn_state")
#num_cols.remove("missed_bytes")
cat_cols.remove("proto")

ValueError: list.remove(x): x not in list

In [19]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    f1_score, roc_auc_score, average_precision_score,
    matthews_corrcoef, brier_score_loss, confusion_matrix,
    classification_report
)
from sklearn.base import clone
import numpy as np
import pandas as pd

best_models = {
    "XGBoost": Pipeline([
        ("pre", DataProcessor(num_cols, cat_cols)),
        ("select", SelectKBest(score_func=f_classif, k=4)),
        ("clf", XGBClassifier(eval_metric="logloss", n_estimators=100, max_depth=5, learning_rate=0.1))
    ]),
    "RandomForest": Pipeline([
        ("pre", DataProcessor(num_cols, cat_cols)),
        ("select", SelectKBest(score_func=f_classif, k=4)),
        ("clf", RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_split=2))
    ]),
    "MLPClassifier": Pipeline([
        ("pre", DataProcessor(num_cols, cat_cols)),
        ("select", SelectKBest(score_func=f_classif, k=5)),
        ("clf", MLPClassifier(hidden_layer_sizes=(50,), activation="relu", alpha=0.0001, max_iter=500, random_state=42))
    ])
}

results = []
for model_name, pipeline in best_models.items():
    print(f"\nEvaluando {model_name}...")

    outer_scores = {"f1": [], "roc_auc": [], "auprc": [], "mcc": [], "brier": [], "fnr": []}
    confusion_matrixes, classification_reports = [], []

    for fold, (train_idx, test_idx) in enumerate(outer_cv.split(X, y), 1):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        model = clone(pipeline)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:, 1]

        outer_scores["f1"].append(f1_score(y_test, y_pred))
        outer_scores["roc_auc"].append(roc_auc_score(y_test, y_proba))
        outer_scores["auprc"].append(average_precision_score(y_test, y_proba))
        outer_scores["mcc"].append(matthews_corrcoef(y_test, y_pred))
        outer_scores["brier"].append(brier_score_loss(y_test, y_proba))

        cm = confusion_matrix(y_test, y_pred, labels=[0, 1])
        tn, fp, fn, tp = cm.ravel()
        fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
        outer_scores["fnr"].append(fnr)
        confusion_matrixes.append(cm)

        report = classification_report(
            y_test, y_pred,
            target_names=["Benigno (0)", "Malicioso (1)"],
            output_dict=True
        )
        classification_reports.append(report)

    results.append({
        "Modelo": model_name,
        "F1": f"{np.mean(outer_scores['f1']):.3f} ± {np.std(outer_scores['f1']):.3f}",
        "ROC-AUC": f"{np.mean(outer_scores['roc_auc']):.3f} ± {np.std(outer_scores['roc_auc']):.3f}",
        "AUPRC": f"{np.mean(outer_scores['auprc']):.3f} ± {np.std(outer_scores['auprc']):.3f}",
        "MCC": f"{np.mean(outer_scores['mcc']):.3f} ± {np.std(outer_scores['mcc']):.3f}",
        "Brier": f"{np.mean(outer_scores['brier']):.3f} ± {np.std(outer_scores['brier']):.3f}",
        "FNR": f"{np.mean(outer_scores['fnr']):.3f} ± {np.std(outer_scores['fnr']):.3f}",
        "ConfusionMatrix_por_fold": confusion_matrixes,
        "ClassificationReport_por_fold": classification_reports
    })

df_results = pd.DataFrame(results)
print("\nResultados finales:")
print(df_results[["Modelo", "F1", "ROC-AUC", "AUPRC", "MCC", "Brier", "FNR"]])



Evaluando XGBoost...


ValueError: A given column is not a column of the dataframe