In [None]:
# import reuiqred libraries and data loader
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from DataLoader import DataLoader
from DataProcessor import DataProcessor

Loader = DataLoader()

In [None]:
df = Loader.load_dataset("C:/Users/PcVip/Downloads/Network_dataset_2.csv", file_type="csv")

In [None]:
df.head()

In [None]:
df["type"].value_counts()

In [None]:
df = Loader.clean_dataset(df)

In [None]:
df["type"].value_counts()

In [None]:
# Show dataframe columns
df.columns

In [None]:
# We take onlu the following columns
features = [
    "duration", 
    "src_bytes", "dst_bytes", 
    "src_pkts", "dst_pkts", 
    "src_ip_bytes", "dst_ip_bytes"
]

X = df[features]
y = df["label"].astype("Int64")

In [None]:
X.head()

In [None]:
y.head()

In [None]:
for col in ["duration", "src_bytes", "dst_bytes", "src_pkts", "dst_pkts",
            "src_ip_bytes", "dst_ip_bytes"]:
    X[col] = pd.to_numeric(X[col], errors="coerce")

# divide categorial and numerical columns
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

In [None]:
print("Numerical columns:", num_cols)
print("Categorical columns:", cat_cols)

In [None]:
# import data processor
processor = DataProcessor(num_cols=num_cols, cat_cols=cat_cols)

In [None]:
from sklearn.model_selection import StratifiedKFold
# Define the outer and inner cross-validation strategies
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

In [None]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    f1_score, roc_auc_score, average_precision_score,
    matthews_corrcoef, brier_score_loss, confusion_matrix,
    classification_report
)
from sklearn.base import clone
import numpy as np
import pandas as pd

best_models = {
    "XGBoost": Pipeline([
        ("pre", DataProcessor(num_cols, cat_cols)),
        ("select", SelectKBest(score_func=f_classif, k=4)),
        ("clf", XGBClassifier(eval_metric="logloss", n_estimators=100, max_depth=5, learning_rate=0.1))
    ]),
    "RandomForest": Pipeline([
        ("pre", DataProcessor(num_cols, cat_cols)),
        ("select", SelectKBest(score_func=f_classif, k=4)),
        ("clf", RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_split=2))
    ]),
    "MLPClassifier": Pipeline([
        ("pre", DataProcessor(num_cols, cat_cols)),
        ("select", SelectKBest(score_func=f_classif, k=5)),
        ("clf", MLPClassifier(hidden_layer_sizes=(50,), activation="relu", alpha=0.0001, max_iter=500, random_state=42))
    ])
}

results = []
for model_name, pipeline in best_models.items():
    print(f"\nEvaluando {model_name}...")

    outer_scores = {"f1": [], "roc_auc": [], "auprc": [], "mcc": [], "brier": [], "fnr": []}
    confusion_matrixes, classification_reports = [], []
    selected_features_folds = []

    for fold, (train_idx, test_idx) in enumerate(outer_cv.split(X, y), 1):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        model = clone(pipeline)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:, 1]

        outer_scores["f1"].append(f1_score(y_test, y_pred))
        outer_scores["roc_auc"].append(roc_auc_score(y_test, y_proba))
        outer_scores["auprc"].append(average_precision_score(y_test, y_proba))
        outer_scores["mcc"].append(matthews_corrcoef(y_test, y_pred))
        outer_scores["brier"].append(brier_score_loss(y_test, y_proba))

        cm = confusion_matrix(y_test, y_pred, labels=[0, 1])
        tn, fp, fn, tp = cm.ravel()
        fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
        outer_scores["fnr"].append(fnr)
        confusion_matrixes.append(cm)

        report = classification_report(
            y_test, y_pred,
            target_names=["Benigno (0)", "Malicioso (1)"],
            output_dict=True
        )
        classification_reports.append(report)

        # === Guardar features seleccionadas en este fold ===
        select_step = model.named_steps["select"]
        feature_names = model.named_steps["pre"].get_feature_names_out()
        selected_mask = select_step.get_support()
        selected_features = feature_names[selected_mask]
        selected_features_folds.append(list(selected_features))

    results.append({
        "Modelo": model_name,
        "F1": f"{np.mean(outer_scores['f1']):.3f} ± {np.std(outer_scores['f1']):.3f}",
        "ROC-AUC": f"{np.mean(outer_scores['roc_auc']):.3f} ± {np.std(outer_scores['roc_auc']):.3f}",
        "AUPRC": f"{np.mean(outer_scores['auprc']):.3f} ± {np.std(outer_scores['auprc']):.3f}",
        "MCC": f"{np.mean(outer_scores['mcc']):.3f} ± {np.std(outer_scores['mcc']):.3f}",
        "Brier": f"{np.mean(outer_scores['brier']):.3f} ± {np.std(outer_scores['brier']):.3f}",
        "FNR": f"{np.mean(outer_scores['fnr']):.3f} ± {np.std(outer_scores['fnr']):.3f}",
        "ConfusionMatrix_por_fold": confusion_matrixes,
        "ClassificationReport_por_fold": classification_reports,
        "BestFeatures_por_fold": selected_features_folds
    })

df_results = pd.DataFrame(results)
print("\nResultados finales:")
print(df_results[["Modelo", "F1", "ROC-AUC", "AUPRC", "MCC", "Brier", "FNR"]])

# === Contar qué features se repiten más por modelo ===
for _, row in df_results.iterrows():
    modelo = row["Modelo"]
    features_folds = row["BestFeatures_por_fold"]
    all_feats = [feat for fold_feats in features_folds for feat in fold_feats]
    feat_counts = pd.Series(all_feats).value_counts()
    print(f"\n{modelo} - Features más seleccionadas:")
    print(feat_counts)


In [None]:
df_prep = df.copy()

In [None]:
num_features = ["duration","src_bytes","dst_bytes",
                "src_pkts","dst_pkts","src_ip_bytes","dst_ip_bytes"]
cat_features = ["proto","conn_state","service"]

In [None]:
for col in num_features:
    df_prep[col] = pd.to_numeric(df_prep[col], errors="coerce").fillna(0)

In [None]:
scaler = StandardScaler()
X_num = scaler.fit_transform(df_prep[num_features])


In [None]:
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
X_cat = ohe.fit_transform(df_prep[cat_features].astype(str))



In [None]:
print(X_num.shape, X_cat.shape)
# Ej: (10000, 7) (10000, 25)


In [None]:
X_num = np.array(X_num)
X_cat = np.array(X_cat)

print(X_num.shape, X_cat.shape, type(X_num), type(X_cat))

X_all = np.hstack([X_num, X_cat])

y_all = df_prep["label"].values

In [None]:
def create_sequences(X, y, window_size=10):
    Xs, ys = [], []
    for i in range(len(X) - window_size):
        Xs.append(X[i:(i + window_size)])
        ys.append(y[i + window_size])  # etiqueta del último elemento
    return np.array(Xs), np.array(ys)

X_seq, y_seq = create_sequences(X_all, y_all, window_size=10)

print("Shape secuencias:", X_seq.shape, y_seq.shape)

In [None]:
import numpy as np
unique, counts = np.unique(y_seq, return_counts=True)
print(dict(zip(unique, counts)))


In [None]:
split = int(0.8 * len(X_seq))
X_train, X_test = X_seq[:split], X_seq[split:]
y_train, y_test = y_seq[:split], y_seq[split:]


In [None]:
!pip install tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense
from tensorflow.keras.optimizers import Adam

from sklearn.utils.class_weight import compute_class_weight
import numpy as np

classes = np.unique(y_train)
weights = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)
class_weights = dict(zip(classes, weights))
print(class_weights)




model = Sequential([
    GRU(64, input_shape=(X_seq.shape[1], X_seq.shape[2])),
    Dense(1, activation="sigmoid")
])

model.compile(
    optimizer=Adam(learning_rate=1e-3),
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=5,
    batch_size=256,
    class_weight=class_weights
)


In [None]:
# Probabilidades
y_proba = model.predict(X_test, batch_size=256)

# Convertir a 0/1 con umbral 0.5
y_pred = (y_proba > 0.75).astype("int32").flatten()


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=4))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))
# display the confusion matrix with ConfussionMatrxixDisplay
cm = confusion_matrix(y_test, y_pred)
from sklearn.metrics import ConfusionMatrixDisplay
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Benigno (0)", "Malicioso (1)"])
disp.plot(cmap=plt.cm.Blues)

In [None]:
import numpy as np
import pandas as pd
from sklearn.utils import resample, class_weight
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, f1_score, matthews_corrcoef
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

# ============================
# Balanceo del dataset
# ============================
# Aplano para poder hacer oversampling
X_train_flat = X_train.reshape(X_train.shape[0], -1)
df_train = pd.DataFrame(X_train_flat)
df_train["label"] = y_train

# Separar clases
df_majority = df_train[df_train.label == 1]
df_minority = df_train[df_train.label == 0]

print("Antes del balanceo:", len(df_majority), "ataques,", len(df_minority), "normales")

# Oversample clase 0
df_minority_upsampled = resample(
    df_minority,
    replace=True,
    n_samples=len(df_majority),
    random_state=42
)

# Dataset balanceado
df_balanced = pd.concat([df_majority, df_minority_upsampled])

# Reconstruir arrays
y_train_bal = df_balanced["label"].values
X_train_bal = df_balanced.drop(columns=["label"]).values.reshape(-1, X_train.shape[1], X_train.shape[2])

print("Después del balanceo:", np.bincount(y_train_bal))

# ============================
# Pesos de clase (por si queda algo de desbalanceo)
# ============================
classes = np.unique(y_train_bal)
weights = class_weight.compute_class_weight(class_weight="balanced", classes=classes, y=y_train_bal)
class_weights = dict(zip(classes, weights))
print("Class weights:", class_weights)

# ============================
# Modelo GRU
# ============================
model = Sequential([
    GRU(64, input_shape=(X_train.shape[1], X_train.shape[2])),
    Dense(1, activation="sigmoid")
])

model.compile(
    optimizer=Adam(1e-3),
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

history = model.fit(
    X_train_bal, y_train_bal,
    validation_data=(X_test, y_test),
    epochs=5,
    batch_size=256,
    class_weight=class_weights,
    verbose=1
)

# ============================
# Evaluación
# ============================
y_proba = model.predict(X_test, batch_size=256)
y_pred = (y_proba > 0.5).astype("int32").flatten()

print("\nConfusion matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred, digits=4))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))
print("F1:", f1_score(y_test, y_pred))
print("MCC:", matthews_corrcoef(y_test, y_pred))



In [None]:
# Confussion matrix display of y_pred and y_test
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay

# Display the confusion matrix with ConfussionMatrxixDisplay
# Display
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Benigno (0)", "Malicioso (1)"])
disp.plot(cmap=plt.cm.Blues)

In [None]:
! pip install imbalanced-learn
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, f1_score, matthews_corrcoef
import seaborn as sns
import matplotlib.pyplot as plt

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

# ============================
# SMOTE para balancear dataset
# ============================
print("Original balance:", np.bincount(y_train))

# Flatten para SMOTE
n_samples, timesteps, n_features = X_train.shape
X_train_flat = X_train.reshape((n_samples, timesteps * n_features))

smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train_flat, y_train)

# Reconstruir 3D para GRU
X_train_bal = X_train_bal.reshape((-1, timesteps, n_features))
print("Balance después de SMOTE:", np.bincount(y_train_bal))

# ============================
# Focal Loss
# ============================
def focal_loss(gamma=2., alpha=0.25):
    def loss(y_true, y_pred):
        y_true = tf.cast(y_true, tf.float32)
        bce = tf.keras.losses.binary_crossentropy(y_true, y_pred)
        pt = tf.exp(-bce)
        return alpha * (1 - pt) ** gamma * bce
    return loss

# ============================
# Modelo GRU
# ============================
model = Sequential([
    GRU(64, input_shape=(timesteps, n_features)),
    Dense(1, activation="sigmoid")
])

model.compile(
    optimizer=Adam(1e-3),
    loss=focal_loss(gamma=2., alpha=0.75),
    metrics=["accuracy"]
)

history = model.fit(
    X_train_bal, y_train_bal,
    validation_data=(X_test, y_test),
    epochs=5,
    batch_size=256,
    verbose=1
)

# ============================
# Evaluación
# ============================
y_proba = model.predict(X_test, batch_size=256)
y_pred = (y_proba > 0.5).astype("int32").flatten()

cm = confusion_matrix(y_test, y_pred)
print("\nConfusion matrix:\n", cm)
print("\nClassification report:\n", classification_report(y_test, y_pred, digits=4))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))
print("F1:", f1_score(y_test, y_pred))
print("MCC:", matthews_corrcoef(y_test, y_pred))

# ============================
# Visualizar matriz de confusión
# ============================
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Normal (0)","Ataque (1)"], yticklabels=["Normal (0)","Ataque (1)"])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix - GRU + SMOTE + Focal Loss")
plt.show()
