<a href="https://colab.research.google.com/github/iacopoooo/Tesi/blob/main/Casodistuio1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# ==============================================================
# 1. Installazione librerie necessarie
# (alcune sono già presenti in Colab, ma per sicurezza)
# ==============================================================
!pip install pandas numpy scikit-learn matplotlib seaborn

# ==============================================================
# 2. Import librerie
# ==============================================================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

# ==============================================================
# 3. Caricamento del dataset
# ==============================================================
# ⚠️ Devi caricare il file cicids2017.csv su Colab:
# Colab → barra laterale sinistra → "Files" → "Upload" → scegli il CSV
# oppure monta Google Drive:
# from google.colab import drive
# drive.mount('/content/drive')
# e poi usa il percorso giusto, es: '/content/drive/MyDrive/cicids2017.csv'

try:
    df_iter = pd.read_csv("cicids2017.csv", iterator=True, chunksize=10000)
    df = pd.concat([chunk.sample(frac=0.1, random_state=42) for chunk in df_iter])
    print("✅ Dataset caricato con successo.")
except FileNotFoundError:
    print("❌ Errore: 'cicids2017.csv' non trovato.")
    raise

# Pulizia nomi colonne
df.columns = (
    df.columns.str.strip()
    .str.lower()
    .str.replace(" ", "_")
    .str.replace("[^a-zA-Z0-9_]", "", regex=True)
)

print(f"Shape dataset: {df.shape}")
df.head()

# ==============================================================
# 4. Preprocessing
# ==============================================================
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)

if "label" not in df.columns:
    raise KeyError("Colonna 'label' non trovata, controlla il dataset!")

X = df.drop("label", axis=1)
y = df["label"]

print("Distribuzione classi:")
print(y.value_counts())

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)
# ==============================================================
# 5. Addestramento Random Forest
# ==============================================================
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

print("Accuracy RF:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, zero_division=0))

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10,6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Matrice di Confusione - Random Forest")
plt.show()

# Importanza feature
importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
print("Top 10 feature importanti:")
print(importances.head(10))
# ==============================================================
# 6. Addestramento MLP
# ==============================================================
from sklearn.neural_network import MLPClassifier
from sklearn.exceptions import ConvergenceWarning
import warnings

mlp = MLPClassifier(
    hidden_layer_sizes=(50, 25),
    max_iter=300,
    random_state=42,
    early_stopping=True,
    validation_fraction=0.1,
    verbose=True
)

with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=ConvergenceWarning)
    mlp.fit(X_train, y_train)

y_pred_mlp = mlp.predict(X_test)

print("Accuracy MLP:", accuracy_score(y_test, y_pred_mlp))
print(classification_report(y_test, y_pred_mlp, zero_division=0))

cm_mlp = confusion_matrix(y_test, y_pred_mlp)
plt.figure(figsize=(10,6))
sns.heatmap(cm_mlp, annot=True, fmt="d", cmap="Greens")
plt.title("Matrice di Confusione - MLP")
plt.show()
# ==============================================================
# 7. Confronto metriche per classe
# ==============================================================
from sklearn.metrics import precision_recall_fscore_support

class_labels = sorted(np.unique(y_test).astype(str))

precision_rf, recall_rf, f1_rf, _ = precision_recall_fscore_support(y_test, y_pred, labels=np.unique(y_test), zero_division=0)
precision_mlp, recall_mlp, f1_mlp, _ = precision_recall_fscore_support(y_test, y_pred_mlp, labels=np.unique(y_test), zero_division=0)

x = np.arange(len(class_labels))
width = 0.35

fig, axs = plt.subplots(1, 3, figsize=(18, 6), sharey=True)

axs[0].bar(x - width/2, precision_rf, width, label="RF", color="orange")
axs[0].bar(x + width/2, precision_mlp, width, label="MLP", color="red")
axs[0].set_title("Precision")
axs[0].set_xticks(x)
axs[0].set_xticklabels(class_labels, rotation=45)

axs[1].bar(x - width/2, recall_rf, width, label="RF", color="orange")
axs[1].bar(x + width/2, recall_mlp, width, label="MLP", color="red")
axs[1].set_title("Recall")
axs[1].set_xticks(x)
axs[1].set_xticklabels(class_labels, rotation=45)

axs[2].bar(x - width/2, f1_rf, width, label="RF", color="orange")
axs[2].bar(x + width/2, f1_mlp, width, label="MLP", color="red")
axs[2].set_title("F1-score")
axs[2].set_xticks(x)
axs[2].set_xticklabels(class_labels, rotation=45)

for ax in axs:
    ax.set_ylim(0, 1.1)
    ax.grid(True, axis="y")

axs[2].legend()
plt.tight_layout()
plt.show()


❌ Errore: 'cicids2017.csv' non trovato.


FileNotFoundError: [Errno 2] No such file or directory: 'cicids2017.csv'