In [10]:
import os
from scapy.all import rdpcap
import pandas as pd
from sklearn.ensemble import IsolationForest
import re
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib

In [11]:
# Path to the folder containing the CSV files
DATA_FOLDER = "MachineLearningCVE"

# Function to load all CSV files and combine into a single DataFrame
def load_csv_files(data_folder):
    csv_files = [f for f in os.listdir(data_folder) if f.endswith('.csv')]
    dataframes = []

    for csv_file in csv_files:
        file_path = os.path.join(data_folder, csv_file)
        print(f"Loading {file_path}...")
        df = pd.read_csv(file_path)
        dataframes.append(df)

    return pd.concat(dataframes, ignore_index=True)

# Load the data
data = load_csv_files(DATA_FOLDER)

# Explore the data structure
print("Data Columns:", data.columns)
print("Sample Rows:")
print(data.head())

Loading MachineLearningCVE\Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv...
Loading MachineLearningCVE\Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv...
Loading MachineLearningCVE\Friday-WorkingHours-Morning.pcap_ISCX.csv...
Loading MachineLearningCVE\Monday-WorkingHours.pcap_ISCX.csv...
Loading MachineLearningCVE\Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv...
Loading MachineLearningCVE\Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv...
Loading MachineLearningCVE\Tuesday-WorkingHours.pcap_ISCX.csv...
Loading MachineLearningCVE\Wednesday-workingHours.pcap_ISCX.csv...
Data Columns: Index([' Destination Port', ' Flow Duration', ' Total Fwd Packets',
       ' Total Backward Packets', 'Total Length of Fwd Packets',
       ' Total Length of Bwd Packets', ' Fwd Packet Length Max',
       ' Fwd Packet Length Min', ' Fwd Packet Length Mean',
       ' Fwd Packet Length Std', 'Bwd Packet Length Max',
       ' Bwd Packet Length Min', ' Bwd Packet Length Mean',
     

In [12]:
data.columns

Index([' Destination Port', ' Flow Duration', ' Total Fwd Packets',
       ' Total Backward Packets', 'Total Length of Fwd Packets',
       ' Total Length of Bwd Packets', ' Fwd Packet Length Max',
       ' Fwd Packet Length Min', ' Fwd Packet Length Mean',
       ' Fwd Packet Length Std', 'Bwd Packet Length Max',
       ' Bwd Packet Length Min', ' Bwd Packet Length Mean',
       ' Bwd Packet Length Std', 'Flow Bytes/s', ' Flow Packets/s',
       ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min',
       'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max',
       ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std',
       ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags', ' Bwd PSH Flags',
       ' Fwd URG Flags', ' Bwd URG Flags', ' Fwd Header Length',
       ' Bwd Header Length', 'Fwd Packets/s', ' Bwd Packets/s',
       ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean',
       ' Packet Length Std', ' Packet Length Variance', '

In [None]:
# Data preprocessing
# Drop missing values
data = data.dropna()

# Select features and labels
# ' Label' column contains the attack type or "BENIGN"
X = data.drop(columns=[' Label'])  # Features
y = data[' Label']  # Target labels

# Remove leading/trailing spaces in column names
X.columns = X.columns.str.strip()

In [13]:
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Remove leading/trailing spaces in column names
data.columns = data.columns.str.strip() 

# Replace infinity and NaN values with finite numbers
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X.fillna(X.mean(), inplace=True)  # Replace NaN with column means

# Optional: Scale the data to ensure values are within a reasonable range
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Update the train-test split with scaled data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)

# Train a Random Forest classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
print("Training the model...")
model.fit(X_train, y_train)

# Evaluate the model
print("Evaluating the model...")
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Training the model...
Evaluating the model...
Accuracy: 0.9987152685124152
                            precision    recall  f1-score   support

                    BENIGN       1.00      1.00      1.00    454315
                       Bot       0.90      0.72      0.80       382
                      DDoS       1.00      1.00      1.00     25596
             DoS GoldenEye       1.00      0.99      1.00      2062
                  DoS Hulk       1.00      1.00      1.00     46327
          DoS Slowhttptest       0.99      0.99      0.99      1084
             DoS slowloris       1.00      1.00      1.00      1142
               FTP-Patator       1.00      1.00      1.00      1603
                Heartbleed       1.00      0.75      0.86         4
              Infiltration       1.00      0.43      0.60         7
                  PortScan       0.99      1.00      1.00     31704
               SSH-Patator       1.00      1.00      1.00      1198
  Web Attack � Brute Force       0.72   

In [14]:
print(data['Label'].value_counts())

Label
BENIGN                        2273097
DoS Hulk                       231073
PortScan                       158930
DDoS                           128027
DoS GoldenEye                   10293
FTP-Patator                      7938
SSH-Patator                      5897
DoS slowloris                    5796
DoS Slowhttptest                 5499
Bot                              1966
Web Attack � Brute Force         1507
Web Attack � XSS                  652
Infiltration                       36
Web Attack � Sql Injection         21
Heartbleed                         11
Name: count, dtype: int64
