<a href="https://colab.research.google.com/github/hkokin/ekpa/blob/main/IDSIPS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
#εισαγωγή βιβλιοθηκών - πακέτων
import pandas as pd # add this line to import the pandas library
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTENC
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.compose import ColumnTransformer
import requests
from io import BytesIO
import gzip
import numpy as np

In [29]:
#κατέβασμα και αποσυμπίεση του συνόλου δεδομένων KDD Cup 1999
#url = "http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz"
#response = requests.get(url, stream=True)
#compressed_file = BytesIO(response.content)
decompressed_file = 'https://github.com/hkokin/EKPA/raw/main/pcap_data.csv'

In [30]:
#φόρτωση του συνόλου δεδομένων
columns = ["duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", "land",
           "wrong_fragment", "urgent", "hot", "num_failed_logins", "logged_in", "num_compromised",
           "root_shell", "su_attempted", "num_root", "num_file_creations", "num_shells",
           "num_access_files", "num_outbound_cmds", "is_host_login", "is_guest_login",
           "count", "srv_count", "serror_rate", "srv_serror_rate", "rerror_rate", "srv_rerror_rate",
           "same_srv_rate", "diff_srv_rate", "srv_diff_host_rate", "dst_host_count",
           "dst_host_srv_count", "dst_host_same_srv_rate", "dst_host_diff_srv_rate",
           "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate", "dst_host_serror_rate",
           "dst_host_srv_serror_rate", "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "target"]

df = pd.read_csv(decompressed_file, header=None, names=columns)

In [31]:
#δημιουργία του συνόλου σε δύο κλάσεις Normal (0) και Attack (1)
X = df.drop("target", axis=1)
y = df["target"].apply(lambda x: 0 if x == "normal." else 1)

In [32]:
#έλεγχος κλάσεων
print("Κατανομή κλάσεων y:")
print(y.value_counts())

Κατανομή κλάσεων y:
target
1    14968
Name: count, dtype: int64


In [33]:
#εντοπισμός κατηγορικών μεταβλητών
categorical_features = ['protocol_type', 'service', 'flag']

In [34]:
#δημιουργία διοχέτευσης (αγωγού) προεπεξεργασίας με κωδικοποίηση μίας δέσμης (one-hot encoding) για κατηγορικές μεταβλητές
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

In [35]:
#διαχωρισμός κατηγορικών και αριθμητικών μεταβλητών
numeric_features = X.columns.difference(categorical_features)

In [None]:
#τμηματοποίηση δεδομένων σε σύνολα εκαπίδευσης και δοκιμών
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [36]:
from sklearn.model_selection import train_test_split

# Assuming you have your data in X and y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [37]:
#δημιουργία διοχέτευσης (αγωγού) SMOTE για τις αριθμητικές μεταβλητές μόνο
pipeline = ImbPipeline([
    ('preprocessor', preprocessor),
    ('smote', SMOTENC(random_state=42, categorical_features=[X.columns.get_loc(col) for col in categorical_features])),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [38]:
#καθορισμός κατωφλίου (threshold) για την διακοπή διακτυακής κίνησης
blocking_threshold = 0.9

In [39]:
#δημιουργία βρόχου συνεχούς - αυξητικής μάθησης
batch_size = 10000
for epoch in range(1, 3):  #δυνατότητα αλλαγής των εποχών
    for i in range(0, len(X_train), batch_size):
        X_batch = X_train.iloc[i:i + batch_size]
        y_batch = y_train.iloc[i:i + batch_size]

        # Check if y_batch has more than one class
        if len(np.unique(y_batch)) > 1:
            #σταδιακή ενημέρωση του μοντέλου με κάθε ροή (batch) δεδομένων
            pipeline.fit(X_batch, y_batch)

            #περιοδική ενημέρωση του μοντέλου στο σύνολο δοκιμών
            if i % batch_size == 0 and i > 0:
                y_pred_proba = pipeline.predict_proba(X_test)[:, 1]

                #αποκλεισμός δικτυακής κυκλοφορίας εάν η προβλεπόμενη πιθανότητα υπερβαίνει το καθορισμένο όριο
                blocked_indices = np.where(y_pred_proba > blocking_threshold)[0]
                if len(blocked_indices) > 0:
                    print(f"Blocking {len(blocked_indices)} malicious traffic instances.")

                accuracy = accuracy_score(y_test, y_pred_proba > blocking_threshold)
                print(f"Epoch {epoch}, Iteration {i}, Test Accuracy: {accuracy}")
        else:
            print(f"Skipping batch {i} due to only one class present.")

Skipping batch 0 due to only one class present.
Skipping batch 10000 due to only one class present.
Skipping batch 0 due to only one class present.
Skipping batch 10000 due to only one class present.


In [40]:
# Assuming 'X' contains your DataFrame, you can derive 'numerical_features' by
# excluding the 'categorical_features' from the columns of 'X'

numerical_features = X.columns.drop(categorical_features).tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Update the pipeline with the modified preprocessor
pipeline = ImbPipeline([
    ('preprocessor', preprocessor),
    ('smote', SMOTENC(random_state=42, categorical_features=[X.columns.get_loc(col) for col in categorical_features])),
    ('classifier', RandomForestClassifier(random_state=42))
])