<a href="https://colab.research.google.com/github/kdemertzis/EKPA/blob/main/OnlineLearning_IPS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTENC
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.compose import ColumnTransformer
import requests
from io import BytesIO
import gzip
import numpy as np

# Download and decompress the KDD Cup 1999 dataset
url = "http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz"
response = requests.get(url, stream=True)
compressed_file = BytesIO(response.content)
decompressed_file = gzip.GzipFile(fileobj=compressed_file)

# Load the dataset
columns = ["duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", "land",
           "wrong_fragment", "urgent", "hot", "num_failed_logins", "logged_in", "num_compromised",
           "root_shell", "su_attempted", "num_root", "num_file_creations", "num_shells",
           "num_access_files", "num_outbound_cmds", "is_host_login", "is_guest_login",
           "count", "srv_count", "serror_rate", "srv_serror_rate", "rerror_rate", "srv_rerror_rate",
           "same_srv_rate", "diff_srv_rate", "srv_diff_host_rate", "dst_host_count",
           "dst_host_srv_count", "dst_host_same_srv_rate", "dst_host_diff_srv_rate",
           "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate", "dst_host_serror_rate",
           "dst_host_srv_serror_rate", "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "target"]

df = pd.read_csv(decompressed_file, header=None, names=columns)

# Preprocess the data
X = df.drop("target", axis=1)
y = df["target"].apply(lambda x: 1 if x == "normal." else 0)

# Identify categorical features
categorical_features = ['protocol_type', 'service', 'flag']

# Separate numeric and categorical features
numeric_features = X.columns.difference(categorical_features)

# Create preprocessing pipeline with one-hot encoding for categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline with SMOTENC for numeric features only
pipeline = ImbPipeline([
    ('preprocessor', preprocessor),
    ('smote', SMOTENC(random_state=42, categorical_features=[X.columns.get_loc(col) for col in categorical_features])),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Online learning loop
batch_size = 1000
for epoch in range(1, 11):  # Modify the number of epochs as needed
    for i in range(0, len(X_train), batch_size):
        X_batch = X_train.iloc[i:i + batch_size]
        y_batch = y_train.iloc[i:i + batch_size]

        # Incrementally update the model with a batch of samples
        pipeline.fit(X_batch, y_batch)

        # Periodically evaluate the model on the test set
        if i % batch_size == 0 and i > 0:
            y_pred = pipeline.predict(X_test)  # Use the pipeline directly on X_test
            accuracy = accuracy_score(y_test, y_pred)
            print(f"Epoch {epoch}, Iteration {i}, Test Accuracy: {accuracy}")

# Evaluate the final model on the test set using the pipeline
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f"Final Test Accuracy: {accuracy}")
print("Classification Report:")
print(classification_rep)




Epoch 1, Iteration 1000, Test Accuracy: 0.9974292798947422
Epoch 1, Iteration 2000, Test Accuracy: 0.9931886038155964
Epoch 1, Iteration 3000, Test Accuracy: 0.9957492029755579
Epoch 1, Iteration 4000, Test Accuracy: 0.9937654976974849
Epoch 1, Iteration 5000, Test Accuracy: 0.9964070644198169
Epoch 1, Iteration 6000, Test Accuracy: 0.991994332270634
Epoch 1, Iteration 7000, Test Accuracy: 0.997004200192298
Epoch 1, Iteration 8000, Test Accuracy: 0.9973584332776682
Epoch 1, Iteration 9000, Test Accuracy: 0.9962248874044836
Epoch 1, Iteration 10000, Test Accuracy: 0.9959111381002985
Epoch 1, Iteration 11000, Test Accuracy: 0.995577146905521
Epoch 1, Iteration 12000, Test Accuracy: 0.9950710996407064
Epoch 1, Iteration 13000, Test Accuracy: 0.9976620616365569
Epoch 1, Iteration 14000, Test Accuracy: 0.9963058549668539
Epoch 1, Iteration 15000, Test Accuracy: 0.9934821112291888
Epoch 1, Iteration 16000, Test Accuracy: 0.9963159759121502
Epoch 1, Iteration 17000, Test Accuracy: 0.996022468