<a href="https://colab.research.google.com/github/ladyTootie/ACE-R-D/blob/main/MPL_Final_Copy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load and Preprocess NSL-KDD Training dataset

### Implementing SMOTE for Class Imbalance

Class imbalance is a common problem in anomaly detection datasets like NSL-KDD, where normal traffic instances far outnumber attack instances. This can lead to models that perform well on the majority class but poorly on minority (attack) classes.

**SMOTE (Synthetic Minority Over-sampling Technique)** addresses this by generating synthetic samples for the minority classes. It works by selecting a sample from the minority class and then finding its k-nearest neighbors. Synthetic samples are then generated at random points along the line segments connecting the chosen sample to its neighbors.

By balancing the dataset, SMOTE can help the model learn more effectively from the minority classes, potentially leading to better detection rates for various attack types and an improved F1-score.

In [2]:
import tensorflow as tf
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from tensorflow.keras.models import Sequential #linear stack of layers
from tensorflow.keras.layers import Dense #connected neural network layer
import numpy as np
from sklearn.metrics import f1_score


#Define column names
column_names = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins',
    'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root',
    'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds',
    'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate',
    'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
    'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
    'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
    'dst_host_srv_rerror_rate', 'traffic_type', 'difficulty_level' # Adjusted column names
]

#Load the dataset
df_train = pd.read_csv('/content/KDDTrain+.txt', names=column_names)

#Check if dataset loaded correctly
#display(df_train.head())

# Drop the difficulty_level column
df_train = df_train.drop('difficulty_level', axis=1)

#Separate the features from traffic_type (what the MLP is predicting)
x_train_raw = df_train.drop('traffic_type', axis=1)
y_train_raw = df_train['traffic_type']

#Identify categorical and numerical features
categorical_features_for_x = ['protocol_type', 'service', 'flag']
numerical_features_for_x = [col for col in x_train_raw.columns if col not in categorical_features_for_x]

#Encode categorical and scale numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features_for_x),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features_for_x)
    ])

#Apply preprocessing
x_train = preprocessor.fit_transform(x_train_raw)


# Encode 'traffic_type' - ensure all possible labels from both train and test are known
label_encoder = LabelEncoder()

# Temporarily load KDDTest+.txt to get all possible labels for the encoder
# This ensures the label_encoder is fitted with all classes from both datasets
# before transforming either y_train or y_test, preventing 'unseen labels' errors.
df_test_temp = pd.read_csv('/content/KDDTest+.txt', names=column_names)
df_test_temp = df_test_temp.drop('difficulty_level', axis=1)
y_test_raw_temp = df_test_temp['traffic_type']

# Combine unique labels from both training and testing sets for fitting the encoder
all_traffic_types = pd.concat([y_train_raw, y_test_raw_temp]).unique()
label_encoder.fit(all_traffic_types)

# Now transform y_train_raw using the fully fitted encoder
y_train = label_encoder.transform(y_train_raw)

# x_train and y_train are now ready from KDDTrain+.txt, no further splitting for training

In [5]:
from imblearn.over_sampling import SMOTE
from collections import Counter

print("Applying SMOTE to the training data...")

# Count the occurrences of each class
class_counts = Counter(y_train)
print(f"Original class distribution: {class_counts}")

# Identify minority classes (classes with count < default k_neighbors + 1, which is 6)
# SMOTE's default k_neighbors is 5, so it needs at least k_neighbors + 1 samples (6 samples)
# to form neighbors. If a class has fewer than 6 samples, it will cause an error.
# We can find the minimum number of samples in any class for which SMOTE will be applied
# and set k_neighbors accordingly.

# Filter out 'normal' class from consideration for min_samples to avoid issues if 'normal' is huge
# and we only care about minority attack classes.
# If 'normal' is a minority class itself, this logic might need adjustment.
minority_class_counts = {cls: count for cls, count in class_counts.items() if count <= 5}

# Determine the maximum k_neighbors we can use across all classes being oversampled.
# If there are classes with very few samples (e.g., 1 or 2), SMOTE might not be suitable for them.
# The smallest valid k_neighbors is 1.
# If min_samples_in_minority is less than 2, SMOTE cannot be applied even with k_neighbors=1.

# If no minority classes have counts less than or equal to 5, we can use default k_neighbors=5
# Otherwise, find the smallest count among those classes, and set k_neighbors to that count - 1.
# If the smallest count is 1, SMOTE cannot be applied to that class directly with k_neighbors >= 1.

min_samples_for_smote = min(class_counts.values())

# Adjust k_neighbors based on the smallest class size that SMOTE will process
# SMOTE requires at least k_neighbors + 1 samples to work for a given class.
# So, if a class has `N` samples, the maximum k_neighbors we can use is `N - 1`.
# We need to find the smallest `N` across all classes that SMOTE will operate on.
# Let's consider only classes that are not 'normal' and might be subject to oversampling.

# Find the minimum size of a class (excluding the largest classes if there are any that would not be considered minority)
# For simplicity, let's just use the absolute minimum count across all classes for now
# This assumes all classes might need oversampling or that the error comes from the smallest one.

# If there's a class with fewer than 2 samples, SMOTE with k_neighbors=1 is still problematic.
# Let's find the minimum count of any class *after* preprocessing, which `y_train` reflects.

# Re-calculate min_samples_for_smote considering that SMOTE's k_neighbors is for generating samples
# and it needs k_neighbors + 1 samples in the class to find k_neighbors.

# Exclude any class that would lead to k_neighbors < 1 if we try to apply SMOTE.
# The error happened because a class had 3 samples. So, max k_neighbors = 3-1 = 2.
# Let's try k_neighbors = 2 to be safe for classes with at least 3 samples.

# Dynamically set k_neighbors
# Iterate through class_counts to find the minimum number of samples for any class
# that might be a minority and cause the issue.

# A more robust approach might be to determine which classes SMOTE should be applied to
# or to use 'not majority' strategy if specific classes are known to be problematic.

# For the current error, min_samples_fit is 2. So, k_neighbors should be at most 1.
# Let's set it to 1 to handle the class with 2 samples (2-1=1).

k_neighbors_to_use = 1 # Setting to 1 to handle the class with 2 samples (2-1=1)

smote = SMOTE(random_state=42, k_neighbors=k_neighbors_to_use)
x_train_smote, y_train_smote = smote.fit_resample(x_train, y_train)

print(f"Original training set shape: {x_train.shape}, {y_train.shape}")
print(f"Resampled training set shape: {x_train_smote.shape}, {y_train_smote.shape}")
print(f"Resampled class distribution: {Counter(y_train_smote)}")

# You would then retrain your model using x_train_smote and y_train_smote

Applying SMOTE to the training data...
Original class distribution: Counter({np.int64(16): 67343, np.int64(14): 41214, np.int64(25): 3633, np.int64(7): 3599, np.int64(20): 2931, np.int64(27): 2646, np.int64(15): 1493, np.int64(1): 956, np.int64(32): 892, np.int64(34): 890, np.int64(19): 201, np.int64(4): 53, np.int64(2): 30, np.int64(35): 20, np.int64(8): 18, np.int64(6): 11, np.int64(23): 10, np.int64(9): 9, np.int64(3): 8, np.int64(12): 7, np.int64(18): 4, np.int64(17): 3, np.int64(30): 2})
Original training set shape: (125973, 122), (125973,)
Resampled training set shape: (1548889, 122), (1548889,)
Resampled class distribution: Counter({np.int64(16): 67343, np.int64(14): 67343, np.int64(34): 67343, np.int64(7): 67343, np.int64(20): 67343, np.int64(32): 67343, np.int64(15): 67343, np.int64(25): 67343, np.int64(27): 67343, np.int64(19): 67343, np.int64(1): 67343, np.int64(4): 67343, np.int64(3): 67343, np.int64(12): 67343, np.int64(23): 67343, np.int64(2): 67343, np.int64(6): 67343, n

#Load and Preprocess Testing set

In [6]:
import tensorflow as tf
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from tensorflow.keras.models import Sequential #linear stack of layers
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization #connected neural network layer
import numpy as np
from sklearn.metrics import f1_score


#Define column names
column_names = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins',
    'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root',
    'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds',
    'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate',
    'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
    'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
    'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
    'dst_host_srv_rerror_rate', 'traffic_type', 'difficulty_level' # Adjusted column names
]

#Load the dataset
df_test = pd.read_csv('/content/KDDTest+.txt', names=column_names)

#Check if dataset loaded correctly
#display(df_test.head())

# Drop the difficulty_level column
df_test = df_test.drop('difficulty_level', axis=1)

#Separate the features from traffic_type (what the MLP is predicting)
x_test_raw = df_test.drop('traffic_type', axis=1)
y_test_raw = df_test['traffic_type']

#Identify categorical and numerical features (assuming these are consistent with training set)
# categorical_features_for_x and numerical_features_for_x are already defined in the training cell

#Apply preprocessing using the preprocessor fitted on the training data
x_test = preprocessor.transform(x_test_raw)


#Encode 'traffic_type' using the label_encoder fitted on the training data
y_test = label_encoder.transform(y_test_raw)


# Create the MLP Model


In [20]:
import tensorflow as tf

#Get number of input features (columns)
num_features = x_train.shape[1]

#Get number of traffic types
num_traffic_types = len(label_encoder.classes_)

#Create MLP with three layers
model = Sequential([
    Dense(256, activation='relu', input_shape=(num_features,), kernel_regularizer=tf.keras.regularizers.l2(0.005)), #input layer and hidden layer
    BatchNormalization(),
    Dropout(0.5),
    Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.005)),#hidden layer
    BatchNormalization(),
    Dropout(0.4),
    Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.005)),#hidden layer
    BatchNormalization(),
    Dropout(0.3),
    Dense(32, activation='relu'),#hidden layer
    Dropout(0.2),
    Dense(num_traffic_types, activation='softmax')#output layer
])

#Compile the model
model.compile(optimizer='adam', #minimize the loss function, update weights and biases
              loss='sparse_categorical_crossentropy', #measures performance
              metrics=['accuracy'])#calculates accuracy

#Print model summary
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


# Train the Model

Evauluate Accuracy, and F1-Score from Training on Test Set

In [21]:
#Train the model with 10 epochs
model.fit(x_train, y_train, epochs=15, batch_size=32)
print("Training complete. Ready to analyze logs.")

Epoch 1/15
[1m3937/3937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 5ms/step - accuracy: 0.9010 - loss: 1.1151
Epoch 2/15
[1m3937/3937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 4ms/step - accuracy: 0.9658 - loss: 0.2805
Epoch 3/15
[1m3937/3937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 4ms/step - accuracy: 0.9677 - loss: 0.2623
Epoch 4/15
[1m3937/3937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 4ms/step - accuracy: 0.9679 - loss: 0.2505
Epoch 5/15
[1m3937/3937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 4ms/step - accuracy: 0.9679 - loss: 0.2407
Epoch 6/15
[1m3937/3937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 4ms/step - accuracy: 0.9684 - loss: 0.2334
Epoch 7/15
[1m3937/3937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 4ms/step - accuracy: 0.9684 - loss: 0.2289
Epoch 8/15
[1m3937/3937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 6ms/step - accuracy: 0.9682 - loss: 0.2253
Epoch 9/15
[1m3

In [22]:
#Evaluate the model on the test set
loss, accuracy = model.evaluate(x_test, y_test)

print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

[1m705/705[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7033 - loss: 7.5621
Test Loss: 7.6258
Test Accuracy: 0.6993


In [23]:
#Make predictions on the test set
y_pred_probabilities = model.predict(x_test)
y_pred = np.argmax(y_pred_probabilities, axis=1)

#Calculate F1-score
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"F1-score: {f1:.4f}")

[1m705/705[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
F1-score: 0.5977


# Preprocess the conn.log

# Predict Anomalies and Analyze Results

# Retrain the Model with SMOTE-Resampled Data

In [30]:
# Retrain the model with the SMOTE-resampled training data
model.fit(x_train_smote, y_train_smote, epochs=15, batch_size=32)
print("Model retrained with SMOTE data. Ready for re-evaluation.")

Epoch 1/15
[1m48403/48403[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m200s[0m 4ms/step - accuracy: 0.8690 - loss: 0.6088
Epoch 2/15
[1m48403/48403[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m199s[0m 4ms/step - accuracy: 0.9155 - loss: 0.4373
Epoch 3/15
[1m48403/48403[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m199s[0m 4ms/step - accuracy: 0.9170 - loss: 0.4271
Epoch 4/15
[1m48403/48403[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m203s[0m 4ms/step - accuracy: 0.9161 - loss: 0.4268
Epoch 5/15
[1m48403/48403[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m199s[0m 4ms/step - accuracy: 0.9166 - loss: 0.4244
Epoch 6/15
[1m48403/48403[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m200s[0m 4ms/step - accuracy: 0.9166 - loss: 0.4235
Epoch 7/15
[1m48403/48403[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m200s[0m 4ms/step - accuracy: 0.9160 - loss: 0.4259
Epoch 8/15
[1m48403/48403[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m197s[0m 4ms/step - accuracy: 0.9160 - loss:

# Re-evaluate Accuracy and F1-Score on Test Set (after SMOTE training)

In [31]:
#Evaluate the retrained model on the original test set
loss_smote, accuracy_smote = model.evaluate(x_test, y_test)

print(f"Test Loss (after SMOTE training): {loss_smote:.4f}")
print(f"Test Accuracy (after SMOTE training): {accuracy_smote:.4f}")

#Make predictions on the test set with the retrained model
y_pred_probabilities_smote = model.predict(x_test)
y_pred_smote = np.argmax(y_pred_probabilities_smote, axis=1)

#Calculate F1-score
f1_smote = f1_score(y_test, y_pred_smote, average='weighted')

print(f"F1-score (after SMOTE training): {f1_smote:.4f}")

[1m705/705[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6818 - loss: 6.7206
Test Loss (after SMOTE training): 6.7859
Test Accuracy (after SMOTE training): 0.6808
[1m705/705[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
F1-score (after SMOTE training): 0.6287


In [39]:
import pandas as pd
#pd.set_option('future.no_silent_downcasting',True)

#Load the conn.log
log_path = '/content/connNormal.log'

#Read the file and organize the data like the training df
header_lines = []
separator = '\t' # default separator
column_names = []

with open(log_path, 'r') as file:
    for line in file:
        if line.startswith('#separator'):
            # Corrected: split by space, take the second part, strip, and replace
            raw_sep = line.split(' ', 1)[1].strip()
            separator = raw_sep.replace('\x09', '\t')
        elif line.startswith('#fields'):
            # Corrected: extract fields string by splitting on the predetermined separator after '#fields'
            fields_prefix = '#fields'
            if fields_prefix in line:
                # Find the start of the actual field names after '#fields' and the first separator
                start_of_fields = line.find(fields_prefix) + len(fields_prefix)
                fields_str_full = line[start_of_fields:].strip()
                # Now split by the determined separator
                column_names = fields_str_full.split(separator)
                # Filter out any empty strings that might result from splitting (e.g., if there's a trailing separator)
                column_names = [name for name in column_names if name]
            else:
                column_names = [] # Fallback if '#fields' line is malformed
        elif not line.startswith('#'):
            # Stop reading header lines once data lines start
            break
        header_lines.append(line)

# Calculate number of lines to skip
skip_lines = len(header_lines)

#Load the data, skipping the header and using column names
# Corrected: use skiprows instead of skiplines
log_df = pd.read_csv(log_path, sep=separator, skiprows=skip_lines, names=column_names, index_col=False)

#Create new df with the same columns as the training df
log_df_features = pd.DataFrame(index=log_df.index, columns=x_train_raw.columns)

#Map the column names
column_mapping = {
    'duration': 'duration',
    'proto': 'protocol_type',
    'service': 'service',
    'orig_bytes': 'src_bytes',
    'resp_bytes': 'dst_bytes',
}

# Corrected: Iterate through the defined column_mapping
for log_col, kdd_col in column_mapping.items():
    if log_col in log_df.columns and kdd_col in log_df_features.columns:
        log_df_features[kdd_col] = log_df[log_col]

#Map and fill nonexistent features. Fill numerical columns with mean from training df
#Fill categorical columns with the mode from training df or a placeholder
for col in x_train_raw.columns:
  if col not in column_mapping.values():
    if col in numerical_features_for_x:
        # Assign directly, as log_df_features[col] would be all NaNs at this point
        log_df_features[col] = df_train[col].mean()
    elif col in categorical_features_for_x:
        log_df_features[col] = df_train[col].mode()[0] if not df_train[col].mode().empty else 'unknown'

#Handle NaNs in categorical features (for columns that were mapped, or for any other reason)
for col in log_df_features.columns:
    if log_df_features[col].isnull().any():
      if col in numerical_features_for_x:
        # Ensure numerical type before filling with mean
            log_df_features[col] = pd.to_numeric(log_df_features[col], errors='coerce')
            log_df_features[col] = log_df_features[col].fillna(df_train[col].mean())
      elif col in categorical_features_for_x:
            log_df_features[col] = log_df_features[col].fillna(df_train[col].mode()[0] if not df_train[col].mode().empty else 'unknown')

# Ensure correct dtypes for preprocessing. Specifically, numerical features should be numeric.
for col in numerical_features_for_x:
    log_df_features[col] = pd.to_numeric(log_df_features[col], errors='coerce')

# Apply the preprocessor only if log_df_features is not empty
if not log_df_features.empty:
    log_df_features_processed = preprocessor.transform(log_df_features)
    print("Log data preprocessed successfully.")
else:
    log_df_features_processed = None # Or handle as appropriate for downstream steps
    print("No log data found to preprocess after loading conn.log. `log_df_features_processed` is None.")

#Update conn_df to display processed features
conn_df = log_df_features.copy()

  log_df = pd.read_csv(log_path, sep=separator, skiprows=skip_lines, names=column_names, index_col=False)
  log_df = pd.read_csv(log_path, sep=separator, skiprows=skip_lines, names=column_names, index_col=False)


Log data preprocessed successfully.


#Contents of Preprocessed conn.log




In [40]:
display(conn_df.head())

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,287.14465,tcp,http,SF,45566.743,19779.114421,0.000198,0.022687,0.000111,0.204409,...,182.148945,115.653005,0.521242,0.082951,0.148379,0.032542,0.284452,0.278485,0.118832,0.12024
1,287.14465,tcp,http,SF,45566.743,19779.114421,0.000198,0.022687,0.000111,0.204409,...,182.148945,115.653005,0.521242,0.082951,0.148379,0.032542,0.284452,0.278485,0.118832,0.12024
2,287.14465,tcp,http,SF,45566.743,19779.114421,0.000198,0.022687,0.000111,0.204409,...,182.148945,115.653005,0.521242,0.082951,0.148379,0.032542,0.284452,0.278485,0.118832,0.12024
3,287.14465,tcp,http,SF,45566.743,19779.114421,0.000198,0.022687,0.000111,0.204409,...,182.148945,115.653005,0.521242,0.082951,0.148379,0.032542,0.284452,0.278485,0.118832,0.12024
4,287.14465,tcp,http,SF,45566.743,19779.114421,0.000198,0.022687,0.000111,0.204409,...,182.148945,115.653005,0.521242,0.082951,0.148379,0.032542,0.284452,0.278485,0.118832,0.12024


In [41]:
import numpy as np

#Make predictiction on processed conn.log
predictions = model.predict(log_df_features_processed)

#Get index for 'normal' class
normal_class_index = np.where(label_encoder.classes_ == 'normal')[0][0]

#Set threshold for anomaly. If 'normal" probability is less than 50%, flag as an anomaly
anomaly_threshold = 0.20

#Calculate anomaly scores
#anomaly_scores = predictions[:, normal_class_index]

predicted_classes_encoded_thresholded = []
for i in range(len(predictions)):
    normal_prob = predictions[i, normal_class_index]
    if normal_prob < anomaly_threshold:
        # If 'normal' probability is low, classify as the highest non-normal class
        temp_predictions = predictions[i].copy()
        temp_predictions[normal_class_index] = -1 # Temporarily ignore 'normal'
        predicted_classes_encoded_thresholded.append(tf.argmax(temp_predictions).numpy())
    else:
        # Otherwise, stick to the original highest probability (which is likely normal)
        predicted_classes_encoded_thresholded.append(tf.argmax(predictions[i]).numpy())

predicted_classes_encoded = np.array(predicted_classes_encoded_thresholded)

#Decode the predicted classes back to their original labels
predicted_classes = label_encoder.inverse_transform(predicted_classes_encoded)

#Add the predicted traffic type to the conn_df DataFrame
conn_df['predicted_traffic_type'] = predicted_classes

#Define the anomalies
anomalies = conn_df[conn_df['predicted_traffic_type'] != 'normal'].copy() # Added .copy() to avoid SettingWithCopyWarning

# Define a mapping from predicted_traffic_type to known attack categories
attack_mapping = {
    'smurf': 'DDoS Attack',
    'neptune': 'DoS Attack',
    'satan': 'Probing Attack',
    'portsweep': 'Probing Attack',
    'ipsweep': 'Probing Attack',
    'nmap': 'Probing Attack',
    'warezclient': 'U2R Attack',
    'guess_passwd': 'U2R Attack',
    'rootkit': 'U2R Attack',
    'buffer_overflow': 'U2R Attack',
    'loadmodule': 'U2R Attack',
    'perl': 'U2R Attack',
    'spy': 'U2R Attack',
    'phf': 'U2R Attack',
    'multihop': 'R2L Attack',
    'ftp_write': 'R2L Attack',
    'imap': 'R2L Attack',
    'warezmaster': 'R2L Attack',
    'snmpgetattack': 'R2L Attack',
    'snmpguess': 'R2L Attack',
    'xlock': 'R2L Attack',
    'xsnoop': 'R2L Attack',
    'worm': 'DoS Attack',
    'mscan': 'Probing Attack',
    'processtable': 'DoS Attack',
    'apache2': 'DoS Attack',
    'back': 'DoS Attack',
    'teardrop': 'DoS Attack',
    'land': 'DoS Attack',
    'pod': 'DoS Attack',
    'mailbomb': 'DoS Attack',
    'udpstorm': 'DoS Attack',
    'httptunnel': 'Probing Attack',
    'sqlattack': 'U2R Attack',
    'xterm': 'U2R Attack',
    'ps': 'U2R Attack',
    'named': 'R2L Attack',
    'sendmail': 'R2L Attack',
    'secret': 'R2L Attack',
    'syslog': 'R2L Attack',
    'netbus': 'R2L Attack',
    'rsh': 'R2L Attack',
    'finger': 'R2L Attack',
    'eject': 'R2L Attack',
    'sshd': 'R2L Attack',
    'score': 'Probing Attack',
    'tftp_write': 'R2L Attack',
    'irc': 'R2L Attack',
    'lockd': 'R2L Attack',
    'xdmcp': 'R2L Attack',
    'http_tunnel': 'Probing Attack' # This might be the same as httptunnel, keeping for completeness
}

# Create a new column 'attack_category' in the anomalies DataFrame
# Use .get() with a default 'Unknown' category for types not in the mapping
anomalies['attack_category'] = anomalies['predicted_traffic_type'].apply(lambda x: attack_mapping.get(x, 'Unknown'))

#Display the rows that are predicted as anomalies
if not anomalies.empty:
    print("Anomalies detected:")
    display(anomalies[['duration', 'service', 'flag', 'protocol_type', 'predicted_traffic_type', 'attack_category']])
else:
    print("No anomalies detected")

[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Anomalies detected:


Unnamed: 0,duration,service,flag,protocol_type,predicted_traffic_type,attack_category
0,287.14465,http,SF,tcp,neptune,DoS Attack
1,287.14465,http,SF,tcp,neptune,DoS Attack
2,287.14465,http,SF,tcp,neptune,DoS Attack
3,287.14465,http,SF,tcp,neptune,DoS Attack
4,287.14465,http,SF,tcp,neptune,DoS Attack
...,...,...,...,...,...,...
3469,287.14465,http,SF,tcp,neptune,DoS Attack
3470,287.14465,http,SF,tcp,neptune,DoS Attack
3471,287.14465,http,SF,tcp,neptune,DoS Attack
3472,287.14465,http,SF,tcp,neptune,DoS Attack
