<a href="https://colab.research.google.com/github/ladyTootie/ACE-R-D/blob/main/MPL_modelv1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Import TensorFlow and Check the Version

In [1]:
import tensorflow as tf
print("TensorFlow version:", tf.__version__)

TensorFlow version: 2.19.0


# Load and Preprocess NSL-KDD dataset

In [3]:
import tensorflow as tf
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from tensorflow.keras.models import Sequential #linear stack of layers
from tensorflow.keras.layers import Dense #connected neural network layer

#Define column names
column_names = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins',
    'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root',
    'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds',
    'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate',
    'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
    'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
    'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
    'dst_host_srv_rerror_rate', 'traffic_type', 'difficulty_level' # Adjusted column names
]

#Load the dataset
df = pd.read_csv('/content/KDDTrain+.txt', names=column_names)

# Drop the difficulty_level column
df = df.drop('difficulty_level', axis=1)

#Check if dataset loaded correctly
#display(df.head())

#Separate features (X) and labels (y)
X = df.drop('traffic_type', axis=1)
y = df['traffic_type']

# Dynamically identify categorical and numerical features in X
# Based on the original dataset, 'protocol_type', 'service', 'flag' are categorical in X.
categorical_features_for_X = ['protocol_type', 'service', 'flag']
# Numerical features are all other columns in X that are not explicitly defined as categorical
numerical_features_for_X = [col for col in X.columns if col not in categorical_features_for_X]

#Preprocessing, One-hot encode categorical features and scale numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features_for_X),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features_for_X)])

#Apply preprocessing
X_processed = preprocessor.fit_transform(X)

#Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

#Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y_encoded, test_size=0.2, random_state=42)

# Create the MLP Model


In [5]:
#Get the number of input features
input_shape = X_train.shape[1]

#Get the number of output classes
output_shape = len(label_encoder.classes_)

#Create the MLP model with three layers
model = Sequential([
    Dense(128, activation='relu', input_shape=(input_shape,)), # Input layer and first hidden layer
    Dense(64, activation='relu'), # Second hidden layer
    Dense(output_shape, activation='softmax') # Output layer with softmax for multi-class classification
])

#Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy', # Use sparse_categorical_crossentropy for integer labels
              metrics=['accuracy'])

#Print the model summary
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


# Train the Model

In [6]:
#Train the model with 10 epochs
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

print("Model training complete.")

Epoch 1/10
[1m2520/2520[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.9516 - loss: 0.2442 - val_accuracy: 0.9872 - val_loss: 0.0418
Epoch 2/10
[1m2520/2520[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9916 - loss: 0.0281 - val_accuracy: 0.9925 - val_loss: 0.0252
Epoch 3/10
[1m2520/2520[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9932 - loss: 0.0211 - val_accuracy: 0.9924 - val_loss: 0.0255
Epoch 4/10
[1m2520/2520[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9935 - loss: 0.0200 - val_accuracy: 0.9929 - val_loss: 0.0251
Epoch 5/10
[1m2520/2520[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9944 - loss: 0.0173 - val_accuracy: 0.9916 - val_loss: 0.0285
Epoch 6/10
[1m2520/2520[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9945 - loss: 0.0167 - val_accuracy: 0.9947 - val_loss: 0.0240
Epoch 7/10
[1m2

#F1-Score, Test Loss, and Test Accuracy


In [8]:
# Make predictions on the test set (X_test)
y_pred_probabilities = model.predict(X_test)
y_pred_encoded = tf.argmax(y_pred_probabilities, axis=1).numpy()

# Calculate F1-score
from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred_encoded, average='weighted')

print(f"F1-Score on Test Set: {f1:.4f}")

[1m788/788[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 978us/step
F1-Score on Test Set: 0.9944


In [9]:
#Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)

print("Model evaluation complete.")
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

Model evaluation complete.
Test Loss: 0.0268
Test Accuracy: 0.9946


# Preprocess the conn.log

In [None]:
import pandas as pd
pd.set_option('future.no_silent_downcasting', True)

# Load the conn.log file to define raw_conn_log_df
log_file_path = '/content/conn.log'

# Read the file and organize the data
with open(log_file_path, 'r') as f:
    header_lines = []
    separator = '\t' #Separator in Zeek logs
    column_names = []
    for line in f:
        if line.startswith('#separator'):
            separator = line.split(maxsplit=1)[1].strip().replace('\\x09', '\t')
        elif line.startswith('#fields'):
            column_names = line.split(maxsplit=1)[1].strip().split(separator)
        elif not line.startswith('#'):
            break
        header_lines.append(line)

# Calculate the number of lines to skip
skip_rows = len(header_lines)

# Load the data using pandas, skipping the header and using the extracted column names
raw_conn_log_df = pd.read_csv(log_file_path, sep=separator, skiprows=skip_rows, names=column_names, index_col=False)


# Create a new DataFrame 'conn_df_model_features' with the same columns as the training features 'X'
# Initialize with NaNs to easily track what needs filling
conn_df_model_features = pd.DataFrame(index=raw_conn_log_df.index, columns=X.columns)

# Map available columns from raw_conn_log_df to the model's expected features
# Perform explicit renaming to match NSL-KDD column names
column_mapping = {
    'duration': 'duration',
    'proto': 'protocol_type',
    'service': 'service',
    'orig_bytes': 'src_bytes',
    'resp_bytes': 'dst_bytes'
}

for raw_col, model_col in column_mapping.items():
    if raw_col in raw_conn_log_df.columns and model_col in conn_df_model_features.columns:
        conn_df_model_features[model_col] = raw_conn_log_df[raw_col]

# Handle columns that are in X but not directly mapped from raw_conn_log_df
# Fill numerical columns with the mean from the training data (df)
# Fill categorical columns with the mode from the training data (df) or a placeholder like 'unknown'
for col in X.columns:
    if col not in column_mapping.values(): # If it wasn't directly mapped
        if col in numerical_features_for_X: # Check if it's a numerical feature in the original X
            conn_df_model_features[col] = conn_df_model_features[col].fillna(df[col].mean())
        elif col in categorical_features_for_X: # Check if it's a categorical feature in the original X
            # Use mode from training data or 'unknown' if mode is problematic
            conn_df_model_features[col] = conn_df_model_features[col].fillna(df[col].mode()[0] if not df[col].mode().empty else 'unknown')

# After mapping and filling non-existent features, handle any remaining NaNs in the mapped columns
# (e.g., if 'proto' or 'service' had NaNs in raw_conn_log_df before mapping)
for col in conn_df_model_features.columns:
    if conn_df_model_features[col].isnull().any():
        if col in numerical_features_for_X:
            # Ensure numerical type before filling with mean
            conn_df_model_features[col] = pd.to_numeric(conn_df_model_features[col], errors='coerce')
            conn_df_model_features[col] = conn_df_model_features[col].fillna(df[col].mean())
        elif col in categorical_features_for_X:
            conn_df_model_features[col] = conn_df_model_features[col].fillna(df[col].mode()[0] if not df[col].mode().empty else 'unknown')

# Ensure correct dtypes for preprocessing. Specifically, numerical features should be numeric.
for col in numerical_features_for_X:
    conn_df_model_features[col] = pd.to_numeric(conn_df_model_features[col], errors='coerce')


# Now apply the preprocessor
X_conn_processed = preprocessor.transform(conn_df_model_features)

# Update conn_df for display purposes to reflect the processed features
# This will be useful for 'display(conn_df.head())' later, showing the aligned data
conn_df = conn_df_model_features.copy()

  raw_conn_log_df = pd.read_csv(log_file_path, sep=separator, skiprows=skip_rows, names=column_names, index_col=False)


# Predict Anomalies and Analyze Results

In [None]:
import numpy as np

#Make predictions on the preprocessed conn.log data
predictions = model.predict(X_conn_processed)

# Get the index for 'normal' class
normal_class_idx = np.where(label_encoder.classes_ == 'normal')[0][0]

anomaly_threshold = 0.60 #if 'normal' probability is < 60%, flag as anomaly

predicted_classes_encoded_thresholded = []
for i in range(len(predictions)):
    normal_prob = predictions[i, normal_class_idx]
    if normal_prob < anomaly_threshold:
        # If 'normal' probability is low, classify as the highest non-normal class
        temp_predictions = predictions[i].copy()
        temp_predictions[normal_class_idx] = -1 # Temporarily ignore 'normal'
        predicted_classes_encoded_thresholded.append(tf.argmax(temp_predictions).numpy())
    else:
        # Otherwise, stick to the original highest probability (which is likely normal)
        predicted_classes_encoded_thresholded.append(tf.argmax(predictions[i]).numpy())

predicted_classes_encoded = np.array(predicted_classes_encoded_thresholded)

#Decode the predicted classes back to their original labels
predicted_classes = label_encoder.inverse_transform(predicted_classes_encoded)

#Add the predicted traffic type to the conn_df DataFrame
conn_df['predicted_traffic_type'] = predicted_classes

#Define what constitutes an "anomaly"
anomalies = conn_df[conn_df['predicted_traffic_type'] != 'normal']

#Display the rows that are predicted as anomalies
if not anomalies.empty:
    print("Anomalies detected:")
    display(anomalies)
else:
    print("No anomalies detected")

[1m1629/1629[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
Anomalies detected:


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,predicted_traffic_type
32411,287.14465,udp,dns,SF,45566.743,19779.114421,0.000198,0.022687,0.000111,0.204409,...,115.653005,0.521242,0.082951,0.148379,0.032542,0.284452,0.278485,0.118832,0.12024,satan
32412,287.14465,udp,dns,SF,45566.743,19779.114421,0.000198,0.022687,0.000111,0.204409,...,115.653005,0.521242,0.082951,0.148379,0.032542,0.284452,0.278485,0.118832,0.12024,satan
35608,287.14465,udp,dns,SF,45566.743,19779.114421,0.000198,0.022687,0.000111,0.204409,...,115.653005,0.521242,0.082951,0.148379,0.032542,0.284452,0.278485,0.118832,0.12024,satan
35609,287.14465,udp,dns,SF,45566.743,19779.114421,0.000198,0.022687,0.000111,0.204409,...,115.653005,0.521242,0.082951,0.148379,0.032542,0.284452,0.278485,0.118832,0.12024,satan


#Contents of Preprocessed conn.log




In [None]:
display(conn_df.head())

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,predicted_traffic_type
0,6.6e-05,tcp,-,SF,22.0,0.0,0.000198,0.022687,0.000111,0.204409,...,115.653005,0.521242,0.082951,0.148379,0.032542,0.284452,0.278485,0.118832,0.12024,normal
1,287.14465,tcp,-,SF,45566.743,19779.11,0.000198,0.022687,0.000111,0.204409,...,115.653005,0.521242,0.082951,0.148379,0.032542,0.284452,0.278485,0.118832,0.12024,normal
2,287.14465,tcp,-,SF,45566.743,19779.11,0.000198,0.022687,0.000111,0.204409,...,115.653005,0.521242,0.082951,0.148379,0.032542,0.284452,0.278485,0.118832,0.12024,normal
3,287.14465,tcp,-,SF,45566.743,19779.11,0.000198,0.022687,0.000111,0.204409,...,115.653005,0.521242,0.082951,0.148379,0.032542,0.284452,0.278485,0.118832,0.12024,normal
4,43.972656,tcp,http,SF,16564.0,1133984.0,0.000198,0.022687,0.000111,0.204409,...,115.653005,0.521242,0.082951,0.148379,0.032542,0.284452,0.278485,0.118832,0.12024,normal
