<a href="https://colab.research.google.com/github/ladyTootie/ACE-R-D/blob/main/MPL_Final_Copy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Import TensorFlow and Check the Version

In [1]:
import tensorflow as tf
print("TensorFlow version:", tf.__version__)

TensorFlow version: 2.19.0


# Load and Preprocess NSL-KDD dataset

In [17]:
import tensorflow as tf
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from tensorflow.keras.models import Sequential #linear stack of layers
from tensorflow.keras.layers import Dense #connected neural network layer

#Define column names
column_names = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins',
    'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root',
    'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds',
    'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate',
    'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
    'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
    'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
    'dst_host_srv_rerror_rate', 'traffic_type', 'difficulty_level' # Adjusted column names
]

#Load the dataset
df = pd.read_csv('/content/KDDTrain+.txt', names=column_names)

#Check if dataset loaded correctly
#display(df.head())

# Drop the difficulty_level column
df = df.drop('difficulty_level', axis=1)

#Separate the features from traffic_type (what the MLP is predicting)
x = df.drop('traffic_type', axis=1)
y = df['traffic_type']

#Identify categorical and numerical features
categorical_features_for_x = ['protocol_type', 'service', 'flag']
numerical_features_for_x = [col for col in x.columns if col not in categorical_features_for_x]

#Encode categorical and scale numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features_for_x),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features_for_x) # Added handle_unknown='ignore'
    ])

#Apply preprocessing
x_processed = preprocessor.fit_transform(x)


#Encode 'traffic_type'
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

#Split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x_processed, y_encoded, test_size=0.2, random_state=42)

# Create the MLP Model


In [18]:
#Get number of input features (columns)
num_features = x_train.shape[1]

#Get number of traffic types
num_traffic_types = len(label_encoder.classes_)

#Create MLP with three layers
model = Sequential([
    Dense(128, activation='relu', input_shape=(num_features,)), #input layer and hidden layer
    Dense(64, activation='relu'),#hidden layer
    Dense(num_traffic_types, activation='softmax')#output layer
])

#Compile the model
model.compile(optimizer='adam', #minimize the loss function, update weights and biases
              loss='sparse_categorical_crossentropy', #measures performance
              metrics=['accuracy'])#calculates accuracy

#Print model summary
model.summary()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


# Train the Model

In [19]:
#Train the model with 10 epochs
model.fit(x_train, y_train, epochs=10, batch_size=32, validation_split=0.2)
print("Training complete. Ready to analyze logs.")

Epoch 1/10
[1m2520/2520[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - accuracy: 0.9535 - loss: 0.2300 - val_accuracy: 0.9902 - val_loss: 0.0383
Epoch 2/10
[1m2520/2520[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.9913 - loss: 0.0295 - val_accuracy: 0.9927 - val_loss: 0.0251
Epoch 3/10
[1m2520/2520[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.9931 - loss: 0.0223 - val_accuracy: 0.9909 - val_loss: 0.0287
Epoch 4/10
[1m2520/2520[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.9933 - loss: 0.0209 - val_accuracy: 0.9944 - val_loss: 0.0230
Epoch 5/10
[1m2520/2520[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.9941 - loss: 0.0180 - val_accuracy: 0.9947 - val_loss: 0.0220
Epoch 6/10
[1m2520/2520[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.9945 - loss: 0.0164 - val_accuracy: 0.9904 - val_loss: 0.0342
Epoch 7/10
[1m2

In [10]:
#Evaluate the model on the test set
loss, accuracy = model.evaluate(x_test, y_test)

print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

[1m788/788[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9958 - loss: 0.0359
Test Loss: 0.0429
Test Accuracy: 0.9954


# Preprocess the conn.log

In [25]:
import pandas as pd
#pd.set_option('future.no_silent_downcasting',True)

#Load the conn.log
log_path = '/content/conn.log'

#Read the file and organize the data like the training df
header_lines = []
separator = '\t' # default separator
column_names = []

with open(log_path, 'r') as file:
    for line in file:
        if line.startswith('#separator'):
            # Corrected: split by space, take the second part, strip, and replace
            raw_sep = line.split(' ', 1)[1].strip()
            separator = raw_sep.replace('\\x09', '\t')
        elif line.startswith('#fields'):
            # Corrected: extract fields string by splitting on the predetermined separator after '#fields'
            fields_prefix = '#fields'
            if fields_prefix in line:
                # Find the start of the actual field names after '#fields' and the first separator
                start_of_fields = line.find(fields_prefix) + len(fields_prefix)
                fields_str_full = line[start_of_fields:].strip()
                # Now split by the determined separator
                column_names = fields_str_full.split(separator)
                # Filter out any empty strings that might result from splitting (e.g., if there's a trailing separator)
                column_names = [name for name in column_names if name]
            else:
                column_names = [] # Fallback if '#fields' line is malformed
        elif not line.startswith('#'):
            # Stop reading header lines once data lines start
            break
        header_lines.append(line)

# Calculate number of lines to skip
skip_lines = len(header_lines)

#Load the data, skipping the header and using column names
# Corrected: use skiprows instead of skiplines
log_df = pd.read_csv(log_path, sep=separator, skiprows=skip_lines, names=column_names, index_col=False)

#Create new df with the same columns as the training df
log_df_features = pd.DataFrame(index=log_df.index, columns=x.columns)

#Map the column names
column_mapping = {
    'duration': 'duration',
    'proto': 'protocol_type',
    'service': 'service',
    'orig_bytes': 'src_bytes',
    'resp_bytes': 'dst_bytes',
}

# Corrected: Iterate through the defined column_mapping
for log_col, kdd_col in column_mapping.items():
    if log_col in log_df.columns and kdd_col in log_df_features.columns:
        log_df_features[kdd_col] = log_df[log_col]

#Map and fill nonexistent features. Fill numerical columns with mean from training df
#Fill categorical columns with the mode from training df or a placeholder
for col in x.columns:
  if col not in column_mapping.values():
    if col in numerical_features_for_x:
        # Assign directly, as log_df_features[col] would be all NaNs at this point
        log_df_features[col] = df[col].mean()
    elif col in categorical_features_for_x:
        log_df_features[col] = df[col].mode()[0] if not df[col].mode().empty else 'unknown'

#Handle NaNs in categorical features (for columns that were mapped, or for any other reason)
for col in log_df_features.columns:
    if log_df_features[col].isnull().any():
      if col in numerical_features_for_x:
        # Ensure numerical type before filling with mean
            log_df_features[col] = pd.to_numeric(log_df_features[col], errors='coerce')
            log_df_features[col] = log_df_features[col].fillna(df[col].mean())
      elif col in categorical_features_for_x:
            log_df_features[col] = log_df_features[col].fillna(df[col].mode()[0] if not df[col].mode().empty else 'unknown')

# Ensure correct dtypes for preprocessing. Specifically, numerical features should be numeric.
for col in numerical_features_for_x:
    log_df_features[col] = pd.to_numeric(log_df_features[col], errors='coerce')

# Apply the preprocessor only if log_df_features is not empty
if not log_df_features.empty:
    log_df_features_processed = preprocessor.transform(log_df_features)
    print("Log data preprocessed successfully.")
else:
    log_df_features_processed = None # Or handle as appropriate for downstream steps
    print("No log data found to preprocess after loading conn.log. `log_df_features_processed` is None.")

#Update conn_df to display processed features
conn_df = log_df_features.copy()

Log data preprocessed successfully.


# Predict Anomalies and Analyze Results

In [28]:
import numpy as np

#Make predictiction on processed conn.log
predictions = model.predict(log_df_features_processed)

#Get index for 'normal' class
normal_class_index = np.where(label_encoder.classes_ == 'normal')[0][0]

#Set threshold for anomaly. If 'normal" probability is less than 60%, flag as an anomaly
anomaly_threshold = 0.90 # Corrected variable name from anomaly_threshold

#Calculate anomaly scores
#anomaly_scores = predictions[:, normal_class_index]

predicted_classes_encoded_thresholded = []
for i in range(len(predictions)):
    normal_prob = predictions[i, normal_class_index]
    if normal_prob < anomaly_threshold:
        # If 'normal' probability is low, classify as the highest non-normal class
        temp_predictions = predictions[i].copy()
        temp_predictions[normal_class_index] = -1 # Temporarily ignore 'normal'
        predicted_classes_encoded_thresholded.append(tf.argmax(temp_predictions).numpy())
    else:
        # Otherwise, stick to the original highest probability (which is likely normal)
        predicted_classes_encoded_thresholded.append(tf.argmax(predictions[i]).numpy())

predicted_classes_encoded = np.array(predicted_classes_encoded_thresholded)

#Decode the predicted classes back to their original labels
predicted_classes = label_encoder.inverse_transform(predicted_classes_encoded)

#Add the predicted traffic type to the conn_df DataFrame
conn_df['predicted_traffic_type'] = predicted_classes

#Define the anomalies
anomalies = conn_df[conn_df['predicted_traffic_type'] != 'normal']

#Display the rows that are predicted as anomalies
if not anomalies.empty:
    print("Anomalies detected:")
    display(anomalies)
else:
    print("No anomalies detected")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 156ms/step
Anomalies detected:


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,predicted_traffic_type
0,16.403894,icmp,-,SF,0.0,0.0,0.000198,0.022687,0.000111,0.204409,...,115.653005,0.521242,0.082951,0.148379,0.032542,0.284452,0.278485,0.118832,0.12024,smurf


#Contents of Preprocessed conn.log




In [29]:
display(conn_df.head())

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,predicted_traffic_type
0,16.403894,icmp,-,SF,0.0,0.0,0.000198,0.022687,0.000111,0.204409,...,115.653005,0.521242,0.082951,0.148379,0.032542,0.284452,0.278485,0.118832,0.12024,smurf
1,287.14465,tcp,http,SF,45566.743,19779.114421,0.000198,0.022687,0.000111,0.204409,...,115.653005,0.521242,0.082951,0.148379,0.032542,0.284452,0.278485,0.118832,0.12024,normal
