<a href="https://colab.research.google.com/github/ladyTootie/ACE-R-D/blob/main/MPL_Final_Copy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Import TensorFlow and Check the Version

In [None]:
import tensorflow as tf
print("TensorFlow version:", tf.__version__)

TensorFlow version: 2.19.0


# Load and Preprocess NSL-KDD dataset

In [None]:
import tensorflow as tf
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from tensorflow.keras.models import Sequential #linear stack of layers
from tensorflow.keras.layers import Dense #connected neural network layer

#Define column names
column_names = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins',
    'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root',
    'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds',
    'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate',
    'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
    'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
    'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
    'dst_host_srv_rerror_rate', 'traffic_type', 'difficulty_level' # Adjusted column names
]

#Load the dataset
df = pd.read_csv('/content/KDDTrain+.txt', names=column_names)

# Drop the difficulty_level column
df = df.drop('difficulty_level', axis=1)

#Check if dataset loaded correctly
display(df.head())

#Identify categorical and numerical features
categorical_features = ['protocol_type', 'service', 'flag']
numerical_features = [col for col in df.columns if col not in categorical_features + ['traffic_type']]

# Convert numerical columns to numeric, coercing errors, and fill NaNs
for col in numerical_features:
    df[col] = pd.to_numeric(df[col], errors='coerce')
    df[col] = df[col].fillna(df[col].mean()) # or .median()

# Separate features (X) and labels (y)
X = df.drop('traffic_type', axis=1)
y = df['traffic_type']

# Preprocessing: One-hot encode categorical features and scale numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])

# Apply preprocessing
X_processed = preprocessor.fit_transform(X)

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y_encoded, test_size=0.2, random_state=42)

#print("Preprocessing complete. Data split into training and testing sets.")
#print("Shape of X_train:", X_train.shape)
#print("Shape of y_train:", y_train.shape)
#print("Shape of X_test:", X_test.shape)
#print("Shape of y_test:", y_test.shape)



Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,traffic_type
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal


# Create the MLP Model


In [None]:
# Get the number of input features
input_shape = X_train.shape[1]

# Get the number of output classes
output_shape = len(label_encoder.classes_)

# Create the MLP model
model = Sequential([
    Dense(128, activation='relu', input_shape=(input_shape,)), # Input layer and first hidden layer
    Dense(64, activation='relu'), # Second hidden layer
    Dense(output_shape, activation='softmax') # Output layer with softmax for multi-class classification
])

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy', # Use sparse_categorical_crossentropy for integer labels
              metrics=['accuracy'])

# Print the model summary
#model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


# Train the Model

In [None]:
# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

print("Model training complete.")

Epoch 1/10
[1m2520/2520[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 4ms/step - accuracy: 0.9428 - loss: 0.2613 - val_accuracy: 0.9914 - val_loss: 0.0349
Epoch 2/10
[1m2520/2520[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 3ms/step - accuracy: 0.9910 - loss: 0.0297 - val_accuracy: 0.9915 - val_loss: 0.0274
Epoch 3/10
[1m2520/2520[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 4ms/step - accuracy: 0.9928 - loss: 0.0216 - val_accuracy: 0.9925 - val_loss: 0.0277
Epoch 4/10
[1m2520/2520[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - accuracy: 0.9933 - loss: 0.0219 - val_accuracy: 0.9932 - val_loss: 0.0234
Epoch 5/10
[1m2520/2520[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - accuracy: 0.9947 - loss: 0.0176 - val_accuracy: 0.9947 - val_loss: 0.0233
Epoch 6/10
[1m2520/2520[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - accuracy: 0.9945 - loss: 0.0171 - val_accuracy: 0.9930 - val_loss: 0.0326
Epoch 7/10
[

In [None]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)

print("Model evaluation complete.")
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

Model evaluation complete.
Test Loss: 0.0262
Test Accuracy: 0.9934


# Preprocess the conn.log

In [None]:

# Load the new conn.log data using column names defined for NSL-KDD
conn_df = pd.read_csv('/content/conn(2).log', names=column_names)


# Check if the loaded data has the expected columns
expected_features = [col for col in column_names if col not in ['difficulty_level', 'traffic_type']]
if not all(col in conn_df.columns for col in expected_features):
    print("Warning: Columns in conn.log do not match expected features.")

# Identify numerical and categorical features (re-using the lists defined earlier)
numerical_features_conn = [col for col in conn_df.columns if col not in categorical_features + ['traffic_type']]


# Convert numerical columns to numeric, coercing errors, and fill NaNs
for col in numerical_features_conn:
    conn_df[col] = pd.to_numeric(conn_df[col], errors='coerce')
    conn_df[col] = conn_df[col].fillna(df[col].mean()) # Use mean from original training df

# Fill NaN values in categorical features with a placeholder
for col in categorical_features:
    conn_df[col] = conn_df[col].fillna('missing') # Fill with a string placeholder

# Select the features using the correct lists
X_conn = conn_df[numerical_features + categorical_features]


# Apply the preprocessing transformation
X_conn_processed = preprocessor.transform(X_conn)


Preprocessing of conn.log complete.
Shape of preprocessed conn.log data: (763, 122)


# Predict Anomalies and Analyze Results

In [None]:
# Make predictions on the preprocessed conn.log data
predictions = model.predict(X_conn_processed)

# The predictions are probabilities for each class. We need to get the class with the highest probability.
predicted_classes_encoded = tf.argmax(predictions, axis=1).numpy()

# Decode the predicted classes back to their original labels
predicted_classes = label_encoder.inverse_transform(predicted_classes_encoded)

# Add the predicted traffic type to the conn_df DataFrame
conn_df['predicted_traffic_type'] = predicted_classes

# Define what constitutes an "anomaly"
anomalies = conn_df[conn_df['predicted_traffic_type'] != 'normal']

# Display the rows that are predicted as anomalies
if not anomalies.empty:
    print("Anomalies detected:")
    display(anomalies)
else:
    print("No anomalies detected in the conn.log data.")

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
No anomalies detected in the conn.log data.
