# RANDOM FOREST MODEL

#STEP 1 : Load the training and test data


In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE
import os 

# Step 1: Load the training and test data
current_dir = os.path.dirname(os.path.abspath('__file__'))
project_root = current_dir
while not os.path.exists(os.path.join(project_root, 'system_main')):
    project_root = os.path.dirname(project_root)
    if project_root == os.path.dirname(project_root):  # reached the root directory
        raise FileNotFoundError("Could not find the 'system_main' directory.")

# Set up paths
dataset_dir = os.path.join(project_root, 'system_main', 'dataset')
model_dir = os.path.join(project_root, 'system_main', 'model', 'saved_model')

# Ensure the directories exist
os.makedirs(dataset_dir, exist_ok=True)
os.makedirs(model_dir, exist_ok=True)

# Print the paths for verification
print(f"Dataset directory: {dataset_dir}")
print(f"Model directory: {model_dir}")

# Load the data
train_data_path = os.path.join(dataset_dir, 'UNSW_NB15_training-set.csv')
test_data_path = os.path.join(dataset_dir, 'UNSW_NB15_testing-set.csv')

# Check if the files exist
if not os.path.exists(train_data_path):
    raise FileNotFoundError(f"Training dataset not found at {train_data_path}")
if not os.path.exists(test_data_path):
    raise FileNotFoundError(f"Testing dataset not found at {test_data_path}")

# Load the data
train_data_initial = pd.read_csv(train_data_path)
test_data_initial = pd.read_csv(test_data_path)

# Print the shape of the loaded data for verification
print(f"Training data shape: {train_data_initial.shape}")
print(f"Testing data shape: {test_data_initial.shape}")

Dataset directory: c:\Users\kimki\network_anomaly_detect\system_main\dataset
Model directory: c:\Users\kimki\network_anomaly_detect\system_main\model\saved_model
Training data shape: (175341, 45)
Testing data shape: (82332, 45)


#STEP 2: Data Preprocessing

In [34]:
def engineer_features(df):
    # Print available columns for debugging
    print("Available columns:", df.columns.tolist())

    # Create a feature for the total number of bytes
    if 'sbytes' in df.columns and 'dbytes' in df.columns:
        df['total_bytes'] = df['sbytes'] + df['dbytes']
    
    # Create a feature for the ratio of source to destination bytes
    if 'sbytes' in df.columns and 'dbytes' in df.columns:
        df['byte_ratio'] = df['sbytes'] / (df['dbytes'] + 1)  # Adding 1 to avoid division by zero
    
    # Create a feature for the difference in TTL
    if 'sttl' in df.columns and 'dttl' in df.columns:
        df['ttl_diff'] = df['sttl'] - df['dttl']
    
    # Create a binary feature for high port numbers 
    if 'sport' in df.columns and 'dport' in df.columns:
        df['high_port'] = ((df['sport'] > 1024) | (df['dport'] > 1024)).astype(int)
    
    return df



In [35]:
# Step 2: Data-Preprocessing
# Feature Engineering Function to make up the lost of attack_cat feature 


# Drop the 'service' column from both training and test datasets as it contains a lot of missing values
train_data = train_data_initial.drop(columns=['service'])
test_data = test_data_initial.drop(columns=['service'])
# Apply feature engineering
train_data = engineer_features(train_data)
test_data = engineer_features(test_data)

# Define the target column and categorical columns
target_column = 'label'
categorical_columns = ['proto', 'state','attack_cat']
# Separate features and target from the training data
X_train = train_data.drop(columns=[target_column])
y_train = train_data[target_column]

# Separate features and target from the test data
X_test = test_data.drop(columns=[target_column])
y_test = test_data[target_column]

# Initialize label encoders for categorical columns
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    le.fit(X_train[col].astype(str))  # Fit on the training data
    X_train[col] = le.transform(X_train[col].astype(str))

    # Handle unseen categories in the test set
    X_test[col] = X_test[col].astype(str)
    X_test[col] = X_test[col].map(lambda x: le.transform([x])[0] if x in le.classes_ else -1)

    label_encoders[col] = le

Available columns: ['id', 'dur', 'proto', 'state', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth', 'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports', 'attack_cat', 'label']
Available columns: ['id', 'dur', 'proto', 'state', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth', 'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm', 'ct_sr

# STEP 3 : Initialize and train the basic Random Forest model

In [36]:
# Step 3: Initialize and train the basic Random Forest model
basic_model = RandomForestClassifier(n_estimators=100, random_state=42)
basic_model.fit(X_train, y_train)

# Make predictions on the test data using the basic model
y_pred = basic_model.predict(X_test)

# Evaluate the basic model
basic_accuracy = accuracy_score(y_test, y_pred)
basic_report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Basic Model Accuracy: {basic_accuracy:.2f}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report for Basic Model:')
print(basic_report)

Basic Model Accuracy: 0.86
Confusion Matrix:
[[36996     4]
 [11697 33635]]
Classification Report for Basic Model:
              precision    recall  f1-score   support

           0       0.76      1.00      0.86     37000
           1       1.00      0.74      0.85     45332

    accuracy                           0.86     82332
   macro avg       0.88      0.87      0.86     82332
weighted avg       0.89      0.86      0.86     82332



# STEP 4: Model Optimization

When we further analysed the UNSW-NB15, we noticed that it is an imbalanced dataset. The SMOTE (Synthetic Minority Oversampling Technique) algorithm is incorporated to create synthetic samples for the minority class to balance out the class distribution of the dataset. Then we optimize the model again over the balanced training dataset.

In [37]:
#Apply SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [38]:
# Step 4: Model Optimization
#Feature selection using SelectKBest after the basic model

selector = SelectKBest(score_func=f_classif, k=25)  # Selecting top 20 features
X_train_selected = selector.fit_transform(X_train_resampled, y_train_resampled)

# Get selected feature names
selected_feature_names = X_train_resampled.columns[selector.get_support()]

# Apply the same selection to test data
X_test_selected = X_test[selected_feature_names]

# Train a new Random Forest model on the selected features
optimized_model = RandomForestClassifier(n_estimators=100, random_state=42)
optimized_model.fit(X_train_selected, y_train_resampled)

# Make predictions on the test data using the optimized model
y_pred_optimized = optimized_model.predict(X_test_selected)

# Evaluate the optimized model
optimized_accuracy = accuracy_score(y_test, y_pred_optimized)
optimized_report = classification_report(y_test, y_pred_optimized)
optimized_conf_matrix = confusion_matrix(y_test, y_pred_optimized)

print(f'Optimized Model Accuracy: {optimized_accuracy:.2f}')
print('Confusion Matrix for Optimized Model:')
print(optimized_conf_matrix)
print('Classification Report for Optimized Model:')
print(optimized_report)



Optimized Model Accuracy: 0.97
Confusion Matrix for Optimized Model:
[[37000     0]
 [ 2551 42781]]
Classification Report for Optimized Model:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97     37000
           1       1.00      0.94      0.97     45332

    accuracy                           0.97     82332
   macro avg       0.97      0.97      0.97     82332
weighted avg       0.97      0.97      0.97     82332



In [39]:
import joblib
# Save the model and preprocessor information
model_path = os.path.join(model_dir, 'random_forest_model.joblib')
joblib.dump(optimized_model, model_path)

preprocessor = {
    'label_encoders': label_encoders,
    'selected_features': selected_feature_names,
    'feature_selector': selector,
    'smote': smote
}
preprocessor_path = os.path.join(model_dir, 'rf_preprocessor.joblib')
joblib.dump(preprocessor, preprocessor_path)

print(f"Model saved to: {model_path}")
print(f"Preprocessor saved to: {preprocessor_path}")

Model saved to: c:\Users\kimki\network_anomaly_detect\system_main\model\saved_model\random_forest_model.joblib
Preprocessor saved to: c:\Users\kimki\network_anomaly_detect\system_main\model\saved_model\rf_preprocessor.joblib
