# Neural Networks for Classification

In this notebook I will explore neural networks in a similar manner to how i did in the notebook on logistic regression. 

## Data Preprocessing

### Data Loading

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression

from helpers.cross_validation import perform_model_cv
from helpers.feature_selection import (
    perform_rfe,
    pretty_print_rfecv_results,
    select_features_by_correlation,
    select_features_by_chi2,
    select_features_by_hill_climbing,
    select_features_by_mutual_info_categorical,
    select_features_by_mutual_info_numeric,
)
from helpers.param_tuning import tune_hyperparameters
from helpers.preprocessing import preprocess_data
from helpers.evaluate_model_final import train_and_evaluate_model

# Define feature lists for preprocessing
categorical_features = [
    "proto", "state", "service", "is_sm_ips_ports", "is_ftp_login"
]
features_to_transform = [
    "sbytes", "dbytes", "sttl", "dttl", "sload", "dload", "spkts", "dpkts",
    "swin", "dwin", "stcpb", "dtcpb", "smeansz", "dmeansz", "sjit", "djit"
]
features_to_drop = ["attack_cat", "id"]

# Load the training and testing datasets
train_data = pd.read_csv("../data/UNSW_NB15/UNSW_NB15_training-set.csv")
test_data = pd.read_csv("../data/UNSW_NB15/UNSW_NB15_testing-set.csv")

# Preprocess the data
train_data, test_data, categorical_features = preprocess_data(
    train_data, 
    test_data, 
    categorical_features, 
    features_to_transform, 
    features_to_drop
)

# After preprocessing, drop the feature 'state_INT' from both datasets if present
if 'state_INT' in train_data.columns:
    train_data.drop(columns=['state_INT'], inplace=True)
if 'state_INT' in test_data.columns:
    test_data.drop(columns=['state_INT'], inplace=True)

categorical_features.remove('state_INT')

# At the end, print numeric and categorical columns from the training data
numeric_columns = [f for f in train_data.columns if f not in categorical_features]
print("\nNumeric columns in the Training Data:")
print(numeric_columns)
print("Total number of Numeric columns:", len(numeric_columns))
print("\nCategorical columns in the Training Data:")
print(categorical_features)
print("Total number of Categorical columns:", len(categorical_features))

2025-04-03 19:14:10,116 - helpers.preprocessing - INFO - Number of common columns: 43
2025-04-03 19:14:11,186 - helpers.preprocessing - INFO - Processed shapes - Train: (175341, 199), Test: (82332, 199)



Numeric columns in the Training Data:
['sinpkt', 'ct_dst_src_ltm', 'tcprtt', 'ct_srv_src', 'dur', 'ct_dst_ltm', 'dbytes', 'ct_src_ltm', 'sttl', 'sload', 'swin', 'ct_state_ttl', 'sloss', 'ct_flw_http_mthd', 'sbytes', 'sjit', 'spkts', 'trans_depth', 'dinpkt', 'ct_dst_sport_ltm', 'ackdat', 'dpkts', 'dmean', 'dwin', 'ct_src_dport_ltm', 'dloss', 'label', 'response_body_len', 'smean', 'ct_ftp_cmd', 'djit', 'rate', 'dtcpb', 'stcpb', 'dload', 'ct_srv_dst', 'dttl', 'synack']
Total number of Numeric columns: 38

Categorical columns in the Training Data:
['proto_3pc', 'proto_a/n', 'proto_aes-sp3-d', 'proto_any', 'proto_argus', 'proto_aris', 'proto_arp', 'proto_ax.25', 'proto_bbn-rcc', 'proto_bna', 'proto_br-sat-mon', 'proto_cbt', 'proto_cftp', 'proto_chaos', 'proto_compaq-peer', 'proto_cphb', 'proto_cpnx', 'proto_crtp', 'proto_crudp', 'proto_dcn', 'proto_ddp', 'proto_ddx', 'proto_dgp', 'proto_egp', 'proto_eigrp', 'proto_emcon', 'proto_encap', 'proto_etherip', 'proto_fc', 'proto_fire', 'proto_ggp

### Feature Selection

In [2]:
# Use the select_features_by_correlation function from our helpers module to get the features with the highest correlation with the target label
sorted_numeric_features_cor = select_features_by_correlation(
    train_data, 
    train_data.columns, 
    categorical_features, 
    target_column='label'
)

print("\nAll correlated features (in sorted order):")
print(sorted_numeric_features_cor)
print(len(sorted_numeric_features_cor))



Selecting features:   0%|          | 0/37 [00:00<?, ?feature/s]

Excluding 'ct_state_ttl' (target corr: 0.5777) due to 0.8462 correlation with 'dload'
Excluding 'dbytes' (target corr: 0.5131) due to 0.9451 correlation with 'dload'
Excluding 'dpkts' (target corr: 0.4719) due to 0.8792 correlation with 'dload'
Excluding 'dwin' (target corr: 0.3197) due to 0.9902 correlation with 'swin'
Excluding 'stcpb' (target corr: 0.3196) due to 0.9883 correlation with 'swin'
Excluding 'dtcpb' (target corr: 0.3189) due to 0.9882 correlation with 'swin'
Excluding 'ct_src_dport_ltm' (target corr: 0.3056) due to 0.9068 correlation with 'ct_dst_sport_ltm'
Excluding 'ct_dst_src_ltm' (target corr: 0.3039) due to 0.8387 correlation with 'ct_dst_sport_ltm'
Excluding 'sbytes' (target corr: 0.2957) due to 0.9078 correlation with 'spkts'
Excluding 'dttl' (target corr: 0.2773) due to 0.8819 correlation with 'swin'
Excluding 'ct_src_ltm' (target corr: 0.2382) due to 0.8030 correlation with 'ct_dst_sport_ltm'
Excluding 'ct_dst_ltm' (target corr: 0.2299) due to 0.8706 correlation

In [3]:
selected_cat_features_chi2 = select_features_by_chi2(
    df=train_data, 
    categorical_features=categorical_features,
    target_column='label', 
    independence_threshold=0.05,  # Standard statistical significance level
    verbose=True,
    eliminate_dependent=True
)

print("\nFinal selected categorical features:")
print(selected_cat_features_chi2)
print(len(selected_cat_features_chi2))


Chi2 Feature Selection:   0%|          | 0/160 [00:00<?, ?it/s]

Selected feature 'state_CON' (chi2 score: 21821.8537)
Excluding 'proto_tcp' (chi2 score: 10624.4313) due to dependency (p-value: 0.0000) with 'state_CON'
Excluding 'state_FIN' (chi2 score: 8971.2211) due to dependency (p-value: 0.0000) with 'state_CON'
Excluding 'proto_arp' (chi2 score: 6092.7843) due to dependency (p-value: 0.0000) with 'state_CON'
Excluding 'is_sm_ips_ports_1' (chi2 score: 5886.0686) due to dependency (p-value: 0.0000) with 'state_CON'
Excluding 'proto_unas' (chi2 score: 5670.3396) due to dependency (p-value: 0.0000) with 'state_CON'
Excluding 'service_dns' (chi2 score: 5635.6050) due to dependency (p-value: 0.0000) with 'state_CON'
Excluding 'proto_udp' (chi2 score: 2875.3442) due to dependency (p-value: 0.0000) with 'state_CON'
Excluding 'service_ssh' (chi2 score: 2706.2218) due to dependency (p-value: 0.0000) with 'state_CON'
Excluding 'service_-' (chi2 score: 2024.0993) due to dependency (p-value: 0.0000) with 'state_CON'
Excluding 'service_ftp-data' (chi2 score:

In [4]:
selected_numeric_features_mi = select_features_by_mutual_info_numeric(
    df=train_data,
    feature_columns=train_data.columns,
    categorical_features=categorical_features,
    target_column="label"
)
print("\nFinal selected numeric features BY MI:")
print(selected_numeric_features_mi)
print(len(selected_numeric_features_mi))

Excluding 'dbytes' (target MI: 0.3732) due to redundancy (score: 5.7521) with 'sbytes'
Excluding 'sttl' (target MI: 0.3722) due to redundancy (score: 2.0858) with 'sbytes'
Excluding 'rate' (target MI: 0.3534) due to redundancy (score: 4.6249) with 'sbytes'
Excluding 'dttl' (target MI: 0.3493) due to redundancy (score: 2.8004) with 'sbytes'
Excluding 'sload' (target MI: 0.3468) due to redundancy (score: 6.6198) with 'sbytes'
Excluding 'ct_state_ttl' (target MI: 0.3464) due to redundancy (score: 2.9756) with 'sbytes'
Excluding 'dur' (target MI: 0.3393) due to redundancy (score: 4.7829) with 'sbytes'
Excluding 'smean' (target MI: 0.3310) due to redundancy (score: 10.6707) with 'sbytes'
Excluding 'dmean' (target MI: 0.3140) due to redundancy (score: 6.3302) with 'sbytes'
Excluding 'dinpkt' (target MI: 0.2982) due to redundancy (score: 4.4877) with 'sbytes'
Excluding 'dload' (target MI: 0.2805) due to redundancy (score: 5.0258) with 'sbytes'
Excluding 'dpkts' (target MI: 0.2698) due to redu

In [5]:
selected_cat_features_mi = select_features_by_mutual_info_categorical(
    df=train_data,
    categorical_features=categorical_features,
    target_column="label",
    verbose=True,
)

print("\nFinal selected categorical features BY MI:")
print(selected_cat_features_mi)
print(len(selected_cat_features_mi))


Selected feature 'proto_tcp' (mutual info score: 0.0673)
Excluding 'state_FIN' (target MI: 0.0573) due to redundancy (score: 11.0213) with 'proto_tcp'
Excluding 'proto_unas' (target MI: 0.0300) due to redundancy (score: 1.4698) with 'proto_tcp'
Excluding 'service_dns' (target MI: 0.0296) due to redundancy (score: 6.8748) with 'proto_tcp'
Excluding 'proto_udp' (target MI: 0.0208) due to redundancy (score: 14.7001) with 'proto_tcp'
Excluding 'proto_ospf' (target MI: 0.0044) due to redundancy (score: 2.0460) with 'proto_tcp'
Excluding 'service_ftp-data' (target MI: 0.0042) due to redundancy (score: 4.3137) with 'proto_tcp'
Excluding 'proto_sctp' (target MI: 0.0031) due to redundancy (score: 1.3103) with 'proto_tcp'
Excluding 'service_pop3' (target MI: 0.0021) due to redundancy (score: 2.4153) with 'proto_tcp'
Excluding 'is_ftp_login_1' (target MI: 0.0019) due to redundancy (score: 6.0350) with 'proto_tcp'
Excluding 'proto_any' (target MI: 0.0010) due to redundancy (score: 1.0200) with 'pr

In [6]:
combined_features = []
for fl in [selected_cat_features_chi2,  selected_cat_features_mi, selected_numeric_features_mi, sorted_numeric_features_cor]:
    combined_features += fl[:10]
combined_features = list(set(combined_features))
print(combined_features)
print(len(combined_features))

['sinpkt', 'sttl', 'sload', 'proto_qnx', 'swin', 'service_ssh', 'sbytes', 'proto_icmp', 'proto_igmp', 'state_CON', 'spkts', 'service_-', 'is_ftp_login_2', 'is_sm_ips_ports_0', 'ct_dst_sport_ltm', 'ackdat', 'service_ssl', 'dmean', 'rate', 'proto_ipcv', 'service_irc', 'dload', 'proto_secure-vmtp', 'proto_tcp', 'proto_rtp', 'is_ftp_login_0']
26


## Training, feature selection, and tuning

### Hill climbing function for feture selection

In [7]:
# Create a set of kwargs for MLPClassifier
from sklearn.neural_network import MLPClassifier
import random
import time

# Generate a truly random seed based on current time
random_seed = int(time.time()) % 10000
print(f"Using random seed: {random_seed}")

model_params = {
    'hidden_layer_sizes': (5,5),
    'activation': 'relu',
    'solver': 'adam',
    'alpha': 0.0001,
    'max_iter': 1000,
    'shuffle': True,
    'tol': 0.0001,
    'verbose': False,
    'warm_start': False,
    'momentum': 0.9,
    'learning_rate': 'adaptive',
    'learning_rate_init': 0.001,
    'power_t': 0.5,
    'batch_size': 'auto',
    'nesterovs_momentum': True,
    'early_stopping': False,
    'validation_fraction': 0.1,
    'random_state': random_seed
}

# Use a sample of the training data to speed up feature selection
tuning_set = train_data.sample(frac=0.1, random_state=random_seed)

# Use hill climbing feature selection from our helpers module
selected_features, results = select_features_by_hill_climbing(
    df=tuning_set,
    feature_columns=combined_features,
    model_class=MLPClassifier,
    model_params=model_params,
    label_column='label',
    min_features=5,
    max_features=20,
    max_iterations=100,
    n_folds=2,
    random_state=random_seed,
    scoring='f1',
    verbose=1
)

print("\nBest features selected by hill climbing:")
print(selected_features)
print(f"Number of features: {len(selected_features)}")
print(f"F1 Score: {results['f1_score']:.4f}")
print(f"Accuracy: {results['accuracy']:.4f}")

# selected_features = ['service_dns', 'is_sm_ips_ports_1', 'sttl', 'sload', 'proto_any', 'proto_sctp', 'service_pop3', 'proto_tcp', 'dload', 'proto_gre', 'spkts', 'proto_udp']

Using random seed: 2433
Initial feature set (12 features) - F1: 0.9513


Hill Climbing:   0%|          | 0/100 [00:00<?, ?it/s]

Iter 0: Removed: is_ftp_login_2 - f1: 0.9523 (improved)
Iter 1: Added: proto_rtp - f1: 0.9526 (improved)
Iter 21: Added: ct_dst_sport_ltm - f1: 0.9527 (improved)
Iter 25: Removed: proto_igmp - f1: 0.9530 (improved)
Iter 27: Swapped: proto_rtp → proto_tcp - f1: 0.9530 (improved)
Iter 60: Removed: dmean - f1: 0.9514 (explored)
Iter 61: Swapped: proto_tcp → proto_secure-vmtp - f1: 0.9516 (explored)
Iter 69: Added: service_irc - f1: 0.9520 (explored)
Iter 82: Added: service_ssh - f1: 0.9521 (explored)
Iter 95: Swapped: service_irc → proto_igmp - f1: 0.9523 (explored)

Best feature set (12 features) - f1: 0.9530

Best features selected by hill climbing:
['dload', 'state_CON', 'is_sm_ips_ports_0', 'swin', 'ackdat', 'is_ftp_login_0', 'spkts', 'service_-', 'dmean', 'service_ssl', 'ct_dst_sport_ltm', 'proto_tcp']
Number of features: 12
F1 Score: 0.9530
Accuracy: 0.9329


In [8]:
best_features = selected_features

### Hyperparameter Tuning

In [9]:
# Generate a truly random seed based on current time
random_seed = int(time.time()) % 10000
print(f"Using random seed: {random_seed}")

# Define parameter grid for grid search with focus on reducing overfitting
param_grid = {
    # Network architecture: try simpler architectures to reduce overfitting
    'hidden_layer_sizes': [(100,100,100)],
    
    # Activation function for hidden layers
    'activation': ['relu', 'tanh'],
    
    # Solver for weight optimization
    'solver': ['adam'],
    
    # Learning rate schedule - adaptive can help with overfitting
    'learning_rate': ['adaptive'],
    
    # Initial learning rate - smaller values can help prevent overfitting
    'learning_rate_init': [0.0001, 0.001],
    
    # Regularization parameter (L2 penalty) - key for controlling overfitting
    'alpha': [0.001, 0.01, 0.1],
    
    # Batch size for mini-batch optimization
    'batch_size': ['auto'],
    
    # Maximum number of iterations
    'max_iter': [5000],
    
    # Early stopping to prevent overfitting - enable this
    'early_stopping': [True, False],
    
    # Validation fraction for early stopping
    'validation_fraction': [0.2],
    
    # Whether to shuffle samples in each iteration
    'shuffle': [True],
    
    # Momentum for gradient descent - can help escape local minima
    'momentum': [0.8, 0.9],
    
    # Random state for reproducibility
    'random_state': [random_seed]
}

# Use a sample of the training data to speed up hyperparameter tuning
tuning_sample = train_data.sample(frac=0.3, random_state=random_seed)

# Run hyperparameter tuning using the function from param_tuning.py
best_params, best_score, cv_results = tune_hyperparameters(
    df=tuning_sample,
    model_class=MLPClassifier,
    param_grid=param_grid,
    feature_columns=best_features,  # Use features found by hill climbing
    label_column='label',
    scoring='f1',
    n_splits=2,  # Increased from 2 to get better cross-validation
    random_state=random_seed,
    verbose=1
)


Using random seed: 2872
Starting grid search with 48 parameter combinations...
Fitting 2 folds for each of 48 candidates, totalling 96 fits

Best Score (f1): 0.9539
Best Parameters:
  activation: tanh
  alpha: 0.001
  batch_size: auto
  early_stopping: False
  hidden_layer_sizes: (100, 100, 100)
  learning_rate: adaptive
  learning_rate_init: 0.001
  max_iter: 5000
  momentum: 0.8
  random_state: 2872
  shuffle: True
  solver: adam
  validation_fraction: 0.2


## Final Battle

In [10]:
from helpers.evaluate_model_final import train_and_evaluate_model
from sklearn.neural_network import MLPClassifier

# Create the model with best parameters
model = MLPClassifier(**best_params)

# Train and evaluate the model
results, best_model = train_and_evaluate_model(
    train_df=train_data,
    test_df=test_data,
    selected_features=best_features,
    label_column='label',
    model=model,
    stratify=True,
    n_splits=5
)

for k, v in results.items():
    print(k, v)

Cross-validation:   0%|          | 0/5 [00:00<?, ?it/s]

accuracy 0.8228027984258854
precision 0.7620395151804435
recall 0.9861025324274243
f1 0.8597117113651881
true_positive_rate 0.9861025324274243
true_negative_rate 0.6227297297297297
false_positive_rate 0.37727027027027027
false_negative_rate 0.013897467572575664
