# Logistic Regression Experiments

In this notebook i will explore the effectiveness of using logistic regression on the UNSW_NB15 intrusion detection dataset

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression

from helpers.cross_validation import perform_model_cv
from helpers.feature_selection import (
    perform_rfe,
    pretty_print_rfecv_results,
    select_features_by_correlation,
    select_features_by_chi2,
    select_features_by_hill_climbing,
    select_features_by_mutual_info_categorical,
    select_features_by_mutual_info_numeric,
)
from helpers.param_tuning import tune_hyperparameters
from helpers.preprocessing import preprocess_data
from helpers.evaluate_model_final import train_and_evaluate_model

# Define feature lists for preprocessing
categorical_features = [
    "proto", "state", "service", "is_sm_ips_ports", "is_ftp_login"
]
features_to_transform = [
    "sbytes", "dbytes", "sttl", "dttl", "sload", "dload", "spkts", "dpkts",
    "swin", "dwin", "stcpb", "dtcpb", "smeansz", "dmeansz", "sjit", "djit"
]
features_to_drop = ["attack_cat", "id"]

# Load the training and testing datasets
train_data = pd.read_csv("../data/UNSW_NB15/UNSW_NB15_training-set.csv")
test_data = pd.read_csv("../data/UNSW_NB15/UNSW_NB15_testing-set.csv")

# Preprocess the data
train_data, test_data, categorical_features = preprocess_data(
    train_data, 
    test_data, 
    categorical_features, 
    features_to_transform, 
    features_to_drop
)

# At the end, print numeric and categorical columns from the training data
numeric_columns = [f for f in train_data.columns if f not in categorical_features]
print("\nNumeric columns in the Training Data:")
print(numeric_columns)
print("Total number of Numeric columns:", len(numeric_columns))
print("\nCategorical columns in the Training Data:")
print(categorical_features)
print("Total number of Categorical columns:", len(categorical_features))

2025-04-01 17:52:49,740 - helpers.preprocessing - INFO - Number of common columns: 43
2025-04-01 17:52:50,875 - helpers.preprocessing - INFO - Processed shapes - Train: (175341, 199), Test: (82332, 199)



Numeric columns in the Training Data:
['sttl', 'sload', 'ct_src_ltm', 'label', 'dload', 'dttl', 'dloss', 'dmean', 'dwin', 'ct_dst_ltm', 'spkts', 'ct_srv_dst', 'ct_src_dport_ltm', 'rate', 'trans_depth', 'ct_dst_src_ltm', 'ct_state_ttl', 'synack', 'stcpb', 'ct_srv_src', 'swin', 'sinpkt', 'response_body_len', 'tcprtt', 'sloss', 'dinpkt', 'ct_flw_http_mthd', 'dur', 'dbytes', 'ct_ftp_cmd', 'sjit', 'smean', 'ackdat', 'ct_dst_sport_ltm', 'dpkts', 'sbytes', 'dtcpb', 'djit']
Total number of Numeric columns: 38

Categorical columns in the Training Data:
['proto_3pc', 'proto_a/n', 'proto_aes-sp3-d', 'proto_any', 'proto_argus', 'proto_aris', 'proto_arp', 'proto_ax.25', 'proto_bbn-rcc', 'proto_bna', 'proto_br-sat-mon', 'proto_cbt', 'proto_cftp', 'proto_chaos', 'proto_compaq-peer', 'proto_cphb', 'proto_cpnx', 'proto_crtp', 'proto_crudp', 'proto_dcn', 'proto_ddp', 'proto_ddx', 'proto_dgp', 'proto_egp', 'proto_eigrp', 'proto_emcon', 'proto_encap', 'proto_etherip', 'proto_fc', 'proto_fire', 'proto_ggp

First we will preprocess the data by performing the log transformations and then one hot encoding categorical features

In [2]:
# Use the select_features_by_correlation function from our helpers module to get the features with the highest correlation with the target label
sorted_numeric_features_cor = select_features_by_correlation(
    train_data, 
    train_data.columns, 
    categorical_features, 
    target_column='label'
)

print("\nAll correlated features (in sorted order):")
print(sorted_numeric_features_cor)
print(len(sorted_numeric_features_cor))



Selecting features:   0%|          | 0/37 [00:00<?, ?feature/s]

Excluding 'ct_state_ttl' (target corr: 0.5777) due to 0.8462 correlation with 'dload'
Excluding 'dbytes' (target corr: 0.5131) due to 0.9451 correlation with 'dload'
Excluding 'dpkts' (target corr: 0.4719) due to 0.8792 correlation with 'dload'
Excluding 'dwin' (target corr: 0.3197) due to 0.9902 correlation with 'swin'
Excluding 'stcpb' (target corr: 0.3196) due to 0.9883 correlation with 'swin'
Excluding 'dtcpb' (target corr: 0.3189) due to 0.9882 correlation with 'swin'
Excluding 'ct_src_dport_ltm' (target corr: 0.3056) due to 0.9068 correlation with 'ct_dst_sport_ltm'
Excluding 'ct_dst_src_ltm' (target corr: 0.3039) due to 0.8387 correlation with 'ct_dst_sport_ltm'
Excluding 'sbytes' (target corr: 0.2957) due to 0.9078 correlation with 'spkts'
Excluding 'dttl' (target corr: 0.2773) due to 0.8819 correlation with 'swin'
Excluding 'ct_src_ltm' (target corr: 0.2382) due to 0.8030 correlation with 'ct_dst_sport_ltm'
Excluding 'ct_dst_ltm' (target corr: 0.2299) due to 0.8706 correlation

In [3]:
selected_cat_features_chi2 = select_features_by_chi2(
    df=train_data, 
    categorical_features=categorical_features,
    target_column='label', 
    independence_threshold=0.05,  # Standard statistical significance level
    verbose=True,
    eliminate_dependent=True
)

print("\nFinal selected categorical features:")
print(selected_cat_features_chi2)
print(len(selected_cat_features_chi2))


Chi2 Feature Selection:   0%|          | 0/161 [00:00<?, ?it/s]

Selected feature 'state_INT' (chi2 score: 23639.7857)
Excluding 'state_CON' (chi2 score: 21821.8537) due to dependency (p-value: 0.0000) with 'state_INT'
Excluding 'proto_tcp' (chi2 score: 10624.4313) due to dependency (p-value: 0.0000) with 'state_INT'
Excluding 'state_FIN' (chi2 score: 8971.2211) due to dependency (p-value: 0.0000) with 'state_INT'
Excluding 'proto_arp' (chi2 score: 6092.7843) due to dependency (p-value: 0.0000) with 'state_INT'
Excluding 'is_sm_ips_ports_1' (chi2 score: 5886.0686) due to dependency (p-value: 0.0000) with 'state_INT'
Excluding 'proto_unas' (chi2 score: 5670.3396) due to dependency (p-value: 0.0000) with 'state_INT'
Excluding 'service_dns' (chi2 score: 5635.6050) due to dependency (p-value: 0.0000) with 'state_INT'
Excluding 'proto_udp' (chi2 score: 2875.3442) due to dependency (p-value: 0.0000) with 'state_INT'
Excluding 'service_ssh' (chi2 score: 2706.2218) due to dependency (p-value: 0.0000) with 'state_INT'
Excluding 'service_-' (chi2 score: 2024.

In [4]:
selected_numeric_features_mi = select_features_by_mutual_info_numeric(
    df=train_data,
    feature_columns=train_data.columns,
    categorical_features=categorical_features,
    target_column="label"
)
print("\nFinal selected numeric features BY MI:")
print(selected_numeric_features_mi)
print(len(selected_numeric_features_mi))

Excluding 'dbytes' (target MI: 0.3751) due to redundancy (score: 5.7200) with 'sbytes'
Excluding 'sttl' (target MI: 0.3681) due to redundancy (score: 2.1184) with 'sbytes'
Excluding 'rate' (target MI: 0.3533) due to redundancy (score: 4.6194) with 'sbytes'
Excluding 'dttl' (target MI: 0.3495) due to redundancy (score: 2.7952) with 'sbytes'
Excluding 'sload' (target MI: 0.3470) due to redundancy (score: 6.6040) with 'sbytes'
Excluding 'ct_state_ttl' (target MI: 0.3432) due to redundancy (score: 3.0093) with 'sbytes'
Excluding 'dur' (target MI: 0.3377) due to redundancy (score: 4.8046) with 'sbytes'
Excluding 'smean' (target MI: 0.3320) due to redundancy (score: 10.6304) with 'sbytes'
Excluding 'dmean' (target MI: 0.3139) due to redundancy (score: 6.3305) with 'sbytes'
Excluding 'dinpkt' (target MI: 0.2970) due to redundancy (score: 4.5075) with 'sbytes'
Excluding 'dload' (target MI: 0.2811) due to redundancy (score: 5.0118) with 'sbytes'
Excluding 'dpkts' (target MI: 0.2731) due to redu

In [5]:
selected_cat_features_mi = select_features_by_mutual_info_categorical(
    df=train_data,
    categorical_features=categorical_features,
    target_column="label",
    verbose=True,
)

print("\nFinal selected categorical features BY MI:")
print(selected_cat_features_mi)
print(len(selected_cat_features_mi))


Selected feature 'state_INT' (mutual info score: 0.1640)
Excluding 'proto_tcp' (target MI: 0.0699) due to redundancy (score: 6.7760) with 'state_INT'
Excluding 'state_CON' (target MI: 0.0634) due to redundancy (score: 0.7923) with 'state_INT'
Excluding 'state_FIN' (target MI: 0.0576) due to redundancy (score: 7.8163) with 'state_INT'
Excluding 'service_dns' (target MI: 0.0289) due to redundancy (score: 3.6178) with 'state_INT'
Excluding 'proto_unas' (target MI: 0.0285) due to redundancy (score: 1.9343) with 'state_INT'
Excluding 'proto_udp' (target MI: 0.0236) due to redundancy (score: 6.1059) with 'state_INT'
Excluding 'service_ftp-data' (target MI: 0.0047) due to redundancy (score: 3.1015) with 'state_INT'
Excluding 'service_pop3' (target MI: 0.0022) due to redundancy (score: 1.8581) with 'state_INT'
Excluding 'proto_any' (target MI: 0.0012) due to redundancy (score: 1.1262) with 'state_INT'
Excluding 'proto_rsvp' (target MI: 0.0011) due to redundancy (score: 0.7801) with 'state_INT'

In [6]:
combined_features = []
for fl in [selected_cat_features_chi2,  selected_cat_features_mi, selected_numeric_features_mi, sorted_numeric_features_cor]:
    combined_features += fl[:10]
combined_features = list(set(combined_features))
print(combined_features)
print(len(combined_features))

['is_sm_ips_ports_0', 'service_radius', 'sttl', 'state_PAR', 'proto_pvp', 'dload', 'dmean', 'spkts', 'service_-', 'proto_rtp', 'proto_il', 'rate', 'proto_ipcv', 'is_ftp_login_0', 'swin', 'state_URN', 'proto_crtp', 'sinpkt', 'state_INT', 'is_ftp_login_2', 'ackdat', 'ct_dst_sport_ltm', 'sbytes', 'state_no', 'proto_cpnx', 'sload', 'proto_trunk-2']
27


Here we present a hill climbing function that we will use to select different sets of features based on what we have calculated above

In [7]:
# Create a tuning set for feature selection
import random
import numpy as np
random_seed = random.randint(1, 10000)  # Generate a truly random seed each time
tuning_set = train_data.sample(frac=0.2, random_state=random_seed)

# Set up model parameters with random seed
model_params = {'max_iter': 50000, 'random_state': random_seed, 'solver': 'saga'}

# Perform recursive feature elimination with cross-validation
print("Running RFE with cross-validation...")
rfe_results = perform_rfe(
    df=tuning_set, 
    feature_columns=combined_features, 
    model_params=model_params,
    label_column='label', 
    n_features_to_select=10, 
    step=1, 
    cv=2, 
    scoring='f1', 
    random_state=random_seed, 
    verbose=1
)

# Display the RFE results
pretty_print_rfecv_results(rfe_results)

# Store the results
results = rfe_results

# Get the best features from RFE results
best_features = results[0]

Running RFE with cross-validation...
Fitting estimator with 27 features.
Fitting estimator with 26 features.
Fitting estimator with 25 features.
Fitting estimator with 24 features.
Fitting estimator with 23 features.
Fitting estimator with 22 features.
Fitting estimator with 21 features.
Fitting estimator with 20 features.
Fitting estimator with 19 features.
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.

                    FEATURE SELECTION RESULTS
Optimal number of features: 12

Selected features:
  1. is_sm_ips_ports_0
  2. service_radius
  3. sttl
  4. dload
  5. service_-
  6. proto_il
  7. proto_ipcv
  8. proto_crtp
  9. state_INT
  10. sbytes
  11. sload
  12. proto_trunk-2

Best cross-validation score: 0.9434 ± 0.0008
Cross-validation scoring metric: f1



In [8]:
print("tuning with best features: ", best_features)
tuning_set = train_data.sample(frac=.2, random_state=random_seed)

# Define the parameter grid specific to logistic regression
param_grid = {
    'max_iter': [10000],
    'C': [1e-4, 1e-3, 1e-2, 1e-1, 1.0, 2, 5, 10],
    'tol': [1e-2, 1e-3, 1e-4, 1e-5, 1e-6],
    'penalty': ['l2', 'l1', None],
    'solver': ['saga']
}

# Directly call tune_hyperparameters instead of using a wrapper function
best_params, best_score, cv_results = tune_hyperparameters(
    df=tuning_set,
    model_class=LogisticRegression,
    param_grid=param_grid,
    feature_columns=best_features,
    label_column='label',
    scoring='f1',
    n_splits=2,
    random_state=random_seed,
    n_jobs=-1,
    verbose=1
)

tuning with best features:  ['is_sm_ips_ports_0', 'service_radius', 'sttl', 'dload', 'service_-', 'proto_il', 'proto_ipcv', 'proto_crtp', 'state_INT', 'sbytes', 'sload', 'proto_trunk-2']
Starting grid search with 120 parameter combinations...
Fitting 2 folds for each of 120 candidates, totalling 240 fits

Best Score (f1): 0.9438
Best Parameters:
  C: 0.1
  max_iter: 10000
  penalty: l2
  solver: saga
  tol: 0.001


In [9]:
# Perform cross-validation on the entire training set using the selected features
print("\nPerforming cross-validation on the entire training set with selected features...")
cv_results = perform_model_cv(
    df=train_data,
    model_class=LogisticRegression,
    model_params=best_params,
    feature_columns=best_features,
    label_column='label',
    n_splits=5,
    random_state=random_seed
)

# Display the cross-validation results
print("\nCross-validation results:")
for k, v in cv_results.items():
    print(f"{k}: {v}")




Performing cross-validation on the entire training set with selected features...

Cross-validation results:
accuracy: {'mean': 0.9174351853716354, 'std': 0.0019786996356914457}
precision: {'mean': 0.9034786979264677, 'std': 0.002402691672910262}
recall: {'mean': 0.983802718827474, 'std': 0.000822505174806522}
f1: {'mean': 0.9419294395019143, 'std': 0.0013177495564159814}
true_positive_rate: {'mean': 0.983802718827474, 'std': 0.000822505174806522}
false_positive_rate: {'mean': 0.22400000000000003, 'std': 0.006199901250036495}
true_negative_rate: {'mean': 0.776, 'std': 0.006199901250036488}
false_negative_rate: {'mean': 0.016197281172525953, 'std': 0.0008225051748065153}


In [10]:
# Train and evaluate the model using the imported function
results, trained_model = train_and_evaluate_model(
    train_df=train_data, 
    test_df=test_data, 
    selected_features=best_features, 
    label_column='label', 
    model_params=best_params,
    stratify=True,
    n_splits=5
)

for k, v in results.items():
    print(f"{k}: {v}")





Cross-validation:   0%|          | 0/5 [00:00<?, ?it/s]

accuracy: 0.7807049506874605
precision: 0.7278589925653663
recall: 0.9610429718521133
f1: 0.8283533136224058
true_positive_rate: 0.9610429718521133
true_negative_rate: 0.5597567567567567
false_positive_rate: 0.44024324324324327
false_negative_rate: 0.0389570281478867
