# Logistic Regression Experiments

In this notebook i will explore the effectiveness of using logistic regression on the UNSW_NB15 intrusion detection dataset

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import sklearn
from sklearn.linear_model import LogisticRegression
import seaborn as sns
import matplotlib.pyplot as plt

from helpers.cross_validation import perform_model_cv, pretty_print_results
from helpers.feature_selection import perform_rfe, pretty_print_rfecv_results, select_features_by_correlation, select_features_by_chi2
from helpers.param_tuning import tune_hyperparameters
from helpers.preprocessing import preprocess_data
from helpers.evaluate_model_final import train_and_evaluate_model


First we will preprocess the data by performing the log transformations and then one hot encoding categorical features

In [2]:
# Define lists of features for preprocessing
categorical_features = ["proto", "state", "service", "is_sm_ips_ports", "is_ftp_login"]
features_to_transform = [
    'sbytes', 'dbytes', 'sttl', 'dttl', 'sload', 'dload', 'spkts', 'dpkts', 
    'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz', 'dmeansz', 'sjit', 'djit'
]
columns_to_drop = ['attack_cat', 'id']

# Load the training and testing datasets from CSV files
train_data = pd.read_csv('../data/UNSW_NB15/UNSW_NB15_training-set.csv')
test_data = pd.read_csv('../data/UNSW_NB15/UNSW_NB15_testing-set.csv')

# Preprocess the data
train_data, test_data, categorical_features = preprocess_data(
    train_data, 
    test_data, 
    categorical_features, 
    features_to_transform,
    columns_to_drop
)

print("\nList of columns in the Training Data:")
print(list(train_data.columns))
print("\nList of columns in the Testing Data:")
print(list(test_data.columns))

Log transformation applied to numeric features (if present) in the dataset
Log transformation applied to numeric features (if present) in the dataset
Z-score normalization applied to 37 numeric features
One-hot encoding applied to categorical features in the dataset
Updated categorical feature columns: ['proto_3pc', 'proto_a/n', 'proto_aes-sp3-d', 'proto_any', 'proto_argus', 'proto_aris', 'proto_arp', 'proto_ax.25', 'proto_bbn-rcc', 'proto_bna', 'proto_br-sat-mon', 'proto_cbt', 'proto_cftp', 'proto_chaos', 'proto_compaq-peer', 'proto_cphb', 'proto_cpnx', 'proto_crtp', 'proto_crudp', 'proto_dcn', 'proto_ddp', 'proto_ddx', 'proto_dgp', 'proto_egp', 'proto_eigrp', 'proto_emcon', 'proto_encap', 'proto_etherip', 'proto_fc', 'proto_fire', 'proto_ggp', 'proto_gmtp', 'proto_gre', 'proto_hmp', 'proto_i-nlsp', 'proto_iatp', 'proto_ib', 'proto_icmp', 'proto_idpr', 'proto_idpr-cmtp', 'proto_idrp', 'proto_ifmp', 'proto_igmp', 'proto_igp', 'proto_il', 'proto_ip', 'proto_ipcomp', 'proto_ipcv', 'proto

Below we will do some experimentation on sets of features

In [3]:
# Use the select_features_by_correlation function from our helpers module to get the features with the highest correlation with the target label
sorted_numericfeatures = select_features_by_correlation(
    train_data, 
    train_data.columns, 
    categorical_features, 
    target_column='label'
)

print("\nAll correlated features (in sorted order):")
print(sorted_numericfeatures)
print(len(sorted_numericfeatures))


Dropping feature 'ct_state_ttl' (target corr: 0.5777) due to high correlation (0.8462) with already selected feature 'dload'
Dropping feature 'dbytes' (target corr: 0.5131) due to high correlation (0.9451) with already selected feature 'dload'
Dropping feature 'dpkts' (target corr: 0.4719) due to high correlation (0.8792) with already selected feature 'dload'
Dropping feature 'dwin' (target corr: 0.3197) due to high correlation (0.9902) with already selected feature 'swin'
Dropping feature 'stcpb' (target corr: 0.3196) due to high correlation (0.9883) with already selected feature 'swin'
Dropping feature 'dtcpb' (target corr: 0.3189) due to high correlation (0.9882) with already selected feature 'swin'
Dropping feature 'ct_src_dport_ltm' (target corr: 0.3056) due to high correlation (0.9068) with already selected feature 'ct_dst_sport_ltm'
Dropping feature 'ct_dst_src_ltm' (target corr: 0.3039) due to high correlation (0.8387) with already selected feature 'ct_dst_sport_ltm'
Dropping f

In [4]:
selected_cat_features = select_features_by_chi2(
    df=train_data, 
    categorical_features=categorical_features,
    target_column='label', 
    independence_threshold=0.05,  # Standard statistical significance level
    verbose=True,
    eliminate_dependent=False
)

print("\nFinal selected categorical features:")
print(selected_cat_features)
print(len(selected_cat_features))

Selected feature 'state_INT' (chi2 score: 23639.7857)
Selected feature 'state_CON' (chi2 score: 21821.8537)
Selected feature 'proto_tcp' (chi2 score: 10624.4313)
Selected feature 'state_FIN' (chi2 score: 8971.2211)
Selected feature 'proto_arp' (chi2 score: 6092.7843)
Selected feature 'is_sm_ips_ports_1' (chi2 score: 5886.0686)
Selected feature 'proto_unas' (chi2 score: 5670.3396)
Selected feature 'service_dns' (chi2 score: 5635.6050)
Selected feature 'proto_udp' (chi2 score: 2875.3442)
Selected feature 'service_ssh' (chi2 score: 2706.2218)
Selected feature 'service_-' (chi2 score: 2024.0993)
Selected feature 'service_ftp-data' (chi2 score: 1875.1348)
Selected feature 'proto_ospf' (chi2 score: 1036.8852)
Selected feature 'proto_sctp' (chi2 score: 539.6301)
Selected feature 'service_pop3' (chi2 score: 506.8268)
Selected feature 'state_REQ' (chi2 score: 193.1397)
Selected feature 'proto_any' (chi2 score: 140.7731)
Selected feature 'state_RST' (chi2 score: 109.7156)
Selected feature 'proto

Here we present a hill climbing function that we will use to select different sets of features based on what we have calculated above

In [5]:
# Create a tuning set for feature selection
import random
import numpy as np
random_seed = random.randint(1, 10000)  # Generate a truly random seed each time
tuning_set = train_data.sample(frac=0.2, random_state=random_seed)

# Combine top numeric and categorical features
combined_features = selected_cat_features[:20] + sorted_numericfeatures[:10]

# Set up model parameters with random seed
model_params = {'max_iter': 50000, 'random_state': random_seed, 'solver': 'saga'}

# Perform recursive feature elimination with cross-validation
print("Running RFE with cross-validation...")
rfe_results = perform_rfe(
    df=tuning_set, 
    feature_columns=combined_features, 
    model_params=model_params,
    label_column='label', 
    n_features_to_select=10, 
    step=1, 
    cv=2, 
    scoring='f1', 
    random_state=random_seed, 
    verbose=1
)

# Display the RFE results
pretty_print_rfecv_results(rfe_results)

# Store the results
results = rfe_results

# Get the best features from RFE results
best_features = results[0]

Running RFE with cross-validation...
Fitting estimator with 30 features.
Fitting estimator with 29 features.
Fitting estimator with 28 features.
Fitting estimator with 27 features.
Fitting estimator with 26 features.
Fitting estimator with 25 features.
Fitting estimator with 24 features.
Fitting estimator with 23 features.
Fitting estimator with 22 features.
Fitting estimator with 21 features.
Fitting estimator with 20 features.
Fitting estimator with 19 features.
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.

                    FEATURE SELECTION RESULTS
Optimal number of features: 10

Selected features:
  1. state_INT
  2. proto_tcp
  3. state_FIN
  4. proto_arp
  5. proto_udp
  6. service_ssh
  7. service_-
  8. service_pop3
  9. sttl
  10. swin

In [6]:
print("tuning with best features: ", best_features)
tuning_set = train_data.sample(frac=.2, random_state=random_seed)

# Define the parameter grid specific to logistic regression
param_grid = {
    'max_iter': [10000],
    'C': [1e-4, 1e-3, 1e-2, 1e-1, 1.0, 2, 5, 10],
    'tol': [1e-2, 1e-3, 1e-4, 1e-5, 1e-6],
    'penalty': ['l2', 'l1', None],
    'solver': ['saga']
}

# Directly call tune_hyperparameters instead of using a wrapper function
best_params, best_score, cv_results = tune_hyperparameters(
    df=tuning_set,
    model_class=LogisticRegression,
    param_grid=param_grid,
    feature_columns=best_features,
    label_column='label',
    scoring='f1',
    n_splits=2,
    random_state=random_seed,
    n_jobs=-1,
    verbose=1
)

tuning with best features:  ['state_INT', 'proto_tcp', 'state_FIN', 'proto_arp', 'proto_udp', 'service_ssh', 'service_-', 'service_pop3', 'sttl', 'swin']
Starting grid search with 120 parameter combinations...
Fitting 2 folds for each of 120 candidates, totalling 240 fits

Best Score (f1): 0.9452
Best Parameters:
  C: 5
  max_iter: 10000
  penalty: l2
  solver: saga
  tol: 0.001


In [7]:
# Perform cross-validation on the entire training set using the selected features
print("\nPerforming cross-validation on the entire training set with selected features...")
cv_results = perform_model_cv(
    df=train_data,
    model_class=LogisticRegression,
    model_params=best_params,
    feature_columns=best_features,
    label_column='label',
    n_splits=5,
    random_state=random_seed
)

# Display the cross-validation results
print("\nCross-validation results:")
pretty_print_results(cv_results)


Performing cross-validation on the entire training set with selected features...

Cross-validation results:
Accuracy: 0.9230 (±0.0011)
Precision: 0.9105 (±0.0014)
Recall: 0.9836 (±0.0006)
F1: 0.9456 (±0.0007)
True positive rate: 0.9836 (±0.0006)
False positive rate: 0.2061 (±0.0035)
True negative rate: 0.7939 (±0.0035)
False negative rate: 0.0164 (±0.0006)


In [8]:
# Train and evaluate the model using the imported function
results, trained_model = train_and_evaluate_model(
    train_df=train_data, 
    test_df=test_data, 
    selected_features=best_features, 
    label_column='label', 
    model_params=best_params,
    stratify=True,
    n_splits=5
)

for k, v in results.items():
    print(f"{k}: {v}")





accuracy: 0.7873852208132925
precision: 0.735403096184756
recall: 0.9588370246183712
f1: 0.8323869892854066
true_positive_rate: 0.9588370246183712
true_negative_rate: 0.5773243243243243
false_positive_rate: 0.42267567567567566
false_negative_rate: 0.04116297538162887
