# Logistic Regression Experiments

In this notebook i will explore the effectiveness of using logistic regression on the UNSW_NB15 intrusion detection dataset

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import sklearn
from sklearn.linear_model import LogisticRegression
import seaborn as sns
import matplotlib.pyplot as plt

from helpers.cross_validation import perform_model_cv, pretty_print_results
from helpers.feature_selection import perform_rfe, pretty_print_rfecv_results, select_features_by_correlation
from helpers.param_tuning import tune_hyperparameters
from helpers.preprocessing import preprocess_data


First we will preprocess the data by performing the log transformations and then one hot encoding categorical features

In [2]:
# Define lists of features for preprocessing
categorical_features = ["proto", "state", "service", "is_sm_ips_ports", "is_ftp_login"]
features_to_transform = [
    'sbytes', 'dbytes', 'sttl', 'dttl', 'sload', 'dload', 'spkts', 'dpkts', 
    'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz', 'dmeansz', 'sjit', 'djit'
]
columns_to_drop = ['attack_cat', 'id']

# Load the training and testing datasets from CSV files
train_data = pd.read_csv('../data/UNSW_NB15/UNSW_NB15_training-set.csv')
test_data = pd.read_csv('../data/UNSW_NB15/UNSW_NB15_testing-set.csv')

# Preprocess the data
train_data, test_data, categorical_cols = preprocess_data(
    train_data, 
    test_data, 
    categorical_features, 
    features_to_transform,
    columns_to_drop
)

print("\nList of columns in the Training Data:")
print(list(train_data.columns))
print("\nList of columns in the Testing Data:")
print(list(test_data.columns))

Log transformation applied to numeric features (if present) in the dataset
Log transformation applied to numeric features (if present) in the dataset
One-hot encoding applied to categorical features in the dataset
Updated categorical feature columns: ['proto_3pc', 'proto_a/n', 'proto_aes-sp3-d', 'proto_any', 'proto_argus', 'proto_aris', 'proto_arp', 'proto_ax.25', 'proto_bbn-rcc', 'proto_bna', 'proto_br-sat-mon', 'proto_cbt', 'proto_cftp', 'proto_chaos', 'proto_compaq-peer', 'proto_cphb', 'proto_cpnx', 'proto_crtp', 'proto_crudp', 'proto_dcn', 'proto_ddp', 'proto_ddx', 'proto_dgp', 'proto_egp', 'proto_eigrp', 'proto_emcon', 'proto_encap', 'proto_etherip', 'proto_fc', 'proto_fire', 'proto_ggp', 'proto_gmtp', 'proto_gre', 'proto_hmp', 'proto_i-nlsp', 'proto_iatp', 'proto_ib', 'proto_icmp', 'proto_idpr', 'proto_idpr-cmtp', 'proto_idrp', 'proto_ifmp', 'proto_igmp', 'proto_igp', 'proto_il', 'proto_ip', 'proto_ipcomp', 'proto_ipcv', 'proto_ipip', 'proto_iplt', 'proto_ipnip', 'proto_ippc', 'p

Below we will do some experimentation on sets of features

In [3]:
# Use the select_features_by_correlation function from our helpers module to get the features with the highest correlation with the target label
sorted_features = select_features_by_correlation(
    train_data, 
    train_data.columns, 
    categorical_cols, 
    target_column='label'
)

print("\nAll correlated features (in sorted order):")
print(sorted_features)


('dur', 0.0361745815868478)
('spkts', 0.3560078954626985)
('dpkts', 0.4719046961015229)
('sbytes', 0.29573453218419876)
('dbytes', 0.5130950539094983)
('rate', 0.33797851137339135)
('sttl', 0.6800970077833289)
('dttl', 0.27726311815965493)
('sload', 0.3222502550875837)
('dload', 0.625018033576283)
('sloss', 0.000640112660587516)
('dloss', 0.09468497046041137)
('sinpkt', 0.17611032213794253)
('dinpkt', 0.02288747771762546)
('sjit', 0.1496356081351764)
('djit', 0.1706961323318906)
('swin', 0.3336962514918778)
('stcpb', 0.3195788296563108)
('dtcpb', 0.3188769911817073)
('dwin', 0.3196600641937193)
('tcprtt', 0.08158406079561766)
('synack', 0.05829892920859638)
('ackdat', 0.09736393873378878)
('smean', 0.010797569662018761)
('dmean', 0.34180630596094996)
('trans_depth', 0.010800959521764388)
('response_body_len', 0.02136134981329707)
('ct_srv_src', 0.22904392983552574)
('ct_state_ttl', 0.5777039809439662)
('ct_dst_ltm', 0.229887287811049)
('ct_src_dport_ltm', 0.3055787534730556)
('ct_dst_s

In [None]:
# Since the categorical features have already been one-hot encoded, we can use them directly.
onehot_cat_features = train_data[categorical_features]

# Step 2: Compute Chi-squared scores for each one-hot encoded categorical feature using the target label.
from sklearn.feature_selection import chi2
chi2_scores, p_values = chi2(onehot_cat_features, train_data['label'])

# Step 3: Pair each categorical feature with its Chi-squared score and sort in descending order.
cat_chi2_scores = list(zip(categorical_features, chi2_scores))
sorted_cat_chi2 = sorted(cat_chi2_scores, key=lambda x: x[1], reverse=True)

# Step 4: Determine indices corresponding to the top percentiles of categorical features.
n_cat_features = len(sorted_cat_chi2)
index_20_cat = int(np.ceil(n_cat_features * 0.20))
index_40_cat = int(np.ceil(n_cat_features * 0.40))
index_60_cat = int(np.ceil(n_cat_features * 0.60))
index_80_cat = int(np.ceil(n_cat_features * 0.80))

# Step 5: Create lists of categorical features for each specified percentile.
top_20_cat_features = [feature for feature, _ in sorted_cat_chi2[:index_20_cat]]
top_40_cat_features = [feature for feature, _ in sorted_cat_chi2[:index_40_cat]]
top_60_cat_features = [feature for feature, _ in sorted_cat_chi2[:index_60_cat]]
top_80_cat_features = [feature for feature, _ in sorted_cat_chi2[:index_80_cat]]
all_chi2_cat_features = [feature for feature, _ in sorted_cat_chi2]

# Step 6: Display the ranked categorical feature groups.
print("\nTop 20 percentile categorical features:")
print(top_20_cat_features)

print("\nTop 40 percentile categorical features:")
print(top_40_cat_features)

print("\nTop 60 percentile categorical features:")
print(top_60_cat_features)

print("\nTop 80 percentile categorical features:")
print(top_80_cat_features)

print("\nAll categorical features sorted by Chi-squared score:")
print(all_chi2_cat_features)


Here we present a hill climbing function that we will use to select different sets of features based on what we have calculated above

In [None]:
# Create a tuning set for feature selection
import random
import numpy as np
random_seed = random.randint(1, 10000)  # Generate a truly random seed each time
tuning_set = train_data.sample(frac=0.3, random_state=random_seed)

# Combine top numeric and categorical features
combined_features = top_20_numeric_features + top_20_cat_features

# Set up model parameters with random seed
model_params = {'max_iter': 10000, 'random_state': random_seed}

# Perform recursive feature elimination with cross-validation
print("Running RFE with cross-validation...")
rfe_results = perform_rfe(
    df=tuning_set, 
    feature_columns=combined_features, 
    model_params=model_params,
    label_column='label', 
    n_features_to_select=10, 
    step=1, 
    cv=3, 
    scoring='f1', 
    random_state=random_seed, 
    verbose=1
)

# Display the RFE results
pretty_print_rfecv_results(rfe_results)

# Store the results
results = rfe_results

In [None]:
# Get the best features from RFE results
best_features = results[0]

# Set up model parameters
model_params = {'max_iter': 10000, 'random_state': random_seed}

# Perform cross-validation on the entire training set using the selected features
print("\nPerforming cross-validation on the entire training set with selected features...")
cv_results = perform_model_cv(
    df=train_data,
    model_class=LogisticRegression,
    model_params=model_params,
    feature_columns=best_features,
    label_column='label',
    n_splits=5,
    random_state=random_seed
)

# Display the cross-validation results
print("\nCross-validation results:")
pretty_print_results(cv_results)

In [None]:
print("tuning with best features: ", best_features)
tuning_set = train_data.sample(frac=1, random_state=42)

# Define the parameter grid specific to logistic regression
param_grid = {
    'max_iter': [10000],
    'C': [1e-3, 1e-2, 1e-1, 1.0, 2, 5, 10],
    'tol': [1e-4, 1e-5, 1e-6, 1e-7, 1e-8, 1e-9],
    'penalty': ['l2', None]
}

# Directly call tune_hyperparameters instead of using a wrapper function
best_params, best_score, cv_results = tune_hyperparameters(
    df=tuning_set,
    model_class=LogisticRegression,
    param_grid=param_grid,
    feature_columns=best_features,
    label_column='label',
    scoring='f1',
    n_splits=5,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

In [9]:
def train_and_evaluate_model(train_df, test_df, selected_features, label_column, best_params):
    """
    Trains a logistic regression model on the entire training set using the optimal feature subset and hyperparameters,
    and then evaluates the model performance on the testing set.

    Parameters:
      train_df (pd.DataFrame): The training dataset.
      test_df (pd.DataFrame): The testing dataset.
      selected_features (list): List of optimal features to use for training.
      label_column (str): The name of the target label column.
      best_params (dict): A dictionary of optimal hyperparameters for logistic regression 
                          (e.g., {'max_iter': 5000, 'C': 1.0, 'tol': 1e-4, 'penalty': 'l2', 'solver': 'saga'}).

    Returns:
      results (dict): A dictionary containing evaluation metrics:
                      - accuracy: Accuracy score on the test set.
                      - precision: Precision score.
                      - recall: Recall score.
                      - f1: F1 score.
                      - true_positive_rate: Fraction of positive samples correctly classified.
                      - true_negative_rate: Fraction of negative samples correctly classified.
                      - false_positive_rate: Fraction of negative samples incorrectly classified as positive.
                      - false_negative_rate: Fraction of positive samples incorrectly classified as negative.
      model (LogisticRegression): The trained logistic regression model.
    """
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

    # Prepare training features and labels.
    X_train = train_df[selected_features].values
    y_train = train_df[label_column].values

    # Prepare testing features and labels.
    X_test = test_df[selected_features].values
    y_test = test_df[label_column].values

    # Initialize the logistic regression model with optimal parameters.
    # Ensure random_state is set for reproducibility.
    model = LogisticRegression(random_state=42, **best_params)
    
    # Train the model on the entire training set.
    model.fit(X_train, y_train)
    
    # Predict on the testing set.
    y_pred = model.predict(X_test)
    
    # Compute the confusion matrix and derive rates.
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()
    
    false_positive_rate = fp / (fp + tn) if (fp + tn) > 0 else 0
    false_negative_rate = fn / (tp + fn) if (tp + fn) > 0 else 0
    true_positive_rate = tp / (tp + fn) if (tp + fn) > 0 else 0
    true_negative_rate = tn / (tn + fp) if (tn + fp) > 0 else 0
    
    # Calculate evaluation metrics.
    results = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred, zero_division=0),
        'recall': recall_score(y_test, y_pred, zero_division=0),
        'f1': f1_score(y_test, y_pred, zero_division=0),
        'true_positive_rate': true_positive_rate,
        'true_negative_rate': true_negative_rate,
        'false_positive_rate': false_positive_rate,
        'false_negative_rate': false_negative_rate,
    }
    
    return results, model

results, trained_model = train_and_evaluate_model(
    train_df=train_data, 
    test_df=test_data, 
    selected_features=best_features, 
    label_column='label', 
    best_params=best_params
)


In [None]:
for k, v in results.items():
    print(f"{k}: {v}")



