# Logistic Regression Experiments

In this notebook i will explore the effectiveness of using logistic regression on the UNSW_NB15 intrusion detection dataset

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt


First we will preprocess the data by performing the log transformations and then encoding categorical features as numbers

In [2]:
from typing import List  # Import type hints for better code clarity

# Define lists of features for preprocessing:
# - 'categorical_features' will be one-hot encoded.
# - 'features_to_transform' will undergo a log transformation to reduce skewness.
categorical_features: List[str] = ["proto", "state", "service", "is_sm_ips_ports", "is_ftp_login"]
features_to_transform: List[str] = [
    'sbytes', 'dbytes', 'sttl', 'dttl', 'sload', 'dload', 'spkts', 'dpkts', 
    'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz', 'dmeansz', 'sjit', 'djit'
]

# Load the training and testing datasets from CSV files.
train_data: pd.DataFrame = pd.read_csv('../data/UNSW_NB15/UNSW_NB15_training-set.csv')
test_data: pd.DataFrame = pd.read_csv('../data/UNSW_NB15/UNSW_NB15_testing-set.csv')

# Clean both datasets by removing columns that are not needed for modeling.
columns_to_drop = ['attack_cat', 'id']
for df in [train_data, test_data]:
    for col in columns_to_drop:
        if col in df.columns:
            df.drop(col, axis=1, inplace=True)

def process_numeric_features(df: pd.DataFrame, features: List[str]) -> pd.DataFrame:
    """
    Applies a natural logarithm transformation (ln(x+1)) to each specified numeric feature 
    in the provided dataframe. This helps to normalize the distribution of features and 
    mitigate the effect of extreme values.
    """
    
    # apply the log transformation to the features that were determined in EDA
    for feature in features:
        if feature in df.columns:
            df[feature] = np.log1p(df[feature])
    print("Log transformation applied to numeric features (if present) in the dataset")

    return df

def process_categorical_features(df: pd.DataFrame, cat_features: List[str]) -> pd.DataFrame:
    """
    One-hot encodes specified categorical features in the provided dataframe.
    Processes each feature independently. If a feature is missing, a warning is printed.
    Returns:
        df: Updated dataframe with one-hot encoded categorical features.
        updated_dummy_cols: List of the names of all the new categorical features.
    """
    for feature in cat_features:
        if feature in df.columns:
            dummies = pd.get_dummies(df[feature].astype(str), prefix=feature)
            df = df.drop(columns=[feature])
            df = pd.concat([df, dummies], axis=1)
        else:
            print(f"Warning: '{feature}' not found in the dataframe; skipping one-hot encoding for this feature.")
    print("One-hot encoding applied to categorical features in the dataset")
    updated_dummy_cols = [col for col in df.columns if any(col.startswith(f"{feature}_") for feature in cat_features)]
    print("Updated categorical feature columns:", updated_dummy_cols)
    return df, updated_dummy_cols

# Process numeric features on each dataset independently.
train_data = process_numeric_features(train_data, features_to_transform)
test_data = process_numeric_features(test_data, features_to_transform)

# Process categorical features on each dataset independently.
train_data, train_categorical_features = process_categorical_features(train_data, categorical_features)
test_data, test_categorical_features = process_categorical_features(test_data, categorical_features)

# Calculate the union of the new categorical feature sets from training and testing datasets.
# (The previous code was calculating the intersection.)
categorical_features = [i for i in train_categorical_features if i in test_categorical_features]

# Output the shapes of the processed datasets and confirm that all features are numeric.
print(f"Training data shape: {train_data.shape}")
print(f"Testing data shape: {test_data.shape}")
print(f"Any non-numeric columns remaining in Training data: {any(not pd.api.types.is_numeric_dtype(train_data[col]) for col in train_data.columns)}")
print("List of columns in the Training Data:")
print(list(train_data.columns))
print("\nList of columns in the Testing Data:")
print(list(test_data.columns))


Log transformation applied to numeric features (if present) in the dataset
Log transformation applied to numeric features (if present) in the dataset
One-hot encoding applied to categorical features in the dataset
Updated categorical feature columns: ['proto_3pc', 'proto_a/n', 'proto_aes-sp3-d', 'proto_any', 'proto_argus', 'proto_aris', 'proto_arp', 'proto_ax.25', 'proto_bbn-rcc', 'proto_bna', 'proto_br-sat-mon', 'proto_cbt', 'proto_cftp', 'proto_chaos', 'proto_compaq-peer', 'proto_cphb', 'proto_cpnx', 'proto_crtp', 'proto_crudp', 'proto_dcn', 'proto_ddp', 'proto_ddx', 'proto_dgp', 'proto_egp', 'proto_eigrp', 'proto_emcon', 'proto_encap', 'proto_etherip', 'proto_fc', 'proto_fire', 'proto_ggp', 'proto_gmtp', 'proto_gre', 'proto_hmp', 'proto_i-nlsp', 'proto_iatp', 'proto_ib', 'proto_icmp', 'proto_idpr', 'proto_idpr-cmtp', 'proto_idrp', 'proto_ifmp', 'proto_igmp', 'proto_igp', 'proto_il', 'proto_ip', 'proto_ipcomp', 'proto_ipcv', 'proto_ipip', 'proto_iplt', 'proto_ipnip', 'proto_ippc', 'p

In [3]:
def get_class_weights(df, label_column='label'):
    """
    Computes balanced class weights for a given dataframe.
    
    Args:
        df: DataFrame containing the label column
        label_column: Name of the column containing class labels (default: 'label')
        
    Returns:
        dict: Dictionary mapping class indices to their weights
    """
    from sklearn.utils.class_weight import compute_class_weight
    
    # Get class distribution
    class_counts = df[label_column].value_counts()
    print("Class distribution:", class_counts)
    
    # Compute balanced weights
    class_weights = compute_class_weight(
        class_weight='balanced',
        classes=np.unique(df[label_column]),
        y=df[label_column]
    )
    
    # Convert to dictionary mapping class indices to weights
    return {i: weight for i, weight in enumerate(class_weights)}


Below is the function that will do k-fold cross validation and train our model. this willbe used throughout the remainder of the notebook

In [4]:
def perform_logistic_regression_cv(df, feature_columns, label_column='label', n_splits=5, random_state=42,
                                   penalty='l2', C=1.0, solver='saga', max_iter=5000, tol=1e-4, n_jobs=-1, verbose=False,
                                   class_weight=None):
    # This function performs k-fold cross-validation for a logistic regression model.
    # The model uses the hyperparameters provided as additional parameters. The defaults are set as:
    #   penalty: 'l2'
    #   C: 1.0
    #   solver: 'saga'
    #   max_iter: 5000
    #   tol: 1e-4
    #   n_jobs: -1
    #   verbose: False
    #   class_weight: None
    
    # Import necessary libraries for model training and evaluation.
    from sklearn.linear_model import LogisticRegression  # For building the logistic regression model
    from sklearn.model_selection import KFold            # For splitting data into folds for cross-validation
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix  # To compute performance metrics
    import numpy as np                                     # For numerical computations (mean, std, etc.)
    
    # Convert the provided dataframe columns into numpy arrays for efficient computation.
    X = df[feature_columns].values  # Feature matrix constructed from specified columns
    y = df[label_column].values       # Target vector extracted from the label column
    
    # Set up K-Fold cross-validation with shuffling for randomness.
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    # Initialize lists to store metrics for each fold.
    accuracies, precisions, recalls, f1_scores = [], [], [], []
    tpr_list, fpr_list, tnr_list, fnr_list = [], [], [], []
    
    # Iterate over each train-test split generated by KFold.
    for train_index, test_index in kf.split(X):
        # Split the dataset into training and test sets for the current fold.
        X_train, X_test = X[train_index], X[test_index]  # Extract training and testing features.
        y_train, y_test = y[train_index], y[test_index]    # Extract corresponding target labels.
        
        # Initialize the Logistic Regression model with provided hyperparameters.
        model = LogisticRegression(
            penalty=penalty,              # Regularization method.
            C=C,                          # Inverse of regularization strength.
            solver=solver,                # Algorithm to use in the optimization problem.
            max_iter=max_iter,            # Maximum number of iterations to ensure convergence.
            tol=tol,                      # Tolerance for convergence.
            random_state=random_state,    # Set random state for reproducibility.
            n_jobs=n_jobs,                # Utilize the specified number of CPU cores.
            verbose=verbose,              # Verbose output mode.
            class_weight=class_weight     # Class weights for imbalanced datasets.
        )
        
        # Fit the model on the training data.
        model.fit(X_train, y_train)
        # Use the trained model to predict target labels on the test set.
        y_pred = model.predict(X_test)
        
        # Compute performance metrics for the current fold and append them to the lists.
        accuracies.append(accuracy_score(y_test, y_pred))               # Overall accuracy of predictions.
        precisions.append(precision_score(y_test, y_pred, zero_division=0))  # Precision; handles division by zero.
        recalls.append(recall_score(y_test, y_pred, zero_division=0))         # Recall; handles division by zero.
        f1_scores.append(f1_score(y_test, y_pred, zero_division=0))           # F1 score; harmonic mean of precision and recall.
        
        # Generate confusion matrix and unpack into true negatives, false positives, false negatives, and true positives.
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        # Calculate and store the True Positive Rate (Sensitivity) for the fold.
        tpr_list.append(tp / (tp + fn) if (tp + fn) > 0 else 0)
        # Calculate and store the False Positive Rate for the fold.
        fpr_list.append(fp / (fp + tn) if (fp + tn) > 0 else 0)
        # Calculate and store the True Negative Rate (Specificity) for the fold.
        tnr_list.append(tn / (tn + fp) if (tn + fp) > 0 else 0)
        # Calculate and store the False Negative Rate for the fold.
        fnr_list.append(fn / (fn + tp) if (fn + tp) > 0 else 0)
    
    # Compile the computed metrics into a dictionary by calculating the mean and standard deviation across folds.
    results = {
        'accuracy': {'mean': np.mean(accuracies), 'std': np.std(accuracies)},
        'precision': {'mean': np.mean(precisions), 'std': np.std(precisions)},
        'recall': {'mean': np.mean(recalls), 'std': np.std(recalls)},
        'f1': {'mean': np.mean(f1_scores), 'std': np.std(f1_scores)},
        'true_positive_rate': {'mean': np.mean(tpr_list), 'std': np.std(tpr_list)},
        'false_positive_rate': {'mean': np.mean(fpr_list), 'std': np.std(fpr_list)},
        'true_negative_rate': {'mean': np.mean(tnr_list), 'std': np.std(tnr_list)},
        'false_negative_rate': {'mean': np.mean(fnr_list), 'std': np.std(fnr_list)}
    }
    # Return the aggregated cross-validation results.
    return results

def pretty_print_results(results):
    # Iterate through each metric in the results dictionary.
    for metric, values in results.items():
        # Format and print each metric's name, mean, and standard deviation.
        print(f"{metric.replace('_', ' ').capitalize()}: {values['mean']:.4f} (±{values['std']:.4f})")


Below we will do some experimentation on sets of features

In [5]:
# Detailed Numeric Feature Selection Process
# ------------------------------------------------------------
# This section identifies and ranks numeric features based on their absolute Pearson correlation
# with the target label. We exclude both the 'label' column and any categorical features.
#
# The process involves:
# 1. Extracting numeric features from the training dataset.
# 2. Computing the absolute correlation of each feature with the target variable.
# 3. Sorting features in descending order based on their correlation.
# 4. Creating feature subsets corresponding to the top 20%, 40%, 60%, and 80% of features,
#    in addition to a full sorted list of all features.
#
# These feature subsets can help in selecting the most impactful variables for model training.
# ------------------------------------------------------------

# Step 1: Extract numeric features by excluding 'label' and categorical features.
numeric_features = [col for col in train_data.columns if col != 'label' and col not in categorical_features]

# Safety check: Remove 'label' if it is inadvertently included.
if 'label' in numeric_features:
    numeric_features.remove('label')

# Step 2: Compute the absolute Pearson correlation between each numeric feature and the target label.
correlations = []
for feature in numeric_features:
    try:
        corr_value = abs(train_data[feature].corr(train_data['label']))
        correlations.append((feature, corr_value))
    except KeyError:
        print(f"Error: Unable to calculate correlation for feature '{feature}'.")

# Step 3: Sort the features by their correlation strength in descending order.
sorted_correlations = sorted(correlations, key=lambda item: item[1], reverse=True)

# Step 4: Determine indices for the top percentiles.
n_features = len(sorted_correlations)
index_20 = int(np.ceil(n_features * 0.20))
index_40 = int(np.ceil(n_features * 0.40))
index_60 = int(np.ceil(n_features * 0.60))
index_80 = int(np.ceil(n_features * 0.80))

# Create lists of features for each specified percentile.
top_20_numeric_features = [feature for feature, _ in sorted_correlations[:index_20]]
top_40_numeric_features = [feature for feature, _ in sorted_correlations[:index_40]]
top_60_numeric_features = [feature for feature, _ in sorted_correlations[:index_60]]
top_80_numeric_features = [feature for feature, _ in sorted_correlations[:index_80]]
all_correlated_features = [feature for feature, _ in sorted_correlations]

# Display the feature groups.
print("\nTop 20 percentile features:")
print(top_20_numeric_features)

print("\nTop 40 percentile features:")
print(top_40_numeric_features)

print("\nTop 60 percentile features:")
print(top_60_numeric_features)

print("\nTop 80 percentile features:")
print(top_80_numeric_features)

print("\nAll correlated features (in sorted order):")
print(all_correlated_features)



Top 20 percentile features:
['sttl', 'dload', 'ct_state_ttl', 'dbytes', 'dpkts', 'ct_dst_sport_ltm', 'spkts', 'dmean', 'rate']

Top 40 percentile features:
['sttl', 'dload', 'ct_state_ttl', 'dbytes', 'dpkts', 'ct_dst_sport_ltm', 'spkts', 'dmean', 'rate', 'swin', 'sload', 'dwin', 'stcpb', 'dtcpb', 'ct_src_dport_ltm', 'ct_dst_src_ltm', 'sbytes', 'dttl']

Top 60 percentile features:
['sttl', 'dload', 'ct_state_ttl', 'dbytes', 'dpkts', 'ct_dst_sport_ltm', 'spkts', 'dmean', 'rate', 'swin', 'sload', 'dwin', 'stcpb', 'dtcpb', 'ct_src_dport_ltm', 'ct_dst_src_ltm', 'sbytes', 'dttl', 'ct_src_ltm', 'ct_dst_ltm', 'ct_srv_src', 'ct_srv_dst', 'sinpkt', 'djit', 'sjit', 'ackdat', 'dloss']

Top 80 percentile features:
['sttl', 'dload', 'ct_state_ttl', 'dbytes', 'dpkts', 'ct_dst_sport_ltm', 'spkts', 'dmean', 'rate', 'swin', 'sload', 'dwin', 'stcpb', 'dtcpb', 'ct_src_dport_ltm', 'ct_dst_src_ltm', 'sbytes', 'dttl', 'ct_src_ltm', 'ct_dst_ltm', 'ct_srv_src', 'ct_srv_dst', 'sinpkt', 'djit', 'sjit', 'ackdat'

In [6]:
# Since the categorical features have already been one-hot encoded, we can use them directly.
onehot_cat_features = train_data[categorical_features]

# Step 2: Compute Chi-squared scores for each one-hot encoded categorical feature using the target label.
from sklearn.feature_selection import chi2
chi2_scores, p_values = chi2(onehot_cat_features, train_data['label'])

# Step 3: Pair each categorical feature with its Chi-squared score and sort in descending order.
cat_chi2_scores = list(zip(categorical_features, chi2_scores))
sorted_cat_chi2 = sorted(cat_chi2_scores, key=lambda x: x[1], reverse=True)

# Step 4: Determine indices corresponding to the top percentiles of categorical features.
n_cat_features = len(sorted_cat_chi2)
index_20_cat = int(np.ceil(n_cat_features * 0.20))
index_40_cat = int(np.ceil(n_cat_features * 0.40))
index_60_cat = int(np.ceil(n_cat_features * 0.60))
index_80_cat = int(np.ceil(n_cat_features * 0.80))

# Step 5: Create lists of categorical features for each specified percentile.
top_20_cat_features = [feature for feature, _ in sorted_cat_chi2[:index_20_cat]]
top_40_cat_features = [feature for feature, _ in sorted_cat_chi2[:index_40_cat]]
top_60_cat_features = [feature for feature, _ in sorted_cat_chi2[:index_60_cat]]
top_80_cat_features = [feature for feature, _ in sorted_cat_chi2[:index_80_cat]]
all_chi2_cat_features = [feature for feature, _ in sorted_cat_chi2]

# Step 6: Display the ranked categorical feature groups.
print("\nTop 20 percentile categorical features:")
print(top_20_cat_features)

print("\nTop 40 percentile categorical features:")
print(top_40_cat_features)

print("\nTop 60 percentile categorical features:")
print(top_60_cat_features)

print("\nTop 80 percentile categorical features:")
print(top_80_cat_features)

print("\nAll categorical features sorted by Chi-squared score:")
print(all_chi2_cat_features)



Top 20 percentile categorical features:
['state_INT', 'state_CON', 'proto_tcp', 'state_FIN', 'proto_arp', 'is_sm_ips_ports_1', 'proto_unas', 'service_dns', 'proto_udp', 'service_ssh', 'service_-', 'service_ftp-data', 'proto_ospf', 'proto_sctp', 'service_pop3', 'state_REQ', 'proto_any', 'state_RST', 'proto_gre', 'service_http', 'proto_ipv6', 'proto_mobile', 'proto_pim', 'proto_sun-nd', 'proto_swipe', 'is_sm_ips_ports_0', 'proto_rsvp', 'proto_sep', 'proto_ib', 'proto_3pc', 'proto_a/n']

Top 40 percentile categorical features:
['state_INT', 'state_CON', 'proto_tcp', 'state_FIN', 'proto_arp', 'is_sm_ips_ports_1', 'proto_unas', 'service_dns', 'proto_udp', 'service_ssh', 'service_-', 'service_ftp-data', 'proto_ospf', 'proto_sctp', 'service_pop3', 'state_REQ', 'proto_any', 'state_RST', 'proto_gre', 'service_http', 'proto_ipv6', 'proto_mobile', 'proto_pim', 'proto_sun-nd', 'proto_swipe', 'is_sm_ips_ports_0', 'proto_rsvp', 'proto_sep', 'proto_ib', 'proto_3pc', 'proto_a/n', 'proto_aes-sp3-d', '

Here we present a hill climbing function that we will use to select different sets of features based on what we have calculated above

In [7]:
def perform_rfe(df, feature_columns, label_column='label', n_features_to_select=10, 
                step=1, estimator=None, cv=5, scoring='f1', random_state=42, verbose=1, class_weight=None):
    """
    Performs Recursive Feature Elimination (RFE) with cross-validation to select the optimal features.
    
    Parameters:
        df (pandas.DataFrame): The dataset containing features and target variable.
        feature_columns (list): List of feature column names to consider for selection.
        label_column (str): Name of the target variable column.
        n_features_to_select (int or float): Number of features to select. If float between 0 and 1,
                                            it represents the proportion of features to select.
        step (int or float): Number of features to remove at each iteration. If float between 0 and 1,
                             it represents the proportion of features to remove at each iteration.
        estimator (object): A supervised learning estimator with a fit method. If None, uses LogisticRegression.
        cv (int): Number of cross-validation folds.
        scoring (str): Scoring metric to use for feature selection.
        random_state (int): Random seed for reproducibility.
        verbose (int): Controls verbosity of output.
        class_weight (dict or 'balanced'): Weights associated with classes. If None, all classes have weight 1.
        
    Returns:
        selected_features (list): List of selected feature names.
        rfe_cv (RFECV object): The fitted RFECV object for further inspection.
        cv_results (dict): Cross-validation results.
    """
    from sklearn.feature_selection import RFECV
    from sklearn.linear_model import LogisticRegression
    import numpy as np
    import matplotlib.pyplot as plt
    
    # Extract features and target
    X = df[feature_columns]
    y = df[label_column]
    
    # Set default estimator if none provided
    if estimator is None:
        estimator = LogisticRegression(max_iter=5000, random_state=random_state, class_weight=class_weight)
    
    # Initialize RFECV
    rfe_cv = RFECV(
        estimator=estimator,
        step=step,
        min_features_to_select=n_features_to_select,
        cv=cv,
        scoring=scoring,
        n_jobs=-1,
        verbose=verbose
    )
    
    # Fit RFECV
    rfe_cv.fit(X, y)
    
    # Get selected features
    selected_features = [feature for feature, selected in zip(feature_columns, rfe_cv.support_) if selected]
    
    
    # Prepare CV results
    cv_results = {
        'n_features': rfe_cv.n_features_,
        'cv_results_': rfe_cv.cv_results_,  # Using cv_results_ instead of grid_scores_ which is deprecated
        'ranking': rfe_cv.ranking_,
        'support': rfe_cv.support_
    }
    
    return selected_features, rfe_cv, cv_results

def pretty_print_rfecv_results(results):
    """
    Pretty prints the results of the RFECV feature selection process.
    
    Parameters:
        results (tuple): Tuple containing (selected_features, rfe_cv, cv_results)
    """
    selected_features, rfe_cv, cv_results = results
    
    print("\n" + "="*60)
    print(" "*20 + "FEATURE SELECTION RESULTS")
    print("="*60)
    
    # Print optimal number of features
    print(f"Optimal number of features: {cv_results['n_features']}")
    
    # Print selected features
    print("\nSelected features:")
    for i, feature in enumerate(selected_features, 1):
        print(f"  {i}. {feature}")
    
    # Print cross-validation scores
    mean_scores = rfe_cv.cv_results_['mean_test_score']
    std_scores = rfe_cv.cv_results_['std_test_score']
    
    print(f"\nBest cross-validation score: {mean_scores.max():.4f} ± {std_scores[mean_scores.argmax()]:.4f}")
    print(f"Cross-validation scoring metric: {rfe_cv.scoring}")
    
    print("="*60 + "\n")


In [8]:
# Hill climbing feature selection
tuning_set = train_data.sample(frac=0.3, random_state=42)
class_weights = get_class_weights(tuning_set)
results = perform_rfe(tuning_set, top_20_numeric_features + top_20_cat_features, label_column='label', n_features_to_select=10, step=1, estimator=None, cv=3, scoring='f1', random_state=42, verbose=1, class_weight=class_weights)

# best_features = ['proto_3pc', 'proto_sctp', 'sttl', 'proto_tcp', 'state_FIN', 'state_REQ', 'spkts', 'ct_state_ttl', 'state_CON', 'proto_sun-nd', 'proto_arp', 'dpkts']
pretty_print_rfecv_results(results)

Class distribution: label
1    35830
0    16772
Name: count, dtype: int64
Fitting estimator with 40 features.
Fitting estimator with 39 features.
Fitting estimator with 38 features.
Fitting estimator with 37 features.
Fitting estimator with 36 features.
Fitting estimator with 35 features.
Fitting estimator with 34 features.
Fitting estimator with 33 features.
Fitting estimator with 32 features.
Fitting estimator with 31 features.
Fitting estimator with 30 features.
Fitting estimator with 29 features.
Fitting estimator with 28 features.
Fitting estimator with 27 features.
Fitting estimator with 26 features.
Fitting estimator with 25 features.
Fitting estimator with 24 features.
Fitting estimator with 23 features.
Fitting estimator with 22 features.
Fitting estimator with 21 features.
Fitting estimator with 20 features.
Fitting estimator with 19 features.
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Fitting estimator with 15 

In [9]:
best_features = results[0]

Now that we've selected good features, we will do hyperparameter tuning below
```
Best Features = ['proto_3pc', 'proto_sctp', 'sttl', 'proto_tcp', 'state_FIN', 'state_REQ', 'spkts', 'ct_state_ttl', 'state_CON', 'proto_sun-nd', 'proto_arp', 'dpkts']
```

In [10]:
def tune_logistic_regression(df, selected_features, label_column='label', n_splits=5, random_state=42, n_jobs=-1, verbose=1, class_weight=None):
    """
    Performs hyperparameter tuning for logistic regression focusing on iterations, 
    regularization strength, convergence tolerance, and penalty.
    
    Parameters:
        df (pandas.DataFrame): The dataset to use for tuning.
        selected_features (list): List of selected features to use for model training.
        label_column (str): Name of the column containing the target labels.
        n_splits (int): Number of cross-validation splits.
        random_state (int): Random seed for reproducibility.
        n_jobs (int): Number of jobs to run in parallel (-1 means using all processors).
        verbose (int): Verbosity level (0: no output, 1: progress bar, >1: detailed output).
        class_weight (dict or 'balanced', optional): Weights associated with classes. If None, all classes have weight 1.
        
    Returns:
        best_params (dict): Dictionary containing the best hyperparameters.
        best_score (float): The best cross-validated F1 score.
        cv_results (dict): Full results from the grid search cross-validation.
    """
    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import GridSearchCV
    from sklearn.metrics import make_scorer, f1_score
    import numpy as np
    
    # Extract features and target
    X = df[selected_features].values
    y = df[label_column].values
    
    # Define the parameter grid to search
    param_grid = {
        'max_iter': [10000],
        'C': [0.001, 0.01, 0.1, 1.0],
        'tol': [1e-5, 1e-6, 1e-7],
        'penalty': ['l1', 'l2', 'elasticnet']
    }
    
    # Create a custom parameter grid that respects solver/penalty compatibility
    # We'll use 'saga' solver which supports all penalty types
    compatible_params = []
    
    for max_iter in param_grid['max_iter']:
        for C in param_grid['C']:
            for tol in param_grid['tol']:
                for penalty in param_grid['penalty']:
                    if penalty == 'elasticnet':
                        for l1_ratio in [0.2, 0.5, 0.8]:  # Add some l1_ratio values for elasticnet
                            compatible_params.append({
                                'max_iter': [max_iter],
                                'C': [C],
                                'tol': [tol],
                                'penalty': [penalty],
                                'solver': ['saga'],
                                'l1_ratio': [l1_ratio]
                            })
                    else:
                        compatible_params.append({
                            'max_iter': [max_iter],
                            'C': [C],
                            'tol': [tol],
                            'penalty': [penalty],
                            'solver': ['saga']
                        })
    
    # Define the logistic regression model
    lr = LogisticRegression(random_state=random_state, n_jobs=1, class_weight=class_weight)
    
    # Define the scoring metric (F1 score)
    f1_scorer = make_scorer(f1_score)
    
    # Perform grid search with cross-validation
    grid_search = GridSearchCV(
        estimator=lr,
        param_grid=compatible_params,
        scoring=f1_scorer,
        cv=n_splits,
        n_jobs=n_jobs,
        verbose=verbose,
        return_train_score=True
    )
    
    print(f"Starting grid search with {len(compatible_params)} parameter combinations...")
    grid_search.fit(X, y)
    
    # Get the best parameters and score
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    
    print(f"\nBest F1 Score: {best_score:.4f}")
    print("Best Parameters:")
    for param, value in best_params.items():
        print(f"  {param}: {value}")
    
    # Return the best parameters, best score, and full results
    return best_params, best_score, grid_search.cv_results_

print("tuning with best features: ", best_features)
tuning_set = train_data.sample(frac=0.3, random_state=42)
class_weights = get_class_weights(tuning_set)
best_params, best_score, cv_results = tune_logistic_regression(tuning_set, best_features, label_column='label', n_splits=3, random_state=42, n_jobs=-1, verbose=1, class_weight=class_weights)

tuning with best features:  ['ct_state_ttl', 'proto_tcp', 'state_FIN', 'proto_arp', 'is_sm_ips_ports_1', 'service_dns', 'proto_udp', 'proto_sctp', 'service_pop3', 'state_REQ', 'is_sm_ips_ports_0']
Class distribution: label
1    35830
0    16772
Name: count, dtype: int64
Starting grid search with 60 parameter combinations...
Fitting 3 folds for each of 60 candidates, totalling 180 fits

Best F1 Score: 0.9516
Best Parameters:
  C: 1.0
  max_iter: 10000
  penalty: l1
  solver: saga
  tol: 1e-05


In [11]:
# do k fold cross validation with the best features and best params
class_weights = get_class_weights(train_data)
results = perform_logistic_regression_cv(train_data, best_features, label_column='label', n_splits=5, class_weight=class_weights, **best_params)
pretty_print_results(results)



Class distribution: label
1    119341
0     56000
Name: count, dtype: int64
Accuracy: 0.9322 (±0.0013)
Precision: 0.9122 (±0.0016)
Recall: 0.9963 (±0.0003)
F1: 0.9524 (±0.0009)
True positive rate: 0.9963 (±0.0003)
False positive rate: 0.2045 (±0.0036)
True negative rate: 0.7955 (±0.0036)
False negative rate: 0.0037 (±0.0003)


In [12]:
def train_and_evaluate_model(train_df, test_df, selected_features, label_column, best_params, class_weight=None):
    """
    Trains a logistic regression model on the entire training set using the optimal feature subset and hyperparameters,
    and then evaluates the model performance on the testing set.

    Parameters:
      train_df (pd.DataFrame): The training dataset.
      test_df (pd.DataFrame): The testing dataset.
      selected_features (list): List of optimal features to use for training.
      label_column (str): The name of the target label column.
      best_params (dict): A dictionary of optimal hyperparameters for logistic regression 
                          (e.g., {'max_iter': 5000, 'C': 1.0, 'tol': 1e-4, 'penalty': 'l2', 'solver': 'saga'}).
      class_weight (dict or str, optional): Weights associated with classes. If not provided, all classes are 
                                           supposed to have weight one. Can be 'balanced' or a dictionary.

    Returns:
      results (dict): A dictionary containing evaluation metrics:
                      - accuracy: Accuracy score on the test set.
                      - precision: Precision score.
                      - recall: Recall score.
                      - f1: F1 score.
                      - true_positive_rate: Fraction of positive samples correctly classified.
                      - true_negative_rate: Fraction of negative samples correctly classified.
                      - false_positive_rate: Fraction of negative samples incorrectly classified as positive.
                      - false_negative_rate: Fraction of positive samples incorrectly classified as negative.
      model (LogisticRegression): The trained logistic regression model.
    """
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

    # Prepare training features and labels.
    X_train = train_df[selected_features].values
    y_train = train_df[label_column].values

    # Prepare testing features and labels.
    X_test = test_df[selected_features].values
    y_test = test_df[label_column].values

    # Initialize the logistic regression model with optimal parameters.
    # Ensure random_state is set for reproducibility.
    model = LogisticRegression(random_state=42, class_weight=class_weight, **best_params)
    
    # Train the model on the entire training set.
    model.fit(X_train, y_train)
    
    # Predict on the testing set.
    y_pred = model.predict(X_test)
    
    # Compute the confusion matrix and derive rates.
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()
    
    false_positive_rate = fp / (fp + tn) if (fp + tn) > 0 else 0
    false_negative_rate = fn / (tp + fn) if (tp + fn) > 0 else 0
    true_positive_rate = tp / (tp + fn) if (tp + fn) > 0 else 0
    true_negative_rate = tn / (tn + fp) if (tn + fp) > 0 else 0
    
    # Calculate evaluation metrics.
    results = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred, zero_division=0),
        'recall': recall_score(y_test, y_pred, zero_division=0),
        'f1': f1_score(y_test, y_pred, zero_division=0),
        'true_positive_rate': true_positive_rate,
        'true_negative_rate': true_negative_rate,
        'false_positive_rate': false_positive_rate,
        'false_negative_rate': false_negative_rate,
    }
    
    return results, model

results, trained_model = train_and_evaluate_model(
    train_df=train_data, 
    test_df=test_data, 
    selected_features=best_features, 
    label_column='label', 
    best_params=best_params,
    class_weight=class_weights
)


In [13]:
for k, v in results.items():
    print(f"{k}: {v}")





accuracy: 0.8092722149346548
precision: 0.743575409809112
recall: 0.9976396364598958
f1: 0.8520720092696392
true_positive_rate: 0.9976396364598958
true_negative_rate: 0.5784864864864865
false_positive_rate: 0.4215135135135135
false_negative_rate: 0.0023603635401041206
