<a href="https://colab.research.google.com/github/hkvil/multi-class-unsw-nb15/blob/main/19_1_(GPU).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [72]:
%pip install lightgbm --config-settings=cmake.define.USE_GPU=ON

[0mCollecting lightgbm
  Using cached lightgbm-4.5.0-py3-none-manylinux_2_28_x86_64.whl.metadata (17 kB)
Using cached lightgbm-4.5.0-py3-none-manylinux_2_28_x86_64.whl (3.6 MB)
[0mInstalling collected packages: lightgbm
[0mSuccessfully installed lightgbm-4.5.0


In [74]:
!mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd

In [66]:
%load_ext cudf.pandas

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.metrics import classification_report,roc_auc_score,average_precision_score
from sklearn.preprocessing import LabelEncoder,label_binarize
import matplotlib.pyplot as plt
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import ADASYN
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import class_weight
import time
import joblib
import os
import json

import xgboost as xgb
import lightgbm as lgb
import catboost as cb

The cudf.pandas extension is already loaded. To reload it, use:
  %reload_ext cudf.pandas


In [67]:
seed = 28

In [68]:
%ls

cufile.log  data-test.csv  data-train.csv  rmm_log.txt  [0m[01;34msample_data[0m/


In [69]:
df_train = pd.read_csv("data-train.csv")
df_test  = pd.read_csv("data-test.csv")

df_train.drop(columns=['id','label'],inplace=True)
df_test.drop(columns=['id','label',],inplace=True)
cat_features = ['proto','service','state']

combined_df = pd.concat([df_train, df_test], ignore_index=True)

le = LabelEncoder()
# Encode labels in column 'attack_cat'.
combined_df['attack_cat']= le.fit_transform(combined_df['attack_cat'])

n_train = len(df_train)

df_train = combined_df.iloc[:n_train].reset_index(drop=True)
df_test  = combined_df.iloc[n_train:].reset_index(drop=True)

print("Training set shape:", df_train.shape)
print("Test set shape:", df_test.shape)

Training set shape: (175341, 43)
Test set shape: (82332, 43)


In [75]:
def apply_adasyn(df_train):
    X, y = df_train.drop(columns=['attack_cat']), df_train['attack_cat']
    categorical_cols = X.select_dtypes(include=['object', 'category']).columns

    # Step 1: One Hot Encoding
    encoder = OneHotEncoder(sparse_output=False, drop='first')
    X_encoded = encoder.fit_transform(X[categorical_cols])
    X_encoded_df = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(categorical_cols))

    # Step 2: Combine the encoded columns with the remaining columns
    X_combined = pd.concat([X.drop(columns=categorical_cols).reset_index(drop=True), X_encoded_df], axis=1)
    current_X, current_y = X_combined, y

    # Step 3: Apply ADASYN (Oversample all class except majority)
    adasyn = ADASYN(random_state=seed)
    X_resampled, y_resampled = adasyn.fit_resample(current_X, current_y)

    # Step 4: Revert One Hot Encoding
    X_resampled_df = pd.DataFrame(X_resampled, columns=X_combined.columns)
    original_categorical_df = encoder.inverse_transform(X_resampled_df[encoder.get_feature_names_out(categorical_cols)])
    original_categorical_df = pd.DataFrame(original_categorical_df, columns=categorical_cols)

    # Combine the reverted categorical columns back with the other columns
    final_X = pd.concat([X_resampled_df.drop(columns=encoder.get_feature_names_out(categorical_cols)),
                         original_categorical_df], axis=1)

    # Ensure the final column order matches the original order
    final_X = final_X[X.columns]

    # Combine the features with the target column
    df_resampled = pd.concat([final_X, y_resampled.reset_index(drop=True)], axis=1)

    return df_resampled

def perform_rfe_cv(df_train, min_features_to_select):
    X, y = df_train.drop(columns=['attack_cat']), df_train['attack_cat']
    categorical_features = X.select_dtypes(include=['object', 'category']).columns
    numeric_features = X.select_dtypes(include=['int64', 'float64'])
    model = lgb.LGBMClassifier(random_state=seed,data_sample_strategy='goss',verbose = -1,device='gpu')
    rfe = RFECV(model, min_features_to_select=min_features_to_select,n_jobs=-1,verbose=0,cv=2) #cv=1 for testing purpose
    rfe.fit(numeric_features, y)
    selected_numeric_features = rfe.get_feature_names_out()
    selected_features = list(selected_numeric_features) + list(categorical_features)
    return selected_features

def compute_class_weights(y):
    class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
    class_weights_dict = {cls: weight for cls, weight in zip(np.unique(y), class_weights)}

    return class_weights_dict

def evaluate_model(model, X_test, y_test, fitting_time):
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)

    # Get unique class labels
    classes = model.classes_

    # Binarize the true labels for multiclass evaluation
    y_test_binarized = label_binarize(y_test, classes=classes)

    # Classification Report
    report = classification_report(y_test, y_pred, output_dict=True)

    # PR AUC (One-vs-Rest)
    pr_auc_per_class = []
    for i in range(y_pred_proba.shape[1]):  # Iterate through each class
        pr_auc = average_precision_score(y_test_binarized[:, i], y_pred_proba[:, i])
        pr_auc_per_class.append(pr_auc)

    # PR AUC Overall (macro average)
    pr_auc_overall = average_precision_score(y_test_binarized, y_pred_proba, average='macro')

    return {
        'classification_report': report,
        'pr_auc_per_class': pr_auc_per_class,
        'pr_auc_overall': pr_auc_overall,
        'fitting_time': fitting_time,
    }


def objectToCategory(df):
    data = df.copy()  # To avoid modifying the original DataFrame
    for col in data.select_dtypes(include=['object']).columns:
        data[col] = pd.Categorical(data[col]).codes
    return data


def export_all_results(all_results):
    # Ensure output directory exists
    output_dir = 'output'
    os.makedirs(output_dir, exist_ok=True)

    # Define the output file path
    file_path = f'{output_dir}/all_scenarios_results.json'

    # Write all results to a single JSON file
    with open(file_path, 'w') as json_file:
        json.dump(all_results, json_file, indent=4)
    print(f"All results have been exported to {file_path}")

def run_scenario(scenario, df_train, df_test,selected_features ,df_train_adasyn,class_weights):

    df_train = objectToCategory(df_train)
    df_test = objectToCategory(df_test)
    df_train_adasyn = objectToCategory(df_train_adasyn)


    X_train, y_train = df_train.drop(columns=['attack_cat']), df_train['attack_cat']
    X_test, y_test = df_test.drop(columns=['attack_cat']), df_test['attack_cat']


    if 'ADASYN' in scenario:
        X_train, y_train = df_train_adasyn.drop(columns=['attack_cat']), df_train_adasyn['attack_cat']

    if 'RFE-CV' in scenario:
        X_train = X_train[selected_features]
        X_test = X_test[selected_features]

    if 'CLASS WEIGHT' not in scenario:
        class_weights = None


    models = {
        'LightGBM': lgb.LGBMClassifier(
            random_state=seed,
            objective='multiclass',
            n_estimators=1000,
            learning_rate=0.1,
            class_weight=class_weights,
            data_sample_strategy='goss',
            verbose=-1,
            device="gpu"
        ),
        'XGBoost': xgb.XGBClassifier(
            random_state=seed,
            n_estimators=100,
            learning_rate=0.1,
            enable_categorical=True,
            tree_method="approx",
            device="cuda"
        ),
        'CatBoost': cb.CatBoostClassifier(
            random_seed=seed,
            n_estimators=100,
            learning_rate=0.1,
            class_weights=class_weights,
            verbose=0,
            cat_features=list(cat_features),
            task_type="GPU"
        )
    }

    results = {}

    for model_name, model in models.items():
        if model_name == 'XGBoost':
            if 'CLASS WEIGHT' in scenario:
                class_weights_xgb = class_weight.compute_sample_weight(
                class_weight=class_weights,
                y=y_train)
                start_time = time.time()
                model.fit(X_train, y_train,sample_weight=class_weights_xgb)
                fitting_time = time.time() - start_time
                results[model_name] = evaluate_model(model, X_test, y_test, fitting_time)
            else:
                start_time = time.time()
                model.fit(X_train, y_train)
                fitting_time = time.time() - start_time
                results[model_name] = evaluate_model(model, X_test, y_test, fitting_time)
        else:
            start_time = time.time()
            model.fit(X_train, y_train)
            fitting_time = time.time() - start_time
            results[model_name] = evaluate_model(model, X_test, y_test, fitting_time)



    return results


In [None]:
selected_features = perform_rfe_cv(df_train,10)
class_weights = compute_class_weights(df_train['attack_cat'])
df_train_adasyn = apply_adasyn(df_train)

In [None]:
# prompt: print current time

import time

# ... (Your existing code)

def print_current_time():
    current_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    print(f"Current time: {current_time}")


# Example usage (call the function wherever you need to print the time)
print_current_time()
#1038
# ... (Rest of your existing code)

In [None]:
scenarios = [
     [],
    ['RFE-CV'],
    ['ADASYN'],
    ['CLASS WEIGHT'],
    ['RFE-CV', 'ADASYN'],
    ['RFE-CV', 'CLASS WEIGHT'],
    ['ADASYN', 'CLASS WEIGHT'],
    ['RFE-CV', 'ADASYN', 'CLASS WEIGHT']
]

all_results = {}
for idx, scenario in enumerate(scenarios, start=1):
    scenario_name = " + ".join(scenario) if scenario else "NO RFE-CV, NO ADASYN, NO CLASS WEIGHT"
    print(f"Running Scenario {idx}: {scenario_name}")
    results = run_scenario(scenario, df_train,df_test,selected_features,df_train_adasyn,class_weights)
    all_results[scenario_name] = results


export_all_results(all_results)

df_train.info()