In [None]:
import os
import warnings

from pathlib import Path

import numpy as np
import optuna
import pandas as pd
import psutil
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import model_selection
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import LabelEncoder

from xgboost import XGBClassifier




warnings.filterwarnings("ignore", category=DeprecationWarning)
optuna.logging.set_verbosity(optuna.logging.CRITICAL)

In [None]:
DATA_PATH = Path('..') / 'data'
RAW_DATA_PATH = DATA_PATH / 'raw'
MODELS_PATH = Path('..') / 'models'


In [None]:
columns = (
['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count',
 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate',
 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'attack', 'level'])

In [None]:
df_train = pd.read_csv(RAW_DATA_PATH / 'KDDTrain+.txt')
df_test = pd.read_csv(RAW_DATA_PATH / 'KDDTest+.txt')

df_train.columns = columns
df_test.columns = columns

df_train.drop('level', axis=1, inplace=True)
df_test.drop('level', axis=1, inplace=True)

# Optimization for Big data

based on:  https://www.kaggle.com/bextuychiev/how-to-work-w-million-row-datasets-like-a-pro

In [None]:
def cpu_stats():
    pid = os.getpid()
    py = psutil.Process(pid)
    memory_use = py.memory_info()[0] / 2. ** 30
    return 'memory GB:' + str(np.round(memory_use, 2))

def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                        c_min > np.finfo(np.float16).min
                        and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                        c_min > np.finfo(np.float32).min
                        and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

# ML Helpers

In [None]:
def score(X, y, model, cv):
    scoring = ["roc_auc"]
    scores = cross_validate(
        model, X, y, scoring=scoring, cv=cv, return_train_score=True
    )
    scores = pd.DataFrame(scores).T
    return scores.assign(
        mean=lambda x: x.mean(axis=1),
        std=lambda x: x.std(axis=1),
    )

In [None]:
df_train = reduce_memory_usage(df_train, verbose=True)
df_test = reduce_memory_usage(df_test, verbose=True)

print(cpu_stats())
print('Memory reduced')

In [None]:
# plt.figure(figsize=(20, 20))
# sns.countplot(df_train['attack'])
# plt.xticks(rotation=45)
# plt.show()

# Data Preprocessing

In [None]:
le = LabelEncoder()
df_train['protocol_type'] = le.fit_transform(df_train['protocol_type'])
df_test['protocol_type'] = le.transform(df_test['protocol_type'])
df_train['service'] = le.fit_transform(df_train['service'])
df_test['service'] = le.transform(df_test['service'])
df_train['flag'] = le.fit_transform(df_train['flag'])
df_test['flag'] = le.transform(df_test['flag'])

In [None]:
label = []
for i in df_train.attack:
    if i == 'normal':
        label.append(0)
    else:
        label.append(1)
df_train['label'] = label

label_test = []
for i in df_test.attack:
    if i == 'normal':
        label_test.append(0)
    else:
        label_test.append(1)
df_test['label'] = label_test

In [None]:
df_train.drop('attack', axis=1, inplace=True, errors='ignore')
df_test.drop('attack', axis=1, inplace=True, errors='ignore')


In [None]:
df_test["kfold"] = -1
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=666)
for fold, (train_indicies, valid_indicies) in enumerate(kf.split(X=df_test)):
    df_test.loc[valid_indicies, "kfold"] = fold
features = [x for x in df_test.columns.values if x[0] == "f"]

useful_features = [c for c in df_test.columns if c not in ("label", "kfold")]

# Run XGBoost Classifier

In [None]:
n_estimators = 100 #7000
early_stopping_rounds = 3 #300

n_trials = 5

xgboost_model = None

In [None]:
def run(trial):
    print('running trail')
    fold = 0
    learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.25, log=True)
    reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
    reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
    subsample = trial.suggest_float("subsample", 0.1, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
    max_depth = trial.suggest_int("max_depth", 1, 7)

    xtrain = df_test[df_test.kfold != fold].reset_index(drop=True)
    xvalid = df_test[df_test.kfold == fold].reset_index(drop=True)

    ytrain = xtrain.label
    yvalid = xvalid.label

    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]

    xgboost_model = XGBClassifier(
        random_state=42,
        tree_method="auto",
        gpu_id=1,
        predictor="cpu_predictor",
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        reg_lambda=reg_lambda,
        reg_alpha=reg_alpha,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        max_depth=max_depth,
    )
    xgboost_model.fit(xtrain, ytrain, early_stopping_rounds=early_stopping_rounds, eval_metric="aucpr", eval_set=[(xvalid, yvalid)], verbose=1000)
    preds_valid = xgboost_model.predict(xvalid)
    acc = accuracy_score(yvalid, preds_valid)
    
    if trial.number == (n_trials - 1):
        print(f'saving model after trail: {trial.number}')
        xgboost_model.save_model(MODELS_PATH / f'xgboost_model_{trial.number}.json')
    print('finished running trail')
    return acc

In [74]:
study = optuna.create_study(direction="minimize")
study.optimize(run, n_trials=n_trials)

print('saved model !')

running trail
[0]	validation_0-aucpr:0.98620


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)


[64]	validation_0-aucpr:0.99931
finished running trail
running trail
[0]	validation_0-aucpr:0.98551


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)


[5]	validation_0-aucpr:0.99151
finished running trail
running trail
[0]	validation_0-aucpr:0.95243
[15]	validation_0-aucpr:0.98342
finished running trail
running trail


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)


[0]	validation_0-aucpr:0.95293
[57]	validation_0-aucpr:0.99809
finished running trail
running trail
[0]	validation_0-aucpr:0.99113


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)


[45]	validation_0-aucpr:0.99941
saving model after trail: 4
finished running trail
saved model !


In [76]:
study.best_params
print(len(useful_features))


41
