In [None]:
import os
import warnings

from pathlib import Path

import numpy as np
import optuna
import pandas as pd
import psutil
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
#import shap

from sklearn import model_selection
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.preprocessing import LabelEncoder

from xgboost import XGBClassifier




warnings.filterwarnings("ignore", category=DeprecationWarning)
optuna.logging.set_verbosity(optuna.logging.CRITICAL)

In [None]:
DATA_PATH = Path('..') / 'data'
RAW_DATA_PATH = DATA_PATH / 'raw'
PROCESSED_DATA_PATH = DATA_PATH / 'processed_v3'

MODELS_PATH = Path('..') / 'models'


In [None]:
columns = (
['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count',
 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate',
 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'attack', 'level'])

In [None]:
df_train = pd.read_csv(RAW_DATA_PATH / 'KDDTrain+.txt')
df_test = pd.read_csv(RAW_DATA_PATH / 'KDDTest+.txt')

df_train.columns = columns
df_test.columns = columns

df_train.drop('level', axis=1, inplace=True)
df_test.drop('level', axis=1, inplace=True)

# Optimization for Big data

based on:  https://www.kaggle.com/bextuychiev/how-to-work-w-million-row-datasets-like-a-pro

In [None]:
def cpu_stats():
    pid = os.getpid()
    py = psutil.Process(pid)
    memory_use = py.memory_info()[0] / 2. ** 30
    return 'memory GB:' + str(np.round(memory_use, 2))

def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                        c_min > np.finfo(np.float16).min
                        and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                        c_min > np.finfo(np.float32).min
                        and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [None]:
df_train = reduce_memory_usage(df_train, verbose=True)
df_test = reduce_memory_usage(df_test, verbose=True)

print(cpu_stats())
print('Memory reduced')

# Data Preprocessing

In [None]:
le = LabelEncoder()
df_train['protocol_type'] = le.fit_transform(df_train['protocol_type'])
df_test['protocol_type'] = le.transform(df_test['protocol_type'])
df_train['service'] = le.fit_transform(df_train['service'])
df_test['service'] = le.transform(df_test['service'])
df_train['flag'] = le.fit_transform(df_train['flag'])
df_test['flag'] = le.transform(df_test['flag'])

In [None]:
label = []
for i in df_train.attack:
    if i == 'normal':
        label.append(0)
    else:
        label.append(1)
df_train['label'] = label

label_test = []
for i in df_test.attack:
    if i == 'normal':
        label_test.append(0)
    else:
        label_test.append(1)
df_test['label'] = label_test

In [None]:
df_train.drop('attack', axis=1, inplace=True, errors='ignore')
df_test.drop('attack', axis=1, inplace=True, errors='ignore')


In [None]:
y_train_df = df_train.label

X_train_df = df_train.drop('label', axis=1, inplace=False, errors='ignore')

X_train, X_val, y_train, y_val = train_test_split(X_train_df, y_train_df, test_size=0.2)

In [None]:
X_test = df_test.drop('label', axis=1, inplace=False, errors='ignore')
y_test = df_test['label']


In [None]:
# save data

X_train.to_csv(PROCESSED_DATA_PATH / 'X_train.csv', index=False)
X_val.to_csv(PROCESSED_DATA_PATH / 'X_val.csv', index=False)
y_train.to_csv(PROCESSED_DATA_PATH / 'y_train.csv', index=False)
y_val.to_csv(PROCESSED_DATA_PATH / 'y_val.csv', index=False)
X_test.to_csv(PROCESSED_DATA_PATH / 'X_test.csv', index=False)
y_test.to_csv(PROCESSED_DATA_PATH / 'y_test.csv', index=False)

# Run XGBoost Classifier

In [25]:
   
xgboost_model = XGBClassifier()

xgboost_model.fit(X_train, y_train, verbose=False)
y_pred = xgboost_model.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)




              precision    recall  f1-score   support

           0       0.69      0.97      0.81      9711
           1       0.97      0.67      0.79     12832

    accuracy                           0.80     22543
   macro avg       0.83      0.82      0.80     22543
weighted avg       0.85      0.80      0.80     22543



# Save Model And Predictions

In [None]:
xgboost_model.save_model(MODELS_PATH / f'xgboost_model.json')
np.save(PROCESSED_DATA_PATH / 'y_pred.npy', y_pred[:22500], allow_pickle=True)