In [31]:
import os
import warnings

from pathlib import Path

import numpy as np
import optuna
import pandas as pd
import psutil
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import model_selection
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle


from xgboost import XGBClassifier


warnings.filterwarnings("ignore", category=DeprecationWarning)
optuna.logging.set_verbosity(optuna.logging.CRITICAL)

In [32]:
DATA_PATH = Path('..') / 'data'
RAW_DATA_PATH = DATA_PATH / 'raw'
PROCESSED_DATA_PATH = DATA_PATH / 'processed_v3'

MODELS_PATH = Path('..') / 'models'


In [33]:
columns = (
['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count',
 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate',
 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'attack', 'level'])

In [34]:
df_train = pd.read_csv(RAW_DATA_PATH / 'KDDTrain+.csv')
df_test = pd.read_csv(RAW_DATA_PATH / 'KDDTest+.csv')

df_train.columns = columns
df_test.columns = columns

df_train.drop('level', axis=1, inplace=True)
df_test.drop('level', axis=1, inplace=True)

# Optimization for Big data

based on:  https://www.kaggle.com/bextuychiev/how-to-work-w-million-row-datasets-like-a-pro

In [35]:
def cpu_stats():
    pid = os.getpid()
    py = psutil.Process(pid)
    memory_use = py.memory_info()[0] / 2. ** 30
    return 'memory GB:' + str(np.round(memory_use, 2))

def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                        c_min > np.finfo(np.float16).min
                        and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                        c_min > np.finfo(np.float32).min
                        and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [36]:
df_train = reduce_memory_usage(df_train, verbose=True)
df_test = reduce_memory_usage(df_test, verbose=True)

print(cpu_stats())
print('Memory reduced')

Mem. usage decreased to 12.01 Mb (70.2% reduction)
Mem. usage decreased to 2.15 Mb (70.2% reduction)
memory GB:0.26
Memory reduced


# Data Preprocessing

In [37]:
le = LabelEncoder()
df_train['protocol_type'] = le.fit_transform(df_train['protocol_type'])
df_test['protocol_type'] = le.transform(df_test['protocol_type'])
df_train['service'] = le.fit_transform(df_train['service'])
df_test['service'] = le.transform(df_test['service'])
df_train['flag'] = le.fit_transform(df_train['flag'])
df_test['flag'] = le.transform(df_test['flag'])

In [38]:
label = []
for i in df_train.attack:
    if i == 'normal':
        label.append(0)
    else:
        label.append(1)
df_train['label'] = label

label_test = []
for i in df_test.attack:
    if i == 'normal':
        label_test.append(0)
    else:
        label_test.append(1)
df_test['label'] = label_test

### Prepare Test And Train Validation Sets

In [39]:
df_train.drop('attack', axis=1, inplace=True, errors='ignore')
df_test.drop('attack', axis=1, inplace=True, errors='ignore')


df_train = shuffle(df_train)
df_test = shuffle(df_test)


y_train = df_train.label

X_train = df_train.drop('label', axis=1, inplace=False, errors='ignore')

X_test = df_test.drop('label', axis=1, inplace=False, errors='ignore')
y_test = df_test['label']

### Validation Set

we use 25% of the test set as validation since we get overfitting if we do cross validaiotn on traning set

In [40]:
SIZE_VAL = 5000 

X_test = X_test.iloc[SIZE_VAL:]
y_test = y_test.iloc[SIZE_VAL:]
X_val  = X_test.iloc[:SIZE_VAL]
y_val  = y_test.iloc[:SIZE_VAL]

In [41]:
# save data

X_train.to_csv(PROCESSED_DATA_PATH / 'X_train.csv', index=False)
X_val.to_csv(PROCESSED_DATA_PATH / 'X_val.csv', index=False)
y_train.to_csv(PROCESSED_DATA_PATH / 'y_train.csv', index=False)
y_val.to_csv(PROCESSED_DATA_PATH / 'y_val.csv', index=False)
X_test.to_csv(PROCESSED_DATA_PATH / 'X_test.csv', index=False)
y_test.to_csv(PROCESSED_DATA_PATH / 'y_test.csv', index=False)

# Run XGBoost Classifier

#### trainning and validation set

In [42]:
xgboost_model = XGBClassifier()

xgboost_model.fit(X_train, y_train, verbose=False)
y_pred_val = xgboost_model.predict(X_val)
report = classification_report(y_val, y_pred_val)
print(report)

              precision    recall  f1-score   support

           0       0.70      0.97      0.81      2215
           1       0.96      0.67      0.79      2785

    accuracy                           0.80      5000
   macro avg       0.83      0.82      0.80      5000
weighted avg       0.84      0.80      0.80      5000



In [47]:
print(len(X_test))
print(len(y_test))
print(len(y_pred_test))

17543
17543
5000


#### test set

In [48]:
y_pred_test = xgboost_model.predict(X_test)
report_ = classification_report(y_test, y_pred_test)
print(report)

              precision    recall  f1-score   support

           0       0.70      0.97      0.81      2215
           1       0.96      0.67      0.79      2785

    accuracy                           0.80      5000
   macro avg       0.83      0.82      0.80      5000
weighted avg       0.84      0.80      0.80      5000



# Save Model And Predictions

In [49]:
xgboost_model.save_model(MODELS_PATH / f'xgboost_model.json')
np.save(PROCESSED_DATA_PATH / 'y_pred_val_xgboost.npy', y_pred_val, allow_pickle=True)
np.save(PROCESSED_DATA_PATH / 'y_pred_test_xgboost.npy', y_pred_test, allow_pickle=True)