# Import libraries

In [45]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_validate
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import random

seed = 2022
random.seed(seed)

# Input file

In [3]:
df = pd.read_csv('../ALLFLOWMETER_HIKARI2021.csv')
print(df.shape)
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,uid,originh,originp,responh,responp,flow_duration,fwd_pkts_tot,bwd_pkts_tot,...,idle.min,idle.max,idle.tot,idle.avg,idle.std,fwd_init_window_size,bwd_init_window_size,fwd_last_window_size,traffic_category,Label
0,0,0,Cg61Jch3vdz9DBptj,103.255.15.23,13316,128.199.242.104,443,2.207588,15,14,...,0.0,0.0,0.0,0.0,0.0,29200,65160,0,Bruteforce-XML,1
1,1,1,CdRIlqLWdj35Y9vW9,103.255.15.23,13318,128.199.242.104,443,15.624266,15,14,...,15343000.0,15343000.0,15343000.0,15343000.0,0.0,29200,65160,0,Bruteforce-XML,1
2,2,2,CLzp9Khd0Y09Qkgrg,103.255.15.23,13320,128.199.242.104,443,12.203357,14,13,...,11968140.0,11968140.0,11968140.0,11968140.0,0.0,29200,65160,0,Bruteforce-XML,1
3,3,3,Cnf1YA4iLB4CSNWB88,103.255.15.23,13322,128.199.242.104,443,9.992448,14,13,...,9759205.0,9759205.0,9759205.0,9759205.0,0.0,29200,65160,0,Bruteforce-XML,1
4,4,4,C4ZKvv3fpO72EAOsJ6,103.255.15.23,13324,128.199.242.104,443,7.780611,14,14,...,7545305.0,7545305.0,7545305.0,7545305.0,0.0,29200,65160,0,Bruteforce-XML,1


# Setup columns
There are 
* 88 columns in total. Some of them are `Unnamed` columns, dropped them.
* 79 feature columns. All of them are numerical columns, either float64 or int64.
* No categorical feature columns.
* Two target columns
  * Label: Benign or attack
  * traffic_category: Benign has two categories (Benign or Background). Attack has four (Bruteforce, Bruteforce-XML, Probing, and XMRIGCC CryptoMiner)

In [4]:
id_columns = ['uid', 'originh', 'originp', 'responh', 'responp']
target_columns = ['traffic_category', 'Label']
# drop Unnames columns
df.drop(columns=[col for col in df.columns if 'Unnamed:' in col], inplace=True)

In [25]:
feature_columns = [
    col for col in df.columns 
        if col not in id_columns + target_columns
]

numerical_columns = [
    col for col in feature_columns 
        if df[col].dtype in [int, np.int64, float, np.float64]
]
categorical_columns = [
    col for col in feature_columns 
        if col not in numerical_columns
]

print(len(numerical_columns), len(categorical_columns))

# Preprocessing

In [30]:
# attack instances are a lot fewer than benign ones
df[target_columns[1]].value_counts()

0    517582
1     37696
Name: Label, dtype: int64

# Training

In [29]:
df[target_columns[1]].value_counts()

0    517582
1     37696
Name: Label, dtype: int64

In [32]:
X = df[feature_columns]
Y = df[target_columns[1]]

In [41]:
def result(y_true, y_prob):
    auc = roc_auc_score(y_true, y_prob)

    y_pred = np.round(y_prob)
    f1_pos = f1_score(y_true, y_pred, pos_label=1)
    f1_neg = f1_score(y_true, y_pred, pos_label=0)

    print(f'AUC {np.round(auc, 2)}, F1 for attacks {np.round(f1_pos, 2)}, F1 for benigns {np.round(f1_neg, 2)}')

    return auc, f1_pos, f1_neg

## Train test validation

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, shuffle=True, random_state=seed)

### Unweighted learning

In [43]:
model = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=None, 
    class_weight=None, random_state=seed
)
model.fit(x_train, y_train)

RandomForestClassifier(random_state=2022)

In [44]:
# training performance
print('Training performance')
y_prob = model.predict_proba(x_train)[:, -1]
result(y_train, y_prob)

# test performance
print('Testing performance')
y_prob = model.predict_proba(x_test)[:, -1]
result(y_test, y_prob)

Training performance
AUC 0.98, F1 for attacks 0.58, F1 for benigns 0.97
Testing performance
AUC 0.89, F1 for attacks 0.1, F1 for benigns 0.94


(0.8899262759666244, 0.10494856825132055, 0.938004274903237)

### Weighted learning

In [35]:
model = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=None, 
    class_weight='balanced', random_state=seed
)
model.fit(x_train, y_train)

RandomForestClassifier(class_weight='balanced', random_state=2022)

In [42]:
# training performance
print('Training performance')
y_prob = model.predict_proba(x_train)[:, -1]
result(y_train, y_prob)

# test performance
print('Testing performance')
y_prob = model.predict_proba(x_test)[:, -1]
result(y_test, y_prob)

Training performance
AUC 0.98, F1 for attacks 0.73, F1 for benigns 0.97
Testing performance
AUC 0.89, F1 for attacks 0.13, F1 for benigns 0.94


(0.8905138235652679, 0.13458879304668464, 0.936484233417905)

## Cross validation

In [None]:
folds = 10
kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

### Weighted learning

In [47]:
model = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=None, 
    class_weight='balanced', random_state=seed
)

scores = cross_validate(model, X, Y, cv=kf, scoring=["accuracy", "f1"])
for key in scores.keys():
    mean, std = scores[key].mean(), scores[key].std()
    print(f"{key}, mean {mean}, std {std}")

fit_time, mean 171.3735337972641, std 5.079239326246589
score_time, mean 0.9671569108963013, std 0.0858398812749709
test_accuracy, mean 0.8753291185510298, std 0.0011769176742212166
test_f1, mean 0.09538201442208605, std 0.0030848086348229794


### Unweighted learning

In [None]:
model = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=None, 
    class_weight=None, random_state=seed
)

scores = cross_validate(model, X, Y, cv=kf, scoring=["accuracy", "f1"])
for key in scores.keys():
    mean, std = scores[key].mean(), scores[key].std()
    print(f"{key}, mean {mean}, std {std}")

### Sampling

In [None]:
import time, tqdm
# from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold

start = time.clock()
y_probs = []
y_vals = []

steps = [
    ('under', RandomUnderSampler(sampling_strategy='majority')), 
    (
        'model', RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=None, 
                class_weight='balanced', random_state=seed 
            )
    )
]
pipeline = Pipeline(steps=steps)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=seed)
scores = cross_validate(pipeline, X, Y, scoring='f1_micro', cv=kf, n_jobs=-1)

for (key, value) in scores:
    print(key, value)