In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:

import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import pickle
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, roc_auc_score, f1_score, accuracy_score, multilabel_confusion_matrix, ConfusionMatrixDisplay, confusion_matrix
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from src.data_processor.phase_3.prob2.v1 import  Phase3Prob2FeatureProcessor
import numpy as np
import gc
from collections import Counter

In [3]:
feat_to_name = {'feature1': 'dur', 'feature2': 'proto', 'feature3': 'service', 'feature4': 'state', 'feature5': 'spkts', 'feature6': 'dpkts', 'feature7': 'sbytes', 'feature8': 'dbytes', 'feature9': 'sttl', 'feature10': 'dttl', 'feature11': 'sload', 'feature12': 'dload', 'feature13': 'sloss', 'feature14': 'dloss', 'feature15': 'sinpkt', 'feature16': 'dinpkt', 'feature17': 'sjit', 'feature18': 'djit', 'feature19': 'swin', 'feature20': 'stcpb', 'feature21': 'dtcpb', 'feature22': 'dwin', 'feature23': 'tcprtt', 'feature24': 'synack', 'feature25': 'ackdat', 'feature26': 'smean', 'feature27': 'dmean', 'feature28': 'trans_depth', 'feature29': 'response_body_len', 'feature30': 'ct_srv_src', 'feature31': 'ct_state_ttl', 'feature32': 'ct_dst_ltm', 'feature33': 'ct_src_dport_ltm', 'feature34': 'ct_dst_sport_ltm', 'feature35': 'ct_dst_src_ltm', 'feature36': 'is_ftp_login', 'feature37': 'ct_ftp_cmd', 'feature38': 'ct_flw_http_mthd', 'feature39': 'ct_src_ltm', 'feature40': 'ct_srv_dst', 'feature41': 'is_sm_ips_ports', 'label': 'label'}
name_to_feat = {v:k for k, v in feat_to_name.items()}

In [4]:
df = pd.read_parquet("F:/Data/MLOPS_2023/data_phase-3/phase-3/prob-2/raw_train.parquet")
print(df.shape)
df.drop_duplicates(inplace=True)
df.reset_index(inplace=True, drop=True)
print(df.shape)
df.drop_duplicates(df.columns[:-1], keep=False, inplace=True)
print(df.shape)

(92762, 42)
(65191, 42)
(59742, 42)


In [6]:
df2 = pd.read_csv("F:/Data/MLOPS_2023/UNSW_NB15_training-set.csv")
df3 = pd.read_csv("F:/Data/MLOPS_2023/UNSW_NB15_testing-set.csv")
df2 = pd.concat([df2, df3])
del df3
gc.collect()
df2['label'] = df2['attack_cat']
mapping = {
            "Normal": "Normal",
            "DoS":"Denial of Service",
            "Reconnaissance":"Information Gathering",
            "Analysis": "Information Gathering",
            "Exploits": "Exploits",
            "Shellcode": "Malware",
            "Worms": "Malware",
            "Backdoor":"Malware",
            "Generic": "Other",
            "Fuzzers": "Denial of Service"
            }
df2["label"] = df2["label"].map(mapping)
print(df2["label"].value_counts(dropna=False))
print(df2.shape)
df2.drop(columns=['id','rate','attack_cat'],inplace=True)
df2.drop_duplicates(inplace=True)
print(df2["label"].value_counts(dropna=False))
print(df2.shape)
df2.drop_duplicates(df2.columns[:-1], keep=False, inplace=True)
print(df2.shape)
df2.columns = [name_to_feat[c] for c in df2.columns]


Normal                   93000
Other                    58871
Exploits                 44525
Denial of Service        40599
Information Gathering    16664
Malware                   4014
Name: label, dtype: int64
(257673, 45)
Normal                   85722
Exploits                 27434
Denial of Service        24718
Information Gathering    10686
Other                     7599
Malware                   3507
Name: label, dtype: int64
(159666, 42)
(151246, 42)


In [7]:
df3 = pd.concat([df, df2])
df3.drop_duplicates(inplace=True)
print(df3.shape)
df3.reset_index(drop=True, inplace=True)

(151614, 42)


In [40]:
HyperParameters={'objective': 'multiclass',
                 'n_estimators':200,
                 'learning_rate':0.7,
                 'max_depth':32,
                 'colsample_bytree':0.8,
                 'subsample':0.8,
                 'reg_alpha':1.2,
                 'reg_lambda':10,
                 'random_state':42}
            
hyper_parameters = HyperParameters.copy()

In [11]:
TARGET = 'label'
processor = Phase3Prob2FeatureProcessor()
new_df = processor.transform(df3)
FEATURES = processor.data_features['features']
categorical = processor.data_features['categorical_features']

In [49]:
kfold = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

# hyper_parameters['objective'] = 'multiclass'
models = []
scores = []
oofs = np.empty(df3.shape[0], dtype='object')
for i, (train_idx, valid_idx) in enumerate(kfold.split(new_df, df3[TARGET])):
    X_train = new_df.iloc[train_idx]
    y_train = df3.iloc[train_idx][TARGET]
    X_valid = new_df.iloc[valid_idx]
    y_valid = df3.iloc[valid_idx][TARGET]
    print(X_train.shape, X_valid.shape)
    print(y_train.shape, y_valid.shape)
    # model = LGBMClassifier(**hyper_parameters)
    # model.fit(X_train, y_train,
    #           eval_set=[(X_train, y_train), (X_valid, y_valid)],
    #           eval_metric=["logloss"],
    #           categorical_feature=categorical,
              
    #           verbose=50)
    model = CatBoostClassifier(iterations=500, 
                               learning_rate=0.1, max_depth=8, 
                                random_state=42,eval_metric='Accuracy', thread_count=-1,
                                )
    model.fit(X_train, y_train,
              cat_features=categorical,
              eval_set=[(X_train, y_train), (X_valid, y_valid)]
              )
    models.append(model)
    y_pred = model.predict(X_valid)[:,0]
    print(y_pred)
    oofs[valid_idx] = y_pred

    print(f"fold {i} : {accuracy_score(y_valid, y_pred)}")
    print(classification_report(y_valid, y_pred))
    scores.append(accuracy_score(y_valid, y_pred))

print(np.mean(scores), np.std(scores))
print(classification_report(df3[TARGET], oofs))
print(accuracy_score(df3[TARGET], oofs))

(121291, 41) (30323, 41)
(121291,) (30323,)
0:	learn: 0.7804454	test: 0.7804454	test1: 0.7772318	best: 0.7772318 (0)	total: 278ms	remaining: 2m 18s
1:	learn: 0.7920950	test: 0.7920950	test1: 0.7914784	best: 0.7914784 (1)	total: 555ms	remaining: 2m 18s
2:	learn: 0.8034397	test: 0.8034397	test1: 0.8015698	best: 0.8015698 (2)	total: 808ms	remaining: 2m 13s
3:	learn: 0.8063335	test: 0.8063335	test1: 0.8056591	best: 0.8056591 (3)	total: 1.07s	remaining: 2m 12s
4:	learn: 0.8055338	test: 0.8055338	test1: 0.8033506	best: 0.8056591 (3)	total: 1.33s	remaining: 2m 11s
5:	learn: 0.8048825	test: 0.8048825	test1: 0.8029878	best: 0.8056591 (3)	total: 1.58s	remaining: 2m 10s
6:	learn: 0.8145947	test: 0.8145947	test1: 0.8123537	best: 0.8123537 (6)	total: 1.84s	remaining: 2m 9s
7:	learn: 0.8147348	test: 0.8147348	test1: 0.8140026	best: 0.8140026 (7)	total: 2.11s	remaining: 2m 9s
8:	learn: 0.8140917	test: 0.8140917	test1: 0.8126834	best: 0.8140026 (7)	total: 2.39s	remaining: 2m 10s
9:	learn: 0.8156994	te

In [47]:
# gmodel = LGBMClassifier(**hyper_parameters)
# gmodel.fit(new_df, df3[TARGET],
#            eval_set=[(new_df,df3[TARGET])],
#            eval_metric=["logloss"],
#            categorical_feature=categorical,
#            verbose=50)

# with open(f'../checkpoints/phase-3/prob-2/v1.pkl','wb') as file:
#     pickle.dump(gmodel, file)

[50]	training's multi_logloss: 0.233132
[100]	training's multi_logloss: 0.174968
[150]	training's multi_logloss: 0.139594
[200]	training's multi_logloss: 0.116658


In [13]:
gmodel = CatBoostClassifier(iterations=500, 
                        learning_rate=0.5, max_depth=8, 
                        random_state=42,eval_metric='Accuracy', thread_count=-1,
                        )
gmodel.fit(new_df, df3[TARGET],
        cat_features=categorical,
        eval_set=[(new_df,df3[TARGET])]
        )

with open(f'../checkpoints/phase-3/prob-2/v1.pkl','wb') as file:
    pickle.dump(gmodel, file)

0:	learn: 0.7841096	test: 0.7841096	best: 0.7841096 (0)	total: 339ms	remaining: 2m 49s
1:	learn: 0.8182226	test: 0.8184337	best: 0.8184337 (1)	total: 679ms	remaining: 2m 49s
2:	learn: 0.8252074	test: 0.8252932	best: 0.8252932 (2)	total: 1.05s	remaining: 2m 53s
3:	learn: 0.8304642	test: 0.8305368	best: 0.8305368 (3)	total: 1.37s	remaining: 2m 49s
4:	learn: 0.8345865	test: 0.8346657	best: 0.8346657 (4)	total: 1.72s	remaining: 2m 49s
5:	learn: 0.8365257	test: 0.8365388	best: 0.8365388 (5)	total: 2.05s	remaining: 2m 48s
6:	learn: 0.8392827	test: 0.8393750	best: 0.8393750 (6)	total: 2.38s	remaining: 2m 47s
7:	learn: 0.8420858	test: 0.8421782	best: 0.8421782 (7)	total: 2.68s	remaining: 2m 45s
8:	learn: 0.8440052	test: 0.8441041	best: 0.8441041 (8)	total: 2.99s	remaining: 2m 43s
9:	learn: 0.8445724	test: 0.8446252	best: 0.8446252 (9)	total: 3.3s	remaining: 2m 41s
10:	learn: 0.8460169	test: 0.8460630	best: 0.8460630 (10)	total: 3.62s	remaining: 2m 40s
11:	learn: 0.8467226	test: 0.8467886	best:

In [14]:
y_pred = gmodel.predict(new_df)[:,0]

In [15]:
print(classification_report(df3[TARGET],y_pred))

                       precision    recall  f1-score   support

    Denial of Service       0.89      0.79      0.84     22522
             Exploits       0.92      0.96      0.94     25542
Information Gathering       0.95      0.93      0.94      8891
              Malware       0.88      0.92      0.90      1961
               Normal       0.96      0.97      0.97     85410
                Other       0.99      0.98      0.99      7288

             accuracy                           0.94    151614
            macro avg       0.93      0.93      0.93    151614
         weighted avg       0.94      0.94      0.94    151614



In [45]:
feats = pd.DataFrame({'feature':gmodel.feature_name_, 'score':gmodel.feature_importances_}).sort_values('score',ascending=False)

In [46]:
feats['feature'].values[:15]

array(['feature26', 'feature21', 'feature7', 'feature20', 'feature25',
       'feature24', 'feature11', 'feature23', 'feature1', 'feature18',
       'feature12', 'feature17', 'feature15', 'feature16', 'feature30'],
      dtype=object)