In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:

import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import pickle
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, roc_auc_score, f1_score, accuracy_score, multilabel_confusion_matrix, ConfusionMatrixDisplay, confusion_matrix
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from src.data_processor.phase_3.prob1.v1 import  Phase3Prob1FeatureProcessor
import numpy as np
import gc
from collections import Counter

In [3]:
feat_to_name = {'feature1': 'dur', 'feature2': 'proto', 'feature3': 'service', 'feature4': 'state', 'feature5': 'spkts', 'feature6': 'dpkts', 'feature7': 'sbytes', 'feature8': 'dbytes', 'feature9': 'sttl', 'feature10': 'dttl', 'feature11': 'sload', 'feature12': 'dload', 'feature13': 'sloss', 'feature14': 'dloss', 'feature15': 'sinpkt', 'feature16': 'dinpkt', 'feature17': 'sjit', 'feature18': 'djit', 'feature19': 'swin', 'feature20': 'stcpb', 'feature21': 'dtcpb', 'feature22': 'dwin', 'feature23': 'tcprtt', 'feature24': 'synack', 'feature25': 'ackdat', 'feature26': 'smean', 'feature27': 'dmean', 'feature28': 'trans_depth', 'feature29': 'response_body_len', 'feature30': 'ct_srv_src', 'feature31': 'ct_state_ttl', 'feature32': 'ct_dst_ltm', 'feature33': 'ct_src_dport_ltm', 'feature34': 'ct_dst_sport_ltm', 'feature35': 'ct_dst_src_ltm', 'feature36': 'is_ftp_login', 'feature37': 'ct_ftp_cmd', 'feature38': 'ct_flw_http_mthd', 'feature39': 'ct_src_ltm', 'feature40': 'ct_srv_dst', 'feature41': 'is_sm_ips_ports', 'label': 'label'}
name_to_feat = {v:k for k, v in feat_to_name.items()}

In [13]:
df = pd.read_parquet("F:/Data/MLOPS_2023/data_phase-3/phase-3/prob-1/raw_train.parquet")
df.drop_duplicates(inplace=True)
df.reset_index(inplace=True, drop=True)
df.shape

(61748, 42)

In [5]:
df2 = pd.read_csv("F:/Data/MLOPS_2023/UNSW_NB15_training-set.csv")
df3 = pd.read_csv("F:/Data/MLOPS_2023/UNSW_NB15_testing-set.csv")
df2 = pd.concat([df2, df3])
del df3
gc.collect()
df2.drop(columns=['id','rate','attack_cat'],inplace=True)
df2.drop_duplicates(inplace=True)
df2.shape
df2.columns = [name_to_feat[c] for c in df2.columns]
df2 = df2.sample(frac=1, random_state=42, replace=False)


In [19]:
df3 = pd.concat([df, df2])

In [30]:
HyperParameters={'n_estimators':100,
                 'learning_rate':0.3,
                 'max_depth':16,
                 'colsample_bytree':0.5,
                 'subsample':0.8,
                 'reg_alpha':1,
                 'reg_lambda':0,
                 'random_state':42}

hyper_parameters = HyperParameters.copy()

In [42]:
processor = Phase3Prob1FeatureProcessor()
new_df = processor.fit_transform(df3)
kfold = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
FEATURES = processor.data_features['features']
categorical = processor.data_features['categorical_features']
TARGET = 'label'
kfold = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
models = []
scores = []
oofs = np.zeros(df3.shape[0])
for i, (train_idx, valid_idx) in enumerate(kfold.split(new_df, df3['label'])):
    X_train = new_df.iloc[train_idx][FEATURES]
    X_valid = new_df.iloc[valid_idx][FEATURES]
    y_train = df3.iloc[train_idx][TARGET]
    y_valid = df3.iloc[valid_idx][TARGET]
 
    model = CatBoostClassifier(iterations=1500, 
                        learning_rate=0.7, max_depth=8, 
                        random_state=42,eval_metric='AUC', thread_count=-1,
                        )
    model.fit(X_train, y_train,
              eval_set=[(X_train, y_train), (X_valid, y_valid)],
              cat_features=categorical,
              early_stopping_rounds=50,
              verbose=50)
    models.append(model)
    y_pred_proba = model.predict_proba(X_valid)[:,1]
    oofs[valid_idx] = y_pred_proba

    print(f"fold {i} : {roc_auc_score(y_valid, y_pred_proba)}")
    scores.append(roc_auc_score(y_valid, y_pred_proba))

print(np.mean(scores), np.std(scores))

0:	test: 0.9587922	test1: 0.9586963	best: 0.9586963 (0)	total: 138ms	remaining: 3m 27s
50:	test: 0.9869185	test1: 0.9826260	best: 0.9826260 (50)	total: 6.42s	remaining: 3m 2s
100:	test: 0.9913707	test1: 0.9840501	best: 0.9840501 (100)	total: 12.6s	remaining: 2m 55s
150:	test: 0.9940703	test1: 0.9852094	best: 0.9852136 (149)	total: 18.8s	remaining: 2m 48s
200:	test: 0.9958544	test1: 0.9859897	best: 0.9859897 (200)	total: 25.2s	remaining: 2m 42s
250:	test: 0.9970979	test1: 0.9867276	best: 0.9867276 (250)	total: 31.4s	remaining: 2m 36s
300:	test: 0.9979475	test1: 0.9872576	best: 0.9872690 (297)	total: 37.6s	remaining: 2m 29s
350:	test: 0.9985135	test1: 0.9876094	best: 0.9876178 (345)	total: 43.9s	remaining: 2m 23s
400:	test: 0.9989173	test1: 0.9878170	best: 0.9878245 (398)	total: 50.5s	remaining: 2m 18s
450:	test: 0.9991532	test1: 0.9880325	best: 0.9880325 (450)	total: 57.1s	remaining: 2m 12s
500:	test: 0.9993730	test1: 0.9883236	best: 0.9883348 (498)	total: 1m 3s	remaining: 2m 7s
550:	te

In [32]:
gmodel = LGBMClassifier(**hyper_parameters)
gmodel.fit(new_df[FEATURES], df3[TARGET],
           eval_set=[(new_df[FEATURES],df3[TARGET])],
           eval_metric=["logloss", "auc"],
           categorical_feature=categorical,
           verbose=50)

with open(f'../checkpoints/phase-3/prob-1/v1.pkl','wb') as file:
    pickle.dump(gmodel, file)

[50]	valid_0's binary_logloss: 0.156405	valid_0's auc: 0.984428
[100]	valid_0's binary_logloss: 0.142359	valid_0's auc: 0.987729


In [33]:
gmodel = CatBoostClassifier(iterations=1500, 
                        learning_rate=0.7, max_depth=8, 
                        random_state=42,eval_metric='Accuracy', thread_count=-1,
                        )
gmodel.fit(new_df, df3[TARGET],
        cat_features=categorical,
        eval_set=[(new_df,df3[TARGET])]
        )

0:	learn: 0.8734005	test: 0.8734005	best: 0.8734005 (0)	total: 307ms	remaining: 7m 40s
1:	learn: 0.8926966	test: 0.8926966	best: 0.8926966 (1)	total: 441ms	remaining: 5m 30s
2:	learn: 0.8982608	test: 0.8982608	best: 0.8982608 (2)	total: 578ms	remaining: 4m 48s
3:	learn: 0.9010359	test: 0.9010359	best: 0.9010359 (3)	total: 708ms	remaining: 4m 24s
4:	learn: 0.9021200	test: 0.9021293	best: 0.9021293 (4)	total: 863ms	remaining: 4m 18s
5:	learn: 0.9044087	test: 0.9044180	best: 0.9044180 (5)	total: 1.01s	remaining: 4m 10s
6:	learn: 0.9056550	test: 0.9056689	best: 0.9056689 (6)	total: 1.14s	remaining: 4m 2s
7:	learn: 0.9070448	test: 0.9070541	best: 0.9070541 (7)	total: 1.26s	remaining: 3m 55s
8:	learn: 0.9091760	test: 0.9091899	best: 0.9091899 (8)	total: 1.4s	remaining: 3m 51s
9:	learn: 0.9100331	test: 0.9100470	best: 0.9100470 (9)	total: 1.54s	remaining: 3m 48s
10:	learn: 0.9108299	test: 0.9108392	best: 0.9108392 (10)	total: 1.68s	remaining: 3m 47s
11:	learn: 0.9114971	test: 0.9115064	best: 

<catboost.core.CatBoostClassifier at 0x21cbf02ead0>

In [34]:

with open(f'../checkpoints/phase-3/prob-1/v1.pkl','wb') as file:
    pickle.dump(gmodel, file)

In [37]:
y_pred_prob = gmodel.predict_proba(new_df)[:,1]

In [39]:
print(roc_auc_score(df3[TARGET], y_pred_prob))

0.9998979037820548
