In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import pickle
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, roc_auc_score, f1_score
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from src.data_processor.phase_2.prob1.v1 import  Phase2Prob1FeatureProcessor
import numpy as np
from collections import Counter

In [2]:
df = pd.read_parquet("D:/Data/MLOPS_2023/data_phase-2/phase-2/prob-1/raw_train.parquet")

In [18]:
df.to_csv('output.csv',index=False)

In [20]:
df.groupby('feature3')['feature4'].value_counts()

feature3  feature4
-         FIN         16622
          INT          5461
          CON          2158
          REQ           631
          RST            17
          ECO             3
          ACC             1
dhcp      INT            23
          CON             8
dns       INT          4558
          CON          2515
          REQ            33
          FIN            17
ftp       FIN          1065
          CON             6
ftp-data  FIN          1215
          CON             1
http      FIN          6448
          CON             5
          RST             1
irc       FIN             8
pop3      FIN           377
          CON             3
radius    CON             3
          INT             3
smtp      FIN          1687
          CON            11
snmp      INT            18
ssh       FIN           344
          RST             2
ssl       FIN            23
Name: count, dtype: int64

In [4]:
df.drop_duplicates(inplace=True)
df.reset_index(inplace=True, drop=True)
df.shape

(43405, 42)

In [5]:
df.reset_index(drop=True, inplace=True)
x = df.groupby(['feature1', 'feature2', 'feature3', 'feature4', 'feature5', 'feature6',
                'feature7', 'feature8', 'feature9', 'feature10', 'feature11',
                'feature12', 'feature13', 'feature14', 'feature15', 'feature16',
                'feature17', 'feature18', 'feature19', 'feature20', 'feature21',
                'feature22', 'feature23', 'feature24', 'feature25', 'feature26',
                'feature27', 'feature28', 'feature29', 'feature30', 'feature31',
                'feature32', 'feature33', 'feature34', 'feature35', 'feature36',
                'feature37', 'feature38', 'feature39', 'feature40', 'feature41']).apply(lambda x : list(x.index)).reset_index()
a = x[x[0].apply(lambda x: len(x)>1)]
blacklist = []
for t in a[0].values:
    blacklist.extend(t)
    index = np.array(blacklist)
not_index = np.setdiff1d(df.index.to_numpy(), index)
df = df.iloc[not_index]
df.reset_index(inplace=True, drop=True)
df['label'].value_counts()

label
1    21787
0    21480
Name: count, dtype: int64

In [6]:
df.shape

(43267, 42)

In [16]:
HyperParameters={'n_estimators':100,
                 'learning_rate':0.09,
                 'max_depth':11,
                 'colsample_bytree':0.7,
                 'subsample':0.8,
                 'reg_alpha':1,
                 'reg_lambda':0,
                 'random_state':42,
                 'scale_pos_weight':1.5}

hyper_parameters = HyperParameters.copy()

In [17]:
processor = Phase2Prob1FeatureProcessor()
new_df = processor.fit_transform(df)

kfold = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
FEATURES = processor.data_features['features']
categorical = processor.data_features['categorical_features']
TARGET = 'label'
new_df
print(new_df.shape, df.shape)

models = []
scores = []
oofs = np.zeros(df.shape[0])
for i, (train_idx, valid_idx) in enumerate(kfold.split(new_df, df['label'])):
    X_train = new_df.iloc[train_idx][FEATURES]
    X_valid = new_df.iloc[valid_idx][FEATURES]
    y_train = df.iloc[train_idx][TARGET]
    y_valid = df.iloc[valid_idx][TARGET]

    model = LGBMClassifier(**hyper_parameters)
    model.fit(X_train, y_train,
              eval_set=[(X_train, y_train), (X_valid, y_valid)],
              eval_metric=["logloss", "auc"],
              categorical_feature=categorical,
              early_stopping_rounds=50,
              verbose=50)
    # model = CatBoostClassifier(iterations=100, learning_rate=0.05, max_depth=7, subsample=0.8, random_state=42,eval_metric='AUC',class_weights={1:1.5,0:1})
    # model.fit(X_train, y_train,
    #           cat_features=categorical,
    #           eval_set=[(X_train, y_train), (X_valid, y_valid)])
    models.append(model)
    y_pred_proba = model.predict_proba(X_valid)[:,1]
    y_pred = (y_pred_proba>0.5).astype(int)
    oofs[valid_idx] = y_pred_proba

    print(f"fold {i} : {roc_auc_score(y_valid, y_pred_proba)}")
    print(classification_report(y_valid, y_pred))
    scores.append(roc_auc_score(y_valid, y_pred_proba))

print(np.mean(scores), np.std(scores))

(43267, 41) (43267, 42)
[50]	training's binary_logloss: 0.169206	training's auc: 0.984138	valid_1's binary_logloss: 0.180825	valid_1's auc: 0.981019
[100]	training's binary_logloss: 0.14625	training's auc: 0.98837	valid_1's binary_logloss: 0.169337	valid_1's auc: 0.982325
fold 0 : 0.9823246145917993
              precision    recall  f1-score   support

           0       0.94      0.90      0.92      4296
           1       0.90      0.94      0.92      4358

    accuracy                           0.92      8654
   macro avg       0.92      0.92      0.92      8654
weighted avg       0.92      0.92      0.92      8654

[50]	training's binary_logloss: 0.169644	training's auc: 0.98411	valid_1's binary_logloss: 0.183136	valid_1's auc: 0.981022
[100]	training's binary_logloss: 0.147335	training's auc: 0.988319	valid_1's binary_logloss: 0.173341	valid_1's auc: 0.981985
fold 1 : 0.9819845061160237
              precision    recall  f1-score   support

           0       0.94      0.87      

Index([  130,   897,  5672,  7777, 10787, 11591, 15383, 17095, 18394, 21144,
       24061, 25118, 27524, 28062, 29804, 30090, 30568, 31403, 31653, 31756,
       33579, 36286, 41564],
      dtype='int64')

In [26]:
print(roc_auc_score(df['label'],oofs))
oofs[df[df.feature3=='ssl'].index] = 1
print(roc_auc_score(df['label'],oofs))

0.9821918292809365
0.9821921925406287


0.9821918292809365


In [9]:
gmodel = LGBMClassifier(**models[0].get_params())
gmodel.fit(new_df[FEATURES], df[TARGET],
           eval_set=[(new_df[FEATURES],df[TARGET])],
           eval_metric=["logloss", "auc"],
           categorical_feature=categorical,
           verbose=50)

with open(f'../checkpoints/phase-2/prob-1/v1.pkl','wb') as file:
    pickle.dump(gmodel, file)

[50]	valid_0's binary_logloss: 0.182324	valid_0's auc: 0.980857
[100]	valid_0's binary_logloss: 0.163481	valid_0's auc: 0.983564


In [10]:
pd.DataFrame({'feature':gmodel.feature_name_, 'score': gmodel.feature_importances_}).sort_values('score',ascending=False)

Unnamed: 0,feature,score
25,feature26,302
6,feature7,283
10,feature11,165
34,feature35,133
26,feature27,128
7,feature8,114
29,feature30,99
14,feature15,96
11,feature12,92
0,feature1,83


In [14]:
var_count_train = Counter(df['feature3'])
with open(f'../checkpoints/phase-2/prob-1/var_count.pkl', 'wb') as file:
    pickle.dump(var_count_train, file)