In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler

In [None]:
# Creating a empty dict, where I will save all parameters required for test data transformation

saved_dict = {}

In [None]:
# Reading datasets
dfs = []
for i in range(1,5):
    path = '/content/drive/MyDrive/UNSW/CSV Files/UNSW-NB15_{}.csv'  # There are 4 input csv files
    dfs.append(pd.read_csv(path.format(i), header = None))
all_data = pd.concat(dfs).reset_index(drop=True)  # Concat all to a single df

  dfs.append(pd.read_csv(path.format(i), header = None))
  dfs.append(pd.read_csv(path.format(i), header = None))


In [None]:
# This csv file contains names of all the features
df_col = pd.read_csv('/content/drive/MyDrive/UNSW/CSV Files/NUSW-NB15_features.csv', encoding='ISO-8859-1')

In [None]:
# Making column names lower case, removing spaces
df_col['Name'] = df_col['Name'].apply(lambda x: x.strip().replace(' ', '').lower())

In [None]:
# Renaming our dataframe with proper column names
all_data.columns = df_col['Name']

In [None]:
# Saving useful info, later this will be used to transform raw test data
saved_dict['columns'] = df_col['Name'][df_col['Name']!='label'].tolist()

In [None]:
del df_col

In [None]:
all_data.shape

(2540047, 49)

In [None]:
all_data.head()

Name,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_cat,label
0,59.166.0.0,1390,149.171.126.6,53,udp,CON,0.001055,132,164,31,...,0,3,7,1,3,1,1,1,,0
1,59.166.0.0,33661,149.171.126.9,1024,udp,CON,0.036133,528,304,31,...,0,2,4,2,3,1,1,2,,0
2,59.166.0.6,1464,149.171.126.7,53,udp,CON,0.001119,146,178,31,...,0,12,8,1,2,2,1,1,,0
3,59.166.0.5,3593,149.171.126.5,53,udp,CON,0.001209,132,164,31,...,0,6,9,1,1,1,1,1,,0
4,59.166.0.3,49664,149.171.126.0,53,udp,CON,0.001169,146,178,31,...,0,7,9,1,1,1,1,1,,0


In [None]:
all_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2540047 entries, 0 to 2540046
Data columns (total 49 columns):
 #   Column            Dtype  
---  ------            -----  
 0   srcip             object 
 1   sport             object 
 2   dstip             object 
 3   dsport            object 
 4   proto             object 
 5   state             object 
 6   dur               float64
 7   sbytes            int64  
 8   dbytes            int64  
 9   sttl              int64  
 10  dttl              int64  
 11  sloss             int64  
 12  dloss             int64  
 13  service           object 
 14  sload             float64
 15  dload             float64
 16  spkts             int64  
 17  dpkts             int64  
 18  swin              int64  
 19  dwin              int64  
 20  stcpb             int64  
 21  dtcpb             int64  
 22  smeansz           int64  
 23  dmeansz           int64  
 24  trans_depth       int64  
 25  res_bdy_len       int64  
 26  sjit          

In [None]:
# attack_cat 열에서 NaN 값을 "Normal"로 대체
all_data['attack_cat'] = all_data['attack_cat'].fillna('Normal').str.strip()

In [None]:
class_names  = all_data['attack_cat'].unique()

In [None]:
class_names

array(['Normal', 'Exploits', 'Reconnaissance', 'DoS', 'Generic',
       'Shellcode', 'Fuzzers', 'Worms', 'Backdoors', 'Analysis',
       'Backdoor'], dtype=object)

## Data cleaning and pre-processing

In [None]:
# Missing Data Handling Module

def myPreprocessor(dataframe, handling_method, target_label_name):
    df = dataframe
    method = handling_method
    label = target_label_name

    #importing libraries
    import numpy as np
    import pandas as pd
    from sklearn import preprocessing

    for i in range(df.shape[1]):
        n = df.iloc[:,i].isnull().sum()
        if n > 0:
            print("There is some missing values")
            print(df.iloc[:,i].name)
            if df.iloc[:,i].dtype == 'object':
                print('object')
                df.iloc[:,i]=df.iloc[:,i].fillna(df.iloc[:,i].mode()[0])
            else:
                print('numerical')

                if method == 'mean':
                    df.iloc[:,i]=df.iloc[:,i].fillna(df.iloc[:,i].mean())
                elif method == 'median':
                    df.iloc[:,i]=df.iloc[:,i].fillna(df.iloc[:,i].median())
                elif method == 'ffill':
                    df.iloc[:,i]=df.iloc[:,i].fillna(method = 'ffill')

# Label Encoder for converting categorical values to numerical values

    obj_features = []

    # Decide which categorical variables you want to use in model
    for col_name in df.columns:
        if df[col_name].dtypes == 'object':
            unique_cat = len(df[col_name].unique())
            print("Feature '{col_name}' has {unique_cat} unique categories".format(col_name=col_name, unique_cat=unique_cat))
            obj_features.append(col_name)

    label_encoder = preprocessing.LabelEncoder()

    k = df[obj_features].shape[1]

    for i in range(k):
        name=obj_features
        name[i]
        df[name[i]]= label_encoder.fit_transform(df[name[i]])


# Feature scaling - Normalization

    features_data=df.drop(label,axis=1)
    class_data=df[label]
    ###
    cols_name = features_data.columns

    minmax_scaler=preprocessing.MinMaxScaler()
    scaled =minmax_scaler.fit_transform(features_data)

    normalize_data = pd.DataFrame(scaled, columns = cols_name)

    return normalize_data, class_data, label_encoder

In [None]:
# Missing Data Handling Module

def myBinaryPreprocessor(dataframe, handling_method, target_label_name):
    df = dataframe
    method = handling_method
    label = target_label_name

    #importing libraries
    import numpy as np
    import pandas as pd
    from sklearn import preprocessing

    for i in range(df.shape[1]):
        n = df.iloc[:,i].isnull().sum()
        if n > 0:
            print("There is some missing values")
            print(df.iloc[:,i].name)
            if df.iloc[:,i].dtype == 'object':
                print('object')
                df.iloc[:,i]=df.iloc[:,i].fillna(df.iloc[:,i].mode()[0])
            else:
                print('numerical')

                if method == 'mean':
                    df.iloc[:,i]=df.iloc[:,i].fillna(df.iloc[:,i].mean())
                elif method == 'median':
                    df.iloc[:,i]=df.iloc[:,i].fillna(df.iloc[:,i].median())
                elif method == 'ffill':
                    df.iloc[:,i]=df.iloc[:,i].fillna(method = 'ffill')

# Label Encoder for converting categorical values to numerical values

    obj_features = []

    # Decide which categorical variables you want to use in model
    for col_name in df.columns:
        if df[col_name].dtypes == 'object':
            unique_cat = len(df[col_name].unique())
            print("Feature '{col_name}' has {unique_cat} unique categories".format(col_name=col_name, unique_cat=unique_cat))
            obj_features.append(col_name)

    label_encoder = preprocessing.LabelEncoder()

    k = df[obj_features].shape[1]

    for i in range(k):
        name=obj_features
        name[i]
        df[name[i]]= label_encoder.fit_transform(df[name[i]])


# Feature scaling - Normalization

    features_data=df.drop(label,axis=1)
    class_data=df[label]
    ###
    cols_name = features_data.columns

    minmax_scaler=preprocessing.MinMaxScaler()
    scaled =minmax_scaler.fit_transform(features_data)

    normalize_data = pd.DataFrame(scaled, columns = cols_name)

    return normalize_data, class_data, label_encoder

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn import preprocessing
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, f1_score, confusion_matrix
import tensorflow as tf
from tensorflow.keras import models, layers, optimizers, initializers
from tensorflow.keras.utils import to_categorical
#from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from imblearn.over_sampling import SMOTE
from tensorflow.keras.callbacks import ReduceLROnPlateau

In [None]:
def Remove_dump_values(data, cols):
    for col in cols:
        data[col] = np.where(data[col] == '-', np.nan, data[col])
    return data

In [None]:
cols = all_data.columns
df = Remove_dump_values(all_data, cols)

In [None]:
df

Name,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_cat,label
0,59.166.0.0,1390,149.171.126.6,53,udp,CON,0.001055,132.0,164.0,31.0,...,0,3.0,7.0,1.0,3.0,1.0,1.0,1.0,Normal,0.0
1,59.166.0.0,33661,149.171.126.9,1024,udp,CON,0.036133,528.0,304.0,31.0,...,0,2.0,4.0,2.0,3.0,1.0,1.0,2.0,Normal,0.0
2,59.166.0.6,1464,149.171.126.7,53,udp,CON,0.001119,146.0,178.0,31.0,...,0,12.0,8.0,1.0,2.0,2.0,1.0,1.0,Normal,0.0
3,59.166.0.5,3593,149.171.126.5,53,udp,CON,0.001209,132.0,164.0,31.0,...,0,6.0,9.0,1.0,1.0,1.0,1.0,1.0,Normal,0.0
4,59.166.0.3,49664,149.171.126.0,53,udp,CON,0.001169,146.0,178.0,31.0,...,0,7.0,9.0,1.0,1.0,1.0,1.0,1.0,Normal,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2540042,59.166.0.5,33094,149.171.126.7,43433,tcp,FIN,0.087306,320.0,1828.0,31.0,...,,1.0,2.0,3.0,3.0,1.0,1.0,3.0,Normal,0.0
2540043,59.166.0.7,20848,149.171.126.4,21,tcp,CON,0.365058,456.0,346.0,31.0,...,2,2.0,2.0,2.0,2.0,2.0,2.0,2.0,Normal,0.0
2540044,59.166.0.3,21511,149.171.126.9,21,tcp,CON,6.335154,1802.0,2088.0,31.0,...,2,2.0,2.0,4.0,2.0,2.0,2.0,2.0,Normal,0.0
2540045,59.166.0.9,35433,149.171.126.0,80,tcp,CON,2.200934,3498.0,166054.0,31.0,...,,1.0,1.0,2.0,4.0,2.0,2.0,2.0,Normal,0.0


In [None]:
df_multi = df.copy()

In [None]:
df_binary = df.copy()

In [None]:
# attack_cat 열에서 Backdoor 값을 Backdoors로 변경
df_multi['attack_cat'] = df_multi['attack_cat'].replace('Backdoor', 'Backdoors')

In [None]:
# 'attack_cat' 열이 'Normal'인 데이터 삭제
df_multi = df_multi[df_multi['attack_cat'] != 'Normal']

In [None]:
df_multi['attack_cat'].unique()

array(['Exploits', 'Reconnaissance', 'DoS', 'Generic', 'Shellcode',
       'Fuzzers', 'Worms', 'Backdoors', 'Analysis'], dtype=object)

In [None]:
df_multi['ct_flw_http_mthd'] = df_multi.ct_flw_http_mthd.fillna(value=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_multi['ct_flw_http_mthd'] = df_multi.ct_flw_http_mthd.fillna(value=0)


In [None]:
df_multi['is_ftp_login'] = (df_multi.is_ftp_login.fillna(value=0)).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_multi['is_ftp_login'] = (df_multi.is_ftp_login.fillna(value=0)).astype(int)


In [None]:
df_multi.drop(['sport', 'dsport', 'label','service', 'ct_ftp_cmd', 'srcip', 'dstip'], axis=1, inplace = True)

In [None]:
df_multi.isnull().sum()

Unnamed: 0_level_0,0
Name,Unnamed: 1_level_1
proto,0
state,0
dur,0
sbytes,0
dbytes,0
sttl,0
dttl,0
sloss,0
dloss,0
sload,0


In [None]:
x, y, multi_label_encoder = myPreprocessor(df_multi, 'mean', 'attack_cat')

Feature 'proto' has 129 unique categories
Feature 'state' has 7 unique categories
Feature 'attack_cat' has 9 unique categories


In [None]:
# 'attack_cat' 열을 다중 클래스 타겟 라벨로 설정
y_multi = df_multi['attack_cat'].values

In [None]:
unique_classes, class_counts = np.unique(y_multi, return_counts=True)
for cls, count in zip(unique_classes, class_counts):
    print(f"Class {cls}: {count}")

Class 0: 2677
Class 1: 2329
Class 2: 16353
Class 3: 44525
Class 4: 24246
Class 5: 215481
Class 6: 13987
Class 7: 1511
Class 8: 174


In [None]:
# 각 인코딩된 값과 원래 클래스의 매핑 출력
for index, class_label in enumerate(multi_label_encoder.classes_):
    print(f"Value {index}: {class_label}")


Value 0: Analysis
Value 1: Backdoors
Value 2: DoS
Value 3: Exploits
Value 4: Fuzzers
Value 5: Generic
Value 6: Reconnaissance
Value 7: Shellcode
Value 8: Worms


In [None]:
x

Name,proto,state,dur,sbytes,dbytes,sttl,dttl,sloss,dloss,sload,...,ct_state_ttl,ct_flw_http_mthd,is_ftp_login,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm
0,0.898438,0.666667,3.500028e-07,0.000051,0.000000,0.996078,0.000000,0.000000,0.000000,2.315743e-02,...,0.333333,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.851562,0.500000,4.002348e-03,0.000064,0.001743,0.243137,0.996047,0.000376,0.001816,4.684439e-06,...,0.166667,0.033333,0.0,0.030303,0.015152,0.015152,0.000000,0.000000,0.000000,0.000000
2,0.851562,0.500000,3.984015e-02,0.000095,0.000018,0.996078,0.996047,0.001128,0.000182,7.070172e-07,...,0.166667,0.033333,0.0,0.060606,0.015152,0.015152,0.000000,0.000000,0.000000,0.000000
3,0.851562,0.500000,2.919856e-03,0.000569,0.000018,0.996078,0.996047,0.000752,0.000182,5.784349e-05,...,0.166667,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.851562,0.500000,3.176692e-03,0.000059,0.000018,0.996078,0.996047,0.000376,0.000182,5.327199e-06,...,0.166667,0.033333,0.0,0.030303,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
321278,0.898438,0.666667,1.666680e-08,0.000008,0.000000,0.996078,0.000000,0.000000,0.000000,7.615230e-02,...,0.333333,0.000000,0.0,0.212121,0.212121,0.212121,0.212121,0.212121,0.378378,0.212121
321279,0.851562,0.333333,1.571652e-02,0.000040,0.000046,0.243137,0.996047,0.000940,0.001090,7.466455e-07,...,0.500000,0.000000,0.0,0.015152,0.000000,0.015152,0.045455,0.015152,0.027027,0.015152
321280,0.851562,0.333333,4.299042e-02,0.018800,0.000089,0.243137,0.996047,0.019365,0.000182,1.391144e-04,...,0.500000,0.066667,0.0,0.015152,0.000000,0.015152,0.045455,0.015152,0.027027,0.015152
321281,0.851562,0.333333,4.299042e-02,0.018800,0.000089,0.243137,0.996047,0.019365,0.000182,1.391144e-04,...,0.500000,0.066667,0.0,0.000000,0.000000,0.015152,0.045455,0.015152,0.027027,0.015152


In [None]:
print(x.head())

Name     proto     state           dur    sbytes    dbytes      sttl  \
0     0.898438  0.666667  3.500028e-07  0.000051  0.000000  0.996078   
1     0.851562  0.500000  4.002348e-03  0.000064  0.001743  0.243137   
2     0.851562  0.500000  3.984015e-02  0.000095  0.000018  0.996078   
3     0.851562  0.500000  2.919856e-03  0.000569  0.000018  0.996078   
4     0.851562  0.500000  3.176692e-03  0.000059  0.000018  0.996078   

Name      dttl     sloss     dloss         sload  ...  ct_state_ttl  \
0     0.000000  0.000000  0.000000  2.315743e-02  ...      0.333333   
1     0.996047  0.000376  0.001816  4.684439e-06  ...      0.166667   
2     0.996047  0.001128  0.000182  7.070172e-07  ...      0.166667   
3     0.996047  0.000752  0.000182  5.784349e-05  ...      0.166667   
4     0.996047  0.000376  0.000182  5.327199e-06  ...      0.166667   

Name  ct_flw_http_mthd  is_ftp_login  ct_srv_src  ct_srv_dst  ct_dst_ltm  \
0             0.000000           0.0    0.000000    0.000000    

In [None]:
y.value_counts()

Unnamed: 0_level_0,count
attack_cat,Unnamed: 1_level_1
5,215481
3,44525
4,24246
2,16353
6,13987
0,2677
1,2329
7,1511
8,174


## 이진분류용

In [None]:
# attack_cat 열에서 Backdoor 값을 Backdoors로 변경
df_binary['attack_cat'] = df_binary['attack_cat'].replace('Backdoor', 'Backdoors')

In [None]:
df_binary['ct_flw_http_mthd'] = df_binary.ct_flw_http_mthd.fillna(value=0)

In [None]:
df_binary['is_ftp_login'] = (df_binary.is_ftp_login.fillna(value=0)).astype(int)

In [None]:
df_binary.drop(['sport', 'dsport', 'attack_cat','service', 'ct_ftp_cmd', 'srcip', 'dstip'], axis=1, inplace = True)

In [None]:
df_binary['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0.0,2218764
1.0,321283


In [None]:
x_binary, y_binary, binary_label_encoder = myBinaryPreprocessor(df_binary, 'mean', 'label')

Feature 'proto' has 135 unique categories
Feature 'state' has 16 unique categories


In [None]:
x_binary

Name,proto,state,dur,sbytes,dbytes,sttl,dttl,sloss,dloss,sload,...,ct_state_ttl,ct_flw_http_mthd,is_ftp_login,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm
0,0.895522,0.133333,1.200687e-07,0.000009,0.000011,0.121569,0.114173,0.000000,0.000000,8.357948e-05,...,0.0,0.000000,0.0,0.030303,0.090909,0.000000,0.030303,0.000000,0.000000,0.000000
1,0.895522,0.133333,4.112267e-06,0.000037,0.000021,0.121569,0.114173,0.000000,0.000000,1.464196e-05,...,0.0,0.000000,0.0,0.015152,0.045455,0.015152,0.030303,0.000000,0.000000,0.015152
2,0.895522,0.133333,1.273525e-07,0.000010,0.000012,0.121569,0.114173,0.000000,0.000000,8.715673e-05,...,0.0,0.000000,0.0,0.166667,0.106061,0.000000,0.015152,0.015152,0.000000,0.000000
3,0.895522,0.133333,1.375953e-07,0.000009,0.000011,0.121569,0.114173,0.000000,0.000000,7.293329e-05,...,0.0,0.000000,0.0,0.075758,0.121212,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.895522,0.133333,1.330429e-07,0.000010,0.000012,0.121569,0.114173,0.000000,0.000000,8.342890e-05,...,0.0,0.000000,0.0,0.090909,0.121212,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2540042,0.850746,0.333333,9.936224e-06,0.000022,0.000125,0.121569,0.114173,0.000188,0.000363,4.085783e-06,...,0.0,0.000000,0.0,0.000000,0.015152,0.030303,0.030303,0.000000,0.000000,0.030303
2540043,0.850746,0.133333,4.154695e-05,0.000032,0.000024,0.121569,0.114173,0.000376,0.000363,1.460223e-06,...,0.0,0.000000,0.5,0.015152,0.015152,0.015152,0.015152,0.015152,0.016949,0.015152
2540044,0.850746,0.133333,7.209987e-04,0.000126,0.000142,0.121569,0.114173,0.001316,0.001634,3.682097e-07,...,0.0,0.000000,0.5,0.015152,0.015152,0.045455,0.015152,0.015152,0.016949,0.015152
2540045,0.850746,0.133333,2.504865e-04,0.000244,0.011329,0.121569,0.114173,0.000376,0.010350,2.086926e-06,...,0.0,0.055556,0.0,0.000000,0.000000,0.015152,0.045455,0.015152,0.016949,0.015152


In [None]:
y_binary.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0.0,2218764
1.0,321283


In [None]:
y.value_counts()

Unnamed: 0_level_0,count
attack_cat,Unnamed: 1_level_1
5,215481
3,44525
4,24246
2,16353
6,13987
0,2677
1,2329
7,1511
8,174


## Data Visualization

In [60]:
x_train_attack_multi = np.array(x)
y_train_attack_multi = np.array(y)

In [61]:
# Step 1: 언더샘플링
# 가장 많은 클래스(7)를 두 번째로 많은 클래스(6)의 수에 맞추기
undersample = RandomUnderSampler(sampling_strategy={3: 44525}, random_state=42)
x_under, y_under = undersample.fit_resample(x_train_attack_multi, y_train_attack_multi)


NameError: name 'RandomUnderSampler' is not defined

In [None]:
from imblearn.over_sampling import SMOTE

# SMOTE 오버샘플링 적용
smote = SMOTE(sampling_strategy='auto', random_state=42)
x_train_attack_multi_resampled, y_train_attack_multi_resampled = smote.fit_resample(x_under, y_under)

# 클래스 분포 확인
from collections import Counter
print("After SMOTE: ", Counter(y_train_attack_multi_resampled))


Before SMOTE:  Counter({5: 215481, 3: 44525, 4: 24246, 2: 16353, 6: 13987, 0: 2677, 1: 2329, 7: 1511, 8: 174})
After SMOTE:  Counter({3: 215481, 6: 215481, 2: 215481, 5: 215481, 7: 215481, 4: 215481, 8: 215481, 1: 215481, 0: 215481})


In [None]:
x_array_binary = np.array(x_binary)
y_array_binary = np.array(y_binary)

In [None]:
# 데이터를 80:10:10 비율로 나누기
x_train_binary, x_temp_binary, y_train_binary, y_temp_binary = train_test_split(
    x_array_binary, y_array_binary, test_size=0.2, random_state=42, stratify=y_binary
)

x_val_binary, x_test_binary, y_val_binary, y_test_binary = train_test_split(
    x_temp_binary, y_temp_binary, test_size=0.5, random_state=42, stratify=y_temp_binary
)

# CNN 모델이 필요로 하는 형태로 데이터 차원 확장
x_train_binary = np.expand_dims(x_train_binary, axis=2)
x_test_binary = np.expand_dims(x_test_binary, axis=2)
x_val_binary = np.expand_dims(x_val_binary, axis=2)

In [None]:
# 데이터를 80:10:10 비율로 나누기
x_train_multi, x_temp_multi, y_train_multi, y_temp_multi = train_test_split(
    x_train_attack_multi_resampled, y_train_attack_multi_resampled, test_size=0.2, random_state=42, stratify=y_train_attack_multi_resampled
)

x_val_multi, x_test_multi, y_val_multi, y_test_multi = train_test_split(
    x_temp_multi, y_temp_multi, test_size=0.5, random_state=42, stratify=y_temp_multi
)

# CNN 모델이 필요로 하는 형태로 데이터 차원 확장
x_train_multi = np.expand_dims(x_train_multi, axis=2)
x_test_multi = np.expand_dims(x_test_multi, axis=2)
x_val_multi = np.expand_dims(x_val_multi, axis=2)

## Model

In [None]:
# 첫 번째 모델 정의
cnn_binary = models.Sequential([
    layers.Conv1D(32, 3, activation='relu', padding='same', kernel_initializer='he_uniform', input_shape=(41, 1)),
    layers.Conv1D(32, 3, activation='relu', padding='same', kernel_initializer='he_uniform'),
    layers.MaxPooling1D(pool_size=2, strides=2),
    layers.Dropout(0.2),
    layers.BatchNormalization(),

    layers.Conv1D(64, 3, activation='relu', padding='same', kernel_initializer='he_uniform'),
    layers.Conv1D(64, 3, activation='relu', padding='same', kernel_initializer='he_uniform'),
    layers.MaxPooling1D(pool_size=2, strides=2),
    layers.Dropout(0.2),
    layers.BatchNormalization(),

    layers.Flatten(),
    layers.Dense(32, activation='relu'),
    layers.Dense(1, activation='sigmoid')  # 이진분류를 위한 출력층
])

# 모델 컴파일
cnn_binary.compile(optimizer='nadam', loss='binary_crossentropy', metrics=['accuracy'])

# 모델 학습
cnn_binary.fit(x_train_binary, y_train_binary,
               batch_size=256,
               epochs=15,
               validation_data=(x_val_binary, y_val_binary))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/15
[1m7938/7938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 5ms/step - accuracy: 0.9869 - loss: 0.0287 - val_accuracy: 0.9904 - val_loss: 0.0198
Epoch 2/15
[1m7938/7938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 4ms/step - accuracy: 0.9906 - loss: 0.0185 - val_accuracy: 0.9908 - val_loss: 0.0174
Epoch 3/15
[1m7938/7938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 4ms/step - accuracy: 0.9907 - loss: 0.0178 - val_accuracy: 0.9908 - val_loss: 0.0176
Epoch 4/15
[1m7938/7938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 4ms/step - accuracy: 0.9908 - loss: 0.0178 - val_accuracy: 0.9911 - val_loss: 0.0170
Epoch 5/15
[1m7938/7938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 4ms/step - accuracy: 0.9912 - loss: 0.0172 - val_accuracy: 0.9918 - val_loss: 0.0163
Epoch 6/15
[1m7938/7938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 4ms/step - accuracy: 0.9918 - loss: 0.0166 - val_accuracy: 0.9919 - val_loss: 0.0159
Epoch 7/15

<keras.src.callbacks.history.History at 0x799108b1c3a0>

In [None]:
# 두 번째 모델 정의
cnn_multi = models.Sequential([
    layers.Conv1D(128, 3, activation='relu', padding='same', input_shape=(41, 1)),
    layers.Conv1D(128, 3, activation='relu', padding='same'),
    layers.MaxPooling1D(pool_size=2),
    layers.Dropout(0.3),

    layers.Conv1D(64, 3, activation='relu', padding='same'),
    layers.MaxPooling1D(pool_size=2),
    layers.Dropout(0.3),

    layers.LSTM(128, return_sequences=True),
    layers.LSTM(64),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.4),
    layers.Dense(9, activation='softmax')  # 9개의 Attack 클래스를 위한 출력층
])

# 모델 컴파일
cnn_multi.compile(optimizer='nadam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [None]:
# 첫 번째 모델로 테스트 데이터셋 예측
y_pred_binary = cnn_binary.predict(x_test_binary)
y_pred_binary = (y_pred_binary > 0.5).astype(int)  # Threshold 적용

# Attack으로 분류된 데이터 추출
attack_indices = np.where(y_pred_binary == 1)[0]
x_attack = x_test_binary[attack_indices]
y_attack_multi = y_multi[attack_indices]  # 다중분류 라벨


[1m7938/7938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step


In [None]:
# 다중분류 모델 학습
cnn_multi.fit(x_train_multi, y_train_multi,
              batch_size=256,
              epochs=15,
              validation_data=(x_val_multi, y_val_multi))


Epoch 1/15
[1m6061/6061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 12ms/step - accuracy: 0.5149 - loss: 1.2332 - val_accuracy: 0.6824 - val_loss: 0.7783
Epoch 2/15
[1m6061/6061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 11ms/step - accuracy: 0.6765 - loss: 0.8017 - val_accuracy: 0.6983 - val_loss: 0.7294
Epoch 3/15
[1m6061/6061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 11ms/step - accuracy: 0.6920 - loss: 0.7550 - val_accuracy: 0.7057 - val_loss: 0.7081
Epoch 4/15
[1m6061/6061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 12ms/step - accuracy: 0.6991 - loss: 0.7336 - val_accuracy: 0.7160 - val_loss: 0.6864
Epoch 5/15
[1m6061/6061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 12ms/step - accuracy: 0.7057 - loss: 0.7159 - val_accuracy: 0.7162 - val_loss: 0.6841
Epoch 6/15
[1m6061/6061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 12ms/step - accuracy: 0.7101 - loss: 0.7041 - val_accuracy: 0.7200 - val_loss: 0.6731
Epoc

<keras.src.callbacks.history.History at 0x7990f3ad0520>

In [None]:
# attack_cat을 Label Encoding
original_label_encoder = preprocessing.LabelEncoder()
df['encoded_attack_cat'] = original_label_encoder.fit_transform(df['attack_cat'])

# y_test_multi: 원본 테스트 데이터의 10개 클래스 라벨 (0~9)
y_test_multi = df['encoded_attack_cat'].iloc[x_test_indices].values  # 테스트셋 인덱스에 맞게 선택

NameError: name 'x_test_indices' is not defined

In [None]:
# Multi label 매핑
multi_mapping = {index: label for index, label in enumerate(multi_label_encoder.classes_)}
print("Multi-class Label Mapping:", multi_mapping)

In [None]:
# 이진분류 모델 예측
y_pred_binary = cnn_binary.predict(x_test_binary)  # x_test_binary: 테스트용 입력 데이터
y_pred_binary = (y_pred_binary > 0.5).astype(int)

# Attack으로 예측된 데이터 인덱스
attack_indices = np.where(y_pred_binary.flatten() == 1)[0]

# 다중분류 모델 예측
x_attack = x_test_binary[attack_indices]  # Attack으로 분류된 데이터만 추출
y_pred_multi = cnn_multi.predict(x_attack)
y_pred_multi = np.argmax(y_pred_multi, axis=1)  # 다중분류 결과 (0~8)


In [None]:
# 최종 예측값 배열 생성
final_predictions = np.full_like(y_test_multi, -1)  # 테스트 데이터 크기에 맞게 -1로 초기화

# 0 (Normal) → 다중분류 라벨의 6으로 매핑
final_predictions[y_pred_binary.flatten() == 0] = 6

# Attack으로 예측된 값들 매핑 (0~8 → 0~5, 7~9)
for i, idx in enumerate(attack_indices):
    if y_pred_multi[i] <= 5:  # 0~5는 그대로
        final_predictions[idx] = y_pred_multi[i]
    else:  # 6, 7, 8은 각각 7, 8, 9로 매핑
        final_predictions[idx] = y_pred_multi[i] + 1
