### EDA 02

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA, FactorAnalysis
from factor_analyzer import FactorAnalyzer
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from config import data_setting, predict_setting
from dataFactory import Read_DataList, CIC_Dataset

##### 1. 데이터 불러오기

In [2]:
# set data path & level 
filePath = data_setting["filePath"]
level = 1 # data_setting["level"]
scale = data_setting["scale"]
val_size = data_setting["val_size"]
pred_filePath = predict_setting["pred_filePath"]

# set train with dataset
num_workers = data_setting["num_workers"]
batch_size = data_setting["batch_size"]
drop_last = data_setting["drop_last"]
shuffle_flag = data_setting["shuffle_flag"]       
add_test = True

dataloader = Read_DataList(filePath, val_size, level, scale, add_test)


 - [ Start Read Data-List And Load Data-files ] - 

Now Loading........TCP_IP-DoS-SYN1_train.pcap.csv...Check.left/Total_file..(48/48)
Skip:..Recon-OS_Scan_train.pcap.csv..is..in...the..List..of..skip..Category..Class...(47/48)
Now Loading........TCP_IP-DDoS-UDP8_train.pcap.csv...Check.left/Total_file..(46/48)
Now Loading........TCP_IP-DoS-ICMP4_train.pcap.csv...Check.left/Total_file..(45/48)
Now Loading........TCP_IP-DDoS-SYN4_train.pcap.csv...Check.left/Total_file..(44/48)
Now Loading........TCP_IP-DoS-TCP3_train.pcap.csv...Check.left/Total_file..(43/48)
Skip:..Recon-VulScan_train.pcap.csv..is..in...the..List..of..skip..Category..Class...(42/48)
Now Loading........TCP_IP-DDoS-ICMP8_train.pcap.csv...Check.left/Total_file..(41/48)
Now Loading........TCP_IP-DDoS-TCP4_train.pcap.csv...Check.left/Total_file..(40/48)
Now Loading........TCP_IP-DDoS-UDP5_train.pcap.csv...Check.left/Total_file..(39/48)
Now Loading........TCP_IP-DoS-ICMP2_train.pcap.csv...Check.left/Total_file..(38/48)
Now Lo

In [3]:
# 데이터를 읽어 들이고 데이터의 모양을 확인 (클라스로 부터 데이터 불러오기)
train,train_label = dataloader.get_train_data()
val, val_label = dataloader.get_val_data()
test, test_label = dataloader.get_test_data()
print (f"train input shape: {train.shape} \ntrain_label shape: {train_label.shape}\n")
print (f"val input shape: {val.shape} \nval_label shape: {val_label.shape}\n")
print (f"test input shape: {test.shape} \ntest_label shape: {test_label.shape}\n \n")

oneHot = dataloader.get_oneHot()   # 학습된 원핫 인코더 불러오기 
print(f"label_oneHot_categories: {oneHot.categories_}")

colnames = dataloader.colnames
# colnames = colnames.remove('class_1')
print(f"feature column name's: {colnames}")
print(f"fueature columns length: {len(colnames)}")

train input shape: (5178508, 45) 
train_label shape: (5178508, 3)

val input shape: (1294628, 45) 
val_label shape: (1294628, 3)

test input shape: (1583015, 45) 
test_label shape: (1583015, 3)
 

label_oneHot_categories: [array(['Benign', 'DDoS', 'DoS'], dtype=object)]
feature column name's: ['Header_Length', 'Protocol Type', 'Duration', 'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number', 'rst_flag_number', 'psh_flag_number', 'ack_flag_number', 'ece_flag_number', 'cwr_flag_number', 'ack_count', 'syn_count', 'fin_count', 'rst_count', 'HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP', 'UDP', 'DHCP', 'ARP', 'ICMP', 'IGMP', 'IPv', 'LLC', 'Tot sum', 'Min', 'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue', 'Radius', 'Covariance', 'Variance', 'Weight', 'class_1']
fueature columns length: 46


##### 2. 데이터 리샘프링

In [4]:
# trasform to DataFrame
df_train_label = pd.DataFrame(train_label, columns=[oneHot.categories_[0]])
df_val_label = pd.DataFrame(val_label, columns=[oneHot.categories_[0]])
df_test_label = pd.DataFrame(test_label, columns=[oneHot.categories_[0]])

# label count plot : 라벨 데이터의 클라스별 비율 확인 
def plot_column_sums(df, flag:str):
    column_sums = df.sum()
    print(f"{flag.upper} columns count : \n{column_sums}")
    plt.figure(figsize=(10, 6))
    column_sums.plot(kind='bar')
    plt.title(f'Sum of {flag.upper()}\'s Each Column')
    plt.xlabel('Columns')
    plt.ylabel('Sum')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.legend(loc=1)
    plt.show()

In [6]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import TomekLinks
from imblearn.under_sampling import OneSidedSelection

tlk = TomekLinks(sampling_strategy='auto')
train_tmk, train_label_tmk = tlk.fit_resample(train, train_label)

df_train_tmk = pd.DataFrame(train_tmk)
df_train_label_tmk = pd.DataFrame(train_label_tmk)
df_train_tmk.to_csv('./data/CIC_2024/balanced/df_train_tmk.csv', index=False)
df_train_label_tmk.to_csv('./data/CIC_2024/balanced/df_train_label_tmk.csv', index=False)


In [7]:
val_tmk, val_label_tmk = tlk.fit_resample(val, val_label)

df_val_tmk = pd.DataFrame(val_tmk)
df_val_label_tmk = pd.DataFrame(val_label_tmk)
df_val_tmk.to_csv('./data/CIC_2024/balanced/df_val_tmk.csv', index=False)
df_val_label_tmk.to_csv('./data/CIC_2024/balanced/df_val_label_tmk.csv', index=False)

In [8]:
test_tmk, test_label_tmk = tlk.fit_resample(test, test_label)

df_test_tmk = pd.DataFrame(test_tmk)
df_test_label_tmk = pd.DataFrame(test_label_tmk)
df_test_tmk.to_csv('./data/CIC_2024/balanced/df_test_tmk.csv', index=False)
df_test_label_tmk.to_csv('./data/CIC_2024/balanced/df_test_label_tmk.csv', index=False)

##### 3. 군집분석 및 차원축소 

In [None]:
# train = './data/CIC_2024/balanced/df_train_rus.csv'
# train_label = './data/CIC_2024/balanced/df_train_label_rus.csv'

val = './data/CIC_2024/balanced/df_val_rus.csv'
val_label = './data/CIC_2024/balanced/df_val_label_rus.csv'

# df_test = './data/CIC_2024/balanced/df_test_rus.csv'
# df_train = pd.read_csv(train)
df_val = pd.read_csv(val)

scaler = StandardScaler()

df_val_scaled = scaler.fit_transform(df_val)
print(f"{df_val_scaled.shape}")

df_val_label = pd.read_csv(val_label)
df_val_label.head()
print(f"{df_val_label.shape}")
df_val_rvs = oneHot.inverse_transform(df_val_label)
df_val_rvs = pd.DataFrame(df_val_rvs, columns=['class'])
df_val_rvs['class'] = df_val_rvs['class'].astype('category')

In [None]:
plot_column_sums(df_val_label, 'val')

##### 4. T-sne 차원감소

In [None]:
perplexity = 30.
n_iter = 300
n_components_3d = 3
tsne_3d = TSNE(n_components=n_components_3d, perplexity=perplexity, n_iter=n_iter, random_state=42)
val_embedded = tsne_3d.fit_transform(df_val_scaled)

In [None]:
print("\nt-SNE 3차원 임베딩 후 데이터 형태:", val_embedded.shape)

# 5. t-SNE 결과 시각화 (3차원으로 임베딩했을 경우)
fig = plt.figure(figsize=(12, 12))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(val_embedded[:, 0], val_embedded[:, 1], val_embedded[:, 2], c=df_val_rvs['class'].cat.codes, cmap='prism', s=1, alpha=0.6)
ax.set_xlabel('t-SNE Component 1')
ax.set_ylabel('t-SNE Component 2')
ax.set_zlabel('t-SNE Component 3')
ax.set_title(f't-SNE 3D visualization of digits dataset (perplexity={perplexity}, n_iter={n_iter})')
# fig.colorbar(scatter, label='Digit')
ax.view_init(elev=50, azim=-50, roll=0)
ax.legend()
plt.show()

##### PCA

In [None]:
pca = PCA(n_components=3)
pca.fit(df_val_scaled)

In [None]:
# 3. 주성분 분석 결과 확인
print("고유값 (설명된 분산):", pca.explained_variance_)
print("설명된 분산 비율:", pca.explained_variance_ratio_)
print("주성분 (고유 벡터):")
for i, component in enumerate(pca.components_):
    print(f"  주성분 {i+1}: {component}")

In [None]:
X = StandardScaler().fit_transform(df_val)  # Standardize the data
factors = 2
#  a list of 2 tuples containing titles for and instances of or class
fas = [
    ("FA no rotation", FactorAnalysis(n_components = factors)),
    ("FA varimax", FactorAnalysis(n_components = factors, rotation="varimax")),
]  

#  Let's prepare some plots on one canvas (subplots)
fig, axes = plt.subplots(ncols=len(fas), figsize=(10, 8))

'''
And loop over the variants of our analysis `fas`, zipped with the 
plot axes `axes`
'''
for ax, (title, fa) in zip(axes, fas):
    #  Fit the model to the standardized food data
    fa = fa.fit(X)
    #  and transpose the component (loading) matrix
    factor_matrix = fa.components_.T
    #  Plot the data as a heat map
    im = ax.imshow(factor_matrix, cmap="RdBu_r", vmax=1, vmin=-1)
    #  and add the corresponding value to the center of each cell
    for (i,j), z in np.ndenumerate(factor_matrix):
        ax.text(j, i, str(z.round(2)), ha="center", va="center")
    #  Tell matplotlib about the metadata of the plot
    ax.set_yticks(np.arange(len(df_val.columns)))
    if ax.get_subplotspec().is_first_col():
        ax.set_yticklabels(df_val.columns)
    else:
        ax.set_yticklabels([])
    ax.set_title(title)
    ax.set_xticks([0, 1])
    ax.set_xticklabels(["Factor 1", "Factor 2"])
    #  and squeeze the axes tight, to save space
    plt.tight_layout()
    
#  and add a colorbar
cb = fig.colorbar(im, ax=axes, location='right', label="loadings")
#  show us the plot
plt.show()

In [None]:

# Apply Bartlett's test
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity
chi_square_value, p_value = calculate_bartlett_sphericity(df_val_scaled)
print(f'Chi-square value: {chi_square_value}\nP-value: {p_value}')

# Apply KMO test
from factor_analyzer.factor_analyzer import calculate_kmo
kmo_all, kmo_model = calculate_kmo(df_val_scaled)
print(f'KMO Model: {kmo_model}')

# Create factor analysis object and perform factor analysis
fa = FactorAnalyzer(rotation="varimax")
fa.fit(df_val_scaled)

# Check Eigenvalues
eigen_values, vectors = fa.get_eigenvalues()
plt.scatter(range(1, df_val_scaled.shape[1]+1), eigen_values)
plt.plot(range(1, df_val_scaled.shape[1]+1), eigen_values)
plt.title('Scree Plot')
plt.xlabel('Factors')
plt.ylabel('Eigenvalue')
plt.grid()
plt.show()