In [46]:
import pandas as pd
# from dnslib import DNSRecord
import ast
import math
import numpy as np
import matplotlib.pyplot as plt
import os

In [47]:
'''DNS_DATA = pd.read_csv('payload_csv/dns_payloads.csv')
HTTP_DATA = pd.read_csv('payload_csv/http_payloads.csv')
MQTT_DATA = pd.read_csv('payload_csv/mqtt_payloads.csv')'''

"DNS_DATA = pd.read_csv('payload_csv/dns_payloads.csv')\nHTTP_DATA = pd.read_csv('payload_csv/http_payloads.csv')\nMQTT_DATA = pd.read_csv('payload_csv/mqtt_payloads.csv')"

In [48]:
PROTOCOL_LIST = ['dns', 'http', 'mqtt']

In [49]:
fixed_image_size = 5
max_data_length = 25

In [50]:
def grayscale_to_rgb(value):
    r = value % 256
    g = (value * 2) % 256
    b = (value * 3) % 256
    return r, g, b

In [51]:
def sliding_window_to2D(data, window_size=4, step_size=4):

    windows = [data[i:i+window_size] for i in range(0, len(data), step_size)]
    
    # 각 윈도우를 1D 배열로 평탄화 (패딩 포함)
    flat_values = [val for window in windows for val in window]
    
    # 고정된 이미지 크기로 패딩 추가
    padded_values = flat_values + [0] * (fixed_image_size ** 2 - len(flat_values))
    
    # 고정된 크기의 2D 배열로 변환 (10x10)
    pixel_matrix = np.array(padded_values).reshape(fixed_image_size, fixed_image_size)

    rgb_matrix = np.zeros((fixed_image_size, fixed_image_size, 3), dtype=np.uint8)  # (R, G, B) 각각 8비트
    
    for i in range(fixed_image_size):
        for j in range(fixed_image_size):
            r, g, b = grayscale_to_rgb(pixel_matrix[i, j])
            rgb_matrix[i, j] = [r, g, b]
    
    return rgb_matrix

In [52]:
def data_read_pre(data_df, idx):
    data = data_df['Payload(Bytes)'][i]

    if len(data) > max_data_length:
        data = data[:max_data_length]
    else:
        data = data + ' ' * (max_data_length - len(data))

    ascii_values = [ord(c) for c in data]

    return ascii_values

In [53]:
# 현재 찐 코드

import os
import pandas as pd
import matplotlib.pyplot as plt

start_idx = 0
MAX_IMAGES_PER_PROTOCOL = 10000  # 프로토콜당 최대 처리할 이미지 개수
MAX_IMAGES_PER_FILE = 0

all_image_data = []

for class_idx, protocol in enumerate(PROTOCOL_LIST):
    # 스플릿된 CSV 파일을 처리하는 코드
    chunk_idx = 1  # 청크 인덱스 초기화
    
    # 프로토콜 별 이미지 데이터 저장 리스트 초기화
    image_data = []
    
    total_images_processed = 0
    
    while True:
        input_file = f'payload_csv/split/{protocol}/{protocol}_{chunk_idx}.csv'  # 스플릿된 CSV 파일 경로
        
        if not os.path.exists(input_file):  # 파일이 존재하지 않으면 종료
            print(f"File not found: {input_file}")
            break

        data_df = pd.read_csv(input_file)  # CSV 파일 읽기
        print(f"Processing {input_file}, Total rows: {len(data_df)}")

        # 폴더 생성 (클래스별 폴더)
        folder_name = f"train_data/protocol_{class_idx}"
        os.makedirs(folder_name, exist_ok=True)

        # 프로토콜마다 이미지 처리
        images_in_this_file = 0  # 이 파일에서 추출한 이미지 개수
        
        # 프로토콜이 'dns'일 때 모든 이미지를 처리
        # 프로토콜이 'dns'가 아닐 때는 5번째 파일만 처리
        if protocol == 'dns':
            # 모든 이미지 처리
            sample_indices = range(len(data_df))  # 모든 인덱스를 선택

            for i in sample_indices:
                if total_images_processed >= MAX_IMAGES_PER_PROTOCOL: 
                    print(f"Reached {MAX_IMAGES_PER_PROTOCOL} images for {protocol}")
                    break
                
                # 이미지 데이터 변환 및 저장 과정
                ascii_values = data_read_pre(data_df, i)
                rgb_matrix = sliding_window_to2D(ascii_values, window_size=8, step_size=8)

                # 새로운 figure 생성 (서브플롯을 사용하지 않음)
                fig, ax = plt.subplots(figsize=(3, 3))  # 각 이미지를 3x3 크기로 설정

                # 그레이스케일 이미지로 시각화
                im = ax.imshow(rgb_matrix, interpolation='nearest')
                ax.set_title(f"Payload {i+1}")  # 각 이미지에 타이틀 설정

                ax.set_xticks([])  # x축 눈금 제거
                ax.set_yticks([])

                # 이미지 경로 설정 및 저장
                image_path = os.path.join(folder_name, f"payload_image_{total_images_processed + 1}.png")
                plt.savefig(image_path, bbox_inches='tight', pad_inches=0.1)

                # figure 닫기 (메모리 절약)
                plt.close(fig)

                # 이미지 경로 및 클래스 번호 저장
                image_data.append({"image_path": image_path, "class": class_idx})
                
                # 원본 데이터 저장 (이미지 생성 시)
                payload_data = data_df.iloc[i].to_dict()  # 원본 데이터 가져오기
                payload_data["class"] = class_idx  # 클래스 추가

                # 이미지 경로와 원본 데이터 저장
                all_image_data.append(payload_data)
                
                total_images_processed += 1  # 이미지 처리 개수 증가
                images_in_this_file += 1  # 파일에서 추출한 이미지 개수 증가

        elif protocol != 'dns' and chunk_idx % 5 == 0:
            # 랜덤으로 100개 샘플링
            if protocol == 'http':
                MAX_IMAGES_PER_FILE = 500
            elif protocol == 'mqtt':
                MAX_IMAGES_PER_FILE = 125
            
            sample_indices = data_df.sample(n=min(MAX_IMAGES_PER_PROTOCOL - total_images_processed, MAX_IMAGES_PER_FILE, len(data_df)), random_state=42).index

            for i in sample_indices:
                if total_images_processed >= MAX_IMAGES_PER_PROTOCOL: 
                    print(f"Reached {MAX_IMAGES_PER_PROTOCOL} images for {protocol}")
                    break
                
                # 이미지 데이터 변환 및 저장 과정
                ascii_values = data_read_pre(data_df, i)
                rgb_matrix = sliding_window_to2D(ascii_values, window_size=8, step_size=8)

                # 새로운 figure 생성 (서브플롯을 사용하지 않음)
                fig, ax = plt.subplots(figsize=(3, 3))  # 각 이미지를 3x3 크기로 설정

                # 그레이스케일 이미지로 시각화
                im = ax.imshow(rgb_matrix, interpolation='nearest')
                ax.set_title(f"Payload {i+1}")  # 각 이미지에 타이틀 설정

                ax.set_xticks([])  # x축 눈금 제거
                ax.set_yticks([])

                # 이미지 경로 설정 및 저장
                image_path = os.path.join(folder_name, f"payload_image_{total_images_processed + 1}.png")
                plt.savefig(image_path, bbox_inches='tight', pad_inches=0.1)

                # figure 닫기 (메모리 절약)
                plt.close(fig)
                
                # 원본 데이터 저장 (이미지 생성 시)
                payload_data = data_df.iloc[i].to_dict()  # 원본 데이터 가져오기
                payload_data["class"] = class_idx  # 클래스 추가

                # 이미지 경로와 원본 데이터 저장
                all_image_data.append(payload_data)

                # 이미지 경로 및 클래스 번호 저장
                image_data.append({"image_path": image_path, "class": class_idx})
                
                total_images_processed += 1  # 이미지 처리 개수 증가
                images_in_this_file += 1  # 파일에서 추출한 이미지 개수 증가

        # 프로토콜별 CSV 파일로 저장
        if image_data:  # 이미지 데이터가 있을 경우에만 저장
            image_df = pd.DataFrame(image_data)
            csv_file_path = f"train_data/protocol_{class_idx}_image_class_info.csv"
            image_df.to_csv(csv_file_path, index=False)
            print(f"Saved {csv_file_path} with {len(image_data)} entries.")

        chunk_idx += 1  # 다음 청크로 이동

    # 각 프로토콜별 image_class_info CSV 파일의 행 수 출력
    csv_file_path = f"train_data/protocol_{class_idx}_image_class_info.csv"
    if os.path.exists(csv_file_path):
        saved_df = pd.read_csv(csv_file_path)
        print(f"{csv_file_path} has {len(saved_df)} entries.")

        
# 전체 원본 데이터와 클래스 정보를 하나의 CSV 파일로 저장
all_image_df = pd.DataFrame(all_image_data)
all_image_csv_path = 'train_data/all_payload_data_class_info.csv'
all_image_df.to_csv(all_image_csv_path, index=False)
print(f"Saved {all_image_csv_path} with {len(all_image_data)} entries.")

Processing payload_csv/split/dns/dns_1.csv, Total rows: 5000
Saved train_data/protocol_0_image_class_info.csv with 5000 entries.
Processing payload_csv/split/dns/dns_2.csv, Total rows: 5000
Saved train_data/protocol_0_image_class_info.csv with 10000 entries.
Processing payload_csv/split/dns/dns_3.csv, Total rows: 158
Reached 10000 images for dns
Saved train_data/protocol_0_image_class_info.csv with 10000 entries.
File not found: payload_csv/split/dns/dns_4.csv
train_data/protocol_0_image_class_info.csv has 10000 entries.
Processing payload_csv/split/http/http_1.csv, Total rows: 5000
Processing payload_csv/split/http/http_2.csv, Total rows: 5000
Processing payload_csv/split/http/http_3.csv, Total rows: 5000
Processing payload_csv/split/http/http_4.csv, Total rows: 5000
Processing payload_csv/split/http/http_5.csv, Total rows: 5000
Saved train_data/protocol_1_image_class_info.csv with 500 entries.
Processing payload_csv/split/http/http_6.csv, Total rows: 5000
Saved train_data/protocol_1_

Processing payload_csv/split/http/http_63.csv, Total rows: 5000
Saved train_data/protocol_1_image_class_info.csv with 6000 entries.
Processing payload_csv/split/http/http_64.csv, Total rows: 5000
Saved train_data/protocol_1_image_class_info.csv with 6000 entries.
Processing payload_csv/split/http/http_65.csv, Total rows: 5000
Saved train_data/protocol_1_image_class_info.csv with 6500 entries.
Processing payload_csv/split/http/http_66.csv, Total rows: 5000
Saved train_data/protocol_1_image_class_info.csv with 6500 entries.
Processing payload_csv/split/http/http_67.csv, Total rows: 5000
Saved train_data/protocol_1_image_class_info.csv with 6500 entries.
Processing payload_csv/split/http/http_68.csv, Total rows: 5000
Saved train_data/protocol_1_image_class_info.csv with 6500 entries.
Processing payload_csv/split/http/http_69.csv, Total rows: 5000
Saved train_data/protocol_1_image_class_info.csv with 6500 entries.
Processing payload_csv/split/http/http_70.csv, Total rows: 5000
Saved train_

Saved train_data/protocol_2_image_class_info.csv with 625 entries.
Processing payload_csv/split/mqtt/mqtt_26.csv, Total rows: 5000
Saved train_data/protocol_2_image_class_info.csv with 625 entries.
Processing payload_csv/split/mqtt/mqtt_27.csv, Total rows: 5000
Saved train_data/protocol_2_image_class_info.csv with 625 entries.
Processing payload_csv/split/mqtt/mqtt_28.csv, Total rows: 5000
Saved train_data/protocol_2_image_class_info.csv with 625 entries.
Processing payload_csv/split/mqtt/mqtt_29.csv, Total rows: 5000
Saved train_data/protocol_2_image_class_info.csv with 625 entries.
Processing payload_csv/split/mqtt/mqtt_30.csv, Total rows: 5000
Saved train_data/protocol_2_image_class_info.csv with 750 entries.
Processing payload_csv/split/mqtt/mqtt_31.csv, Total rows: 5000
Saved train_data/protocol_2_image_class_info.csv with 750 entries.
Processing payload_csv/split/mqtt/mqtt_32.csv, Total rows: 5000
Saved train_data/protocol_2_image_class_info.csv with 750 entries.
Processing paylo

Saved train_data/protocol_2_image_class_info.csv with 2250 entries.
Processing payload_csv/split/mqtt/mqtt_91.csv, Total rows: 5000
Saved train_data/protocol_2_image_class_info.csv with 2250 entries.
Processing payload_csv/split/mqtt/mqtt_92.csv, Total rows: 5000
Saved train_data/protocol_2_image_class_info.csv with 2250 entries.
Processing payload_csv/split/mqtt/mqtt_93.csv, Total rows: 5000
Saved train_data/protocol_2_image_class_info.csv with 2250 entries.
Processing payload_csv/split/mqtt/mqtt_94.csv, Total rows: 5000
Saved train_data/protocol_2_image_class_info.csv with 2250 entries.
Processing payload_csv/split/mqtt/mqtt_95.csv, Total rows: 5000
Saved train_data/protocol_2_image_class_info.csv with 2375 entries.
Processing payload_csv/split/mqtt/mqtt_96.csv, Total rows: 5000
Saved train_data/protocol_2_image_class_info.csv with 2375 entries.
Processing payload_csv/split/mqtt/mqtt_97.csv, Total rows: 5000
Saved train_data/protocol_2_image_class_info.csv with 2375 entries.
Processi

Saved train_data/protocol_2_image_class_info.csv with 3875 entries.
Processing payload_csv/split/mqtt/mqtt_156.csv, Total rows: 5000
Saved train_data/protocol_2_image_class_info.csv with 3875 entries.
Processing payload_csv/split/mqtt/mqtt_157.csv, Total rows: 5000
Saved train_data/protocol_2_image_class_info.csv with 3875 entries.
Processing payload_csv/split/mqtt/mqtt_158.csv, Total rows: 5000
Saved train_data/protocol_2_image_class_info.csv with 3875 entries.
Processing payload_csv/split/mqtt/mqtt_159.csv, Total rows: 5000
Saved train_data/protocol_2_image_class_info.csv with 3875 entries.
Processing payload_csv/split/mqtt/mqtt_160.csv, Total rows: 5000
Saved train_data/protocol_2_image_class_info.csv with 4000 entries.
Processing payload_csv/split/mqtt/mqtt_161.csv, Total rows: 5000
Saved train_data/protocol_2_image_class_info.csv with 4000 entries.
Processing payload_csv/split/mqtt/mqtt_162.csv, Total rows: 5000
Saved train_data/protocol_2_image_class_info.csv with 4000 entries.
P

Saved train_data/protocol_2_image_class_info.csv with 5500 entries.
Processing payload_csv/split/mqtt/mqtt_221.csv, Total rows: 5000
Saved train_data/protocol_2_image_class_info.csv with 5500 entries.
Processing payload_csv/split/mqtt/mqtt_222.csv, Total rows: 5000
Saved train_data/protocol_2_image_class_info.csv with 5500 entries.
Processing payload_csv/split/mqtt/mqtt_223.csv, Total rows: 5000
Saved train_data/protocol_2_image_class_info.csv with 5500 entries.
Processing payload_csv/split/mqtt/mqtt_224.csv, Total rows: 5000
Saved train_data/protocol_2_image_class_info.csv with 5500 entries.
Processing payload_csv/split/mqtt/mqtt_225.csv, Total rows: 5000
Saved train_data/protocol_2_image_class_info.csv with 5625 entries.
Processing payload_csv/split/mqtt/mqtt_226.csv, Total rows: 5000
Saved train_data/protocol_2_image_class_info.csv with 5625 entries.
Processing payload_csv/split/mqtt/mqtt_227.csv, Total rows: 5000
Saved train_data/protocol_2_image_class_info.csv with 5625 entries.
P

Saved train_data/protocol_2_image_class_info.csv with 7125 entries.
Processing payload_csv/split/mqtt/mqtt_286.csv, Total rows: 5000
Saved train_data/protocol_2_image_class_info.csv with 7125 entries.
Processing payload_csv/split/mqtt/mqtt_287.csv, Total rows: 5000
Saved train_data/protocol_2_image_class_info.csv with 7125 entries.
Processing payload_csv/split/mqtt/mqtt_288.csv, Total rows: 5000
Saved train_data/protocol_2_image_class_info.csv with 7125 entries.
Processing payload_csv/split/mqtt/mqtt_289.csv, Total rows: 5000
Saved train_data/protocol_2_image_class_info.csv with 7125 entries.
Processing payload_csv/split/mqtt/mqtt_290.csv, Total rows: 5000
Saved train_data/protocol_2_image_class_info.csv with 7250 entries.
Processing payload_csv/split/mqtt/mqtt_291.csv, Total rows: 5000
Saved train_data/protocol_2_image_class_info.csv with 7250 entries.
Processing payload_csv/split/mqtt/mqtt_292.csv, Total rows: 5000
Saved train_data/protocol_2_image_class_info.csv with 7250 entries.
P

Saved train_data/protocol_2_image_class_info.csv with 8750 entries.
Processing payload_csv/split/mqtt/mqtt_351.csv, Total rows: 5000
Saved train_data/protocol_2_image_class_info.csv with 8750 entries.
Processing payload_csv/split/mqtt/mqtt_352.csv, Total rows: 5000
Saved train_data/protocol_2_image_class_info.csv with 8750 entries.
Processing payload_csv/split/mqtt/mqtt_353.csv, Total rows: 5000
Saved train_data/protocol_2_image_class_info.csv with 8750 entries.
Processing payload_csv/split/mqtt/mqtt_354.csv, Total rows: 5000
Saved train_data/protocol_2_image_class_info.csv with 8750 entries.
Processing payload_csv/split/mqtt/mqtt_355.csv, Total rows: 5000
Saved train_data/protocol_2_image_class_info.csv with 8875 entries.
Processing payload_csv/split/mqtt/mqtt_356.csv, Total rows: 5000
Saved train_data/protocol_2_image_class_info.csv with 8875 entries.
Processing payload_csv/split/mqtt/mqtt_357.csv, Total rows: 5000
Saved train_data/protocol_2_image_class_info.csv with 8875 entries.
P