In [1]:
import numpy as np
import pandas as pd
import os
import csv, json
import math
from PIL import Image

from collections import Counter

In [2]:
import utils

In [3]:
WINDOW_SIZE = 1024
STEP_SIZE = 512

In [4]:
def get_byte_entropy(byte_array:np.ndarray):
    total_count = np.sum(byte_array)

    # P(X)
    P_X = byte_array / total_count

    # 엔트로피 H(X)를 계산
    # Shannon 엔트로피 공식 사용
    P_X_nonzero = P_X[P_X > 0]
    H_X = -np.sum(P_X_nonzero * np.log2(P_X_nonzero))

    P_H_given_X = P_X * H_X
    P_H_X = P_X * P_H_given_X

    return P_H_X

In [5]:
SIZE = 16

def export_image(filename, pixel1d:np.ndarray):
    pixel2d = [pixel1d[i * SIZE:(i + 1) * SIZE] for i in range(SIZE)]

    try:
        final_image = Image.fromarray(np.array(pixel2d, dtype=np.uint8), 'L')
        final_image.save(filename)
    except Exception as e:
        print(f"Error while saving image '{filename}'")
        print(e)
        print(pixel1d)
        print()

In [6]:
def debug_minmax(target, message):
    print(message)
    print('min:', np.min(target))
    print('max:', np.max(target))

In [7]:
def normalize_minmax(arr:np.ndarray)->np.ndarray:
    min_val = np.min(arr)
    max_val = np.max(arr)
    if min_val == max_val:
        return np.zeros_like(arr)
    else:
        return (arr - min_val) / (max_val - min_val)

In [8]:
def extract_entropy_histogram(dataset, *, export_csvfile, export_directory):
    os.makedirs(export_directory, exist_ok=True)
    with open(os.path.join(export_directory, export_csvfile), 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['index', 'path', 'filename', 'bytes'])

        for i, bytes_str in enumerate(dataset['bytes']):
            bs = json.loads(bytes_str)
            counter = Counter(bs)
            bc = [counter[i] for i in range(0, 256)]
            
            entropy = get_byte_entropy(bc)
            normalized = normalize_minmax(entropy)
            normalized = np.log(normalized + 1) / np.log(2) * 255.0
            
            writer.writerow([i, dataset['path'][i], dataset['filename'][i], json.dumps(normalized.tolist())])
            export_image(os.path.join(export_directory, f'dataset_{i}.png'), np.round(normalized, 0).tolist())

In [9]:
EXPORT_PATH = '../export'
EXPORT_NORMAL_PATH = os.path.join(EXPORT_PATH, 'normal')
EXPORT_ATTACK_PATH = os.path.join(EXPORT_PATH, 'attack')

os.makedirs(EXPORT_NORMAL_PATH, exist_ok=True)
os.makedirs(EXPORT_ATTACK_PATH, exist_ok=True)

In [None]:
'''
공격 데이터셋의 경우
'''
dataset_path = []

dataset_path.append('../dataset/attack_decoded.csv')
dataset_path.append('../dataset/attack_original.csv')

In [91]:
'''
노멀 데이터셋의 경우
'''
DATASET_BASE_PATH = '../dataset/CSV'
dataset_path = []

for dirpath, _, filenames in os.walk(DATASET_BASE_PATH):
    dataset_path.extend([os.path.join(dirpath, f) for f in filenames])

dataset_path

['../dataset/CSV\\atestset.csv',
 '../dataset/CSV\\Github.csv',
 '../dataset/CSV\\GithubGist.csv',
 '../dataset/CSV\\invokeCradleCrafter.csv',
 '../dataset/CSV\\InvokeObfuscation.csv',
 '../dataset/CSV\\IseSteroids.csv',
 '../dataset/CSV\\PoshCode.csv',
 '../dataset/CSV\\PowerShellGallery.csv',
 '../dataset/CSV\\Random.csv',
 '../dataset/CSV\\Technet.csv']

In [96]:
for path in dataset_path:
    dataset = pd.read_csv(path)

    dataset_name , _= os.path.splitext(os.path.split(path)[1])
    print(dataset_name)
    
    export_directory = os.path.join(EXPORT_NORMAL_PATH, dataset_name)
    extract_entropy_histogram(dataset,
                              export_csvfile=f'entropy.csv',
                              export_directory=export_directory
                              )

atestset
Github


  P_X = byte_array / total_count


Error while saving image '../export\normal\Github\dataset_479.png'
cannot convert float NaN to integer
[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, n

In [169]:
np.log(1) / np.log(2)

0.0