In [7]:
import sys, os
sys.path.append('../GIPS/core')
from utils import AEchunking, minHash

import pickle
import numpy as np

import chromadb
from chromadb.utils import embedding_functions

from tqdm import tqdm
import pickle

### minhash를 임베딩된 벡터 보는 방법
- 벡터를 DB에 추가하는데 오랜 시간이 걸림
- 벡터마다 차원의 크기가 매우 크기(2 ^ 14 대략 16000) 때문에 문제가 생김

### chroma DB에서 기본값으로 사용하는 임베딩 방식 사용
- 기본 임베딩 방식: all-MiniLM-L6-v2

In [3]:
with open('../GIPS/iotpot/48.pkl', 'rb') as f:
    payloads = pickle.load(f)
    
print(len(payloads))

10122


### 임베딩방식
- minhash

In [5]:
def minhash_vector(payload, window_size, K, M):
    minhashed_virtual_vectors = [0 for _ in range(M)]

    chunks = AEchunking(payload, W=window_size)
    encode_pos = minHash(chunks, K) % M

    for i in encode_pos:
        minhashed_virtual_vectors[i] = 1

    return minhashed_virtual_vectors

### 데이터 전처리

In [8]:
cnk = AEchunking # 청킹 방법
window_size = 4 # 청킹 사이즈

meta = [] # 메타 정보를 담을 배열
embedding_vectors = [] # 임베딩 벡터들을 담을 배열
indices = [] # 각 문서별 번호

cnt = 0 # 인덱스 0부터 시작
vector_size = 100

benign = 0
malware = 0
unknown = 0

for payload, label in tqdm(payloads):

	# 인덱스
	indices.append(f'{cnt}')
	cnt += 1

	# 청킹 임베딩
	vectors = minhash_vector(payload=payload, window_size=window_size, K=vector_size, M=vector_size * 10)
	embedding_vectors.append(vectors)

	# 메타 데이터
	dic = dict()

	## 라벨이 뭐로 되어 있는지
	if "MALWARE" in label:
		dic['label'] = 1
		malware += 1
	elif "BENIGN"in label:
		dic['label'] = 0
		benign += 1
	else:
		dic['label'] = 2
		unknown += 1

	## 어떤 시그니처들을 가지고 있는지
	## 시그니처들의 값은 실제 데이터에서 추출 된 값들을 이용
	signature = ''
	if 'PING' in payload:
		signature += 'PING,'
	if 'PONG' in payload:
		signature += 'PONG,'
	if 'root' in payload:
		signature += 'root,'
	if 'Password' in payload:
		signature += 'Password,'
	if 'Login' in payload:
		signature += 'Login,'
	if 'tftp' in payload:
		signature += 'tftp,'
	if '.sh' in payload:
		signature += 'sh,'

	if len(signature) != 0:
		signature = signature[: -1]

	dic['signature'] = signature
	meta.append(dic)

print(f'benign: {benign}, malware: {malware}, unknown: {unknown}')
print(meta[: 10])

100%|██████████| 10122/10122 [00:10<00:00, 941.33it/s] 

benign: 2689, malware: 1184, unknown: 6249
[{'label': 0, 'signature': ''}, {'label': 0, 'signature': ''}, {'label': 2, 'signature': ''}, {'label': 2, 'signature': ''}, {'label': 2, 'signature': ''}, {'label': 2, 'signature': ''}, {'label': 2, 'signature': ''}, {'label': 2, 'signature': ''}, {'label': 1, 'signature': ''}, {'label': 1, 'signature': ''}]





In [36]:
print(embedding_vectors[0])
print(sum(embedding_vectors[0]))

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 

In [32]:
cnt = 0

for vector in embedding_vectors:
    if sum(vector) < 87:
        cnt += 1
print(cnt)

2


In [9]:
with open("./embedding_vector/2.pkl", 'wb') as f:
    pickle.dump(embedding_vectors, f)

In [10]:
# 클라이언트 생성

client = chromadb.PersistentClient('./chroma')

In [11]:
# 컬렉션 생성 혹은 가져오기

collection = client.get_or_create_collection("minahsh_DB")

In [12]:
# 컬렉션의 데이터 삽입

for i in tqdm(range(len(payloads))):
	collection.add(
    	ids=indices[i], # 인덱스
    	embeddings=embedding_vectors[i], # 임베딩
    	metadatas=meta[i], # 메타 정보
		documents=payloads[i][0], # 문서 원문
	)

100%|██████████| 10122/10122 [02:00<00:00, 84.18it/s]


In [13]:
# 각 시그니처별 역색인

ping = []
pong = []
root = []
login = []
password = []
tftp = []
sh = []


for idx, data in enumerate(payloads):
    payload = data[0]
    label = data[1]
    if "MALWARE" not in label:
        continue
    if 'PING' in payload:
        ping.append(idx)
    if 'PONG' in payload:
        pong.append(idx)
    if 'root' in payload:
        root.append(idx)
    if 'Login' in payload:
        login.append(idx)
    if 'Password' in payload:
        password.append(idx)
    if 'tftp' in payload:
        tftp.append(idx)
    if '.sh' in payload:
        sh.append(idx)
        
print('ping:', len(ping), ping)
print('pong:', len(pong), pong)
print('root:', len(root), root)
print('login:', len(login), login)
print('password', len(password), password)
print('tftp', len(tftp), tftp)
print('sh', len(sh), sh)

ping: 0 []
pong: 748 [1884, 1885, 1912, 1913, 1942, 1943, 1960, 1961, 1978, 1979, 1996, 1997, 2010, 2011, 2024, 2025, 2038, 2039, 2046, 2047, 2056, 2057, 2070, 2071, 2084, 2085, 2110, 2111, 2138, 2139, 2156, 2157, 2174, 2175, 2188, 2189, 2302, 2303, 2316, 2317, 2328, 2329, 2338, 2339, 2352, 2353, 2364, 2365, 2376, 2377, 2390, 2391, 2406, 2407, 2422, 2423, 2434, 2435, 2456, 2457, 2468, 2469, 2486, 2487, 2500, 2501, 2512, 2513, 2522, 2523, 2538, 2539, 2556, 2557, 2594, 2595, 2616, 2617, 2632, 2633, 2654, 2655, 2668, 2669, 2690, 2691, 2712, 2713, 2742, 2743, 2756, 2757, 2776, 2777, 2800, 2801, 2818, 2819, 2834, 2835, 2854, 2855, 2878, 2879, 2898, 2899, 2920, 2921, 2958, 2959, 2978, 2979, 2994, 2995, 3032, 3033, 3058, 3059, 3104, 3105, 3122, 3123, 3146, 3147, 3168, 3169, 3188, 3189, 3206, 3207, 3228, 3229, 3248, 3249, 3262, 3263, 3276, 3277, 3292, 3293, 3312, 3313, 3334, 3335, 3350, 3351, 3370, 3371, 3382, 3383, 3404, 3405, 3416, 3417, 3430, 3431, 3446, 3447, 3460, 3461, 3500, 3501, 3508, 

#### ping, login password를 포함하고 있는 공격 X
#### Pong, root는 공격이 너무 많아서 진행 X
#### sh는 전부 tftp를 포함

In [21]:
res_tftp = collection.query(
    query_embeddings=embedding_vectors[tftp[0]],
    n_results=len(tftp),
)

In [22]:
res_tftp

{'ids': [['724',
   '725',
   '922',
   '923',
   '8',
   '9',
   '1071',
   '1070',
   '1701',
   '1700',
   '419',
   '418',
   '211',
   '210',
   '565',
   '564',
   '1521',
   '1520',
   '1363',
   '1362',
   '1228',
   '1227',
   '2449',
   '2448',
   '6728',
   '6729',
   '37',
   '36',
   '1326',
   '1327',
   '4893',
   '4892',
   '4913',
   '4912',
   '538',
   '3468',
   '3469',
   '4396',
   '4397',
   '5885',
   '5884',
   '8805',
   '8804',
   '10045']],
 'distances': [[0.0,
   0.0,
   17.0,
   17.0,
   25.0,
   25.0,
   27.0,
   27.0,
   27.0,
   27.0,
   29.0,
   29.0,
   33.0,
   33.0,
   39.0,
   39.0,
   39.0,
   39.0,
   41.0,
   41.0,
   43.0,
   43.0,
   149.0,
   149.0,
   152.0,
   152.0,
   154.0,
   154.0,
   154.0,
   154.0,
   154.0,
   154.0,
   154.0,
   154.0,
   155.0,
   155.0,
   155.0,
   155.0,
   155.0,
   155.0,
   155.0,
   155.0,
   155.0,
   155.0]],
 'metadatas': [[{'label': 1, 'signature': 'tftp'},
   {'label': 1, 'signature': 'tftp'},
   {'la

In [23]:
res_tftp_1 = collection.query(
    query_embeddings=embedding_vectors[tftp[0]],
    n_results=len(tftp),
    # 메타 필드 필터링
    where={
        "label": 1,
	},
    # 문서 내용 필터링
    # where_document={
    #     "$contains": "Content"
	# },
)

In [24]:
res_tftp_1

{'ids': [['724',
   '725',
   '922',
   '923',
   '8',
   '9',
   '1071',
   '1070',
   '1701',
   '1700',
   '419',
   '418',
   '211',
   '210',
   '565',
   '564',
   '1521',
   '1520',
   '1363',
   '1362',
   '1228',
   '1227',
   '3635',
   '3634',
   '4188',
   '4189',
   '4573',
   '4572',
   '7413',
   '7412',
   '8993',
   '8992',
   '9079',
   '9078',
   '10033',
   '10032',
   '9322',
   '9323',
   '1907',
   '2943',
   '2952',
   '3071',
   '3070',
   '9438']],
 'distances': [[0.0,
   0.0,
   17.0,
   17.0,
   25.0,
   25.0,
   27.0,
   27.0,
   27.0,
   27.0,
   29.0,
   29.0,
   33.0,
   33.0,
   39.0,
   39.0,
   39.0,
   39.0,
   41.0,
   41.0,
   43.0,
   43.0,
   160.0,
   160.0,
   160.0,
   160.0,
   160.0,
   160.0,
   160.0,
   160.0,
   160.0,
   160.0,
   160.0,
   160.0,
   160.0,
   160.0,
   161.0,
   161.0,
   164.0,
   164.0,
   164.0,
   164.0,
   164.0,
   164.0]],
 'metadatas': [[{'label': 1, 'signature': 'tftp'},
   {'label': 1, 'signature': 'tftp'},
 

In [21]:
benign_ = collection.query(
    query_embeddings=embedding_vectors[1],
    n_results=len(tftp),
)

In [22]:
benign_

{'ids': [['1',
   '0',
   '1147',
   '1148',
   '1142',
   '1143',
   '1169',
   '1170',
   '1165',
   '1166',
   '1157',
   '1158',
   '1144',
   '1164',
   '1163',
   '1174',
   '1173',
   '1150',
   '1149',
   '1096',
   '1097',
   '1138',
   '1139',
   '1154',
   '1153',
   '1577',
   '1576',
   '3',
   '2',
   '952',
   '953',
   '1146',
   '1145',
   '1155',
   '1156',
   '1161',
   '1162',
   '1135',
   '1134',
   '1172',
   '1171',
   '205',
   '204',
   '156']],
 'distances': [[0.0,
   0.0,
   0.08863386511802673,
   0.08863386511802673,
   0.09819384664297104,
   0.0985032320022583,
   0.09877534955739975,
   0.09877534955739975,
   0.10213661938905716,
   0.10213661938905716,
   0.10252879559993744,
   0.10252879559993744,
   0.10554090887308121,
   0.10568691045045853,
   0.10568691045045853,
   0.10834715515375137,
   0.10834715515375137,
   0.10845577716827393,
   0.10845577716827393,
   0.11050014942884445,
   0.11050014942884445,
   0.1105693131685257,
   0.110569313168

### 실험 결과
- 문자열이 조금 바뀐 경우에는 탐지가 잘되는 모습
- 하지만 같은 시그니처를 가졌지만 내용이 다른 경우에는 기본 임베딩 모델보다 탐지가 잘 안되는 모습을 보임