In [17]:
import sys, os
sys.path.append('../GIPS/core')
from utils import AEchunking

import pickle
import numpy as np

import chromadb
from chromadb.utils import embedding_functions

from tqdm import tqdm
import pickle

### minhash를 임베딩된 벡터 보는 방법
- 벡터를 DB에 추가하는데 오랜 시간이 걸림
- 벡터마다 차원의 크기가 매우 크기(2 ^ 14 대략 16000) 때문에 문제가 생김

### chroma DB에서 기본값으로 사용하는 임베딩 방식 사용
- 기본 임베딩 방식: all-MiniLM-L6-v2

In [18]:
with open('../GIPS/iotpot/48.pkl', 'rb') as f:
    payloads = pickle.load(f)
    
print(len(payloads))

10122


### 데이터 전처리

In [19]:
cnk = AEchunking # 청킹 방법
window_size = 4 # 청킹 사이즈
emb = embedding_functions.DefaultEmbeddingFunction() # 임베딩 방법

meta = [] # 메타 정보를 담을 배열
# embedding_vectors = [] # 임베딩 벡터들을 담을 배열
with open('./embedding_vector/1.pkl', 'rb') as f:
    embedding_vectors = pickle.load(f)
indices = [] # 각 문서별 번호

cnt = 0 # 인덱스 0부터 시작

benign = 0
malware = 0
unknown = 0

for payload, label in tqdm(payloads):

	# 인덱스
	indices.append(f'{cnt}')
	cnt += 1

	# 청킹 임베딩
	# chunks = cnk(payload, W=window_size)
	# vectors = np.array(emb(chunks))
	# vec = vectors.sum(axis=0)
	# vec /= len(vectors)
	# embedding_vectors.append(list(vec))

	# 메타 데이터
	dic = dict()

	## 라벨이 뭐로 되어 있는지
	if "MALWARE" in label:
		dic['label'] = 1
		malware += 1
	elif "BENIGN"in label:
		dic['label'] = 0
		benign += 1
	else:
		dic['label'] = 2
		unknown += 1

	## 어떤 시그니처들을 가지고 있는지
	## 시그니처들의 값은 실제 데이터에서 추출 된 값들을 이용
	signature = ''
	if 'PING' in payload:
		signature += 'PING,'
	if 'PONG' in payload:
		signature += 'PONG,'
	if 'root' in payload:
		signature += 'root,'
	if 'Password' in payload:
		signature += 'Password,'
	if 'Login' in payload:
		signature += 'Login,'
	if 'tftp' in payload:
		signature += 'tftp,'
	if '.sh' in payload:
		signature += 'sh,'

	if len(signature) != 0:
		signature = signature[: -1]

	dic['signature'] = signature
	meta.append(dic)

print(f'benign: {benign}, malware: {malware}, unknown: {unknown}')
print(meta[: 5])

100%|██████████| 10122/10122 [00:00<00:00, 648084.89it/s]

benign: 2689, malware: 1184, unknown: 6249
[{'label': 0, 'signature': ''}, {'label': 0, 'signature': ''}, {'label': 2, 'signature': ''}, {'label': 2, 'signature': ''}, {'label': 2, 'signature': ''}]





In [20]:
# with open("./embedding_vector/1.pkl", 'wb') as f:
#     pickle.dump(embedding_vectors, f)

In [21]:
# 클라이언트 생성

client = chromadb.PersistentClient('./chroma')

In [22]:
# 컬렉션 생성 혹은 가져오기

collection = client.get_or_create_collection("default_DB")

In [23]:
# 컬렉션의 데이터 삽입

for i in tqdm(range(len(payloads))):
	collection.add(
    	ids=indices[i], # 인덱스
    	embeddings=embedding_vectors[i], # 임베딩
    	metadatas=meta[i], # 메타 정보
		documents=payloads[i][0], # 문서 원문
	)

In [24]:
# 각 시그니처별 역색인

ping = []
pong = []
root = []
login = []
password = []
tftp = []
sh = []


for idx, data in enumerate(payloads):
    payload = data[0]
    label = data[1]
    if "MALWARE" not in label:
        continue
    if 'PING' in payload:
        ping.append(idx)
    if 'PONG' in payload:
        pong.append(idx)
    if 'root' in payload:
        root.append(idx)
    if 'Login' in payload:
        login.append(idx)
    if 'Password' in payload:
        password.append(idx)
    if 'tftp' in payload:
        tftp.append(idx)
    if '.sh' in payload:
        sh.append(idx)
        
print('ping:', len(ping), ping[: 5])
print('pong:', len(pong), pong[: 5])
print('root:', len(root), root[: 5])
print('login:', len(login), login)
print('password', len(password), password[: 5])
print('tftp', len(tftp), tftp[: 5])
print('sh', len(sh), sh[: 5])

ping: 0 []
pong: 748 [1884, 1885, 1912, 1913, 1942]
root: 290 [1900, 1901, 1906, 1907, 1930]
login: 0 []
password 0 []
tftp 44 [724, 725, 2128, 2129, 2730]
sh 42 [2128, 2129, 2730, 2731, 3082]


#### ping, login password를 포함하고 있는 공격 X
#### Pong, root는 공격이 너무 많아서 진행 X
#### sh는 전부 tftp를 포함

In [25]:
res_tftp = collection.query(
    query_embeddings=embedding_vectors[tftp[0]],
    n_results=len(tftp),
)

In [50]:
res_indices = res_tftp['ids'][0]
res_signature = res_tftp['metadatas'][0]

for i in range(len(tftp)):
    print(f'index: {res_indices[i]}, signarue & label {res_signature[i]}')

index: 725, signarue & label {'label': 1, 'signature': 'tftp'}
index: 724, signarue & label {'label': 1, 'signature': 'tftp'}
index: 923, signarue & label {'label': 1, 'signature': ''}
index: 922, signarue & label {'label': 1, 'signature': ''}
index: 210, signarue & label {'label': 1, 'signature': ''}
index: 211, signarue & label {'label': 1, 'signature': ''}
index: 1228, signarue & label {'label': 1, 'signature': ''}
index: 1227, signarue & label {'label': 1, 'signature': ''}
index: 1363, signarue & label {'label': 1, 'signature': ''}
index: 1362, signarue & label {'label': 1, 'signature': ''}
index: 1701, signarue & label {'label': 1, 'signature': ''}
index: 1700, signarue & label {'label': 1, 'signature': ''}
index: 418, signarue & label {'label': 1, 'signature': ''}
index: 419, signarue & label {'label': 1, 'signature': ''}
index: 9, signarue & label {'label': 1, 'signature': ''}
index: 8, signarue & label {'label': 1, 'signature': ''}
index: 1070, signarue & label {'label': 1, 'si

In [27]:
res_tftp_1 = collection.query(
    query_embeddings=embedding_vectors[tftp[0]],
    n_results=len(tftp),
    # 메타 필드 필터링
    where={
        "label": 1,
	},
    # 문서 내용 필터링
    # where_document={
    #     "$contains": "Content"
	# },
)

In [51]:
res_indices = res_tftp_1['ids'][0]
res_signature = res_tftp_1['metadatas'][0]

for i in range(len(tftp)):
    print(f'index: {res_indices[i]}, signarue & label {res_signature[i]}')

index: 725, signarue & label {'label': 1, 'signature': 'tftp'}
index: 724, signarue & label {'label': 1, 'signature': 'tftp'}
index: 923, signarue & label {'label': 1, 'signature': ''}
index: 922, signarue & label {'label': 1, 'signature': ''}
index: 210, signarue & label {'label': 1, 'signature': ''}
index: 211, signarue & label {'label': 1, 'signature': ''}
index: 1228, signarue & label {'label': 1, 'signature': ''}
index: 1227, signarue & label {'label': 1, 'signature': ''}
index: 1363, signarue & label {'label': 1, 'signature': ''}
index: 1362, signarue & label {'label': 1, 'signature': ''}
index: 1701, signarue & label {'label': 1, 'signature': ''}
index: 1700, signarue & label {'label': 1, 'signature': ''}
index: 418, signarue & label {'label': 1, 'signature': ''}
index: 419, signarue & label {'label': 1, 'signature': ''}
index: 9, signarue & label {'label': 1, 'signature': ''}
index: 8, signarue & label {'label': 1, 'signature': ''}
index: 1070, signarue & label {'label': 1, 'si

In [29]:
benign_ = collection.query(
    query_embeddings=embedding_vectors[1],
    n_results=len(tftp),
)

In [52]:
res_indices = benign_['ids'][0]
res_signature = benign_['metadatas'][0]

for i in range(len(tftp)):
    print(f'index: {res_indices[i]}, signarue & label {res_signature[i]}')

index: 0, signarue & label {'label': 0, 'signature': ''}
index: 1, signarue & label {'label': 0, 'signature': ''}
index: 1147, signarue & label {'label': 2, 'signature': ''}
index: 1148, signarue & label {'label': 2, 'signature': ''}
index: 1142, signarue & label {'label': 2, 'signature': ''}
index: 1143, signarue & label {'label': 2, 'signature': ''}
index: 1169, signarue & label {'label': 2, 'signature': ''}
index: 1170, signarue & label {'label': 2, 'signature': ''}
index: 1166, signarue & label {'label': 2, 'signature': ''}
index: 1165, signarue & label {'label': 2, 'signature': ''}
index: 1158, signarue & label {'label': 2, 'signature': ''}
index: 1157, signarue & label {'label': 2, 'signature': ''}
index: 1144, signarue & label {'label': 2, 'signature': ''}
index: 1164, signarue & label {'label': 2, 'signature': ''}
index: 1163, signarue & label {'label': 2, 'signature': ''}
index: 1173, signarue & label {'label': 2, 'signature': 'root,tftp,sh'}
index: 1174, signarue & label {'la

In [31]:
collection_doc = client.get_or_create_collection("docDB")

In [32]:
# 컬렉션의 데이터 삽입

for i in tqdm(range(len(payloads))):
	collection_doc.add(
    	ids=indices[i], # 인덱스
    	metadatas=meta[i], # 메타 정보
		documents=payloads[i][0], # 문서 원문
	)

In [33]:
res_tftp_1_doc = collection_doc.query(
    query_texts=payloads[tftp[0]][0],
    n_results=len(tftp),
    where={
        "label": 1,
	},
)

In [53]:
res_indices = res_tftp_1_doc['ids'][0]
res_signature = res_tftp_1_doc['metadatas'][0]

for i in range(len(tftp)):
    print(f'index: {res_indices[i]}, signarue & label {res_signature[i]}')

index: 725, signarue & label {'label': 1, 'signature': 'tftp'}
index: 724, signarue & label {'label': 1, 'signature': 'tftp'}
index: 1363, signarue & label {'label': 1, 'signature': ''}
index: 1362, signarue & label {'label': 1, 'signature': ''}
index: 1228, signarue & label {'label': 1, 'signature': ''}
index: 1227, signarue & label {'label': 1, 'signature': ''}
index: 923, signarue & label {'label': 1, 'signature': ''}
index: 922, signarue & label {'label': 1, 'signature': ''}
index: 1701, signarue & label {'label': 1, 'signature': ''}
index: 1700, signarue & label {'label': 1, 'signature': ''}
index: 564, signarue & label {'label': 1, 'signature': ''}
index: 565, signarue & label {'label': 1, 'signature': ''}
index: 1520, signarue & label {'label': 1, 'signature': ''}
index: 1521, signarue & label {'label': 1, 'signature': ''}
index: 418, signarue & label {'label': 1, 'signature': ''}
index: 419, signarue & label {'label': 1, 'signature': ''}
index: 210, signarue & label {'label': 1