In [9]:
import chromadb
import pickle
import numpy as np

from core.utils import AEchunking
from chromadb.utils import embedding_functions

from tqdm import tqdm
import pickle

### minhash를 임베딩된 벡터 보는 방법
- 벡터를 DB에 추가하는데 오랜 시간이 걸림
- 벡터마다 차원의 크기가 매우 크기(2 ^ 14 대략 16000) 때문에 문제가 생김

### chroma DB에서 기본값으로 사용하는 임베딩 방식 사용
- 기본 임베딩 방식: all-MiniLM-L6-v2

In [7]:
with open('./iotpot/48.pkl', 'rb') as f:
    payloads = pickle.load(f)
    
print(len(payloads))

10122


### 데이터 전처리

In [8]:
cnk = AEchunking # 청킹 방법
window_size = 4 # 청킹 사이즈
emb = embedding_functions.DefaultEmbeddingFunction() # 임베딩 방법

meta = [] # 메타 정보를 담을 배열
embedding_vectors = [] # 임베딩 벡터들을 담을 배열
indices = [] # 각 문서별 번호

cnt = 0 # 인덱스 0부터 시작

for payload, label in tqdm(payloads):

	# 인덱스
	indices.append(f'{cnt}')
	cnt += 1

	# 청킹 임베딩
	chunks = cnk(payload, W=window_size)
	vectors = np.array(emb(chunks))
	vec = vectors.sum(axis=0)
	vec /= len(vectors)
	embedding_vectors.append(list(vec))

	# 메타 데이터
	dic = dict()

	## 라벨이 뭐로 되어 있는지
	if "MALWARE" in label:
		dic['label'] = 1
	else:
		dic['label'] = 0

	## 어떤 시그니처들을 가지고 있는지
	## 시그니처들의 값은 실제 데이터에서 추출 된 값들을 이용
	signature = ''
	if 'PING' in payload:
		signature += 'PING,'
	if 'PONG' in payload:
		signature += 'PONG,'
	if 'root' in payload:
		signature += 'root,'
	if 'Password' in payload:
		signature += 'Password,'
	if 'Login' in payload:
		signature += 'Login,'
	if 'tftp' in payload:
		signature += 'tftp,'
	if '.sh' in payload:
		signature += 'sh,'

	if len(signature) != 0:
		signature = signature[: -1]

	dic['signature'] = signature
	meta.append(dic)

meta

100%|██████████| 10122/10122 [2:14:52<00:00,  1.25it/s]  


[{'label': 0, 'signature': ''},
 {'label': 0, 'signature': ''},
 {'label': 0, 'signature': ''},
 {'label': 0, 'signature': ''},
 {'label': 0, 'signature': ''},
 {'label': 0, 'signature': ''},
 {'label': 0, 'signature': ''},
 {'label': 0, 'signature': ''},
 {'label': 1, 'signature': ''},
 {'label': 1, 'signature': ''},
 {'label': 0, 'signature': ''},
 {'label': 0, 'signature': ''},
 {'label': 0, 'signature': ''},
 {'label': 0, 'signature': ''},
 {'label': 0, 'signature': ''},
 {'label': 0, 'signature': ''},
 {'label': 0, 'signature': ''},
 {'label': 0, 'signature': ''},
 {'label': 0, 'signature': ''},
 {'label': 0, 'signature': ''},
 {'label': 0, 'signature': ''},
 {'label': 0, 'signature': ''},
 {'label': 0, 'signature': ''},
 {'label': 0, 'signature': ''},
 {'label': 0, 'signature': ''},
 {'label': 0, 'signature': ''},
 {'label': 0, 'signature': ''},
 {'label': 0, 'signature': ''},
 {'label': 0, 'signature': ''},
 {'label': 0, 'signature': ''},
 {'label': 0, 'signature': ''},
 {'label

In [10]:
with open("./embedding_vector/1.pkl", 'wb') as f:
    pickle.dump(embedding_vectors, f)

In [11]:
# 클라이언트 생성

client = chromadb.PersistentClient('./vectorDB')

In [12]:
# 컬렉션 생성 혹은 가져오기

collection = client.get_or_create_collection("default_DB")

In [13]:
for i in tqdm(range(len(payloads))):
	collection.add(
    	ids=indices[i],
    	embeddings=embedding_vectors[i],
    	metadatas=meta[i],
		documents=payloads[i][0],
	)

100%|██████████| 10122/10122 [01:49<00:00, 92.06it/s]


In [14]:
# 각 시그니처별 역색인

ping = []
pong = []
root = []
login = []
password = []
tftp = []
sh = []


for idx, data in enumerate(payloads):
    payload = data[0]
    label = data[1]
    if "MALWARE" not in label:
        continue
    if 'PING' in payload:
        ping.append(idx)
    if 'PONG' in payload:
        pong.append(idx)
    if 'root' in payload:
        root.append(idx)
    if 'Login' in payload:
        login.append(idx)
    if 'Password' in payload:
        password.append(idx)
    if 'tftp' in payload:
        tftp.append(idx)
    if '.sh' in payload:
        sh.append(idx)
        
root

[1900,
 1901,
 1906,
 1907,
 1930,
 1931,
 2128,
 2129,
 2130,
 2131,
 2730,
 2731,
 2732,
 2733,
 2734,
 2735,
 2942,
 2943,
 2952,
 2953,
 3048,
 3049,
 3068,
 3069,
 3070,
 3071,
 3082,
 3083,
 3084,
 3085,
 3088,
 3089,
 3090,
 3091,
 3096,
 3097,
 3098,
 3099,
 3578,
 3579,
 3584,
 3585,
 3596,
 3597,
 3628,
 3629,
 3654,
 3655,
 4034,
 4035,
 4048,
 4049,
 4126,
 4127,
 4128,
 4129,
 4132,
 4133,
 4152,
 4153,
 4154,
 4155,
 4178,
 4179,
 4180,
 4181,
 4206,
 4207,
 4242,
 4243,
 4248,
 4249,
 4506,
 4507,
 4512,
 4513,
 4532,
 4533,
 4566,
 4567,
 4580,
 4581,
 4594,
 4595,
 4606,
 4607,
 4632,
 4633,
 4634,
 4635,
 4644,
 4645,
 4664,
 4665,
 4670,
 4671,
 4678,
 4679,
 4702,
 4703,
 4844,
 4845,
 4846,
 4847,
 4888,
 4889,
 4900,
 4901,
 4902,
 4903,
 4904,
 4905,
 4906,
 4907,
 4914,
 4915,
 4916,
 4917,
 4924,
 4925,
 4926,
 4927,
 4940,
 4941,
 4950,
 4951,
 5250,
 5251,
 5252,
 5253,
 5522,
 5523,
 5530,
 5531,
 5566,
 5567,
 5572,
 5573,
 5802,
 5803,
 5804,
 5805,
 5806,

In [15]:
collection.query(
    # query_texts='chmod 777',
	query_embeddings=embedding_vectors[4924],
    n_results=30,
    # 메타 필드 필터링
    where={
        "label": 1,
	},
    # 문서 내용 필터링
    # where_document={
    #     "$contains": "\n"
	# },
)

{'ids': [['2129',
   '2128',
   '2730',
   '2731',
   '3089',
   '3096',
   '3097',
   '3082',
   '3083',
   '3088',
   '4845',
   '4844',
   '4906',
   '4901',
   '4907',
   '4924',
   '4915',
   '4925',
   '4900',
   '4914',
   '5251',
   '5250',
   '6121',
   '6120',
   '6240',
   '6241',
   '6787',
   '6767',
   '6786',
   '6766']],
 'distances': [[0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0]],
 'metadatas': [[{'label': 1, 'signature': 'root,tftp,sh'},
   {'label': 1, 'signature': 'root,tftp,sh'},
   {'label': 1, 'signature': 'root,tftp,sh'},
   {'label': 1, 'signature': 'root,tftp,sh'},
   {'label': 1, 'signature': 'root,tftp,sh'},
   {'label': 1, 'signature': 'root,tftp,sh'},
   {'label': 1, 'signature': 'root,tftp,sh'},
   {'label': 1, 'signature': 'root,tftp,sh'},
   {'label': 1, 'signatur