In [1]:
import sys, os
sys.path.append('../GIPS/core')
from utils import AEchunking

import pickle
import numpy as np

import chromadb
from chromadb.utils import embedding_functions

from tqdm import tqdm
import pickle

### minhash를 임베딩된 벡터 보는 방법
- 벡터를 DB에 추가하는데 오랜 시간이 걸림
- 벡터마다 차원의 크기가 매우 크기(2 ^ 14 대략 16000) 때문에 문제가 생김

### chroma DB에서 기본값으로 사용하는 임베딩 방식 사용
- 기본 임베딩 방식: all-MiniLM-L6-v2

In [2]:
with open('../GIPS/iotpot/48.pkl', 'rb') as f:
    payloads = pickle.load(f)
    
print(len(payloads))

10122


### 데이터 전처리

In [3]:
cnk = AEchunking # 청킹 방법
window_size = 4 # 청킹 사이즈
emb = embedding_functions.DefaultEmbeddingFunction() # 임베딩 방법

meta = [] # 메타 정보를 담을 배열
# embedding_vectors = [] # 임베딩 벡터들을 담을 배열
with open('./embedding_vector/1.pkl', 'rb') as f:
    embedding_vectors = pickle.load(f)
indices = [] # 각 문서별 번호

cnt = 0 # 인덱스 0부터 시작

benign = 0
malware = 0
unknown = 0

for payload, label in tqdm(payloads):

	# 인덱스
	indices.append(f'{cnt}')
	cnt += 1

	# 청킹 임베딩
	# chunks = cnk(payload, W=window_size)
	# vectors = np.array(emb(chunks))
	# vec = vectors.sum(axis=0)
	# vec /= len(vectors)
	# embedding_vectors.append(list(vec))

	# 메타 데이터
	dic = dict()

	## 라벨이 뭐로 되어 있는지
	if "MALWARE" in label:
		dic['label'] = 1
		malware += 1
	elif "BENIGN"in label:
		dic['label'] = 0
		benign += 1
	else:
		dic['label'] = 2
		unknown += 1

	## 어떤 시그니처들을 가지고 있는지
	## 시그니처들의 값은 실제 데이터에서 추출 된 값들을 이용
	signature = ''
	if 'PING' in payload:
		signature += 'PING,'
	if 'PONG' in payload:
		signature += 'PONG,'
	if 'root' in payload:
		signature += 'root,'
	if 'Password' in payload:
		signature += 'Password,'
	if 'Login' in payload:
		signature += 'Login,'
	if 'tftp' in payload:
		signature += 'tftp,'
	if '.sh' in payload:
		signature += 'sh,'

	if len(signature) != 0:
		signature = signature[: -1]

	dic['signature'] = signature
	meta.append(dic)

print(f'benign: {benign}, malware: {malware}, unknown: {unknown}')
print(meta[: 5])

100%|██████████| 10122/10122 [00:00<00:00, 540322.31it/s]

benign: 2689, malware: 1184, unknown: 6249
[{'label': 0, 'signature': ''}, {'label': 0, 'signature': ''}, {'label': 2, 'signature': ''}, {'label': 2, 'signature': ''}, {'label': 2, 'signature': ''}]





In [4]:
# with open("./embedding_vector/1.pkl", 'wb') as f:
#     pickle.dump(embedding_vectors, f)

In [5]:
# 클라이언트 생성

client = chromadb.PersistentClient('./chroma')

In [6]:
# 컬렉션 생성 혹은 가져오기

collection = client.get_or_create_collection("default_DB")

In [7]:
# 컬렉션의 데이터 삽입

for i in tqdm(range(len(payloads))):
	collection.add(
    	ids=indices[i], # 인덱스
    	embeddings=embedding_vectors[i], # 임베딩
    	metadatas=meta[i], # 메타 정보
		documents=payloads[i][0], # 문서 원문
	)

100%|██████████| 10122/10122 [01:58<00:00, 85.35it/s] 


In [8]:
# 각 시그니처별 역색인

ping = []
pong = []
root = []
login = []
password = []
tftp = []
sh = []


for idx, data in enumerate(payloads):
    payload = data[0]
    label = data[1]
    if "MALWARE" not in label:
        continue
    if 'PING' in payload:
        ping.append(idx)
    if 'PONG' in payload:
        pong.append(idx)
    if 'root' in payload:
        root.append(idx)
    if 'Login' in payload:
        login.append(idx)
    if 'Password' in payload:
        password.append(idx)
    if 'tftp' in payload:
        tftp.append(idx)
    if '.sh' in payload:
        sh.append(idx)
        
print('ping:', len(ping), ping[: 5])
print('pong:', len(pong), pong[: 5])
print('root:', len(root), root[: 5])
print('login:', len(login), login)
print('password', len(password), password[: 5])
print('tftp', len(tftp), tftp[: 5])
print('sh', len(sh), sh[: 5])

ping: 0 []
pong: 748 [1884, 1885, 1912, 1913, 1942]
root: 290 [1900, 1901, 1906, 1907, 1930]
login: 0 []
password 0 []
tftp 44 [724, 725, 2128, 2129, 2730]
sh 42 [2128, 2129, 2730, 2731, 3082]


#### ping, login password를 포함하고 있는 공격 X
#### Pong, root는 공격이 너무 많아서 진행 X
#### sh는 전부 tftp를 포함

In [9]:
res_tftp = collection.query(
    query_embeddings=embedding_vectors[tftp[0]],
    n_results=len(tftp),
)

In [10]:
res_tftp

{'ids': [['725',
   '724',
   '923',
   '922',
   '210',
   '211',
   '1228',
   '1227',
   '1363',
   '1362',
   '1701',
   '1700',
   '418',
   '419',
   '9',
   '8',
   '1070',
   '1071',
   '1520',
   '1521',
   '564',
   '565',
   '1341',
   '1340',
   '1309',
   '1308',
   '1048',
   '1049',
   '656',
   '657',
   '516',
   '517',
   '1205',
   '1206',
   '344',
   '343',
   '704',
   '705',
   '140',
   '141',
   '1477',
   '1476',
   '1817',
   '1816']],
 'distances': [[0.0,
   0.0,
   0.0037391996011137962,
   0.0037391996011137962,
   0.0037755051162093878,
   0.0037755051162093878,
   0.004489046987146139,
   0.004489046987146139,
   0.004697577096521854,
   0.004697577096521854,
   0.005892964545637369,
   0.005892964545637369,
   0.005944156087934971,
   0.005944156087934971,
   0.006112964358180761,
   0.006112964358180761,
   0.006189025938510895,
   0.006189025938510895,
   0.0064215343445539474,
   0.0064215343445539474,
   0.00644228421151638,
   0.00644228421151638,


In [11]:
res_tftp_1 = collection.query(
    query_embeddings=embedding_vectors[tftp[0]],
    n_results=len(tftp),
    # 메타 필드 필터링
    where={
        "label": 1,
	},
    # 문서 내용 필터링
    # where_document={
    #     "$contains": "Content"
	# },
)

In [12]:
res_tftp_1

{'ids': [['725',
   '724',
   '923',
   '922',
   '210',
   '211',
   '1228',
   '1227',
   '1363',
   '1362',
   '1701',
   '1700',
   '418',
   '419',
   '9',
   '8',
   '1070',
   '1071',
   '1520',
   '1521',
   '564',
   '565',
   '2129',
   '2128',
   '2731',
   '2730',
   '3089',
   '3083',
   '3097',
   '3082',
   '3096',
   '3088',
   '4844',
   '4845',
   '4915',
   '4907',
   '4924',
   '4925',
   '4906',
   '4900',
   '4914',
   '4901',
   '5250',
   '5251']],
 'distances': [[0.0,
   0.0,
   0.0037391996011137962,
   0.0037391996011137962,
   0.0037755051162093878,
   0.0037755051162093878,
   0.004489046987146139,
   0.004489046987146139,
   0.004697577096521854,
   0.004697577096521854,
   0.005892964545637369,
   0.005892964545637369,
   0.005944156087934971,
   0.005944156087934971,
   0.006112964358180761,
   0.006112964358180761,
   0.006189025938510895,
   0.006189025938510895,
   0.0064215343445539474,
   0.0064215343445539474,
   0.00644228421151638,
   0.006442284

In [13]:
benign_ = collection.query(
    query_embeddings=embedding_vectors[1],
    n_results=len(tftp),
)

In [14]:
benign_

{'ids': [['0',
   '1',
   '1147',
   '1148',
   '1142',
   '1143',
   '1169',
   '1170',
   '1166',
   '1165',
   '1158',
   '1157',
   '1144',
   '1164',
   '1163',
   '1173',
   '1174',
   '1150',
   '1149',
   '1096',
   '1097',
   '1138',
   '1139',
   '1153',
   '1154',
   '1577',
   '1576',
   '3',
   '2',
   '952',
   '953',
   '1145',
   '1146',
   '1155',
   '1156',
   '1161',
   '1162',
   '1134',
   '1135',
   '1171',
   '1172',
   '423',
   '422',
   '204']],
 'distances': [[0.0,
   0.0,
   0.08863386511802673,
   0.08863386511802673,
   0.09819384664297104,
   0.0985032320022583,
   0.09877534955739975,
   0.09877534955739975,
   0.10213661938905716,
   0.10213661938905716,
   0.10252879559993744,
   0.10252879559993744,
   0.10554090887308121,
   0.10568691045045853,
   0.10568691045045853,
   0.10834715515375137,
   0.10834715515375137,
   0.10845577716827393,
   0.10845577716827393,
   0.11050014942884445,
   0.11050014942884445,
   0.1105693131685257,
   0.110569313168

In [15]:
collection_doc = client.get_or_create_collection("docDB")

In [16]:
# 컬렉션의 데이터 삽입

for i in tqdm(range(len(payloads))):
	collection_doc.add(
    	ids=indices[i], # 인덱스
    	metadatas=meta[i], # 메타 정보
		documents=payloads[i][0], # 문서 원문
	)

100%|██████████| 10122/10122 [05:41<00:00, 29.60it/s]


In [17]:
res_tftp_1_doc = collection_doc.query(
    query_texts=payloads[tftp[0]][0],
    n_results=len(tftp),
    where={
        "label": 1,
	},
)

In [18]:
res_tftp_1_doc

{'ids': [['725',
   '724',
   '1363',
   '1362',
   '1228',
   '1227',
   '923',
   '922',
   '1701',
   '1700',
   '564',
   '565',
   '1520',
   '1521',
   '418',
   '419',
   '210',
   '211',
   '9',
   '8',
   '1070',
   '1071',
   '2129',
   '2128',
   '2731',
   '2730',
   '3089',
   '3083',
   '3097',
   '3082',
   '3096',
   '3088',
   '4844',
   '4845',
   '4915',
   '4907',
   '4924',
   '4925',
   '4906',
   '4900',
   '4914',
   '4901',
   '5250',
   '5251']],
 'distances': [[0.0,
   0.0,
   0.23795321583747864,
   0.23795321583747864,
   0.3170430064201355,
   0.3170430064201355,
   0.402919203042984,
   0.402919203042984,
   0.4065503776073456,
   0.4065503776073456,
   0.4591413736343384,
   0.4591413736343384,
   0.49964264035224915,
   0.49964264035224915,
   0.5459277629852295,
   0.5459277629852295,
   0.5476492643356323,
   0.5476492643356323,
   0.5580764412879944,
   0.5580764412879944,
   0.6910387277603149,
   0.6910387277603149,
   0.8466423749923706,
   0.8466