In [1]:
import chromadb
import pickle
import numpy as np

from core.utils import AEchunking
from chromadb.utils import embedding_functions

from tqdm import tqdm
import pickle

### minhash를 임베딩된 벡터 보는 방법
- 벡터를 DB에 추가하는데 오랜 시간이 걸림
- 벡터마다 차원의 크기가 매우 크기(2 ^ 14 대략 16000) 때문에 문제가 생김

### chroma DB에서 기본값으로 사용하는 임베딩 방식 사용
- 기본 임베딩 방식: all-MiniLM-L6-v2

In [2]:
with open('./iotpot/48.pkl', 'rb') as f:
    payloads = pickle.load(f)
    
print(len(payloads))

10122


In [3]:
with open('./embedding_vector/1.pkl', 'rb') as f:
    embedding_vectors = pickle.load(f)

### 데이터 전처리

In [120]:
cnk = AEchunking # 청킹 방법
window_size = 4 # 청킹 사이즈
emb = embedding_functions.DefaultEmbeddingFunction() # 임베딩 방법

meta = [] # 메타 정보를 담을 배열
# embedding_vectors = [] # 임베딩 벡터들을 담을 배열
indices = [] # 각 문서별 번호

cnt = 0 # 인덱스 0부터 시작

benign = 0
malware = 0
unknown = 0

for payload, label in tqdm(payloads):

	# 인덱스
	indices.append(f'{cnt}')
	cnt += 1

	# 청킹 임베딩
	# chunks = cnk(payload, W=window_size)
	# vectors = np.array(emb(chunks))
	# vec = vectors.sum(axis=0)
	# vec /= len(vectors)
	# embedding_vectors.append(list(vec))

	# 메타 데이터
	dic = dict()

	## 라벨이 뭐로 되어 있는지
	if "MALWARE" in label:
		dic['label'] = 1
		malware += 1
	elif "BENIGN"in label:
		dic['label'] = 0
		benign += 1
	else:
		dic['label'] = 2
		unknown += 1

	## 어떤 시그니처들을 가지고 있는지
	## 시그니처들의 값은 실제 데이터에서 추출 된 값들을 이용
	signature = ''
	if 'PING' in payload:
		signature += 'PING,'
	if 'PONG' in payload:
		signature += 'PONG,'
	if 'root' in payload:
		signature += 'root,'
	if 'Password' in payload:
		signature += 'Password,'
	if 'Login' in payload:
		signature += 'Login,'
	if 'tftp' in payload:
		signature += 'tftp,'
	if '.sh' in payload:
		signature += 'sh,'

	if len(signature) != 0:
		signature = signature[: -1]

	dic['signature'] = signature
	meta.append(dic)

print(f'benign: {benign}, malware: {malware}, unknown: {unknown}')
print(meta[: 10])

100%|██████████| 10122/10122 [00:00<00:00, 460803.47it/s]

benign: 2689, malware: 1184, unknown: 0
[{'label': 0, 'signature': ''}, {'label': 0, 'signature': ''}, {'label': 2, 'signature': ''}, {'label': 2, 'signature': ''}, {'label': 2, 'signature': ''}, {'label': 2, 'signature': ''}, {'label': 2, 'signature': ''}, {'label': 2, 'signature': ''}, {'label': 1, 'signature': ''}, {'label': 1, 'signature': ''}]





In [5]:
# with open("./embedding_vector/1.pkl", 'wb') as f:
#     pickle.dump(embedding_vectors, f)

In [6]:
# 클라이언트 생성

client = chromadb.PersistentClient('./vectorDB')

In [7]:
# 컬렉션 생성 혹은 가져오기

collection = client.get_or_create_collection("default_DB")

In [13]:
# 컬렉션의 데이터 삽입

# for i in tqdm(range(len(payloads))):
# 	collection.add(
#     	ids=indices[i], # 인덱스
#     	embeddings=embedding_vectors[i], # 임베딩
#     	metadatas=meta[i], # 메타 정보
# 		documents=payloads[i][0], # 문서 원문
# 	)

100%|██████████| 10122/10122 [01:49<00:00, 92.06it/s]


In [111]:
# 라벨링 세분화

for i in tqdm(range(len(payloads))):
	collection.update(
    	ids=indices[i], # 인덱스
    	metadatas=meta[i], # 메타 정보
	)

100%|██████████| 10122/10122 [01:28<00:00, 114.06it/s]


In [112]:
# 각 시그니처별 역색인

ping = []
pong = []
root = []
login = []
password = []
tftp = []
sh = []


for idx, data in enumerate(payloads):
    payload = data[0]
    label = data[1]
    if "MALWARE" not in label:
        continue
    if 'PING' in payload:
        ping.append(idx)
    if 'PONG' in payload:
        pong.append(idx)
    if 'root' in payload:
        root.append(idx)
    if 'Login' in payload:
        login.append(idx)
    if 'Password' in payload:
        password.append(idx)
    if 'tftp' in payload:
        tftp.append(idx)
    if '.sh' in payload:
        sh.append(idx)
        
print('ping:', len(ping), ping)
print('pong:', len(pong), pong)
print('root:', len(root), root)
print('login:', len(login), login)
print('password', len(password), password)
print('tftp', len(tftp), tftp)
print('sh', len(sh), sh)

ping: 0 []
pong: 748 [1884, 1885, 1912, 1913, 1942, 1943, 1960, 1961, 1978, 1979, 1996, 1997, 2010, 2011, 2024, 2025, 2038, 2039, 2046, 2047, 2056, 2057, 2070, 2071, 2084, 2085, 2110, 2111, 2138, 2139, 2156, 2157, 2174, 2175, 2188, 2189, 2302, 2303, 2316, 2317, 2328, 2329, 2338, 2339, 2352, 2353, 2364, 2365, 2376, 2377, 2390, 2391, 2406, 2407, 2422, 2423, 2434, 2435, 2456, 2457, 2468, 2469, 2486, 2487, 2500, 2501, 2512, 2513, 2522, 2523, 2538, 2539, 2556, 2557, 2594, 2595, 2616, 2617, 2632, 2633, 2654, 2655, 2668, 2669, 2690, 2691, 2712, 2713, 2742, 2743, 2756, 2757, 2776, 2777, 2800, 2801, 2818, 2819, 2834, 2835, 2854, 2855, 2878, 2879, 2898, 2899, 2920, 2921, 2958, 2959, 2978, 2979, 2994, 2995, 3032, 3033, 3058, 3059, 3104, 3105, 3122, 3123, 3146, 3147, 3168, 3169, 3188, 3189, 3206, 3207, 3228, 3229, 3248, 3249, 3262, 3263, 3276, 3277, 3292, 3293, 3312, 3313, 3334, 3335, 3350, 3351, 3370, 3371, 3382, 3383, 3404, 3405, 3416, 3417, 3430, 3431, 3446, 3447, 3460, 3461, 3500, 3501, 3508, 

#### ping, login password를 포함하고 있는 공격 X
#### Pong, root는 공격이 너무 많아서 진행 X
#### sh는 전부 tftp를 포함

In [113]:
res_tftp = collection.query(
    query_embeddings=embedding_vectors[tftp[0]],
    n_results=len(tftp),
)

In [114]:
res_tftp

{'ids': [['724',
   '725',
   '922',
   '923',
   '210',
   '211',
   '1227',
   '1228',
   '1363',
   '1362',
   '1700',
   '1701',
   '419',
   '418',
   '9',
   '8',
   '1070',
   '1071',
   '1520',
   '1521',
   '565',
   '564',
   '1340',
   '1341',
   '1308',
   '1309',
   '1049',
   '1048',
   '657',
   '656',
   '516',
   '517',
   '1205',
   '1206',
   '343',
   '344',
   '705',
   '704',
   '140',
   '141',
   '1476',
   '1477',
   '1652',
   '1653']],
 'distances': [[0.0,
   0.0,
   0.0037391996011137962,
   0.0037391996011137962,
   0.0037755051162093878,
   0.0037755051162093878,
   0.004489046987146139,
   0.004489046987146139,
   0.004697577096521854,
   0.004697577096521854,
   0.005892964545637369,
   0.005892964545637369,
   0.005944156087934971,
   0.005944156087934971,
   0.006112964358180761,
   0.006112964358180761,
   0.006189025938510895,
   0.006189025938510895,
   0.0064215343445539474,
   0.0064215343445539474,
   0.00644228421151638,
   0.00644228421151638,


In [115]:
res_tftp_1 = collection.query(
    query_embeddings=embedding_vectors[tftp[0]],
    n_results=len(tftp),
    # 메타 필드 필터링
    where={
        "label": 1,
	},
    # 문서 내용 필터링
    # where_document={
    #     "$contains": "Content"
	# },
)

In [116]:
res_tftp_1

{'ids': [['724',
   '725',
   '922',
   '923',
   '210',
   '211',
   '1227',
   '1228',
   '1363',
   '1362',
   '1700',
   '1701',
   '419',
   '418',
   '9',
   '8',
   '1070',
   '1071',
   '1520',
   '1521',
   '565',
   '564',
   '2129',
   '2128',
   '2730',
   '2731',
   '3089',
   '3096',
   '3097',
   '3082',
   '3083',
   '3088',
   '4845',
   '4844',
   '4906',
   '4901',
   '4907',
   '4924',
   '4915',
   '4925',
   '4900',
   '4914',
   '5251',
   '5250']],
 'distances': [[0.0,
   0.0,
   0.0037391996011137962,
   0.0037391996011137962,
   0.0037755051162093878,
   0.0037755051162093878,
   0.004489046987146139,
   0.004489046987146139,
   0.004697577096521854,
   0.004697577096521854,
   0.005892964545637369,
   0.005892964545637369,
   0.005944156087934971,
   0.005944156087934971,
   0.006112964358180761,
   0.006112964358180761,
   0.006189025938510895,
   0.006189025938510895,
   0.0064215343445539474,
   0.0064215343445539474,
   0.00644228421151638,
   0.006442284