In [1]:
import sys, os
sys.path.append('../GIPS/core')
from utils import AEchunking, minHash

import pickle
import numpy as np

import chromadb
from chromadb.utils import embedding_functions

from tqdm import tqdm
import pickle

### minhash를 임베딩된 벡터 보는 방법
- K = 100, M - K * 10

In [3]:
with open('../GIPS/iotpot/48.pkl', 'rb') as f:
    payloads = pickle.load(f)
    
print(len(payloads))

10122


### 임베딩방식
- minhash

In [4]:
def minhash_vector(payload, window_size, K, M):
    minhashed_virtual_vectors = [0 for _ in range(M)]

    chunks = AEchunking(payload, W=window_size)
    encode_pos = minHash(chunks, K) % M

    for i in encode_pos:
        minhashed_virtual_vectors[i] = 1

    return minhashed_virtual_vectors

### 데이터 전처리

In [5]:
cnk = AEchunking # 청킹 방법
window_size = 4 # 청킹 사이즈

meta = [] # 메타 정보를 담을 배열
# embedding_vectors = [] # 임베딩 벡터들을 담을 배열
with open('./embedding_vector/2.pkl', 'rb') as f:
	embedding_vectors  = pickle.load(f)
indices = [] # 각 문서별 번호

cnt = 0 # 인덱스 0부터 시작
vector_size = 100

benign = 0
malware = 0
unknown = 0

for payload, label in tqdm(payloads):

	# 인덱스
	indices.append(f'{cnt}')
	cnt += 1

	# 청킹 임베딩
	# vectors = minhash_vector(payload=payload, window_size=window_size, K=vector_size, M=vector_size * 10)
	# embedding_vectors.append(vectors)

	# 메타 데이터
	dic = dict()

	## 라벨이 뭐로 되어 있는지
	if "MALWARE" in label:
		dic['label'] = 1
		malware += 1
	elif "BENIGN"in label:
		dic['label'] = 0
		benign += 1
	else:
		dic['label'] = 2
		unknown += 1

	## 어떤 시그니처들을 가지고 있는지
	## 시그니처들의 값은 실제 데이터에서 추출 된 값들을 이용
	signature = ''
	if 'PING' in payload:
		signature += 'PING,'
	if 'PONG' in payload:
		signature += 'PONG,'
	if 'root' in payload:
		signature += 'root,'
	if 'Password' in payload:
		signature += 'Password,'
	if 'Login' in payload:
		signature += 'Login,'
	if 'tftp' in payload:
		signature += 'tftp,'
	if '.sh' in payload:
		signature += 'sh,'

	if len(signature) != 0:
		signature = signature[: -1]

	dic['signature'] = signature
	meta.append(dic)

print(f'benign: {benign}, malware: {malware}, unknown: {unknown}')
print(meta[: 5])

100%|██████████| 10122/10122 [00:00<00:00, 113381.97it/s]

benign: 2689, malware: 1184, unknown: 6249
[{'label': 0, 'signature': ''}, {'label': 0, 'signature': ''}, {'label': 2, 'signature': ''}, {'label': 2, 'signature': ''}, {'label': 2, 'signature': ''}]





In [9]:
# with open("./embedding_vector/2.pkl", 'wb') as f:
#     pickle.dump(embedding_vectors, f)

In [6]:
# 클라이언트 생성

client = chromadb.PersistentClient('./chroma')

In [7]:
# 컬렉션 생성 혹은 가져오기

collection = client.get_or_create_collection("minahsh_DB")

In [8]:
# 컬렉션의 데이터 삽입

for i in tqdm(range(len(payloads))):
	collection.add(
    	ids=indices[i], # 인덱스
    	embeddings=embedding_vectors[i], # 임베딩
    	metadatas=meta[i], # 메타 정보
		documents=payloads[i][0], # 문서 원문
	)

100%|██████████| 10122/10122 [01:56<00:00, 86.68it/s]


In [9]:
# 각 시그니처별 역색인

ping = []
pong = []
root = []
login = []
password = []
tftp = []
sh = []


for idx, data in enumerate(payloads):
    payload = data[0]
    label = data[1]
    if "MALWARE" not in label:
        continue
    if 'PING' in payload:
        ping.append(idx)
    if 'PONG' in payload:
        pong.append(idx)
    if 'root' in payload:
        root.append(idx)
    if 'Login' in payload:
        login.append(idx)
    if 'Password' in payload:
        password.append(idx)
    if 'tftp' in payload:
        tftp.append(idx)
    if '.sh' in payload:
        sh.append(idx)
        
print('ping:', len(ping), ping[: 5])
print('pong:', len(pong), pong[: 5])
print('root:', len(root), root[: 5])
print('login:', len(login), login[: 5])
print('password', len(password), password[: 5])
print('tftp', len(tftp), tftp[: 5])
print('sh', len(sh), sh[: 5])

ping: 0 []
pong: 748 [1884, 1885, 1912, 1913, 1942]
root: 290 [1900, 1901, 1906, 1907, 1930]
login: 0 []
password 0 []
tftp 44 [724, 725, 2128, 2129, 2730]
sh 42 [2128, 2129, 2730, 2731, 3082]


#### ping, login password를 포함하고 있는 공격 X
#### Pong, root는 공격이 너무 많아서 진행 X
#### sh는 전부 tftp를 포함

In [10]:
res_tftp = collection.query(
    query_embeddings=embedding_vectors[tftp[0]],
    n_results=len(tftp),
)

In [11]:
res_tftp

{'ids': [['724',
   '725',
   '923',
   '922',
   '9',
   '1070',
   '1071',
   '1701',
   '1700',
   '419',
   '418',
   '211',
   '210',
   '564',
   '565',
   '1520',
   '1521',
   '1363',
   '1362',
   '1228',
   '1227',
   '2449',
   '2448',
   '6171',
   '6170',
   '6729',
   '6728',
   '36',
   '1327',
   '1326',
   '4893',
   '4892',
   '4913',
   '4912',
   '539',
   '3469',
   '3468',
   '4397',
   '4396',
   '5885',
   '5884',
   '8805',
   '8804',
   '10045']],
 'distances': [[0.0,
   0.0,
   17.0,
   17.0,
   25.0,
   27.0,
   27.0,
   27.0,
   27.0,
   29.0,
   29.0,
   33.0,
   33.0,
   39.0,
   39.0,
   39.0,
   39.0,
   41.0,
   41.0,
   43.0,
   43.0,
   149.0,
   149.0,
   149.0,
   149.0,
   152.0,
   152.0,
   154.0,
   154.0,
   154.0,
   154.0,
   154.0,
   154.0,
   154.0,
   155.0,
   155.0,
   155.0,
   155.0,
   155.0,
   155.0,
   155.0,
   155.0,
   155.0,
   155.0]],
 'metadatas': [[{'label': 1, 'signature': 'tftp'},
   {'label': 1, 'signature': 'tftp'},
 

In [12]:
res_tftp_1 = collection.query(
    query_embeddings=embedding_vectors[tftp[0]],
    n_results=len(tftp),
    # 메타 필드 필터링
    where={
        "label": 1,
	},
    # 문서 내용 필터링
    # where_document={
    #     "$contains": "Content"
	# },
)

In [13]:
res_tftp_1

{'ids': [['724',
   '725',
   '923',
   '922',
   '8',
   '9',
   '1070',
   '1071',
   '1701',
   '1700',
   '419',
   '418',
   '211',
   '210',
   '564',
   '565',
   '1520',
   '1521',
   '1363',
   '1362',
   '1228',
   '1227',
   '3635',
   '3634',
   '4189',
   '4188',
   '4572',
   '4573',
   '7412',
   '7413',
   '8993',
   '8992',
   '9078',
   '9079',
   '10033',
   '10032',
   '6769',
   '9322',
   '9323',
   '6980',
   '6981',
   '2943',
   '3069',
   '9197']],
 'distances': [[0.0,
   0.0,
   17.0,
   17.0,
   25.0,
   25.0,
   27.0,
   27.0,
   27.0,
   27.0,
   29.0,
   29.0,
   33.0,
   33.0,
   39.0,
   39.0,
   39.0,
   39.0,
   41.0,
   41.0,
   43.0,
   43.0,
   160.0,
   160.0,
   160.0,
   160.0,
   160.0,
   160.0,
   160.0,
   160.0,
   160.0,
   160.0,
   160.0,
   160.0,
   160.0,
   160.0,
   161.0,
   161.0,
   161.0,
   163.0,
   163.0,
   164.0,
   164.0,
   164.0]],
 'metadatas': [[{'label': 1, 'signature': 'tftp'},
   {'label': 1, 'signature': 'tftp'},
 

In [14]:
benign_ = collection.query(
    query_embeddings=embedding_vectors[1],
    n_results=len(tftp),
)

In [15]:
benign_

{'ids': [['0',
   '1',
   '691',
   '690',
   '3081',
   '3080',
   '9457',
   '9456',
   '10108',
   '10109',
   '424',
   '425',
   '776',
   '777',
   '3245',
   '3244',
   '921',
   '920',
   '1199',
   '1200',
   '2651',
   '2650',
   '33',
   '32',
   '753',
   '752',
   '1632',
   '1633',
   '2233',
   '2232',
   '3127',
   '3126',
   '8316',
   '8317',
   '9461',
   '9460',
   '263',
   '264',
   '561',
   '560',
   '881',
   '880',
   '1157',
   '1158']],
 'distances': [[0.0,
   0.0,
   155.0,
   155.0,
   155.0,
   155.0,
   155.0,
   155.0,
   157.0,
   157.0,
   157.0,
   157.0,
   157.0,
   157.0,
   157.0,
   157.0,
   158.0,
   158.0,
   158.0,
   158.0,
   158.0,
   158.0,
   159.0,
   159.0,
   159.0,
   159.0,
   159.0,
   159.0,
   159.0,
   159.0,
   159.0,
   159.0,
   159.0,
   159.0,
   159.0,
   159.0,
   160.0,
   160.0,
   160.0,
   160.0,
   160.0,
   160.0,
   160.0,
   160.0]],
 'metadatas': [[{'label': 0, 'signature': ''},
   {'label': 0, 'signature': ''},

### 실험 결과
- 문자열이 조금 바뀐 경우에는 탐지가 잘되는 모습
- 하지만 같은 시그니처를 가졌지만 내용이 다른 경우에는 기본 임베딩 모델보다 탐지가 잘 안되는 모습을 보임