In [1]:
import pickle
import random

from GIPS.core.MV2 import MV2
from GIPS.core.JIG import JIG
from GIPS.core.SG2 import SG2

In [3]:
def GIPS(str_feature,
		 window_size, K, M, # MV2 파라미터
		 thetaJ,  # JIG 파라미터
		 vector_size, eps, minpts, ngram, hh1_size, hh2_size, hh3_size, ratio # SG2, AWL 파라미터
		):

	print(f'data no: {len(str_feature)}')

	str_feature = list(str_feature)
	for feature in str_feature:
		feature = list(feature)

	# 빅 그룹 식별
	minhashed_virtual_vectors = MV2(payloads=str_feature, window_size=window_size, K=K, M=M)

	big_group_indices = JIG(vectors=minhashed_virtual_vectors, thetaJ=thetaJ)
  
	
	big_group_payloads = []
	non_big_group_paylaods = []

	for idx, payload in enumerate(str_feature):
		if idx in big_group_indices:
			big_group_payloads.append(payload)
		else:
			non_big_group_paylaods.append(payload)
	
	# 시그니처 생성
	cluster_signatures = SG2(payloads=big_group_payloads, window_size=window_size, vector_size=vector_size, 
							 eps=eps, minpts=minpts, ngram=ngram, hh1_size=hh1_size, hh2_size=hh2_size, hh3_size=hh3_size, ratio=ratio)
	
	signatures = set()
	for value_list in cluster_signatures.values():
		for i in value_list:
			signatures.add(i[0])

	res = list(signatures)

	return res

In [3]:
with open('./datasets/pkl/pe_string.pkl', 'rb') as f:
    malware_strings = pickle.load(f)
    
with open('./datasets/pkl/pe_benign_string.pkl', 'rb') as f:
    benign_strings = pickle.load(f)

In [4]:
# split train test

train_ratio = 0.8

random.Random(42).shuffle(malware_strings)

train_data = malware_strings[: int(len(malware_strings) * train_ratio)]
test_data = malware_strings[int(len(malware_strings) * train_ratio): ]

print(len(malware_strings), len(train_data), len(test_data))

29909 23927 5982


In [5]:
# parameters

K = 64
M = 2 ** 14
thetaJ = 0.6
window_size = 4
vector_size = 512
eps = 0.4
minpts = 5
ngram = 4
hh1_size = 5000
hh2_size = 5000
hh3_size = 5000
ratio = 0.8

In [6]:
signatures = GIPS(str_feature=train_data, window_size=window_size, K=K, M=M, thetaJ=thetaJ, vector_size=vector_size,
     eps=eps, minpts=minpts, ngram=ngram, hh1_size=hh1_size, hh2_size=hh2_size, hh3_size=hh3_size, ratio=ratio
)

data no: 23927
make minhashed vector


100%|██████████| 23927/23927 [05:24<00:00, 73.84it/s] 


checking big group


100%|██████████| 23927/23927 [02:31<00:00, 158.03it/s]


chunking


100%|██████████| 18601/18601 [00:53<00:00, 346.49it/s]


start DBSCAN
end DBSCAN
make signature


100%|██████████| 4/4 [00:32<00:00,  8.03s/it]

end signature





In [7]:
sig_set = set(signatures)
sig_set

{'        <requestedExecutionLevel level="asInvoker" uiAccess="false"></requestedExecutionLevel>',
 '      </requestedPrivileges>',
 '      <requestedPrivileges>',
 '    </security>',
 '    <security>',
 '  </trustInfo>',
 '  <trustInfo xmlns="urn:schemas-microsoft-com:asm.v3">',
 '!.7z{(',
 '!This is a PE executable',
 '!This program cannot be run in DOS mode.',
 '"#Z:[^',
 '#W"R%R\'',
 '#v5dBzf',
 '$ |o/~Ei',
 '$,<mpD',
 '%Tw{x;vA',
 '&"Se@E',
 '&)OMAr3',
 '&P1H@K',
 '&yjC)/',
 "'unqb%",
 "'x%Vm\\]~",
 '(q90/ ',
 ')D0R@;',
 ')U`<>:r',
 '+k9.3^',
 '.JA:J"&',
 '.NUa^D',
 '.idata',
 '.rdata',
 '.reloc',
 '.x/&137',
 '/catalog/b04.gz',
 '/class/VER103.gz',
 '/java/b04.gz',
 '/payment_gateway/heapid3.gz',
 '/sc/id3.gz',
 '/scripts/inst_VER103.gz',
 '0-[Ma!O',
 "0Kq'^[",
 '0V7&8,',
 '2Hxtrc',
 '2L#^Tq',
 '345678',
 '3yd~>q',
 '4/SH_h[p',
 '4DKNr2',
 '4FT!0L',
 '6SL"x*',
 '6{SU#Ko@!',
 '7087":gbBDFFY',
 '7087":gbICnyj',
 '7087":gbJNNqW',
 '7087":gbKkHQt',
 '7087":gbMMTKD',
 '7087":gbMeeSn',

In [8]:
res = []

for strings in test_data:
    cnt = 0
    for string in strings:
        if string in sig_set:
            cnt += 1
    res.append(cnt)
    
print(res[: 100])
print(sum(res) / len(res))

[10, 2, 74, 33, 49, 8, 19, 77, 86, 12, 7, 2, 19, 49, 55, 11, 16, 14, 39, 74, 13, 56, 4, 25, 41, 5, 5, 13, 29, 25, 6, 45, 11, 25, 49, 5, 28, 73, 13, 73, 83, 49, 37, 18, 32, 83, 77, 12, 89, 69, 44, 49, 42, 5, 5, 87, 7, 7, 24, 10, 4, 48, 19, 19, 7, 49, 103, 15, 55, 10, 75, 5, 78, 55, 9, 8, 47, 13, 45, 16, 64, 21, 107, 7, 8, 49, 55, 55, 74, 78, 29, 55, 45, 32, 64, 19, 74, 45, 5, 13]
34.32280173854898


In [9]:
benign = []

for strings in benign_strings:
    cnt = 0
    for string in strings:
        if string in sig_set:
            cnt += 1
    benign.append(cnt)
    
print(benign[: 100])
print(sum(benign) / len(benign))

[51, 30, 30, 17, 29, 11, 1, 29, 81, 11, 48, 47, 40, 60, 91, 74, 82, 57, 11, 30, 53, 14, 2, 16, 21, 10, 5, 28, 54, 69, 52, 87, 63, 12, 75, 1, 1, 7, 49, 1, 31, 8, 11, 44, 1, 65, 5, 58, 20, 54, 44, 17, 22, 91, 19, 51, 42, 50, 77, 24, 13, 78, 17, 9, 6, 47, 6, 54, 48, 13, 12, 42, 6, 87, 1, 84, 4, 14, 2, 54, 13, 3, 35, 43, 74, 32, 2, 85, 2, 74, 1, 45, 6, 50, 38, 20, 13, 22, 20, 9]
34.13263566150657


In [10]:
with open('./res/PE_benign_signature.pkl', 'rb') as f:
    white_list_dict = pickle.load(f)
    
white_list_dict

{0: [('String', 37705),
  ('Thread', 30087),
  ('.idata', 25927),
  ('System', 23398),
  ('      ', 18535),
  ('A_A^A]', 18314),
  ('.rdata', 18130),
  ('A^A]A\\', 17569),
  ('Exception', 17560),
  ('Object', 16632),
  ('UAVAWH', 15923),
  ('Delete', 15212),
  ('AUAVAWH', 14549),
  ('A_A^A]A\\_', 14242),
  ('GetCurrentProcess', 13693),
  ('Version', 13181),
  ('handle', 12310),
  ('Environment', 11557),
  ('Console', 11461),
  (' delete', 11316),
  ('UnhandledExceptionFilter', 11055),
  ('WATAUAVAWH', 10351),
  ('0123456789', 10338),
  ('Strings', 9610),
  ('string', 9553),
  ('abcdefghijklmnopqrstuvwxyz', 9216),
  ('InitializeCriticalSection', 9035),
  ('ABCDEFGHIJKLMNOPQRSTUVWXYZ', 8595),
  ('FileName', 8594),
  ('ExitProcess', 8574),
  ('GetCurrentThread', 8133),
  ('GetWindow', 7878),
  ('A_A^A\\', 7852),
  ('connect', 7829),
  ('GetValue', 7692),
  ('length', 7363),
  ('!This program cannot be run in DOS mode.', 7281),
  ('.CRT$XIA', 7274),
  ('KERNEL32', 7201),
  ('Global', 7108)

In [11]:
white_list = set()

for key, value in white_list_dict.items():
    for string, cnt in value:
        white_list.add(string)
        
white_list

{'      ',
 '                          ',
 '      </requestedPrivileges>',
 '    </security>',
 '    <security>',
 '  </trustInfo>',
 ' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`ABCDEFGHIJKLMNOPQRSTUVWXYZ{|}~',
 ' !"#$%&\'()*+,-./0123456789:;<=>?@abcdefghijklmnopqrstuvwxyz[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~',
 " Base Class Array'",
 ' Base Class Descriptor at (',
 " Class Hierarchy Descriptor'",
 " Complete Object Locator'",
 " Type Descriptor'",
 ' delete',
 ' delete[]',
 ' new[]',
 ' using a Numara Software, Inc. product.',
 '!This program cannot be run in DOS mode.',
 '(DigiCert SHA2 Assured ID Code Signing CA',
 '(null)',
 '+\\$$Ff',
 '.00cfg',
 '.?AVbad_alloc@std@@',
 '.?AVexception@std@@',
 '.?AVtype_info@@',
 '.CRT$XCA',
 '.CRT$XCZ',
 '.CRT$XIA',
 '.CRT$XIZ',
 '.CRT$XPA',
 '.CRT$XPX',
 '.CRT$XPZ',
 '.CRT$XTA',
 '.CRT$XTZ',
 '.SEFCMD',
 '.idata',
 '.idata$2',
 '.idata$3',
 '.idata$4',
 '.idata$5',
 '.idata$6',
 '.idata$7',
 '.pdata',
 '.rdata',
 '.rdata$

In [12]:
sig_set = set()

for i in signatures:
    if i not in white_list:
        sig_set.add(i)

In [13]:
for i in range(5, 13):
	res = []
	TP = 0
	FP = 0

	print('N', i)

	for strings in test_data:
		cnt = 0
		for string in strings:
			if string in sig_set:
				cnt += 1
		if cnt > i:
			TP += 1
		else:
			FP += 1
		res.append(cnt)
		
	# print(sum(res) / len(res))
	print(f'TP: {TP}, FP: {FP}')

	benign = []

	TN = 0
	FN = 0

	for strings in benign_strings[: 5000]:
		cnt = 0
		for string in strings:
			if string in sig_set:
				cnt += 1		
		if cnt <= i:
			TN += 1
		else:
			FN += 1
		benign.append(cnt)
		
	# print(sum(benign) / len(benign))
	print(f'TN: {TN}, FN: {FN}')

	presion = round(TP / (TP + FP) , 4)
	recall = round(TP / (TP + FN), 4)

	print(f'presion: {presion}, recall: {recall}')
	print('f1-score:', round(2 * (presion * recall) / (presion + recall), 4))
	print()

N 5
TP: 4166, FP: 1816
TN: 2973, FN: 2027
presion: 0.6964, recall: 0.6727
f1-score: 0.6843

N 6
TP: 3921, FP: 2061
TN: 3106, FN: 1894
presion: 0.6555, recall: 0.6743
f1-score: 0.6648

N 7
TP: 3614, FP: 2368
TN: 3247, FN: 1753
presion: 0.6041, recall: 0.6734
f1-score: 0.6369

N 8
TP: 3456, FP: 2526
TN: 3345, FN: 1655
presion: 0.5777, recall: 0.6762
f1-score: 0.6231

N 9
TP: 3289, FP: 2693
TN: 3452, FN: 1548
presion: 0.5498, recall: 0.68
f1-score: 0.608

N 10
TP: 3177, FP: 2805
TN: 3545, FN: 1455
presion: 0.5311, recall: 0.6859
f1-score: 0.5987

N 11
TP: 2726, FP: 3256
TN: 3619, FN: 1381
presion: 0.4557, recall: 0.6637
f1-score: 0.5404

N 12
TP: 2681, FP: 3301
TN: 3672, FN: 1328
presion: 0.4482, recall: 0.6687
f1-score: 0.5367



# 문자열 추출 방법 개선

In [4]:
from GIPS.core.feature_extractor import extract_string_lower
import os
from tqdm import tqdm

In [5]:
malware_direc = './datasets/PE_malware/'
benign_direc = './datasets/ben/'

In [6]:
malware_path = os.listdir(malware_direc)

with open('./datasets/pkl/benign_md5.pkl', 'rb') as f:
    benign_path = pickle.load(f)

In [11]:
benign_strings = [] # 전체 스트링 저장
white_list = [] # train 화이트 리스트
benign_test = [] # test benign

for path_ in tqdm(benign_path[: int(len(benign_path) * 0.8)]):
	path = os.path.join(benign_direc, path_)
	
	res = extract_string_lower(path=path)
	benign_strings.append(res)
	white_list = white_list + list(res)

print(len(white_list))

with open('./datasets/pkl/white_list.pkl', 'wb') as f:
	pickle.dump(white_list, f)


for path_ in tqdm(benign_path[int(len(benign_path) * 0.8): ]):
	path = os.path.join(benign_direc, path_)
	
	res = extract_string_lower(path=path)
	benign_test.append(res)
	benign_strings.append(res)

with open('./datasets/pkl/benign_low_strings.pkl', 'wb') as f:
	pickle.dump(benign_strings, f)

 32%|███▏      | 7665/23927 [27:40<58:43,  4.62it/s]  


KeyboardInterrupt: 

In [22]:
malware_strings = [] # 전체스트링 저장
malware_train = [] # 실험 진행할 데이터
malware_test = [] # 테스트할 데이터
whitelist = set(white_list)

for path_ in tqdm(malware_path[: int(len(malware_path) * 0.8)]):
	path = os.path.join(malware_direc, path_)
	
	res = extract_string_lower(path=path)
	malware_strings.append(res)
	tmp = []
	for i in res:
		if i not in whitelist:
			tmp.append(i)
	malware_train.append(tmp)

print(len(white_list))


for path_ in tqdm(malware_path[int(len(malware_path) * 0.8): ]):
	path = os.path.join(malware_direc, path_)
	
	res = extract_string_lower(path=path)
	malware_strings.append(res)
	tmp = []
	for i in res:
		if i not in whitelist:
			tmp.append(i)
	malware_test.append(tmp)

with open('./datasets/pkl/malware_low_strings.pkl', 'wb') as f:
	pickle.dump(malware_strings, f)

100%|██████████| 29910/29910 [04:52<00:00, 102.14it/s]




In [24]:
# split train test

train_ratio = 0.8

random.Random(42).shuffle(malware_strings)

train_data = malware_strings[: int(len(malware_strings) * train_ratio)]
test_data = malware_strings[int(len(malware_strings) * train_ratio): ]

print(len(malware_strings), len(train_data), len(test_data))

29910 23928 5982


In [25]:
signatures = GIPS(str_feature=train_data, window_size=window_size, K=K, M=M, thetaJ=thetaJ, vector_size=vector_size,
     eps=eps, minpts=minpts, ngram=ngram, hh1_size=hh1_size, hh2_size=hh2_size, hh3_size=hh3_size, ratio=ratio
)

data no: 23928
make minhashed vector


100%|██████████| 23928/23928 [05:02<00:00, 79.12it/s] 


checking big group


100%|██████████| 23928/23928 [02:13<00:00, 178.97it/s]


chunking


100%|██████████| 18475/18475 [00:48<00:00, 378.93it/s]


start DBSCAN
end DBSCAN
make signature


100%|██████████| 3/3 [00:24<00:00,  8.03s/it]

end signature





In [26]:
sig_set = set(signatures)
sig_set

{'!this is a pe executable',
 '!this program cannot be run in dos mode.',
 '%tw{x;va',
 '.idata',
 '.rdata',
 '/catalog/b04.gz',
 '/class/ver103.gz',
 '/java/b04.gz',
 '/payment_gateway/heapid3.gz',
 '/sc/id3.gz',
 '/scripts/inst_ver103.gz',
 '345678',
 '</assembly>papaddingxxpaddingpaddingxxpaddingpaddingxxpaddingpaddingxxpaddingpaddingxxpaddingpaddingxxpaddingpaddingxxpaddingpaddingxxpaddingpaddingxxpaddingpaddingxxpaddingpaddingxxpaddingpaddingxxpaddingpaddingxxpaddingpaddingxxpad',
 '</requestedprivileges>',
 '</security>',
 '</trustinfo>',
 '<assembly xmlns="urn:schemas-microsoft-com:asm.v1" manifestversion="1.0">',
 '<requestedexecutionlevel level="asinvoker" uiaccess="false"></requestedexecutionlevel>',
 '<requestedprivileges>',
 '<security>',
 '<trustinfo xmlns="urn:schemas-microsoft-com:asm.v3">',
 '@.data',
 '`.data',
 '`.rdata',
 'active',
 'adjustwindowrectex',
 'advapi32',
 'advapi32.dll',
 'ageboxa',
 'application/*',
 'beginpaint',
 'buffer',
 'bvvcb.exe',
 'charnexta',
