In [1]:
from sklearn.preprocessing import MinMaxScaler
import pefile
from capstone import *
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import os.path
from keras.models import load_model
from os import listdir
import csv
import gc

Using TensorFlow backend.


In [2]:
import json

In [3]:
def grams_extractor(file_path, grams, size=4):
    with open(file_path, 'rb') as fp:
        freq = {}
        for g in grams:
            freq[g] = 0
        chunk = fp.read(size).hex()
        if chunk in grams:
            freq[chunk] = 1
        while chunk != '':
            chunk = fp.read(size).hex()
            try:
                freq[chunk] += 1
            except:
                pass
    return freq


def grams_rf(freq):
    summ = 0
    for g in freq:
        summ += freq[g]
    for g in freq:
        freq[g] = freq[g] / summ
    return freq


def grams_row(freq, grams):
    row = []
    for g in grams:
        row.append(freq[g])
    return row


def normalized_row(row):
    scaler = MinMaxScaler()
    norm_row = scaler.fit_transform(np.array(row).reshape(-1, 1))
    return norm_row


def extract_imports(file_path, dlls, functions):
    dlls_used = {}
    functions_used = {}
    for dll in dlls:
        dlls_used[dll] = 0
    for function in functions:
        functions_used[function] = 0
    try:
        exe = pefile.PE(file_path)
    except:
        return 'parsing error'
    try:
        for entry in exe.DIRECTORY_ENTRY_IMPORT:
            dll = entry.dll.decode('utf-8').lower()
            try:
                dlls_used[dll] = 1
            except:
                pass
            for func in entry.imports:
                if func.name is not None:
                    func_name = func.name.decode('utf-8').lower()
                    if dll+func_name in functions:
                        functions_used[dll+func_name] = 1
                else:
                    func_ordinal = str(func.ordinal)
                    if dll+func_ordinal in functions:
                        functions_used[dll+func_ordinal] = 1
        return list(functions_used.values()) + list(dlls_used.values())
    except:
        return 'no imports'


def imports_json(file_path):
    imports = {}
    try:
        exe = pefile.PE(file_path)
    except:
        return 'parsing error'
    try:
        for entry in exe.DIRECTORY_ENTRY_IMPORT:
            dll = entry.dll.decode('utf-8').lower()
            imports[dll] = []
            for func in entry.imports:
                if func.name is not None:
                    func_name = func.name.decode('utf-8').lower()
                    imports[dll].append(func_name)
                else:
                    func_ordinal = str(func.ordinal)
                    imports[dll].append(func_ordinal)
        return imports
    except:
        return {}



def get_main_code_section(sections, base_of_code):
    addresses = []
    for section in sections:
        addresses.append(section.VirtualAddress)
    if base_of_code in addresses:
        # if sections[addresses.index(base_of_code)].Characteristics == int(0x60000020):
        if 1 == 1:
            return sections[addresses.index(base_of_code)]
        else:
            return None
    else:
        addresses.append(base_of_code)
        addresses.sort()
        if addresses.index(base_of_code) != 0:
            # if sections[addresses.index(base_of_code)-1].Characteristics == int(0x60000020):
            if 1 == 1:
                return sections[addresses.index(base_of_code) - 1]
            else:
                return None
        else:
            return None


def get_instruction_group(inst):
    inst_groups = {
        # Conditional Data Transfer
        'cdt': ['cmove', 'cmovz', 'cmovne', 'cmovnz', 'cmova', 'cmovnbe', 'cmovae', 'cmovnb', 'cmovb',
                'cmovnae', 'cmovbe', 'cmovna', 'cmovg',
                'cmovnle', 'cmovge', 'cmovnl', 'cmovl', 'cmovnge', 'cmovle', 'cmovng',
                'cmovc', 'cmovnc', 'cmovo', 'cmovno', 'cmovs', 'cmovns', 'cmovp', 'cmovpe',
                'cmovnp', 'cmovpo', ],
        # Unconditianl Data Transfer
        'udt': ['mov', 'xchg', 'bswap', 'movsx', 'movzx', 'movlps', 'movqda', 'lock xchg'],
        # Stack Data Transfer
        'sdt': ['push', 'pop', 'pusha', 'pushad', 'popa', 'popad', 'popal', 'pushal'],
        'adt': ['xadd'],

        # Compared Data Transfer
        'cmpdt': ['cmpxchg', 'cmpxchg8b', ],
        # Converting
        'cvt': ['cwd', 'cdq', 'cbw', 'cwde'],
        # Binary Arithmetic Instructions
        'bai': ['adcx', 'adox', 'add', 'adc', 'sub', 'sbb', 'imul', 'imulb', 'imulw', 'imull',
                'mul', 'mulb', 'mulw', 'mull', 'idiv', 'idivb', 'idivw', 'idivl',
                'div', 'inc', 'dec', 'neg', 'cmp', 'addb', 'addw', 'addl', 'adcb',
                'adcw', 'adcl', 'subb', 'subw', 'subl', 'sbbb', 'sbbw', 'sbbl',
                'cmpb', 'cmpw', 'cmpl', 'incb', 'incw', 'incl', 'decb', 'decw',
                'decl', 'negb', 'negw', 'negl', 'lock add', 'lock adc', 'lock sbb',
                'lock sub', 'lock neg', 'lock inc', 'lock dec'],
        # Integer Arithmetic Instructions
        'iai': ['fiadd', 'fiaddr', 'ficom', 'fidiv', 'fisub', 'fimul', 'ficomp', 'fisubr', 'fidivr', 'fimulr'],
        # Decimal Arithmetic Instructions
        'dai': ['daa', 'das', 'aaa', 'aas', 'aam', 'aad', ],
        # Flaot Arithmetic Instructions
        'fai': ['fabs', 'fadd', 'faddp', 'fchs', 'fdiv', 'fdivp', 'fdivr', 'fdivrp', 'fiadd',
                'fidiv', 'fidivr', 'fimul',
                'fisub', 'fisubr', 'fmul', 'fmulp', 'fprem', 'fprem1', 'frndint', 'fscale', 'fsqrt',
                'fsub', 'fsubp',
                'fsubr', 'fsubrp', 'fxtract'],
        # Float Comparison Instructions
        'fci': ['fcom', 'fcomi', 'fcomip', 'fcomp', 'fcompp', 'ftst', 'fucom',
                'fucomi', 'fucomip', 'fucomp', 'fucompp', 'fxam'],
        # Stack Arithmetic Instructions
        'sai': ['fsqrt', 'fscale', 'fprem', 'frndint', 'fxtract', 'fabs', 'fchs', ],
        # Logical Instructions
        'li': ['and', 'andb', 'andw', 'andl', 'or', 'orb', 'orw', 'orl', 'xor',
               'xorb', 'xorw', 'xorl', 'not', 'notb', 'notw', 'notl', 'lock or',
               'lock and', 'lock xor', 'lock not', ],
        # Shift Rotate Instructions
        'sri': ['sar', 'shr', 'sal', 'shl', 'shrd', 'shld', 'ror', 'rol', 'rcr', 'rcl',
                'sarb', 'sarw', 'sarl', 'salb', 'salw', 'sall', 'shrb', 'shrw', 'shrl',
                'shld', 'shldw', 'shldl', 'shrd', 'shrdw', 'shrdl', ],
        # Bit Instructions
        'bii': ['bt', 'bts', 'btr', 'btc', 'bsf', 'bsr', 'lock bt', 'lock bts',
                'lock btr', 'lockbtc'],
        # Byte Instructions
        'byi': ['sete', 'setz', 'setne', 'setnz', 'seta', 'setnbe', 'setae', 'setnb', 'setnc', 'setb', 'setnae',
                'setc', 'setbe', 'setna', 'setg', 'setnle', 'setge', 'setnl', 'setl', 'setnge', 'setle', 'setng',
                'sets', 'setns', 'seto', 'setno', 'setpe', 'setp', 'setpo', 'setnp', 'test', 'testb',
                'testw', 'testl', 'crc32', 'popcnt', ],
        # Conditional Jumping
        'cj': ['je', 'jz', 'jnz', 'jnz', 'ja', 'jnbe', 'jae', 'jnb', 'jb', 'jnae', 'jbe', 'jna', 'jg',
               'jnle', 'jge', 'jnl', 'jl', 'jnge', 'jle', 'jng', 'jc', 'jnc', 'jo', 'jno', 'js', 'jns',
               'jpo', 'jnp', 'jpe', 'jp', 'jcxz', 'jecxz', 'loopz', 'loope', 'loopnz', 'loopne', 'into',
               'jne'],
        # Unconditional Jumping/Looping
        'uj': ['jmp', 'loop', 'call', 'enter', 'leave', 'lcall', 'acall', 'ljmp', ],
        # Interruptions
        'int': ['ret', 'iret', 'retn', 'int', 'retf', 'hlt', 'iretd', ],
        # Strings Instructions
        'si': ['movs', 'movsb', 'movsw', 'movsd', 'cmps', 'cmpsb', 'cmpsw', 'cmpsd', 'scas',
               'scasb', 'scasw', 'scasd', 'lods', 'lodsb', 'lodsw', 'lodsd', 'rep', 'repe',
               'repz', 'repne', 'repnz', 'stos', 'stosd', 'stosb', 'stosw', 'stosl', ],
        # I/O Instructions
        'io': ['in', 'out', 'ins', 'insb', 'insw', 'insd', 'outs', 'outsb', 'outsw', 'outsd',
               'inb', 'inw', 'insl', 'outw', 'outsl', 'outl', ],
        # Flags
        'flg': ['stc', 'clc', 'cmc', 'cld', 'std', 'lahf', 'sahf', 'pushf', 'pushfd',
                'popf', 'popfd', 'sti', 'cli', 'popfw', 'popfl', 'pushfw', 'pushfl', 'salc'],
        # Segment Register Instructions
        'seg': ['lds', 'les', 'lfs', 'lgs', 'lss', ],
        #
        'misc': ['lea', 'nop', 'ud', 'xlat', 'xlatb', 'cpuid', 'prefetchw', 'prefetchwt',
                 'clflush', 'clflushopt', ],

        'sr': ['xsave', 'xsavec', 'xsaveopt', 'xrstor', 'xgetbv', ],

        'rng': ['rdrand', 'rdseed'],

        'arr': ['bound', 'boundb', 'boundw', 'boundl'],

        'pmi': ['sldt', 'str', 'lldt', 'ltr', 'verr', 'verw', 'sgdt', 'sidt',
                'smsw', 'lmsw', 'lar', 'lsl', 'clts', 'arpl', 'lgdt', 'lidt', ],

        'pci': ['frstor', 'finitfninit', 'finit', 'fnop', 'fsave', 'fnsave', 'fstcw',
                'fnstcw', 'fstenv', 'fnstenv', 'fstsw', 'fnstsw', 'fwait', 'wait',
                'fclex', 'fnclex', 'fdecstp', 'ffree', 'fincstp', 'pause', 'fclex',
                'fdecstp', 'ffree', 'fincstp', 'finit', 'fldcw', 'fldenv',
                'fnclex', 'fninit', 'fnop', 'fnsave', 'fnstcw', 'fnstenv',
                'fnstsw', 'frstor', 'fsave', 'fstcw', 'fstenv', 'fstsw', 'fwait',
                'rdtsc', 'fxrstor', 'fxsave', 'invd', 'winvd', ],
        # MMX Data Transfer
        'mmxt': ['movd', 'movq'],
        # MMX Conversion
        'mmxc': ['packssdw', 'packsswb', 'packuswb', 'punpckhbw', 'punpckhdq',
                 'punpckhwd', 'punpcklbw', 'punpckldq', 'punpcklwd'],
        # MMX Arithmetic Instuctions
        'mmxa': ['paddb', 'paddd', 'paddsb', 'paddsw', 'paddusb', 'paddusw', 'paddw', 'pmaddwd', 'pmulhw',
                 'pmullw', 'psubb', 'psubd', 'psubsb', 'psubsw', 'psubusb', 'psubusw', 'psubw'],
        # MMX Comparision
        'mmxcmp': ['pcmpeqd', 'pcmpeqb', 'pcmpeqw', 'pcmpgtb', 'pcmpgtd', 'pcmpgtw'],
        # MMX Logical
        'mmxl': ['pand', 'pandn', 'por', 'pxor'],
        # MMX Shift Rotate Instuctions
        'mmxsr': ['pslld', 'psllq', 'psllw', 'psrad', 'psraw', 'psrld', 'psrlq', 'psrlw'],
        # MMX State Management
        'mmxsm': ['emms'],
        # SSE Data Transfer
        'sset': ['movaps', 'movhlps', 'movhps', 'movlhps', 'movlps', 'movmskps', 'movss', 'movups'],
        # SSE Arithmetic Instructions
        'ssea': ['addps', 'addss', 'divps', 'divss', 'maxps', 'maxss', 'minps', 'minss', 'mulps',
                 'mulss', 'rcpps', 'rcpss', 'rsqrtps', 'rsqrtss', 'sqrtps', 'sqrtss', 'subps', 'subss'],
        # SSE Comparision
        'ssecmp': ['cmpps', 'cmpss', 'comiss', 'ucomiss', ],
        # SSE Logical
        'ssel': ['andnps', 'andps', 'orps', 'xorps'],
        # SSE Shuffle Unpack
        'ssesu': ['shufps', 'unpckhps', 'unpcklps'],
        # SSE Convertion
        'ssecvt': ['cvtpi2ps', 'cvtps2pi', 'cvtsi2ss', 'cvtss2si', 'cvttps2pi', 'cvttss2si'],
        # SSE

        # Floating Data Transfer
        'fdt': ['fbld', 'fbstp', 'fcmovb', 'fcmovbe', 'fcmove', 'fcmovnb', 'fcmovnbe', 'fcmovne',
                'fcmovnu', 'fcmovu', 'fild', 'fist', 'fistp', 'fld', 'fst', 'fstp', 'fxch', 'fisttp', ],
        # Flaot Transcedental
        'ftrdt': ['f2xm1', 'fcos', 'fpatan', 'fptan', 'fsin', 'fsincos', 'fyl2x', 'fyl2xp1'],
        # Float Load constant
        'flc': ['fld1', 'fldl2e', 'fldl2t', 'fldlg2', 'fldln2', 'fldpi', 'fldz'],

        'tse': ['xabort', 'xbegin', 'xbeginl', 'xbeginw', 'xend', 'xtest'],

        'ssebi': ['pavgb', 'pavgw', 'pextrw', 'pinsrw', 'pmaxsw', 'pmaxub', 'pminsw',
                  'pminub', 'pmovmskb',
                  'pmulhuw', 'psadbw', 'pshufw', ],
        'vmx': ['invept', 'invvpid', 'vmcall', 'vmclear', 'vmfunc', 'vmlaunch', 'vmresume', 'vmptrld',
                'vmptrst', 'vmread', 'vmwrite', 'vmxoff', 'vmxon', ]
    }
    inst = inst.split(' ')
    if len(inst) > 1:
        inst = inst[1]
    else:
        inst = inst[0]
    if 'int' in inst:
        return 'int'
    for gr in inst_groups.keys():
        if inst in inst_groups[gr]:
            return gr
    for gr in inst_groups.keys():
        for mmc in inst_groups[gr]:
            if inst in mmc or mmc in inst:
                return gr
    return 'other'


def fine_disassemble(exe, depth=128000):
    main_code = get_main_code_section(exe.sections, exe.OPTIONAL_HEADER.BaseOfCode)
    md = Cs(CS_ARCH_X86, CS_MODE_32)
    md.detail = True
    last_address = 0
    last_size = 0
    begin = main_code.PointerToRawData
    end = begin + main_code.SizeOfRawData
    ins_count = 0
    size_count = 0
    sequence_of_groups = ['begin', ]
    while True:
        data = exe.get_memory_mapped_image()[begin:end]
        for i in md.disasm(data, begin):
            group = get_instruction_group(i.mnemonic)
            if sequence_of_groups[-1] == group:
                sequence_of_groups[-1] = (group, 2)
            elif sequence_of_groups[-1][0] == group:
                sequence_of_groups[-1] = (group, sequence_of_groups[-1][1] + 1)
            else:
                sequence_of_groups.append(group)
            last_address = int(i.address)
            last_size = i.size
            ins_count += 1
            size_count += last_size
        begin = max(int(last_address), begin) + last_size + 1
        if begin >= end:
            break
        if ins_count > depth:
            break
    return sequence_of_groups


def quick_disassemble(path, depth=128000):
    try:
        exe = pefile.PE(path)
        gr = fine_disassemble(exe, depth)
        return gr
    except:
        return None


def extract_sequence(path):
    labels = ["cdt", "udt", "sdt", "adt", "cmpdt", "cvt", "bai", "iai",
              "dai", "fai", "fci", "sai", "li", "sri", "bii", "byi",
              "cj", "uj", "int", "si", "io", "flg", "seg", "misc", "sr",
              "rng", "arr", "pmi", "pci", "mmxt", "mmxc", "mmxa",
              "mmxcmp", "mmxl", "mmxsr", "mmxsm", "sset", "ssea",
              "ssecmp", "ssel", "ssesu", "ssecvt", "fdt", "ftrdt", "flc",
              "tse", "ssebi", "vmx", "other"]

    labels_array = np.array(labels).reshape(-1, 1)
    hot_encoder = OneHotEncoder(sparse=False)
    encoded_labels = hot_encoder.fit_transform(labels_array)

    encode_dict = {}
    for l, e in zip(labels, encoded_labels):
        encode_dict[l] = e

    count = 0
    sequence = quick_disassemble(path)
    if sequence is not None:
        del sequence[0]
        for s in sequence:
            if isinstance(s, str):
                count += 1
            else:
                count += s[1]
        steps = 128
        vect = 49
        data_array = np.zeros((int(count / steps) + 1, steps, vect), dtype='float32')
        length = steps
        i, j, k = (0, 0, 0)
        for s in sequence:
            if isinstance(s, str):
                data_array[i, j] = encode_dict[s] + 0.
                j += 1
                if j > length - 1:
                    j = 0
                    i += 1
            else:
                for _ in range(s[1]):
                    data_array[i, j] = encode_dict[s[0]] + 0.
                    j += 1
                    if j > length - 1:
                        j = 0
                        i += 1
        return data_array
    else:
        return None


def extract_img(path, h=64, w=64):
    images = []
    with open(path, 'rb') as img_set:
        img_arr = img_set.read(h * w)
        while img_arr:
            if img_arr not in images and len(img_arr) == h * w:
                images.append(img_arr)
            img_arr = img_set.read(h * w)
    len_img = len(images)
    img_list = np.zeros(shape=(len_img, h, w, 1), dtype=np.uint8)
    for j in range(len(images)):
        img_list[j, :, :, 0] = np.reshape(list(images[j]), (h, w))
    img_list = img_list.astype('float32')
    img_list /= 255
    return img_list

In [4]:
def evaluate_g_encoder(row, columns, col_parts):
    basic_name = 'grams_encoder_part_'
    encoded = []
    for i in range(8):
        sub_row = []
        #enc = load_model(os.path.join('encoders', basic_name+str(i)+'.h5'))
        enc = models[i+4]
        for cp in col_parts[i]:
            sub_row.append(row[columns.index(cp)])
        arr = np.zeros((1, len(sub_row)))
        arr[0] = np.array(sub_row)
        row_enc = enc.predict(arr)
        encoded.append(row_enc[0])
    arr_c = np.concatenate(encoded)
    arr_enc = np.zeros((1, arr_c.shape[0]))
    arr_enc[0] = arr_c
    gc.collect()
    return arr_enc


def evaluate_df_encoder(imports):
    #enc = load_model(os.path.join('encoders', 'dllf_encoder_part_0.h5'))
    enc = models[12]
    arr = np.zeros((1, len(imports)))
    arr[0] = np.array(imports)
    row_enc = enc.predict(arr)
    return row_enc


def joined_prediction(cnn, rnn, saeg, saei):
    return 0.756


def rectification(g_row, imports, sequence, imgs, grams_pre, imp_pre, seq_pre, cnn_pre):
    return 0.1228


In [5]:
def load_static(paths):
    gc.collect()
    loaded = []
    for path in paths:
        loaded.append(load_model(path))
    return loaded

In [6]:
core_models = ''

In [7]:
paths = [os.path.join(core_models, 'cnn64.h5'),
         os.path.join(core_models, 'func_dll_fnn.h5'),
         os.path.join(core_models, 'grams_fnn_beta_1.h5'),
         os.path.join(core_models, 'sequencer.h5'),
        os.path.join('encoders', 'grams_encoder_part_0.h5'),
        os.path.join('encoders', 'grams_encoder_part_1.h5'),
        os.path.join('encoders', 'grams_encoder_part_2.h5'),
        os.path.join('encoders', 'grams_encoder_part_3.h5'),
        os.path.join('encoders', 'grams_encoder_part_4.h5'),
        os.path.join('encoders', 'grams_encoder_part_5.h5'),
        os.path.join('encoders', 'grams_encoder_part_6.h5'),
        os.path.join('encoders', 'grams_encoder_part_7.h5'),
        os.path.join('encoders', 'dllf_encoder_part_0.h5'),
        ]
models = load_static(paths)



In [10]:
exe_path = "D:\\DATASETPFE\\samples\\malware\\2019_S1\\test"
#exe_path = "D:\\DATASETPFE\\samples\\legit\\newlegit"
exe_list = []
for f in listdir(exe_path):
    exe_list.append(f)
    
original_path = "D:\\new_legit_grams.csv"
with open(original_path, 'r') as grms:
    csv_reader = csv.reader(grms)
    columns = next(csv_reader)
del columns[0]
del columns[-1]

with open("D:\\benchmark\\dlls encoded\\Input\\dlls_legit.csv", 'r') as dlls:
    csv_reader = csv.reader(dlls)
    col_dlls = next(csv_reader)
del col_dlls[0]
del col_dlls[-1]

with open("D:\\benchmark\\functions encoded\\Input\\functions_legit.csv", 'r') as func:
    csv_reader = csv.reader(func)
    col_func = next(csv_reader)
del col_func[0]
del col_func[-1]

In [13]:
path = "..\\grams encoded\\Input\\legit_grams_min_max.csv"
with open(path, 'r') as lgm:
    csv_reader = csv.reader(lgm)
    row = next(csv_reader)
    row = next(csv_reader)

In [15]:
del row[0]
del row[-1]

In [17]:
for f in exe_list:
    freq = grams_extractor(os.path.join(exe_path, f), columns)
    grams_freq = grams_rf(freq)
    row = grams_row(grams_freq, columns)
    norm_row_ = normalized_row(row)
    norm_row = []
    for nr in norm_row_:
        norm_row.append(nr[0])
    # ###Imports
    imports = extract_imports(os.path.join(exe_path, f), col_dlls, col_func)
    # ###disassemble
    sequence = extract_sequence(os.path.join(exe_path, f))
    # ###images
    img_list = extract_img(os.path.join(exe_path, f))
    with open("D:\\benchmark\\grams encoded\\Model\\grams_columns_parts.json", 'r') as gcp:
        columns_parts = json.load(gcp)
    encoded_grams = evaluate_g_encoder(row, columns, columns_parts)
    mean = np.mean(encoded_grams[0])
    for i in range(len(encoded_grams[0])):
        if encoded_grams[0][i] < mean*2:
            encoded_grams[0][i] = 0.
    encoded_imports = evaluate_df_encoder(imports)
    gc.collect()
    cnn_pre = models[0].predict(img_list)
    seq_pre = models[3].predict(sequence)
    grams_pre = models[2].predict(encoded_grams)
    imp_pre = models[1].predict(encoded_imports)
    print('cnn', np.mean(cnn_pre))
    print('rnn', np.mean(seq_pre))
    print('grm', grams_pre[0][0])
    print('imp', imp_pre[0][0])
    print('---------------------------')

cnn 0.8614947
rnn 0.8819734
grm 0.99999714
imp 0.81804675
---------------------------
cnn 0.80465096
rnn 0.99631596
grm 0.9999938
imp 0.91607773
---------------------------
cnn 0.3593918
rnn 0.76451737
grm 0.9999945
imp 0.08109633
---------------------------
cnn 0.7208931
rnn 0.34044728
grm 0.9999931
imp 1.4995454e-06
---------------------------


KeyboardInterrupt: 