In [155]:
import re
from os import listdir
from os.path import isfile, join
import csv
import os
import pandas as pd
import time
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
train_dir = '/home/hoang/Downloads/train/'
files = [f for f in listdir(train_dir) if isfile(join(train_dir, f))]

In [126]:
class CSVProcessor:
    def __init__(self, fdir, fname):
        self.fdir = fdir
        self.fname = fname
        
    def process(self):
        line = 0
        df = pd.read_csv(filepath_or_buffer=os.path.join(self.fdir, self.fname))
        return df

In [172]:
proc = CSVProcessor('data/', 'trainLabels.csv')
df = proc.process()

data, _ = train_test_split(df, test_size=0.6, random_state=42) # train on 40% given data

train, test = train_test_split(data, test_size=0.2, random_state=42)

In [180]:
train_hashes = [x for x in train['Id']]

In [181]:
SECTION_PREF = [
    'HEADER:', '.text:', '.Pav:', '.idata', '.data', '.rdata', '.bss', '.edata:',
    '.rsrc:', '.tls', '.reloc:'
]

OP_INSTR = [
    'jmp', 'mov', 'retf', 'push', 'pop', 'xor', 'retn', 'nop', 'sub', 'inc', 'dec', 'add',
    'imul', 'xchg', 'or', 'shr', 'cmp', 'call', 'shl', 'ror', 'rol', 'jnb'
]

KEY = ['.dll', 'std::', ':dword']

# important keyword to interact with stack and manipulate memories
MEM_KW = ['FUNCTION', 'call'] # memcpy_s and memmove_s usually comes after call

In [250]:
class Preprocess:
    def __init__(self, files, min_app=30, num_top_features=5000):
        self.files = files
        self.min_app = min_app
        self.num_top_features = num_top_features
        
    def genTokenAsm(self, file):
        asm = join(train_dir, file + ".asm")
        byte = join(train_dir, file + ".bytes")
        
        with open(asm, 'r', encoding='ISO-8859-1') as asmFile:
            tokens = re.sub(r'\t|\n|\r', ' ', asmFile.read()).split()
        
        filtered = []
        opList = []
        
        for i in range(1, len(tokens) - 1):
            if tokens[i] in OP_INSTR:
                filtered.append(tokens[i])
                opList.append(tokens[i])
            
            filtered += [p for p in SECTION_PREF if p in tokens[i]]
            filtered += [k for k in KEY if k in tokens[i]]
            filtered += [tokens[i] + ' ' + tokens[i + 1] for k in MEM_KW if k == tokens[i]]
            
            # memory and function call
            if tokens[i] == '__stdcall':
                bigram = tokens[i] + ' ' + tokens[i + 1].partition('(')[0]
                filtered.append(bigram)
                filtered.append(tokens[i - 1])
            
            # define bytes
            if tokens[i] == 'db' and tokens[i + 1][0] == "'":
                filtered.append(tokens[i] + ' ' + tokens[i + 1])
        
        counts = {}
        for i in filtered:
            counts[i] = counts.get(i, 0) + 1
        
        return counts
    
    def genBagOfWordsPerFile(self, tok_map, glob_dict):
        ret = {}
        for w in glob_dict:
            ret[w] = (tok_map.get(w, 0) * 1.0) / glob_dict[w]
        return ret
    
    def getBytesAndAsmSize(self, f):
        asm = join(train_dir, f + ".asm")
        b = join(train_dir, f + ".bytes")
        return {"asm": os.stat(asm).st_size, "bytes": os.stat(b).st_size}
                
    
    def process(self):
        glob_dict = {}
        asm_map = {}
        print("Start processing files")
        start = time.time()
        
        for f in self.files:
#             print("=> Generate token for", f)
            freq = self.genTokenAsm(f)
            asm_map[f + ".asm"] = freq
            for tok in freq:
                glob_dict[tok] = glob_dict.get(tok, 0) + freq[tok]
            
        glob_dict = {k:v for (k,v) in glob_dict.items() if v >= self.min_app}
        glob_features = list(glob_dict.keys())
        df = pd.DataFrame(columns=(['file'] + glob_features) + ['asm_sz', 'byte_sz'])
        
        # top features
        sz = min(len(glob_features), self.num_top_features)
        glob_features = sorted(glob_features, key=lambda x: -glob_dict[x])
        top_features = glob_features[:sz]
        
        new_glob_dict = {}
        for f in top_features:
            new_glob_dict[f] = glob_dict.get(f, 0)
            
        
        for f in self.files:
            bag = self.genBagOfWordsPerFile(asm_map[f + ".asm"], new_glob_dict)
            sz = self.getBytesAndAsmSize(f)
            
            bag['file'] = f
            bag['asm_sz'] = sz['asm']
            bag['byte_sz'] = sz['bytes']
            
            df = df.append(bag, ignore_index=True)
        print("Done processing files after", ((time.time() - start) * 1.0 / 60), "minutes")
        return (df, top_features)

In [262]:
pre = Preprocess(train_hashes[:100])
df, top = pre.process()

Start processing files
Done processing files after 5.815332126617432 minutes


In [263]:
df

Unnamed: 0,file,HEADER:,.text:,mov,retn,sub,xor,push,call,inc,...,call sub_431374,call sub_405670,call unknown_libname_2,call sub_10012C7B,call sub_10012C86,call sub_1000A353,call sub_1000AD1A,call sub_1000ACA6,asm_sz,byte_sz
0,BOEePDvjdtscGrZwKloT,0.012456,0.034425,0.043839,0.078269,0.043078,0.025914,0.081762,0.088353,0.065385,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,7174112,2197504
1,7vjsWlX3rQL1dNenwTxV,0.000000,0.043142,0.004774,0.011776,0.010944,0.002788,0.006123,0.006939,0.006408,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,19267478,9265152
2,41o3zDTReYVFtpOB7EWl,0.011460,0.000000,0.002496,0.020103,0.010844,0.003736,0.003865,0.002492,0.015046,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,5792578,2182656
3,IeyQM436SZoWp5ndGERg,0.010962,0.011814,0.012812,0.028305,0.010247,0.008675,0.035355,0.045871,0.014117,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,2495938,816640
4,AMmdfQwU8WeIgXBNhsjp,0.000000,0.001735,0.002311,0.003112,0.006019,0.001640,0.003051,0.003062,0.005851,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,139341125,13689856
5,hlFo3fprkSOzDI7tc8Xq,0.000000,0.040243,0.005147,0.012155,0.011391,0.003207,0.006380,0.007346,0.007523,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,17943169,8448512
6,HWgZ3UYk0mxrcE9Te8O6,0.011958,0.000700,0.000325,0.001935,0.002985,0.000911,0.002897,0.002883,0.000093,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,246138,8507904
7,7w5hNgKFji2kO6dCnMzJ,0.012456,0.002921,0.005555,0.000967,0.018107,0.000273,0.002816,0.003176,0.000557,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,965791,460288
8,1fMQAqC26xFJIgKpkrwS,0.011958,0.000470,0.000180,0.001093,0.001691,0.000875,0.001459,0.002557,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,147859,8507904
9,adS4KHMlGWqDIreO7T8n,0.011460,0.007047,0.005384,0.008117,0.007661,0.005194,0.024494,0.017739,0.020247,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,1471366,727552


In [267]:
top

['.rdata',
 '.data',
 '.text:',
 'mov',
 'push',
 'nop',
 'imul',
 'call',
 'xor',
 'pop',
 'cmp',
 'add',
 '.idata',
 'jmp',
 'retn',
 'sub',
 'xchg',
 'inc',
 ':dword',
 'or',
 '.bss',
 'call dword',
 'dec',
 "db 'ÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌ'",
 'shl',
 'shr',
 'BOOL',
 'jnb',
 'retf',
 'HEADER:',
 'call eax',
 'int',
 'FUNCTION CHUNK',
 '.dll',
 'call esi',
 'call ??3@YAXPAX@Z',
 'call edi',
 'void',
 'call ??2@YAPAXI@Z',
 'call @__security_check_cookie@4',
 'DWORD',
 'call near',
 'std::',
 'call ebx',
 '__int32',
 'HANDLE',
 'call __EH_prolog',
 "db '-",
 'UINT',
 'HRESULT',
 'long',
 'LPVOID',
 'LSTATUS',
 "db '",
 'ror',
 'HMODULE',
 'call ds:lstrlenA',
 'rol',
 '{[thunk]:',
 'HWND',
 '.rsrc:',
 'call sub_418170',
 'call ebp',
 'LONG',
 'call _memset',
 'call _free',
 'call _atexit',
 'call ecx',
 'call __EH_epilog3',
 'MMRESULT',
 'call sub_1000E69C',
 'call ds:GetLastError',
 '.tls',
 'call __decode_pointer',
 'LPSTR',
 'call unknown_libna

In [265]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rfc = RandomForestClassifier(n_estimators=10, max_depth=8, max_features=int(np.sqrt(83)), n_jobs=3)

In [268]:
rfc.fit(df.drop(columns=['file']), train['Class'][:100])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=8, max_features=9, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=3,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [272]:
rfc.score(df.drop(columns=['file']), train['Class'][:100])

1.0

In [None]:
rfc = RandomForestClassifier(n_estimators=200, max_depth=8, max_features=int(np.sqrt(16590)), n_jobs=3)

In [None]:
rfc.fit(df.drop(columns=['file']), train['Class'])