In [1]:
import re
from os import listdir
from os.path import isfile, join
import csv
import os
import pandas as pd
import time
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
train_dir = '/home/hoang/Downloads/train/'
files = [f for f in listdir(train_dir) if isfile(join(train_dir, f))]

In [2]:
class CSVProcessor:
    def __init__(self, fdir, fname):
        self.fdir = fdir
        self.fname = fname
        
    def process(self):
        line = 0
        df = pd.read_csv(filepath_or_buffer=os.path.join(self.fdir, self.fname))
        return df

In [3]:
proc = CSVProcessor('data/', 'trainLabels.csv')
df = proc.process()

data, _ = train_test_split(df, test_size=0.6, random_state=42) # train on 40% given data

train, test = train_test_split(data, test_size=0.2, random_state=42)

In [4]:
proc.process()

Unnamed: 0,Id,Class
0,01kcPWA9K2BOxQeS5Rju,1
1,04EjIdbPV5e1XroFOpiN,1
2,05EeG39MTRrI6VY21DPd,1
3,05rJTUWYAKNegBk2wE8X,1
4,0AnoOZDNbPXIr2MRBSCJ,1
5,0AwWs42SUQ19mI7eDcTC,1
6,0cH8YeO15ZywEhPrJvmj,1
7,0DNVFKwYlcjO7bTfJ5p1,1
8,0DqUX5rkg3IbMY6BLGCE,1
9,0eaNKwluUmkYdIvZ923c,1


In [5]:
train_hashes = [x for x in train['Id']]
train_class = [x for x in train['Class']]

In [None]:
SECTION_PREF = [
    'HEADER:', '.text:', '.Pav:', '.idata', '.data', '.rdata', '.bss', '.edata:',
    '.rsrc:', '.tls', '.reloc:'
]

OP_INSTR = [
    'jmp', 'mov', 'retf', 'push', 'pop', 'xor', 'retn', 'nop', 'sub', 'inc', 'dec', 'add',
    'imul', 'xchg', 'or', 'shr', 'cmp', 'call', 'shl', 'ror', 'rol', 'jnb'
]

KEY = ['.dll', 'std::', ':dword']

# important keyword to interact with stack and manipulate memories
MEM_KW = ['FUNCTION', 'call'] # memcpy_s and memmove_s usually comes after call

In [None]:
class Preprocess:
    def __init__(self, files, min_app=30, num_top_features=5000):
        self.files = files
        self.min_app = min_app
        self.num_top_features = num_top_features
        
    def genTokenAsm(self, file):
        asm = join(train_dir, file + ".asm")
        byte = join(train_dir, file + ".bytes")
        
        with open(asm, 'r', encoding='ISO-8859-1') as asmFile:
            tokens = re.sub(r'\t|\n|\r', ' ', asmFile.read()).split()
        
        filtered = []
        opList = []
        
        for i in range(1, len(tokens) - 1):
            if tokens[i] in OP_INSTR:
                filtered.append(tokens[i])
                opList.append(tokens[i])
            
            filtered += [p for p in SECTION_PREF if p in tokens[i]]
            filtered += [k for k in KEY if k in tokens[i]]
            filtered += [tokens[i] + ' ' + tokens[i + 1] for k in MEM_KW if k == tokens[i]]
            
            # memory and function call
            if tokens[i] == '__stdcall':
                bigram = tokens[i] + ' ' + tokens[i + 1].partition('(')[0]
                filtered.append(bigram)
                filtered.append(tokens[i - 1])
            
            # define bytes
            if tokens[i] == 'db' and tokens[i + 1][0] == "'":
                filtered.append(tokens[i] + ' ' + tokens[i + 1])
        
        counts = {}
        for i in filtered:
            counts[i] = counts.get(i, 0) + 1
        
        return counts
    
    def genBagOfWordsPerFile(self, tok_map, glob_dict):
        ret = {}
        for w in glob_dict:
            ret[w] = (tok_map.get(w, 0) * 1.0) / glob_dict[w]
        return ret
    
    def getBytesAndAsmSize(self, f):
        asm = join(train_dir, f + ".asm")
        b = join(train_dir, f + ".bytes")
        return {"asm": os.stat(asm).st_size, "bytes": os.stat(b).st_size}
                
    
    def process(self):
        glob_dict = {}
        asm_map = {}
        print("Start processing files")
        start = time.time()
        
        for f in self.files:
#             print("=> Generate token for", f)
            freq = self.genTokenAsm(f)
            asm_map[f + ".asm"] = freq
            for tok in freq:
                glob_dict[tok] = glob_dict.get(tok, 0) + freq[tok]
            
        glob_dict = {k:v for (k,v) in glob_dict.items() if v >= self.min_app}
        glob_features = list(glob_dict.keys())
        df = pd.DataFrame(columns=(['file'] + glob_features) + ['asm_sz', 'byte_sz'])
        
        # top features
        sz = min(len(glob_features), self.num_top_features)
        glob_features = sorted(glob_features, key=lambda x: -glob_dict[x])
        top_features = glob_features[:sz]
        
        new_glob_dict = {}
        for f in top_features:
            new_glob_dict[f] = glob_dict.get(f, 0)
            
        
        for f in self.files:
            bag = self.genBagOfWordsPerFile(asm_map[f + ".asm"], new_glob_dict)
            sz = self.getBytesAndAsmSize(f)
            
            bag['file'] = f
            bag['asm_sz'] = sz['asm']
            bag['byte_sz'] = sz['bytes']
            
            df = df.append(bag, ignore_index=True)
        print("Done processing files after", ((time.time() - start) * 1.0 / 60), "minutes")
        return (df, top_features)

In [None]:
pre = Preprocess(train_hashes[:100])
df, top = pre.process()

In [None]:
df

In [None]:
df.insert(44, 'label', train_class[:100])

In [None]:
df['label']

In [None]:
from sklearn.feature_selection import SelectKBest, chi2

selector = SelectKBest(chi2, k=50).fit(df.drop(columns=['file', 'label']), df['label'])

In [None]:
selector.get_params()
selector.scores_.shape

In [None]:
c = len(df.columns)
c = df.columns[1:c-1]
d = dict(zip(c, selector.scores_))

In [None]:
len(c)

In [None]:
sorted(c, key=lambda x: -d[x])[:100]

In [None]:
X_new.shape

In [None]:
top

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rfc = RandomForestClassifier(n_estimators=10, max_depth=8, max_features=int(np.sqrt(83)), n_jobs=3)

In [None]:
rfc.fit(df.drop(columns=['file']), train['Class'][:100])

In [None]:
rfc.score(df.drop(columns=['file']), train['Class'][:100])

In [None]:
rfc = RandomForestClassifier(n_estimators=200, max_depth=8, max_features=int(np.sqrt(16590)), n_jobs=3)

In [None]:
rfc.fit(df.drop(columns=['file']), train['Class'])

In [6]:
from asm_proc import ASMProcess

In [7]:
proc = ASMProcess(train_dir, train_hashes[:2], train_class[:2], 30, 10)

In [8]:
df, top = proc.process()

Start processing files
                   file    .text:       mov     retn  sub  xor      push  \
0  BOEePDvjdtscGrZwKloT  0.443811  0.901797  0.86922  NaN  NaN  0.930324   
1  7vjsWlX3rQL1dNenwTxV  0.556189  0.098203  0.13078  NaN  NaN  0.069676   

       call  inc       pop  ...    call __SEH_epilog  call sub_1002A4A9  BOOL  \
0  0.927179  NaN  0.913317  ...                  NaN                NaN   NaN   
1  0.072821  NaN  0.086683  ...                  NaN                NaN   NaN   

   .dll  .idata  call ??3@YAXPAX@Z  call unknown_libname_3    asm_sz  byte_sz  \
0   NaN     NaN                NaN                     NaN   7174112  2197504   
1   NaN     NaN                NaN                     NaN  19267478  9265152   

   label  
0      1  
1      2  

[2 rows x 55 columns]


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').