In [155]:
import re
from os import listdir
from os.path import isfile, join
import csv
import os
import pandas as pd
import time
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
train_dir = '/home/hoang/Downloads/train/'
files = [f for f in listdir(train_dir) if isfile(join(train_dir, f))]

In [126]:
class CSVProcessor:
    def __init__(self, fdir, fname):
        self.fdir = fdir
        self.fname = fname
        
    def process(self):
        line = 0
        df = pd.read_csv(filepath_or_buffer=os.path.join(self.fdir, self.fname))
        return df

In [172]:
proc = CSVProcessor('data/', 'trainLabels.csv')
df = proc.process()

data, _ = train_test_split(df, test_size=0.6, random_state=42) # train on 40% given data

train, test = train_test_split(data, test_size=0.2, random_state=42)

In [180]:
train_hashes = [x for x in train['Id']]

In [181]:
SECTION_PREF = [
    'HEADER:', '.text:', '.Pav:', '.idata', '.data', '.rdata', '.bss', '.edata:',
    '.rsrc:', '.tls', '.reloc:'
]

OP_INSTR = [
    'jmp', 'mov', 'retf', 'push', 'pop', 'xor', 'retn', 'nop', 'sub', 'inc', 'dec', 'add',
    'imul', 'xchg', 'or', 'shr', 'cmp', 'call', 'shl', 'ror', 'rol', 'jnb'
]

KEY = ['.dll', 'std::', ':dword']

# important keyword to interact with stack and manipulate memories
MEM_KW = ['FUNCTION', 'call'] # memcpy_s and memmove_s usually comes after call

In [235]:
class Preprocess:
    def __init__(self, files, min_app=30, num_top_features=5000):
        self.files = files
        self.min_app = min_app
        self.num_top_features = num_top_features
        
    def genTokenAsm(self, file):
        asm = join(train_dir, file + ".asm")
        byte = join(train_dir, file + ".bytes")
        
        with open(asm, 'r', encoding='ISO-8859-1') as asmFile:
            tokens = re.sub(r'\t|\n|\r', ' ', asmFile.read()).split()
        
        filtered = []
        opList = []
        
        for i in range(1, len(tokens) - 1):
            if tokens[i] in OP_INSTR:
                filtered.append(tokens[i])
                opList.append(tokens[i])
            
            filtered += [p for p in SECTION_PREF if p in tokens[i]]
            filtered += [k for k in KEY if k in tokens[i]]
            filtered += [tokens[i] + ' ' + tokens[i + 1] for k in MEM_KW if k == tokens[i]]
            
            # memory and function call
            if tokens[i] == '__stdcall':
                bigram = tokens[i] + ' ' + tokens[i + 1].partition('(')[0]
                filtered.append(bigram)
                filtered.append(tokens[i - 1])
            
            # define bytes
            if tokens[i] == 'db' and tokens[i + 1][0] == "'":
                filtered.append(tokens[i] + ' ' + tokens[i + 1])
        
        counts = {}
        for i in filtered:
            counts[i] = counts.get(i, 0) + 1
        
        return counts
    
    def genBagOfWordsPerFile(self, tok_map, glob_dict):
        ret = {}
        for w in glob_dict:
            ret[w] = (tok_map.get(w, 0) * 1.0) / glob_dict[w]
        return ret
    
    def getBytesAndAsmSize(self, f):
        asm = join(train_dir, f + ".asm")
        b = join(train_dir, f + ".bytes")
        return {"asm": os.stat(asm).st_size, "bytes": os.stat(b).st_size}
                
    
    def process(self):
        glob_dict = {}
        asm_map = {}
        print("Start processing files")
        start = time.time()
        
        for f in self.files:
#             print("=> Generate token for", f)
            freq = self.genTokenAsm(f)
            asm_map[f + ".asm"] = freq
            for tok in freq:
                glob_dict[tok] = glob_dict.get(tok, 0) + freq[tok]
            
        glob_dict = {k:v for (k,v) in glob_dict.items() if v >= self.min_app}
        glob_features = list(glob_dict.keys())
        df = pd.DataFrame(columns=(['file'] + glob_features) + ['asm_sz', 'byte_sz'])
        
        # top features
        sz = min(len(glob_features), self.num_top_features)
        top_features = sorted(glob_features, key=lambda x: glob_dict[x])[:-sz]
        
        new_glob_dict = {}
        for f in top_features:
            new_glob_dict[f] = glob_dict[f]
            
        
        for f in self.files:
            bag = self.genBagOfWordsPerFile(asm_map[f + ".asm"], new_glob_dict)
            sz = self.getBytesAndAsmSize(f)
            
            bag['file'] = f
            bag['asm_sz'] = sz['asm']
            bag['byte_sz'] = sz['bytes']
            
            df = df.append(bag, ignore_index=True)
        print("Done processing files after", ((time.time() - start) * 1.0 / 60), "minutes")
        return (df, top_features)

In [236]:
pre = Preprocess(train_hashes[:1])
df, top = pre.process()

Start processing files
[] 48
Done processing files after 0.023972912629445394 minutes


In [226]:
df

Unnamed: 0,file,.text:,mov,retn,sub,xor,push,call,inc,pop,...,call sub_1000A460,call sub_10021C20,call __SEH_prolog,call __SEH_epilog,call sub_1002A4A9,BOOL,.dll,.idata,asm_sz,byte_sz
0,BOEePDvjdtscGrZwKloT,,,,,,,,,,...,,,,,,,,,7174112,2197504


In [191]:
scaler = StandardScaler()
d = df[['asm_sz', 'byte_sz']]
scaler.fit(d)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [192]:
scaler.transform(d)

array([[-0.29140642, -0.6766765 ],
       [ 0.20798436,  1.25115036],
       [-0.34845632, -0.68072656],
       ...,
       [-0.48985455, -1.07358203],
       [-0.10971433, -0.64022599],
       [-0.58058474,  1.04054743]])

In [193]:
df[['asm_sz', 'byte_sz']] = scaler.transform(d)

In [194]:
df

Unnamed: 0,file,HEADER:,.text:,mov,retn,sub,xor,push,call,call ds:GetThreadLocale,...,call sub_421484,call sub_422C23,call sub_422BBA,call sub_422C3B,call sub_42A2A0,call sub_41D279,call sub_426CC4,call sub_675029E0,asm_sz,byte_sz
0,BOEePDvjdtscGrZwKloT,0.000359,0.001114,1.378107e-03,0.002099,0.001086,0.000829,0.001838,0.001877,0.008065,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.291406,-0.676677
1,7vjsWlX3rQL1dNenwTxV,0.000000,0.001397,1.500718e-04,0.000316,0.000276,0.000089,0.000138,0.000147,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.207984,1.251150
2,41o3zDTReYVFtpOB7EWl,0.000331,0.000000,7.845131e-05,0.000539,0.000273,0.000120,0.000087,0.000053,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.348456,-0.680727
3,IeyQM436SZoWp5ndGERg,0.000316,0.000382,4.027443e-04,0.000759,0.000258,0.000278,0.000795,0.000974,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.484590,-1.053332
4,AMmdfQwU8WeIgXBNhsjp,0.000000,0.000056,7.265543e-05,0.000083,0.000152,0.000052,0.000069,0.000065,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.166378,2.458067
5,hlFo3fprkSOzDI7tc8Xq,0.000000,0.001303,1.618015e-04,0.000326,0.000287,0.000103,0.000143,0.000156,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.153298,1.028397
6,HWgZ3UYk0mxrcE9Te8O6,0.000345,0.000023,1.021178e-05,0.000052,0.000075,0.000029,0.000065,0.000061,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.577494,1.044597
7,7w5hNgKFji2kO6dCnMzJ,0.000359,0.000095,1.746352e-04,0.000026,0.000457,0.000009,0.000063,0.000067,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.547777,-1.150533
8,1fMQAqC26xFJIgKpkrwS,0.000345,0.000015,5.657878e-06,0.000029,0.000043,0.000028,0.000033,0.000054,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.581553,1.044597
9,adS4KHMlGWqDIreO7T8n,0.000331,0.000228,1.692534e-04,0.000218,0.000193,0.000166,0.000550,0.000377,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.526899,-1.077632


In [195]:
df.shape

(3477, 16590)

In [196]:
smaller_df = Preprocess(train_hashes[:10]).process()

Start processing files
Done processing files after 0.8766748348871867 minutes


In [197]:
smaller_df.shape

(10, 83)

In [206]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rfc = RandomForestClassifier(n_estimators=10, max_depth=8, max_features=int(np.sqrt(83)), n_jobs=3)

In [207]:
rfc.fit(smaller_df.drop(columns=['file']), train['Class'][:10])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=8, max_features=9, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=3,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [208]:
accuracy_score(rfc.predict(smaller_df.drop(columns=['file'])), train['Class'][:10])

1.0

In [211]:
rfc = RandomForestClassifier(n_estimators=200, max_depth=8, max_features=int(np.sqrt(16590)), n_jobs=3)

In [212]:
rfc.fit(df.drop(columns=['file']), train['Class'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=8, max_features=128, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=3,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [219]:
test.shape

(870, 2)