# IDA Pro File Path

In [1]:
dataPath = './ben_report/'
isSave = False
if isSave:
    resultPath = './ben_report_parsingResults/'

# Read the Assembly Files

In [2]:
import os
import json
import pickle
import tqdm

from pyparsing import Word, Optional, Suppress, alphas, alphanums, SkipTo, StringEnd, White

# Define Parser by Pyparsing Library

In [3]:
operandParser = Optional(Word(alphanums + ' []+-_.')('operand1')) + Optional(Suppress(White())) + Optional(Suppress(Word(',')) + Word(alphanums + ' []+-_.')('operand2')) + Optional(Suppress(Word(',')) + Word(alphanums + ' []+-_.')('operand3'))
parser = Word(alphas, alphanums)('mnemonic') + Optional(Suppress(Word(',') + White())) + operandParser
parser.ignore(';' + SkipTo(StringEnd()))

{{W:(ABCD..., ABCD...) [Suppress:({W:(,) <SP><TAB><CR><LF>})]} {{{[W:(ABCD...)] [Suppress:(<SP><TAB><CR><LF>)]} [{Suppress:(W:(,)) W:(ABCD...)}]} [{Suppress:(W:(,)) W:(ABCD...)}]}}

# Test Parser

In [5]:
with open(dataPath + os.listdir(dataPath)[0]) as jsonFilePointer:
    # load assembly code from JSON file
    jsonAsmFile = json.load(jsonFilePointer)
    asm = jsonAsmFile['asm']
            
    # parsing
    resultArray = []
    for function in asm[:2]:
        for basicBlock in function:
            for line in basicBlock:
                result = ['', '', '', '']
                for i, e in enumerate(parser.searchString(line)[0]):
                    if e.isdigit():
                        e = int(e)
                    elif e[-1] == 'h': # hexadecimal
                        try:
                            e = int(e[:-1], 16)
                        except:
                            pass
                                
                    result[i] = e
                        
                if result[0] == 'call':
                    result[1] = 'FTN'
                if result[0][0] == 'j':
                    result[1] = 'ADR'  
                    
                resultArray.append(result)
                print('[ Origin Code ] ', end='')
                print(line)
                
                print('[ Parsed Code ] ', end='')
                print(result)
                print()
                
    print('-------------------')
    print(resultArray)

[ Origin Code ] xor     eax, eax
[ Parsed Code ] ['xor', 'eax', 'eax', '']

[ Origin Code ] cmp     dword ptr [ecx], 1
[ Parsed Code ] ['cmp', 'dword ptr [ecx]', 1, '']

[ Origin Code ] setz    al
[ Parsed Code ] ['setz', 'al', '', '']

[ Origin Code ] retn
[ Parsed Code ] ['retn', '', '', '']

[ Origin Code ] sub     esp, 14h
[ Parsed Code ] ['sub', 'esp', 20, '']

[ Origin Code ] push    ebp
[ Parsed Code ] ['push', 'ebp', '', '']

[ Origin Code ] mov     ebp, [esp+18h+arg_0]
[ Parsed Code ] ['mov', 'ebp', '[esp+18h+arg_0]', '']

[ Origin Code ] push    esi
[ Parsed Code ] ['push', 'esi', '', '']

[ Origin Code ] push    edi
[ Parsed Code ] ['push', 'edi', '', '']

[ Origin Code ] mov     edi, ecx
[ Parsed Code ] ['mov', 'edi', 'ecx', '']

[ Origin Code ] lea     esi, [esp+20h+var_10]
[ Parsed Code ] ['lea', 'esi', '[esp+20h+var_10]', '']

[ Origin Code ] mov     ecx, ebp
[ Parsed Code ] ['mov', 'ecx', 'ebp', '']

[ Origin Code ] call    sub_100BF580
[ Parsed Code ] ['call', 'FTN', '

# Parsing

In [None]:
for fileName in tqdm.tqdm(os.listdir(dataPath)):
    filePath = dataPath + fileName
    if os.path.isfile(filePath):
        with open(filePath) as jsonFilePointer:
            # load assembly code from JSON file
            jsonAsmFile = json.load(jsonFilePointer)
            asm = jsonAsmFile['asm']
            
            # parsing
            resultArray = []
            for function in asm:
                for basicBlock in function:
                    for line in basicBlock:
                        result = ['', '', '', '']
                        for i, e in enumerate(parser.searchString(line)[0]):
                            if e.isdigit():
                                e = int(e)
                            elif e[-1] == 'h': # hexadecimal
                                try:
                                    e = int(e[:-1], 16)
                                except:
                                    pass
                                
                            result[i] = e
                        
                        if result[0] == 'call':
                            result[1] = 'FTN'
                        if result[0][0] == 'j':
                            result[1] = 'ADR'
                            
                        resultArray.append(result)
                        
            # save result
            if isSave:
                with open(resultPath + fileName[:-5] + '.pickle', 'wb') as resultFilePointer:
                    pickle.dump(resultArray, resultFilePointer)