In [None]:
import os, pickle
import pandas as pd
import numpy as np

In [None]:
% run FeatureTrace.ipynb
def getFeatureProfiles(filePaths):
    featureObj = FeatureTrace
    featureProfiles = dict()
    for f in filePaths:
        fName = os.path.basename(f)
        featureProfile = featureObj(f).getTrace_noContainTS()
        featureProfiles[fName] = featureProfile
    return featureProfiles

In [None]:
% run Alignment_Fast3.ipynb
% run StructMatchGap3.ipynb
% run StageMatrix.ipynb
% run Motif.ipynb
% run OutputStage.ipynb
% run CommonMotifAnalysis_Tmp.ipynb

# Doing global alignment and Calculate common motif.
# will return a common motif dict
def do_globalAlignment(rep1, rep2):
    # Aligment
    align_dict = dict()
    BASE = "rep1"
    align_dict['rep1'] = pairwise_NW( rep1, rep1, 2, -1, -3, 1)[2]
    align_dict['rep2'] = pairwise_NW( rep1, rep2, 2, -1, -3, 1)[2]
    
    # get 'Match Matrix' and 'Gap List'
    matchMatrix, gapSeqList = structMatchGap(align_dict, BASE)
    stageMatrixResult = stageMatrix(matchMatrix, gapSeqList)
    Motif_Obj = Motif(stageMatrixResult, BASE)
    outputStage = OutputStage(stageMatrixResult, None, BASE, Motif_Obj)
    
    executionTrace_dict = {"rep1":rep1, "rep2":rep2}
    
    commonMotif = CommonMotif(stageMatrixResult, Motif_Obj, executionTrace_dict, outputStage)
    
    # comMotifdict= {'s<stage>_<motif>': [CMS], oriIdxRange1, oriIdxRange2},
    comMotif_dict = commonMotif.getComMotifDict()  
    return comMotif_dict

In [None]:
def getNewCommonSeq(testFileTrace, modelSequence):
    newCommonSeq = list()
    if testFileTrace and modelSequence:
        comMotif_dict = do_globalAlignment(testFileTrace, modelSequence)
        for m in sorted(comMotif_dict.keys(), key = lambda x : int(x.split('_')[0][1:])): # sorted by stages
            cmsList = comMotif_dict[m][0] # get common motif sequence
            newCommonSeq += cmsList
    
    return newCommonSeq

In [None]:
def doGSATesting(testFileTrace, modelSequence_dict):
    resultDict = dict()
    for modelType in sorted(modelSequence_dict.keys()):
        modelSequence = modelSequence_dict[modelType]
        commonSequence = getNewCommonSeq(testFileTrace, modelSequence)
        resultDict[modelType] = len(commonSequence)
#         print(modelType, len(commonSequence))

    return [resultDict["red"], resultDict["blue"], resultDict["green"], resultDict["sys_change"]]

In [None]:
def readStageMotifFile(gsaResultPath):
    stageCount = None
    traceCount = None
    startMotifAPILines = None

    motifAPI_dict = dict() # record what API in every motifs
    stageMotif_dict = dict() # record what motif in every stages
    gapMotifName = None
    
    with open(gsaResultPath, 'r', encoding = 'utf-8-sig') as fHandle:
        lines = fHandle.readlines()
        lines = [line.strip() for line in lines]
        
        for index, line in enumerate(lines, start=1):
            if index==1:
                stageCount = len(line.split(',stage')) - 1
            elif line.startswith("idMotif"):
                traceCount = index - 2
                startMotifAPILines = index+2
            
            if startMotifAPILines and index>=startMotifAPILines:
                line = line[stageCount+1 ::] # skip many commas
                if line[0] == 'M':
                    motifName = line.split(',')[0] # seperate by comma between motifName and firstAPI
                    api = line.split(',')[1]
                    motifAPI_dict[motifName] = list()
                    if api == "=":
                        gapMotifName = motifName
                    
                    motifAPI_dict[motifName].append(api)
                else:
                    api = line[1::] # skip a comma is api
                    motifAPI_dict[motifName].append(api)
        
        fHandle.close()
    
    with open(gsaResultPath, 'r', encoding = 'utf-8-sig') as fHandle:
        lines = fHandle.readlines()
        lines = [line.strip() for line in lines]
        stageMotif_dict = { 'stage'+str(stage): list() for stage in range(1, stageCount+1)}
        fHandle.close()
        for index, line in enumerate(lines, start=1):
            if index>1 and index<=traceCount+1:
                motifs = line.split(",")
                for stageIndex in range(1, stageCount+1):
                    stageMotif_dict['stage'+str(stageIndex)].append(motifs[stageIndex])
        fHandle.close()
        
    return motifAPI_dict, stageMotif_dict, gapMotifName

In [None]:
from collections import Counter
import math

def getColorAPISequence(motifAPI_dict, stageMotif_dict, gapMotifName):
    commonMotifSeq_red = list()   # 100% common
    commonMotifSeq_blue = list()  # 67% common
    commonMotifSeq_green = list() # 50% common
    colorAPISequence_dict = {'red':list(), 'blue':list(), 'green':list()}
    
    totalTraceCount = len(stageMotif_dict['stage1'])
    threshold_red = totalTraceCount
    threshold_blue = math.ceil(totalTraceCount*2/3)
    threshold_green = math.ceil(totalTraceCount/2)
    
    for stage in sorted(stageMotif_dict.keys(), key=lambda x : int(x.split('stage')[1])):
        motifs = stageMotif_dict[stage]
        counter = Counter(motifs)
        for motifName, freq in counter.items():
            if motifName != gapMotifName:
                if freq == threshold_red:
                    commonMotifSeq_red.append(motifName)
                    commonMotifSeq_blue.append(motifName)
                    commonMotifSeq_green.append(motifName)
                    break
                elif freq >= threshold_blue:
                    commonMotifSeq_blue.append(motifName)
                    commonMotifSeq_green.append(motifName)
                    break
                elif freq >= threshold_green:
                    commonMotifSeq_green.append(motifName)
                    break
    
    for motif in commonMotifSeq_red:
        colorAPISequence_dict['red'] += motifAPI_dict[motif]
    for motif in commonMotifSeq_blue:
        colorAPISequence_dict['blue'] += motifAPI_dict[motif]
    for motif in commonMotifSeq_green:
        colorAPISequence_dict['green'] += motifAPI_dict[motif]            

    return colorAPISequence_dict

In [None]:
def ruleset_filter(featuredAPISequence_list):
    filteredSequence = list()
    
    filter_api = {'CloseHandle', 'OpenThread', 'RegOpenKey', 'RegCloseKey'}
    
    # filtering api
    sys_state_change_motif = {}
    
    frequently_used_lib = {'imm32', 'lpk', 'gdi32', 'kernel32', 'ntdll', 'user32', 'comctl32'}
    loadedLibrary = set() # have loaded librarys
    loadedLibrary.update(frequently_used_lib)
    
    # apply ruleset (filter out: unwanted api, continuously repeated api, loadedLibrary)
    lastAPI = None
    for api in featuredAPISequence_list:
        apiName = api.split('#')[0]
        if not lastAPI:
            if apiName not in filter_api:
                if apiName == "LoadLibrary":
                    libName = api.split("@")[2]
                    libName = libName.lower()
                    if libName not in loadedLibrary:
                        filteredSequence.append(api)
                        lastAPI = api
                        loadedLibrary.update(libName)
                else:
                    filteredSequence.append(api)
                    lastAPI = api
        else:
            if apiName not in filter_api and api != lastAPI:
                if apiName == "LoadLibrary":
                    libName = api.split("@")[2]
                    libName = libName.lower()
                    if libName not in loadedLibrary:
                        filteredSequence.append(api)
                        lastAPI = api
                        loadedLibrary.update(libName)
                else:
                    filteredSequence.append(api)
                    lastAPI = api
            
    return filteredSequence

In [None]:
def sys_change_filter(featuredAPISeq_list):
    filteredSequence = list()
    
    sys_state_change_api = {'LoadLibrary', 'CreateProcess', 'OpenProcess', 'ExitProcess', 'WinExec',
                            'CreateRemoteThread', 'TerminateProcess', 'TerminateThread', 'CreateThread',
                            'CopyFile', 'CreateFile', 'WriteFile', 'DeleteFile', 
                            'RegSetValue', 'RegCreateKey', 'RegDeleteKey', 'RegDeleteValue', 'WinHttpConnect'}
    
    
    for api in featuredAPISeq_list:
        apiName = api.split("#")[0]
        
        if apiName in sys_state_change_api:
            filteredSequence.append(api)
        elif apiName == "CreateFile" and "GENERIC_WRITE" in api:
            filteredSequence.append(api)
    
    return filteredSequence

In [None]:
def getTestingModelSeq(gsa_stage_motif_result):
    motifAPI_dict, stageMotif_dict, gapMotifName = readStageMotifFile(gsa_stage_motif_result)
    colorAPISequence_dict = getColorAPISequence(motifAPI_dict, stageMotif_dict, gapMotifName)
    filtered_APISequence_dict = dict() # key:{red, blue, green, sys_change}
    for color, commonAPISeq in colorAPISequence_dict.items():
        filtered_APISequence_dict[color] = ruleset_filter(commonAPISeq)
    filtered_APISequence_dict = colorAPISequence_dict
    filtered_APISequence_dict['sys_change'] = sys_change_filter(filtered_APISequence_dict['red'])
    return filtered_APISequence_dict

In [None]:
base_path = "output/GSA-11952/GSA_train_17family_withoutRasMMA/"
familyName = "ibryte"
gen = "main"
familyTag = familyName + '_0.8'
tag = familyTag + '_' + gen
dirPath = base_path + familyTag + '/' + gen + '/'
gsa_stage_motif_result = dirPath + tag + "_stageMoti.csv"

filtered_APISequence_dict = getTestingModelSeq(gsa_stage_motif_result)

In [None]:
for k, v in filtered_APISequence_dict.items():
    print(k, len(v))

In [None]:
test_base_path = "11952data/test_17family/"
data_dir_path = test_base_path+familyName+'/'+gen+'/'
testFilePaths = [data_dir_path+f for f in os.listdir(data_dir_path)]
testFile_featureTraces = getFeatureProfiles(testFilePaths)

df = pd.DataFrame(columns=["red", "blue", "green", "sys_change"])


In [None]:
for testFileName, testFileTrace in testFile_featureTraces.items():
    resultScores = doGSATesting(testFileTrace, filtered_APISequence_dict)
    df.loc[-1] = resultScores  # adding a row
    df.index = df.index + 1  # shifting index
    df = df.sort_index()  # sorting by index

In [None]:
df