## A implementation of RasMMA
### (Runtime API call sequence-based Motif Mining Algorithm)


In [None]:
import pickle
import os

In [None]:
% run Alignment_Fast3.ipynb

# Doing global alignment and find commonAPISequence.
def do_globalAlignment(rep1, rep2):
    # Aligment
    commonAPISequence = []
    alignment_result = globalAlign( rep1, rep2, 10, -1, 0)[2]
    common_motif_sequence = motif_delimit(alignment_result)
    return common_motif_sequence

In [None]:
def removeDuplicateAPI(featureTrace): # remove duplicate api if continuously occur
    result = []
    lastAPI = ""
    for api in featureTrace:
        if lastAPI != api: # find new api
            result.append(api)
            lastAPI = api
    return result

def removeUnwantedAPI(featureTrace): # remove unwanted api
    result = []
    unwanted_api = {'CloseHandle', 'OpenThread', 'RegOpenKey', 'RegCloseKey'}
    frequently_used_lib = {'imm32', 'lpk', 'gdi32', 'kernel32', 'ntdll', 'user32', 'comctl32', 'advapi64'}

    for api in featureTrace:
        API = api.split('#')[0]
        
        if API == "LoadLibrary": # api is LoadLibrary
            libName = api.split("@")[2]
            if libName not in frequently_used_lib: # found new library, add it into lib_set and result_Hooklog
                result.append(api)
                frequently_used_lib.update(libName)
                
        elif API not in unwanted_api: # api not unwanted
            result.append(api)
            
    return result

In [None]:
% run FeatureTrace.ipynb

# initialize all traces as "to merge candidates clusters"
def initialCandidateDict(data_directory):
    
#     toMergeCandidate_List = list()
    toMergeCandidate_Dict = dict()
    
    # get feature hooklogs
    FeatTrace = FeatureTrace
    traceName_list = list(filter(lambda f:f.endswith('.trace.hooklog'), os.listdir(data_directory))) # trace Name List
    ft_count = 0
    for traceName in traceName_list:
        featureTrace = FeatTrace(data_directory + traceName).getTrace_noContainTS()
#         featureTrace = [line.rstrip('\n') for line in open(data_directory + traceName)] # use txt as featureTrace directly
        featureTrace = removeDuplicateAPI(featureTrace)    
        featureTrace = removeUnwantedAPI(featureTrace)
        clusterName = "G"+str(ft_count)
        # R = tuple( clusterName, list of common motif Sequence ), the representative of cluster.
        R = (clusterName, [featureTrace])
        clusterMembers = set()
        traceName = shortenHooklogName(traceName)
        clusterMembers.add(traceName)
        
        toMergeCandidate_Dict[ft_count] = (R, clusterMembers)
        
        ft_count+=1
        
    print("-- Finish Initializing --")
    return toMergeCandidate_Dict

In [None]:
# shorten Name to first 6 charactors
def shortenHooklogName(traceName):
    hashValue = traceName[0:6]
    pid = traceName.split("_")[1].split(".")[0]
    return hashValue+"_"+pid

In [None]:
# input: two R
# output: new Rep's common motif sequence of input CMS;
def get_Rep_CommMotifSeq(Ri, Rj):
    rep1 = []
    for motif in Ri[1]:
        rep1.extend(motif)
    rep2 = []
    for motif in Rj[1]:
        rep2.extend(motif)
    repNew_CMS = []
    if(rep1 and rep2):
        commonSequence = do_globalAlignment(rep1, rep2) # do Alignment
        repNew = commonSequence
    return repNew

In [None]:
# return a dictionary that contains the initializing informations
#
# initialDict = {clusterName : (originalName, initialLength)}

def getInitialDict(toMergeCandidateDict):
    initialDict = dict()
    for key, value in toMergeCandidateDict.items():
        clusterName = value[0][0]
        comm_motif_Seq = value[0][1]
        repAPISeq = []
        for motif in comm_motif_Seq:
            repAPISeq.extend(motif)
        initialLen = len(repAPISeq)
        originalName = value[1].pop()
        value[1].add(originalName)
        initialDict[clusterName] = (originalName, initialLen)
    return initialDict

In [None]:
# return a dict that contains only original name
# nameDict = {clusterName: original name}

def getInitialNameDict(initialDict):
    nameDict = dict()
    for key, value in initialDict.items():
        name = value[0]
        nameDict[key] = name
    return nameDict

In [None]:
# compute score of Rnew
# the score calculate method is the length ratio of new to origin one

# Ri is a tuple like ('G0', [[['A#A', 'C#C'], 0, 1, (0, 1), (1, 2)]])
def compute_Score(Ri, Rj, Rnew):
    if(Rnew[1]):
        repi = []
        for API_motif in Ri[1]:
            repi.extend(API_motif)
        repj = []
        for API_motif in Rj[1]:
            repj.extend(API_motif)
        repNew = []
        for API_motif in Rnew[1]:
            repNew.extend(API_motif)
        L_Ri = len(repi)
        L_Rj = len(repj)
        Lorg = max(L_Ri, L_Rj)
        Lnew = len(repNew)
        return float(Lnew)/Lorg
    else:
        return 0

In [None]:
# get score list of toMergeCandidateDict(single iteration) from highest to lowest

def findMergeCandidateScoreList(toMergeCandidateDict, generatedSeqNum):
    scoreList = list()
    dictKeys = list(toMergeCandidateDict.keys())
    
    sensitiveAPIs = {"CreateProcessInternal", "OpenProcess", "WinExec", "CreateThread", "OpenThread", "CreateRemoteThread",
                     "CopyFile", "CreateFile", "WriteFile", "ReadFile", "DeleteFile", "RegCreateKey", "RegSetValue",
                     "InternetOpen", "InternetConnect", "HttpSendRequest", "WinHttpOpen", "WinHttpSendRequest", "WinHttpWriteData", "WinHttpCreateUrl"}
    
    for i in range(len(dictKeys)):
        for j in range(i+1, len(dictKeys)):
            
            # toMergeCandidateDict[i][1] is memberSet
            Ri = toMergeCandidateDict[ dictKeys[i] ][0] # Ri is a tuple like ('G0', [['A#A', 'C#C'], ['MMM']])
            Rj = toMergeCandidateDict[ dictKeys[j] ][0]
            repNew_CMS = get_Rep_CommMotifSeq(Ri, Rj) # get rep's common motif seq.
            clusterTempName = "G" + str(generatedSeqNum)
            Rnew = (clusterTempName, repNew_CMS)
            score = compute_Score(Ri, Rj, Rnew)
            Ri_name = Ri[0]
            Rj_name = Rj[0]
            scoreList.append((score, Rnew, Ri_name, Rj_name))

    if(len(scoreList) > 0):
        scoreList.sort(key=lambda tup:tup[0], reverse=True) # sorting by score (from biggest to smallest) 
        print("ScoreList Length in method : ", len(scoreList))
    else:
        print("No any merge candidate")
    
    return scoreList # list = [(score, Rnew, Ri_name, Rj_name), (score, Rnew, Ri_name, Rj_name), ...]

In [None]:
def checkExactlySameCandidates(scoreList):
    globalPoolDict = dict() # a dict contains many sets.  dict = {index0: memberSet, 1: memberSet, 2:...}
    newScoreList = list() # list = [(score, R, memberSet), (score, R, memberSet), ...]
    scoreListIdx = 0
    for rank in scoreList:
        score = rank[0]
        
        if(score == 1.0):
            
            Ri_name = rank[2]
            Rj_name = rank[3]
            
            duplicate = False
            for key, memberSet in globalPoolDict.items():
                if(Ri_name in memberSet) or (Rj_name in memberSet):
                    memberSet.add(Ri_name)
                    memberSet.add(Rj_name)
                    
                    # update newScoreList 'memberSet' element
                    newScoreList[key] = (newScoreList[key][0], newScoreList[key][1], memberSet)
                    duplicate = True
                    
            # Find new independent pair, add into newScoreList and create new dict key
            if(duplicate is False):
                memberSet = set()
                memberSet.add(Ri_name)
                memberSet.add(Rj_name)
                globalPoolDict[scoreListIdx] = memberSet
                
                Rnew = rank[1]
                newScoreList.append((score, Rnew, memberSet))
                scoreListIdx += 1
        else:
            Rnew = rank[1]
            Ri_name = rank[2]
            Rj_name = rank[3]
            memberSet = set()
            memberSet.add(Ri_name)
            memberSet.add(Rj_name)
            newScoreList.append((score, Rnew, memberSet))
            scoreListIdx += 1
    globalPoolDict = None
    return newScoreList # list = [(score, R, memberSet), (score, R, memberSet), ...]
        

In [None]:
# # unit test
# item1 = (1.0, ("G0", "[['A#A', 'B#B','B#B', 'C#C','D#D'], 0, 2]"), "a.txt", "b.txt")
# item2 = (1.0, ("G1", "[['A#A', 'B#B','B#B', 'C#C','D#D'], 0, 2]"), "a.txt", "c.txt")
# item3 = (1.0, ("G2", "[['A#A', 'B#B','B#B', 'C#C','D#D'], 0, 2]"), "b.txt", "c.txt")
# item4 = (1.0, ("G3", "[['A#A', 'B#B','B#B', 'C#C','D#D'], 0, 2]"), "c.txt", "d.txt")
# item5 = (1.0, ("G4", "[['E#A', 'F#B'], 0, 2]"), "e.txt", "f.txt")
# item6 = (0.8, ("G5", "[['X#A', 'Y#B'], 0, 2]"), "x.txt", "y.txt")

# scoreList = [item1, item2, item3, item4, item5, item6]

# newScoreList = checkExactlySameCandidates(scoreList)
# print(newScoreList)

In [None]:
# add Rnew into toMergeCandidateDict and remove member of Rnew from candidates.

def mergeCandidateClusters(toMergeCandidateDict, intermediatePoolDict, scoreList, generatedSeqNum, initialDict, definedThreshold):
    initialNameDict = getInitialNameDict(initialDict) # get original name for reference in output.
    
    currentMergedSet = set()
    for rank in scoreList:
        score = rank[0]
        memberSet = rank[2] # memberSet of highest score

        # the minmum score this round is smaller than threshold
        if(score < definedThreshold):
            break
        
        exclusiveness = False
        
        # check exclusiveness
        for member in memberSet:
            if(member in currentMergedSet):
                exclusiveness = True
                break
                
        if(not exclusiveness):
            clusterMembers = set() # create cluster member set with original Name
            for member in memberSet:
                nameOfMember = int(member.split('G')[1])
                del toMergeCandidateDict[nameOfMember]
                
                if member in initialNameDict:
                    clusterMembers.add(initialNameDict[member])
                else:
                    clusterMembers.add(member)
                    
                # Mark elements are merged
                currentMergedSet.add(member) # update currentMergedSet
            
            Rnew = rank[1][1] # representative without old clusterName (i.e., rank[1] = (Name, Rep.))
            newName = "G" + str(generatedSeqNum)
            new_Cluster = (newName, Rnew)
            
            toMergeCandidateDict[generatedSeqNum] = (new_Cluster, clusterMembers)
            intermediatePoolDict[generatedSeqNum] = (score, new_Cluster, clusterMembers) # (score, newCluster, members)
            generatedSeqNum += 1
    currentMergedSet = None
    return toMergeCandidateDict, intermediatePoolDict, generatedSeqNum

In [None]:
### Main Function of RasMMA ###
import pickle

def do_RasMMA_clustering(data_directory, tag, outputPath, thresholdValue):
    
    testDict = {0: (('G0', [['A#A', 'B#B','B#B', 'C#C','D#D']]),{"a.trace.hooklog"}),
                1:(('G1', [['A#A','B#B','C#C','D#D',"G#G"]]),{"b.trace.hooklog"}),
                   2:(('G2', [["B#B",'F#F','C#C','D#D', 'G#G']]),{"c.trace.hooklog"}),
                      3:(('G3', [['Q#Q','C#C','D#D','G#G','M#M']]),{"d.trace.hooklog"}),
                           4:(('G4', [['A#A','Q#Q','C#C','G#G','M#M']]),{"e.trace.hooklog"})}

    intermediatePool = dict()
    roundInfos = dict()
    residual = None # used to save residual candidate when algorithm stop.
#     toMergeCandidateDict = testDict
    toMergeCandidateDict = initialCandidateDict(data_directory) # initialize @toMergeCandidateDict
#     print(toMergeCandidateDict)
    # initialDict = {clusterName : (originalName, initialLength)}
    initialDict = getInitialDict(toMergeCandidateDict)
    
    roundProduct = list()
    for key, value in initialDict.items():
        roundProduct.append(key)
    roundInfos[0] = roundProduct # record product in round 0 (i.e., initialization)
    
    generatedSeqNum = len(toMergeCandidateDict) # counter after initialize. Used to naming clusters.

    print("-- Start Clustering --")
    print("Threshold set =", thresholdValue)
    roundCounter = 1
#     generatedSeqNum = 206
#     roundCounter = 2
    if(roundCounter != 1):
        with open(outputPath+ "pickle/" + 'toMergeCandidate_round'+str(roundCounter)+'.pickle', 'rb') as mHandle:
            toMergeCandidateDict = pickle.load(mHandle)
        with open(outputPath+ "pickle/" + 'roundInfos_round'+str(roundCounter-1)+'.pickle', 'rb') as rHandle:
            roundInfos = pickle.load(rHandle)
        with open(outputPath+ "pickle/" + 'intermediate_round'+str(roundCounter-1)+'.pickle', 'rb') as iHandle:
            intermediatePool = pickle.load(iHandle)
    
    while(1):
        print("Round: ", roundCounter)
        if(len(toMergeCandidateDict) == 1):
            residual = toMergeCandidateDict # output residual candidates.
            break

        # calculate scoreList in candidate clusters
        scoreList = findMergeCandidateScoreList(toMergeCandidateDict, generatedSeqNum)
        
        # check and merge exactly the same candidates before merge clusters
        scoreList = checkExactlySameCandidates(scoreList)

        
        # generated Clusters in This Round:
        nameIdxStart = generatedSeqNum
        
        toMergeCandidateDict, intermediatePool, generatedSeqNum = mergeCandidateClusters(
            toMergeCandidateDict, intermediatePool, scoreList, generatedSeqNum, initialDict, thresholdValue)
        
        print("generatedSeqNum now: ", generatedSeqNum)
        
        # check if algorithm should stop when merge score under threshold
        # if a score smaller than threshold, then it will break out when merging.
        # Hense, if the 'generatedSeqNum' equals than 'nameIdxStart', means that no any new generated cluster.
        # (if occurr a new cluster, generatedSeqNum will add one.)
        if(generatedSeqNum == nameIdxStart):
            residual = toMergeCandidateDict # output residual candidates.
            break # end algorithm
        
        nameIdxEnd = generatedSeqNum
        
        # Record clusters generated in this round
        for idx in range(nameIdxStart, nameIdxEnd):
            if roundInfos.get(roundCounter) is None:
                roundProduct = list()
                roundProduct.append(intermediatePool[idx][1][0])
                roundInfos[roundCounter] = roundProduct
            else:
                roundInfos[roundCounter].append(intermediatePool[idx][1][0])
                
        roundCounter += 1
        
        with open(outputPath+ "pickle/" + 'toMergeCandidate_round'+str(roundCounter)+'.pickle', 'wb') as mHandle:
            pickle.dump(toMergeCandidateDict, mHandle, protocol=pickle.HIGHEST_PROTOCOL)
        with open(outputPath+ "pickle/" + 'intermediate_round'+str(roundCounter-1)+'.pickle', 'wb') as iHandle:
            pickle.dump(intermediatePool, iHandle, protocol=pickle.HIGHEST_PROTOCOL)
        with open(outputPath+ "pickle/" + 'roundInfos_round'+str(roundCounter-1)+'.pickle', 'wb') as rHandle:
            pickle.dump(roundInfos, rHandle, protocol=pickle.HIGHEST_PROTOCOL)
            
    print("-- Finish Clustering --")
    
    for pkl in os.listdir(outputPath+'pickle/'):
        if pkl.startswith('toMergeCandidate_round'):
            os.remove(outputPath+'pickle/'+pkl)
        elif pkl.startswith('intermediate_round'):
            os.remove(outputPath+'pickle/'+pkl)
        elif pkl.startswith('roundInfos_round'):
            os.remove(outputPath+'pickle/'+pkl)
            
    print("-- Clean Temp Pickle Files --")

    return intermediatePool, initialDict, roundInfos, residual

In [None]:
def clusterInitializedReps(initializedReps_dict, tag, outputPath, thresholdValue):
    intermediatePool = dict()
    roundInfos = dict()
    residual = None # used to save residual candidate when algorithm stop.
#     toMergeCandidateDict = testDict
    toMergeCandidateDict = initializedReps_dict # using residualRepsDict as toMergeCandidateDict (skip initialization)

    # initialDict = {clusterName : (originalName, initialLength)}
    initialDict = getInitialDict(toMergeCandidateDict)
    
    roundProduct = list()
    for key, value in initialDict.items():
        roundProduct.append(key)
    roundInfos[0] = roundProduct # record product in round 0 (i.e., initialization)
    
    generatedSeqNum = len(toMergeCandidateDict) # counter after initialize. Used to naming clusters.

    print("-- Start Clustering --")
    print("Threshold set =", thresholdValue)
    roundCounter = 1
    
    while(1):
        print("Current Round : Round ", roundCounter)
        if(len(toMergeCandidateDict) == 1):
            residual = toMergeCandidateDict # output residual candidates.
            break

        # calculate scoreList in candidate clusters
        scoreList = findMergeCandidateScoreList(toMergeCandidateDict, generatedSeqNum)
        print("-- Finish scoring --")
        print("ScoreList Len : ", len(scoreList))
        
        # check and merge exactly the same candidates before merge clusters
        scoreList = checkExactlySameCandidates(scoreList)
        print("-- Finish checking 100% same candidates --")
             
        # generated Clusters in This Round:
        nameIdxStart = generatedSeqNum
        
        toMergeCandidateDict, intermediatePool, generatedSeqNum = mergeCandidateClusters(
            toMergeCandidateDict, intermediatePool, scoreList, generatedSeqNum, initialDict, thresholdValue)
        print("-- Finish merging clusters --")
        # check if algorithm should stop when merge score under threshold
        # if a score smaller than threshold, then it will break out when merging.
        # Hense, if the 'generatedSeqNum' equals than 'nameIdxStart', means that no any new generated cluster.
        # (if occurr a new cluster, generatedSeqNum will add one.)
        if(generatedSeqNum == nameIdxStart):
            residual = toMergeCandidateDict # output residual candidates.
            break # end algorithm
        
        nameIdxEnd = generatedSeqNum
        
        # Record clusters generated in this round
        for idx in range(nameIdxStart, nameIdxEnd):
            if roundInfos.get(roundCounter) is None:
                roundProduct = list()
                roundProduct.append(intermediatePool[idx][1][0])
                roundInfos[roundCounter] = roundProduct
            else:
                roundInfos[roundCounter].append(intermediatePool[idx][1][0])
                
        roundCounter += 1
    print("-- Finish Clustering --")

    return intermediatePool, initialDict, roundInfos, residual