# A implementation of Sequence-Based Behavior Group Clustering Algorithm


In [None]:
# shorten Name to first 6 charactors
def shortenHooklogName(hkName):
    hashValue = hkName[0:6]
    pid = hkName.split("_")[1].split(".")[0]
    return hashValue+"_"+pid

In [None]:
# input: two R
# output: new RepresentativeR of inputs;
def get_Representative(Ri, Rj):
    rep1 = list()
    rep2 = list()

    for i in range(len(Ri[1])): # get length of R's common motif seqs  (p.s. Ri[0] is clusterName)
        rep1 += Ri[1][i][0]
    for i in range(len(Rj[1])):
        rep2 += Rj[1][i][0]
    
    comMotif_dict = do_globalAlignment(rep1, rep2) # do Alignment
    
    repNew = list() 
    newStartIdx = 0
    
    for m in sorted(comMotif_dict.keys(), key = lambda x : int(x.split('_')[0][1:])): # sorted by stages
        cmsList = comMotif_dict[m]
        newEndIdx = newStartIdx + len(cmsList[0]) - 1
        repNew.append((cmsList[0], newStartIdx, newEndIdx, cmsList[1], cmsList[2]))
                  # [CMS, newCMSStartIdx, newCMSEndIdx, oriIdxRange1, oriIdxRange2]
        newStartIdx = newEndIdx + 1
    
    return repNew

In [None]:
% run Alignment_Fast3.ipynb
% run StructMatchGap3.ipynb
% run StageMatrix.ipynb
% run Motif.ipynb
% run OutputStage.ipynb
% run CommonMotifAnalysis_Tmp.ipynb

# Doing global alignment and Calculate common motif.
# will return a common motif dict
def do_globalAlignment(rep1, rep2):
    # Aligment
    align_dict = dict()
    BASE = "rep1"
    align_dict['rep1'] = pairwise_NW( rep1, rep1, 2, -1, -3, 1)[2]
    align_dict['rep2'] = pairwise_NW( rep1, rep2, 2, -1, -3, 1)[2]
    
    # get 'Match Matrix' and 'Gap List'
    matchMatrix, gapSeqList = structMatchGap(align_dict, BASE)
    stageMatrixResult = stageMatrix(matchMatrix, gapSeqList)
    Motif_Obj = Motif(stageMatrixResult, BASE)
    outputStage = OutputStage(stageMatrixResult, None, BASE, Motif_Obj)
    
    executionTrace_dict = {"rep1":rep1, "rep2":rep2}
    
    commonMotif = CommonMotif(stageMatrixResult, Motif_Obj, executionTrace_dict, outputStage)
    
    # comMotifdict= {'s<stage>_<motif>': [CMS], oriIdxRange1, oriIdxRange2},
    comMotif_dict = commonMotif.getComMotifDict()  
    return comMotif_dict

In [None]:
% run FeatureHooklog3.ipynb
#******************** the output toMergeCandidate_Dict have to change to set

# initialize all hooklogs as "to merge candidates clusters"
def initialCandidateDict(data_directory):
    
#     toMergeCandidate_List = list()
    toMergeCandidate_Dict = dict()
    
    # get feature hooklogs
    Hooklog = FeatureHooklog3
    hkName_list = list(filter(lambda f:f.endswith('.trace.hooklog'), os.listdir(data_directory))) # hooklog Name List
    hk_count = 0
    for hkName in hkName_list:
        featureHooklog = Hooklog(data_directory + hkName, 1).getHkli_noContainTS()
        clusterName = "G"+str(hk_count)
        # R = tuple( clusterName, list(  tuple(featureHooklog, fhStartIdx, fhEndIdx) ) ), the representative of cluster.
        R = (clusterName, [(featureHooklog, 0, len(featureHooklog)-1)] )
        clusterMembers = set()
        hkName = shortenHooklogName(hkName)
        clusterMembers.add(hkName)
        
        toMergeCandidate_Dict[hk_count] = (R, clusterMembers)
        
        hk_count+=1
        
    print("-- Finish Initializing --")
    return toMergeCandidate_Dict
#     return toMergeCandidateSet

In [None]:
import functools

# compute score of Rnew
# the score calculate method is the length ratio of new to origin one
def compute_Score(Ri, Rj, Rnew):
    L_Ri = functools.reduce(lambda x,y:x+y, [len(i[0]) for i in Ri[1]])
    L_Rj = functools.reduce(lambda x,y:x+y, [len(j[0]) for j in Rj[1]])
    
    Lorg = max(L_Ri, L_Rj)
    Lnew = functools.reduce(lambda x,y:x+y, [len(n[0]) for n in Rnew[1]]) 
    score = float(Lnew)/Lorg
    
    return score

In [None]:
# get score list of toMergeCandidateDict(single iteration) from highest to lowest

def findMergeCandidateScoreList(toMergeCandidateDict, iterateCounter):
    scoreList = list()
    dictKeys = list(toMergeCandidateDict.keys())
    
    for i in range(len(dictKeys)):
        for j in range(i+1, len(dictKeys)):
            
            # toMergeCandidateDict[i][1] is memberSet
            Ri = toMergeCandidateDict[ dictKeys[i] ][0] # Ri is a tuple like (('G0', [[['A#A', 'C#C'], 0, 1, (0, 1), (1, 2)]]))
            Rj = toMergeCandidateDict[ dictKeys[j] ][0]
            
            # create Rnew = (clusterName , repNew)
            repNew = get_Representative(Ri, Rj)
            clusterTempName = "G" + str(iterateCounter)
            Rnew = (clusterTempName , repNew)
            
            # compute merge score of Rnew
            score = compute_Score(Ri, Rj, Rnew)
            
            Ri_name = Ri[0]
            Rj_name = Rj[0]
            
            scoreList.append((score, Rnew, Ri_name, Rj_name))
            
    if(len(scoreList) > 0):
        scoreList.sort(key=lambda tup:tup[0], reverse=True) # sorting by score (from biggest to smallest) 
    else:
        print("No common motif")
    
#     memlist = [(s[2], s[3]) for s in scoreList]
#     print("member pairs in scoreList : iterate: ", iterateCounter,"\n", memlist)
    return scoreList

In [None]:
# add Rnew into toMergeCandidateDict and remove member of Rnew from candidates.

def mergeCandidateClusters(toMergeCandidateDict, intermediatePoolDict, scoreList, iterateCounter, initialNames):
    currentMergedSet = set()
        
    for rank in scoreList:
        Ri_name = rank[2] # member1 of highest score
        Rj_name = rank[3] # member2 of highest score

        # check exclusiveness that candidate have been merged in current scoreList.
        if((Ri_name not in currentMergedSet) and (Rj_name not in currentMergedSet)):
            # remove candidates in @toMergeCandidateDict
            keyOfRi = int(Ri_name.split('G')[1])
            keyOfRj = int(Rj_name.split('G')[1])
            del toMergeCandidateDict[keyOfRi], toMergeCandidateDict[keyOfRj]

            Rnew = rank[1] # get representative of highest score
            newName = "G" + str(iterateCounter) # update clusterName
        
            new_Cluster = (newName, Rnew[1])

            clusterMembers = set() # create cluster member set
            if Ri_name in initialNames:
                clusterMembers.add(initialNames[Ri_name])
            else:
                clusterMembers.add(Ri_name)
            
            
            if Rj_name in initialNames:
                clusterMembers.add(initialNames[Rj_name])
            else:
                clusterMembers.add(Rj_name)
            
            
            toMergeCandidateDict[iterateCounter] = (new_Cluster, clusterMembers)
            intermediatePoolDict[iterateCounter] = (rank[0], new_Cluster, clusterMembers) # (score, newCluster, members)

            iterateCounter += 1

            currentMergedSet.add(Ri_name) # update currentMergedSet
            currentMergedSet.add(Rj_name)
        else:
            break
    
    return toMergeCandidateDict, intermediatePoolDict, iterateCounter

In [None]:
### Main Function of SBBGCA ###

import pickle

def do_SBBGCA_clustering(data_directory, tag, outputPath):
    # testDict = {0: (('G0', [[['A#A', 'B#B','B#B', 'C#C','D#D'], 0, 2]]),{"a.trace.hooklog"}), 1:(('G1', [[['A#A','C#C','D#D'], 0, 2]]),{"b.trace.hooklog"})}

    intermediatePool = dict()

    toMergeCandidateDict = initialCandidateDict(data_directory) # initialize @toMergeCandidateDict

    # initialNames = {value[0][0]: value[1].pop() for key, value in toMergeCandidateDict}
    initialNames = dict()
    for key, value in toMergeCandidateDict.items():
        clusterName = value[0][0]
        originalName = value[1].pop()
        initialNames[clusterName] = originalName

    iterateCounter = len(toMergeCandidateDict) # counter after initialize. Used to naming clusters.

    print("-- Start Clustering --")
    while(1):
        if(len(toMergeCandidateDict) == 1):
            break

        # calculate scoreList in candidate clusters
        scoreList = findMergeCandidateScoreList(toMergeCandidateDict, iterateCounter)
        if(len(scoreList) == 0):
            break
        toMergeCandidateDict, intermediatePool, iterateCounter = mergeCandidateClusters(toMergeCandidateDict, intermediatePool, scoreList, iterateCounter, initialNames)

    print("-- Finish Clustering --")

    return intermediatePool, initialNames