# A implementation of Sequence-Based Behavior Group Clustering Algorithm


In [1]:
def shortenHooklogName(hkName):
    hashValue = hkName[0:6]
    pid = hkName.split("_")[1].split(".")[0]
    return hashValue+"_"+pid

In [2]:
# input: two R to got Representative;
def get_Representative(Ri, Rj):
    rep1 = list()
    rep2 = list()

    for i in range(len(Ri[1])): # get length of R's common motif seqs  (p.s. Ri[0] is clusterName)
        rep1 += Ri[1][i][0]
    for i in range(len(Rj[1])):
        rep2 += Rj[1][i][0]
    
    comMotif_dict = do_globalAlignment(rep1, rep2) # do Alignment
    
    repNew = list() 
    newStartIdx = 0
    
    for m in sorted(comMotif_dict.keys(), key = lambda x : int(x.split('_')[0][1:])): # sorted by stages
        cmsList = comMotif_dict[m]
        newEndIdx = newStartIdx + len(cmsList[0]) - 1
        repNew.append((cmsList[0], newStartIdx, newEndIdx, cmsList[1], cmsList[2]))
                  # [CMS, newCMSStartIdx, newCMSEndIdx, oriIdxRange1, oriIdxRange2]
        newStartIdx = newEndIdx + 1
    
    return repNew

In [3]:
% run Alignment_Fast3.ipynb
% run StructMatchGap3.ipynb
% run StageMatrix.ipynb
% run Motif.ipynb
% run OutputStage.ipynb
% run CommonMotifAnalysis_Tmp.ipynb

# Doing global alignment and Calculate common motif.
# will return a common motif dict
def do_globalAlignment(rep1, rep2):
    # Aligment
    align_dict = dict()
    BASE = "rep1"
    align_dict['rep1'] = pairwise_NW( rep1, rep1, 2, -1, -3, 1)[2]
    align_dict['rep2'] = pairwise_NW( rep1, rep2, 2, -1, -3, 1)[2]
    
    # get 'Match Matrix' and 'Gap List'
    matchMatrix, gapSeqList = structMatchGap(align_dict, BASE)
    stageMatrixResult = stageMatrix(matchMatrix, gapSeqList)
    Motif_Obj = Motif(stageMatrixResult, BASE)
    outputStage = OutputStage(stageMatrixResult, None, BASE, Motif_Obj)
    
    executionTrace_dict = {"rep1":rep1, "rep2":rep2}
    
    commonMotif = CommonMotif(stageMatrixResult, Motif_Obj, executionTrace_dict, outputStage)
    
    # comMotifdict= {'s<stage>_<motif>': [CMS], oriIdxRange1, oriIdxRange2},
    comMotif_dict = commonMotif.getComMotifDict()  
    return comMotif_dict

In [4]:
% run FeatureHooklog3.ipynb
#******************** the output toMergeCandidate_Dict have to change to set
def initialCandidateDict(data_directory):
    
#     toMergeCandidate_List = list()
    toMergeCandidate_Dict = dict()
    
    # get feature hooklogs
    Hooklog = FeatureHooklog3
    hkName_list = list(filter(lambda f:f.endswith('.trace.hooklog'), os.listdir(data_directory))) # hooklog Name List
    hk_count = 0
    for hkName in hkName_list:
        featureHooklog = Hooklog(data_directory + hkName, 1).getHkli_noContainTS()
        clusterName = "G"+str(hk_count)
        # R = tuple( clusterName, list(  tuple(featureHooklog, fhStartIdx, fhEndIdx) ) ), the representative of cluster.
        R = (clusterName, [(featureHooklog, 0, len(featureHooklog)-1)] )
        clusterMembers = set()
        hkName = shortenHooklogName(hkName)
        clusterMembers.add(hkName)
        
        toMergeCandidate_Dict[hk_count] = (R, clusterMembers)
        
        hk_count+=1
    return toMergeCandidate_Dict
#     return toMergeCandidateSet

In [5]:
import functools

def compute_Score(Ri, Rj, Rnew):
    L_Ri = functools.reduce(lambda x,y:x+y, [len(i[0]) for i in Ri[1]])
    L_Rj = functools.reduce(lambda x,y:x+y, [len(j[0]) for j in Rj[1]])
    
    Lorg = max(L_Ri, L_Rj)
    Lnew = functools.reduce(lambda x,y:x+y, [len(n[0]) for n in Rnew[1]]) 
    score = float(Lnew)/Lorg
    
    return score

In [6]:
def findMergeCandidateScoreList(toMergeCandidateDict, iterateCounter):
    scoreList = list()
    dictKeys = list(toMergeCandidateDict.keys())
    print("dictKeys", dictKeys)
    
    for i in range(len(dictKeys)):
        for j in range(i+1, len(dictKeys)):
            
            # toMergeCandidateDict[i][1] is memberSet
            Ri = toMergeCandidateDict[ dictKeys[i] ][0] # Ri is a tuple like (('G0', [[['A#A', 'C#C'], 0, 1, (0, 1), (1, 2)]]))
            Rj = toMergeCandidateDict[ dictKeys[j] ][0]
            
            # create Rnew = (clusterName , repNew)
            repNew = get_Representative(Ri, Rj)
            clusterTempName = "G" + str(iterateCounter)
            Rnew = (clusterTempName , repNew)
            
            # compute merge score of Rnew
            score = compute_Score(Ri, Rj, Rnew)
            Ri_name = Ri[0]
            Rj_name = Rj[0]
            
            if(iterateCounter == 47):
                print("Name = ", Ri_name, Rj_name)
            scoreList.append((score, Rnew, Ri_name, Rj_name))
            
    if(len(scoreList) > 0):
        scoreList.sort(key=lambda tup:tup[0], reverse=True) # sorting by score (from biggest to smallest) 
    else:
        print("No common motif")
    
#     memlist = [(s[2], s[3]) for s in scoreList]
#     print("member pairs in scoreList : iterate: ", iterateCounter,"\n", memlist)
    return scoreList

In [7]:
def mergeCandidateClusters(toMergeCandidateDict, intermediatePoolDict, scoreList, iterateCounter):
    currentMergedSet = set()
        
    for rank in scoreList:
        Ri_name = rank[2] # member1 of highest score
        Rj_name = rank[3] # member2 of highest score

        # check exclusiveness that candidate have been merged in current scoreList.
        if((Ri_name not in currentMergedSet) and (Rj_name not in currentMergedSet)):
            # remove candidates in @toMergeCandidateDict
            keyOfRi = int(Ri_name.split('G')[1])
            keyOfRj = int(Rj_name.split('G')[1])
            print("Remove : ",keyOfRi, ", ", keyOfRj)
            del toMergeCandidateDict[keyOfRi], toMergeCandidateDict[keyOfRj]

            Rnew = rank[1] # get representative of highest score
            newName = "G" + str(iterateCounter) # update clusterName
            new_Cluster = (newName, Rnew[1])

            clusterMembers = set() # create cluster member set
            clusterMembers.add(Ri_name)
            clusterMembers.add(Rj_name)

            toMergeCandidateDict[iterateCounter] = (Rnew, clusterMembers)
            intermediatePoolDict[iterateCounter] = (Rnew, clusterMembers)

            iterateCounter += 1

            currentMergedSet.add(Ri_name) # update currentMergedSet
            currentMergedSet.add(Rj_name)
#             print("currentMergedSet : ", currentMergedSet)
        else:
            break
    
    print("toMergeCandidateDict keys:", toMergeCandidateDict.keys())
#     print("intermediatePoolDict keys:", intermediatePoolDict.keys())
    return toMergeCandidateDict, intermediatePoolDict, iterateCounter

In [8]:

# Ri =('G34', [[['A#A', 'C#C'], 0, 1, (0, 1), (1, 2)], [['B#B'], 2, 3, (3, 3), (3, 3)]])

data_directory = "hooklogs/allaple_woj_g_98/"
# adict = {0: (('G0', [[['A#A', 'B#B','B#B', 'C#C','D#D'], 0, 2]]),{"a.trace.hooklog"}), 1:(('G1', [[['A#A','C#C','D#D'], 0, 2]]),{"b.trace.hooklog"})}

intermediatePool = dict()

toMergeCandidateDict = initialCandidateDict(data_directory) # initialize @toMergeCandidateDict
iterateCounter = len(toMergeCandidateDict) # counter after initialize. Used to naming clusters.

while(1):
    if(len(toMergeCandidateDict) == 1):
        break
    
    # calculate scoreList in candidate clusters
    scoreList = findMergeCandidateScoreList(toMergeCandidateDict, iterateCounter)
    if(len(scoreList) == 0):
        break
    toMergeCandidateDict, intermediatePool, iterateCounter = mergeCandidateClusters(toMergeCandidateDict, intermediatePool, scoreList, iterateCounter)
    print("iterateCounter", iterateCounter)
    
print(len(toMergeCandidateDict))


print(iterateCounter)
print("len of intermediatePool : ", len(intermediatePool))
print("intermediatePool", intermediatePool)

# iterateCounter = len(adict)
# findMergeCandidatePair(adict, iterateCounter)

# print(toMergeCandidateDict)

dictKeys [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
Remove :  0 ,  1
toMergeCandidateDict keys: dict_keys([2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32])
iterateCounter 33
dictKeys [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32]
Remove :  2 ,  23
Remove :  3 ,  11
toMergeCandidateDict keys: dict_keys([4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34])
iterateCounter 35
dictKeys [4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]
Remove :  4 ,  5
toMergeCandidateDict keys: dict_keys([6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35])
iterateCounter 36
dictKeys [6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 1

KeyError: 33