## SBBGCA usage
Usage of SBBGCA.ipynb

### Set Variable 
set variables before running codes:
data_directory, tag, outputPath

### Want to get clustering results
##### run startClustering() function.

It call do_SBBGCA_clustering() function in SBBGAC.ipynb which output 2 results : intermediatePool, initialNames.

##### intermediatePool: the intermediates of clustering which is a dict
##### initialNames: a dict which contains the original names of basic group(start from G0 to G_inputHooklogNumber)

in the startClustering() function will output 2 pickles to outputPath which helping further usage.
(Because the performance of SBBGCA is terrible Q_Q)

### Want to draw hierarchy graph of clustering result
##### run drawClusteringResults() function.

It read the pickles of clustering results and draw graphs using matplotlib.

In [1]:
# basic global inputs variable

manualThresholdNumber = 0.8 # defined the threshold of merge score

familyName = "eggnog"
generation = "main"

data_directory = "17 Family/"+ familyName +"/"+ generation +"/" # traces dir

tag = familyName + "_0.8" # used for pickle name
outputPath = "output/RasMMA-test/" + tag + "/" + generation + "/"
pickleDir = outputPath + "pickle/"

# Create Directories if didn't exist
import os
if not os.path.isdir(outputPath): os.makedirs(outputPath)
if not os.path.isdir(pickleDir): os.makedirs(pickleDir)

In [2]:
# reverse clusterID by hooklogName

def findClusterID(nameDict, hooklogName):
    for key, value in nameDict.items():
        if(value == hooklogName):
            return key
    return hooklogName

In [3]:
def getInitialNameDict(initialDict):
    nameDict = dict()
    for key, value in initialDict.items():
        name = value[0]
        nameDict[key] = name
    return nameDict

In [4]:
# convert memberSet to List type

def getMemberList(memberSet, nameDict):
    memberList = list()
    while(len(memberSet)>0):
        member = memberSet.pop()
        clusterID = findClusterID(nameDict, member)
        memberList.append(clusterID)
    return memberList

In [5]:
# z[0] = g1,  z[1] = g2,  z[2] = 高度
# Create structure Z

def createStructZ(intermediate_dict, nameDict):
    import numpy as np
    Z = np.zeros((len(intermediate_dict) ,4))
    
    intermediate_list = sorted(intermediate_dict.items(), key=lambda x:x[0])
    
    iterCounter = 0
    for item in intermediate_list:
        value = item[1] # get original dict value
        score = value[0]
        height = 1 - score # get cluster distance
        clusterName = value[1][0]
        memberSet = value[2] # members set
        memberList = getMemberList(memberSet, nameDict)
        print(clusterName, " : ", memberList)
        member1 = memberList[0][1::]
        member2 = memberList[1][1::]
        
        Z[iterCounter] = [member1, member2, height, len(memberList)] # set Z element
        iterCounter+=1
        
    return Z

In [6]:
def createLabelList(nameDict):
    dict_keys = list(nameDict.keys())
    dict_keys.sort(key=lambda tup: int(tup[1::] )) # sort keys by number in clusterName (i.e., '31' in 'G31')
    
    labelList = list()
    for key in dict_keys:
        labelList.append( nameDict[key] )
    
    return labelList

In [7]:
# do clustering and output two pickle files. (@_intermediate.pickle and @_nameDict.pickle)
% run RasMMA.ipynb

def startClustering(data_directory, tag, outputPath, thresholdValue=None):
    if not os.path.isdir(outputPath): os.makedirs(outputPath) # create output path Directory
    
    intermediatePool, initialDict, roundInfos, residual = do_SBBGCA_clustering(data_directory, tag, outputPath, thresholdValue)

    # saving intermediatePool as pickle file
    with open(pickleDir + tag + '_intermediate.pickle', 'wb') as handle:
        pickle.dump(intermediatePool, handle, protocol=pickle.HIGHEST_PROTOCOL)

    # saving initialNames dict as pickle file
    with open(pickleDir + tag + '_initialDict.pickle', 'wb') as handle:
        pickle.dump(initialDict, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
    # saving round information dict as pickle file
    with open(pickleDir + tag + '_roundInfos.pickle', 'wb') as handle:
        pickle.dump(roundInfos, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
    if(residual is not None):
        # saving round information dict as pickle file
        with open(pickleDir + tag + '_residual.pickle', 'wb') as handle:
            pickle.dump(residual, handle, protocol=pickle.HIGHEST_PROTOCOL)
            


# Main Cell
if kernel dead at some generatedSeqNum,
##### Modify RasMMA.ipynb
def-do_SBBGCA_clustering() line30,31. and re_run all.

In [8]:
# usage example of get clustering results

import datetime
date_time = datetime.datetime.now()
print(date_time.strftime("%Y-%b-%d %H:%M"))

startClustering(data_directory, tag, outputPath, manualThresholdNumber)
    
date_time = datetime.datetime.now()
print(date_time.strftime("%Y-%b-%d %H:%M"))

2018-Apr-13 14:15
-- Finish Initializing --
-- Start Clustering --
Threshold set = 0.8
ScoreList Length in method :  1830
generatedSeqNum now:  85
ScoreList Length in method :  666
generatedSeqNum now:  96
ScoreList Length in method :  325
generatedSeqNum now:  101
ScoreList Length in method :  210
generatedSeqNum now:  104
ScoreList Length in method :  153
generatedSeqNum now:  105
ScoreList Length in method :  136
generatedSeqNum now:  106
ScoreList Length in method :  120
generatedSeqNum now:  106
-- Finish Clustering --
2018-Apr-13 14:22


### Below cell used to write csv - _groupInfo, _decendants, _motifs
groupInfo.csv can see merge score.

In [9]:
import pickle

# read the results from pickle files
with open(pickleDir + tag + '_intermediate.pickle', 'rb') as handle:
    intermediate = pickle.load(handle)
with open(pickleDir + tag + '_initialDict.pickle', 'rb') as handle:
    initialDict = pickle.load(handle)
with open(pickleDir + tag + '_roundInfos.pickle', 'rb') as handle:
    roundInfos = pickle.load(handle)
    
# calculate motif lengths of all common motifs
def getMotifsLengthList(motifs):
    motifLens = list()
    for motif in motifs:
        startIdx =motif[1]
        endIdx = motif[2]
        mLen = endIdx - startIdx + 1
        motifLens.append(mLen)
    return motifLens

def findGeneratedRoundNumber(clusterName, roundInfosDict):
    for key, value in roundInfosDict.items():
        if clusterName in value:
            return key
    return -1

import csv

descendant_dict = dict()
groupInfo_list = list()
groupMotif_dict = dict()

intermediate_list = sorted(intermediate.items(), key=lambda x : x[0])
for item in intermediate_list:
    value = item[1] # get original dict value
    score = value[0]
    clusterName = value[1][0]
    memberSet = value[2]
    motifs = value[1][1]
    
    # calculate motif lengths of all common motifs
    motifsLens = getMotifsLengthList(motifs) # is a list of numbers
    totalMotifLen = sum(motifsLens) # sum the list

    motifsCount = len(motifs)
    
    descendants = set()
    for member in memberSet:
        if member[0] == "G":
            for descendant in descendant_dict[member]:
                descendants.add(descendant)
        else:
            descendants.add(member)
        
    descendant_dict[clusterName] = descendants
    
    
    groupMotif_dict[clusterName] = motifs
    roundNumber = findGeneratedRoundNumber(clusterName, roundInfos)
    groupInfo_list.append((roundNumber, clusterName, score, memberSet, motifsCount, motifsLens, totalMotifLen))

with open(pickleDir + tag + "_descendant.pickle", 'wb') as f:
    pickle.dump(descendant_dict, f, protocol=pickle.HIGHEST_PROTOCOL)

# write file "GroupInfo.csv" :  clusterName, score, members, motifCount, common motifs length list
with open(outputPath + tag + "_GroupInfo.csv", 'w', newline='') as infoFile:
    spamwriter = csv.writer(infoFile, delimiter=',',
                            quotechar='"', quoting=csv.QUOTE_MINIMAL)
    header = ["Round", "ClusterName", "SimilarityScore", "Members", "MotifsCount", "Motifs_Length", "Total_MotifLength"]
    spamwriter.writerow(header)
    
    # write initial cluster informations(i.e., hooklogs)
    for key in sorted(initialDict.keys(), key = lambda x : int(x[1::])):
        # something like this: (0, "G1", "N/A", "abc", 1, 109)
        originDataRow = (0, key, "N/A", initialDict[key][0], 1, initialDict[key][1], initialDict[key][1])
        spamwriter.writerow(originDataRow)
        
    # write cluster informations
    for group in groupInfo_list:
        spamwriter.writerow(group)
        
with open(outputPath + tag + "_Descendants.csv", "w", newline='') as descFile:
    spamwriter = csv.writer(descFile, delimiter=',',
                            quotechar='"', quoting=csv.QUOTE_MINIMAL)
    header = ["ClusterName", "Descendant Counts", "Descendants"]
    spamwriter.writerow(header)
    for key in sorted(descendant_dict.keys(), key = lambda x : int(x[1::])):
        row = (key, len(descendant_dict[key]), descendant_dict[key])
        spamwriter.writerow(row)
        
# write file "Motifs.csv" :  clusterName, MotifNumber, apis
with open(outputPath + tag + "_Motifs.csv", 'w', newline='', encoding='utf-8') as motifFile:
    spamwriter = csv.writer(motifFile, delimiter=',',
                            quotechar='"', quoting=csv.QUOTE_MINIMAL)
    header = ["ClusterName", "MotifIndex", "MotifLength", "Common Motif APIs"]
    spamwriter.writerow(header)

    for key in sorted(groupMotif_dict.keys(), key = lambda x : int(x[1::])):
        group_motifs = groupMotif_dict[key]
        motifIdx = 0
        for motif in group_motifs:
            firstMotifAPI = True
            motifLen = len(motif[0])
            for api in motif[0]:
                if(firstMotifAPI):
                    row = (key, motifIdx, motifLen, api)
                    firstMotifAPI = False
                else:
                    row = ("", "", "", api)
                spamwriter.writerow(row)
            motifIdx += 1
            
# output residual information of SBBGCA

with open(pickleDir + tag + '_residual.pickle', 'rb') as handle:
    residual = pickle.load(handle)
    
with open(outputPath + tag + "_GroupInfo.csv", 'a', newline='') as expandGroupInfo:
    spamwriter = csv.writer(expandGroupInfo, delimiter=',',
                            quotechar='"', quoting=csv.QUOTE_MINIMAL)
    header = ["ClusterName", "Members", "MotifLength"]
    
    spamwriter.writerow("")
    spamwriter.writerow(("Residual Clusters:","",""))
    spamwriter.writerow(header)
    
    for key, value in residual.items():
        clusterName = value[0][0]
        motifsList = value[0][1]
        motifLens = getMotifsLengthList(motifsList)
        members = value[1]
        if( len(members) == 0 ):
            row = (clusterName, "N/A", motifLens)
        else:
            row = (clusterName, members, motifLens)
            
        spamwriter.writerow(row)

### Below cell used for drawing trees.

In [10]:
# draw pics

def drawClusteringResults(picklePath, outputPath, tag, upgma_threshold):
    
    #     Dependencies
    import pickle
    import scipy
    import scipy.cluster.hierarchy as sch
    import matplotlib.pylab as plt
    %matplotlib inline
    
    # read the results from pickle files
    with open(picklePath + tag + '_intermediate.pickle', 'rb') as handle:
        intermediate = pickle.load(handle)
    with open(picklePath + tag + '_initialDict.pickle', 'rb') as handle:
        initialDict = pickle.load(handle)
    with open(picklePath + tag + '_roundInfos.pickle', 'rb') as handle:
        roundInfos = pickle.load(handle)
        
    initialNameDict = getInitialNameDict(initialDict)
    print("Original Names : ", initialNameDict)
    print("round informations", roundInfos)
    
    # It have to create the Z structure for drawing purpose.
    Z = createStructZ(intermediate, initialNameDict)
    
    label_list = createLabelList(initialNameDict) # create graph labels by nameDict

    (orig_x, orig_y) = plt.rcParams['figure.figsize']
    plt.rcParams['figure.figsize'] = (6, 10) #---input

    # P = sch.dendrogram(Z, color_threshold = upgma_threshold, orientation = 'right') # no label
    P = sch.dendrogram(Z, color_threshold = upgma_threshold, labels = label_list, orientation = 'right')

#     plt.axvline(x=upgma_threshold, linewidth=1, color='black', linestyle='--')
    locs, labels = plt.yticks()
    # plt.xticks(  np.arange(0,1.1,0.1)) #---Align axis-x 900(0, 0.35, 0.05) 909(0,0.6,0.1) 855(0, 0.9, 0.1)
    plt.setp(labels, fontsize = 14)
    plt.tight_layout()

    plt.rcParams['figure.figsize'] = (orig_x, orig_y)
    plt.savefig(outputPath+'SBBGCA_'+tag+'.pdf', dpi=300)

In [11]:
# usage example of draw hierarchy graph of clustering results

drawClusteringResults(pickleDir, outputPath, tag, 0.01)

Original Names :  {'G12': '2e4850_3368', 'G39': 'ab9903_3220', 'G47': 'bfefab_3212', 'G6': '1192d4_3372', 'G14': '382d34_3292', 'G41': 'ac6a42_3284', 'G49': 'c9955c_3040', 'G2': '077940_3264', 'G26': '817033_2896', 'G36': '9fd1e8_3116', 'G40': 'ac6a42_1312', 'G45': 'bf6239_3268', 'G56': 'd5e1d2_3308', 'G25': '78ac49_3284', 'G9': '24d255_3264', 'G55': 'd3e023_3364', 'G31': '8bcf66_2916', 'G0': '008e76_3272', 'G8': '20a21e_3332', 'G44': 'bf6239_3212', 'G46': 'bfefab_3196', 'G37': 'a4d38d_3316', 'G32': '8e2bca_2992', 'G50': 'caecca_3200', 'G28': '85ed58_3356', 'G52': 'cfc485_2996', 'G27': '841bfb_3296', 'G18': '460dad_3324', 'G22': '63117f_3260', 'G42': 'ad7615_2928', 'G5': '1192d4_3296', 'G21': '524d5d_2992', 'G53': 'cfc485_3328', 'G35': '9b791b_3232', 'G19': '4a6eb4_2972', 'G23': '751eba_3328', 'G54': 'cfd1ac_3296', 'G17': '456ced_3316', 'G30': '897e05_2916', 'G7': '121340_2924', 'G57': 'dc813f_3288', 'G51': 'cf5d72_3320', 'G59': 'e64f40_3280', 'G38': 'a56c48_2920', 'G11': '2e4850_3284'

ValueError: Linkage 'Z' uses non-singleton cluster before it is formed.