## SBBGCA usage
Usage of SBBGCA.ipynb

### Set Variable 
set variables before running codes:
data_directory, tag, outputPath

### Want to get clustering results
##### run startClustering() function.

It call do_SBBGCA_clustering() function in SBBGAC.ipynb which output 2 results : intermediatePool, initialNames.

##### intermediatePool: the intermediates of clustering which is a dict
##### initialNames: a dict which contains the original names of basic group(start from G0 to G_inputHooklogNumber)

in the startClustering() function will output 2 pickles to outputPath which helping further usage.
(Because the performance of SBBGCA is terrible Q_Q)

### Want to draw hierarchy graph of clustering result
##### run drawClusteringResults() function.

It read the pickles of clustering results and draw graphs using matplotlib.

In [None]:
# basic global inputs variable

manualThresholdNumber = 0.8 # defined the threshold of merge score

data_directory = "hooklogs/Test_Family/"
tag = "testFam_0.8" # used for pickle name
outputPath = "output/SBBGCA_Clustering/Test_Family/"
pickleDir = outputPath + "pickle/"

# Create Directories if didn't exist
import os
if not os.path.isdir(outputPath): os.makedirs(outputPath)
if not os.path.isdir(pickleDir): os.makedirs(pickleDir)

In [None]:
# reverse clusterID by hooklogName

def findClusterID(nameDict, hooklogName):
    for key, value in nameDict.items():
        if(value == hooklogName):
            return key
    return hooklogName

In [None]:
def getInitialNameDict(initialDict):
    nameDict = dict()
    for key, value in initialDict.items():
        name = value[0]
        nameDict[key] = name
    return nameDict

In [None]:
# convert memberSet to List type

def getMemberList(memberSet, nameDict):
    memberList = list()
    while(len(memberSet)>0):
        member = memberSet.pop()
        clusterID = findClusterID(nameDict, member)
        memberList.append(clusterID)
    return memberList

In [None]:
# z[0] = g1,  z[1] = g2,  z[2] = 高度
# Create structure Z

def createStructZ(intermediate_dict, nameDict):
    import numpy as np
    Z = np.zeros((len(intermediate_dict) ,4))
    
    intermediate_list = sorted(intermediate_dict.items(), key=lambda x:x[0])
    
    iterCounter = 0
    for item in intermediate_list:
        value = item[1] # get original dict value
        score = value[0]
        height = 1 - score # get cluster distance
        clusterName = value[1][0]
        memberSet = value[2] # members set
        memberList = getMemberList(memberSet, nameDict)
        print(clusterName, " : ", memberList)
        member1 = memberList[0][1::]
        member2 = memberList[1][1::]
        
        Z[iterCounter] = [member1, member2, height, len(memberList)] # set Z element
        iterCounter+=1
        
    return Z

In [None]:
def createLabelList(nameDict):
    dict_keys = list(nameDict.keys())
    dict_keys.sort(key=lambda tup: int(tup[1::] )) # sort keys by number in clusterName (i.e., '31' in 'G31')
    
    labelList = list()
    for key in dict_keys:
        labelList.append( nameDict[key] )
    
    return labelList

In [None]:
# do clustering and output two pickle files. (@_intermediate.pickle and @_nameDict.pickle)
% run SBBGCA.ipynb

def startClustering(data_directory, tag, outputPath, thresholdValue=None):
    if not os.path.isdir(outputPath): os.makedirs(outputPath) # create output path Directory
    
    intermediatePool, initialDict, roundInfos, residual = do_SBBGCA_clustering(data_directory, tag, outputPath, thresholdValue)

    # saving intermediatePool as pickle file
    with open(pickleDir + tag + '_intermediate.pickle', 'wb') as handle:
        pickle.dump(intermediatePool, handle, protocol=pickle.HIGHEST_PROTOCOL)

    # saving initialNames dict as pickle file
    with open(pickleDir + tag + '_initialDict.pickle', 'wb') as handle:
        pickle.dump(initialDict, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
    # saving round information dict as pickle file
    with open(pickleDir + tag + '_roundInfos.pickle', 'wb') as handle:
        pickle.dump(roundInfos, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
    if(residual is not None):
        # saving round information dict as pickle file
        with open(pickleDir + tag + '_residual.pickle', 'wb') as handle:
            pickle.dump(residual, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# usage example of get clustering results

import datetime
date_time = datetime.datetime.now()
print(date_time.strftime("%Y-%b-%d %H:%M"))

startClustering(data_directory, tag, outputPath, manualThresholdNumber)
    
date_time = datetime.datetime.now()
print(date_time.strftime("%Y-%b-%d %H:%M"))

In [None]:
# draw pics

def drawClusteringResults(picklePath, outputPath, tag, upgma_threshold):
    
    #     Dependencies
    import pickle
    import scipy
    import scipy.cluster.hierarchy as sch
    import matplotlib.pylab as plt
    %matplotlib inline
    
    # read the results from pickle files
    with open(picklePath + tag + '_intermediate.pickle', 'rb') as handle:
        intermediate = pickle.load(handle)
    with open(picklePath + tag + '_initialDict.pickle', 'rb') as handle:
        initialDict = pickle.load(handle)
    with open(picklePath + tag + '_roundInfos.pickle', 'rb') as handle:
        roundInfos = pickle.load(handle)
        
    initialNameDict = getInitialNameDict(initialDict)
    print("Original Names : ", initialNameDict)
    print("round informations", roundInfos)
    
    # It have to create the Z structure for drawing purpose.
    Z = createStructZ(intermediate, initialNameDict)
    
    label_list = createLabelList(initialNameDict) # create graph labels by nameDict

    (orig_x, orig_y) = plt.rcParams['figure.figsize']
    plt.rcParams['figure.figsize'] = (6, 10) #---input

    # P = sch.dendrogram(Z, color_threshold = upgma_threshold, orientation = 'right') # no label
    P = sch.dendrogram(Z, color_threshold = upgma_threshold, labels = label_list, orientation = 'right')

#     plt.axvline(x=upgma_threshold, linewidth=1, color='black', linestyle='--')
    locs, labels = plt.yticks()
    # plt.xticks(  np.arange(0,1.1,0.1)) #---Align axis-x 900(0, 0.35, 0.05) 909(0,0.6,0.1) 855(0, 0.9, 0.1)
    plt.setp(labels, fontsize = 14)
    plt.tight_layout()

    plt.rcParams['figure.figsize'] = (orig_x, orig_y)
    plt.savefig(outputPath+'SBBGCA_'+tag+'.pdf', dpi=300)

In [None]:
# usage example of draw hierarchy graph of clustering results

drawClusteringResults(pickleDir, outputPath, tag, 0.01)