In [None]:
from ipynb.fs.full.functions_header import *
from scipy import stats

In [1]:
# Modularity maximisation
def clauset_old(G):
    '''Performs Clauset-Newman-Moore modularity maximisation.
    Returns cluster size and cluster allocations for given network G.'''
    clusts = list(greedy_modularity_communities(G))
    
    clusts_no = len(clusts)
    clusts_size = np.zeros((clusts_no,1))
    clusts_alloc = np.zeros((len(G.nodes),1))
    
    # get community size
    for k in range(0, clusts_no):
        clusts_size[k,0] = len(clusts[k])
    
    # get community assignment
    for j in range(0, clusts_no):
        clusts_alloc[sorted(clusts[j]),0] = j
        
    return clusts_size.astype(int), clusts_alloc.astype(int)

# Modularity maximisation
def clauset(G):
    '''Performs Clauset-Newman-Moore modularity maximisation.
    Returns cluster size and cluster allocations for given network G.'''
    clusts = list(greedy_modularity_communities(G))
    
    clusts_no = len(clusts)
#     clusts_size = np.zeros((clusts_no,1))
    clusts_alloc = np.zeros((1468,1))
    
#     # get community size
#     for k in range(0, clusts_no):
#         clusts_size[k,0] = len(clusts[k])
    
    # get community assignment
    for j in range(0, clusts_no):
        clusts_alloc[sorted(clusts[j]),0] = j
        
    return clusts_alloc.astype(int)

In [None]:
# Reindexing clusters
def cluster_reindex(df_cluster, ind_start):
    '''Input: data frame of clusters. ind_start indicates where the cluster assignments start.
    Output: cluster_size (size of clusters assignments) and cluster_copy: reassigned clusters for each column.
    I.e. 0 is the largest cluster, 1 is the second largest cluster etc.
    Example: cluster_reindex(parasites_cluster,5)'''
    # get maximum number of clusters
    cluster_size_max = df_cluster.iloc[:,ind_start:].max().max() + 1
    # get number of clusterings performed
    cluster_no = df_cluster.shape[1] - ind_start
    
    cluster_copy = df_cluster.copy()
    cluster_size = pd.DataFrame(np.zeros((cluster_size_max, cluster_no)).astype(int),
                                columns = df_cluster.columns[ind_start:])
    
    for i in df_cluster.columns[ind_start:]:
        # get cluster centroids on first column, sorted by cluster size (given in second column)
        a = cluster_order(df_cluster[i], style = 'size')
        
        # assign cluster size
        cluster_size[i][range(a.shape[0])] = a[:,1]
        
        for j in range(a.shape[0]):
            # iterate over largest cluster centroids in first column of a, and reassign centroids starting from 0
            # df_cluster[i] == a[j,0] searches the column where index is equal to a[j,0]
            # and reassigns them to cluster j
            cluster_copy[i][df_cluster[i] == a[j,0]] = np.repeat(j, sum(df_cluster[i] == a[j,0]))
            
    return cluster_size, cluster_copy

# functions for cluster analysis
def return_cluster(data, ind, clust, str_col):
    ''' Find out which parasites are contained in a specific cluster.
    Example: return_cluster(parasites_cluster_reindex, 'Sample', 0, 'louvain_25' )'''
    return(set(data[ind][data[str_col] == clust]))

def compare_cluster(data, ind, clust_ind1, clust_ind2, str_col1, str_col2):
    '''Compares how much 2 clusters intersect. ind is which column you're looking at (want 'Sample' as default).
    Example: compare_cluster(parasites_cluster_reindex, 'Sample', 0, 0, 'HC_single', 'louvain_25')'''
    return(set(return_cluster(data, ind, clust_ind1, str_col1) &
         return_cluster(data, ind, clust_ind2, str_col2)))

def cluster_order(cluster_list, style):
    '''Sort the cluster by largest cluster.
    Returns largest cluster on first column and corresponding size on second column.
    Example: cluster_order(parasites_cluster['louvain_25'])'''
    
    clusterCount = collections.Counter(cluster_list)
    cluster_assign, cluster_cnt = zip(*clusterCount.items())
    
    a = np.column_stack((cluster_assign, cluster_cnt))
    if (style == 'size'):
        return(a[(-a[:,1]).argsort()])
    elif (style == 'index'):
        return(a[(a[:,0]).argsort()])

def get_cluster_features(data, ind, clust, str_col, att):
    '''Find out what features are assigned to the points in given cluster.
    Note: use 'index' instead of 'sample' in order to get right features.
    Example: get_cluster_features(parasites_cluster_reindex, 'index', 0, 'louvain_25', 'k13Class')'''
    
    return(data.iloc[list(return_cluster(data, ind, clust, str_col)),:][att])

# give cluster features for one type of clustering
def get_cluster_features_meta(meta, clustering, att):
    '''Example: get_cluster_features_meta(meta, parasites_cluster_reindex['karrer_sbm_50'], 'k13Class')'''
    for i in range(clustering.max()+1):
        print("\n")
        print("Cluster: ", i)
        g_cluster = list(meta.index[clustering == i])
        g_cluster_att = meta[att][g_cluster]
        print(collections.Counter(g_cluster_att))

def count_cluster_features(data, ind, clust, str_col, att):
    '''Summarise what features are assigned to the points in given cluster.
    Example: count_cluster_features(parasites_cluster_reindex, 'index', 0, 'louvain_25', 'k13Class' )'''
    clusterCount = collections.Counter(get_cluster_features(data, ind, clust, str_col, att))
    return(clusterCount)

# print all cluster contents
def cluster_contents(df_cluster, ind_start, cluster_no):
    '''Find out what is in each cluster, based on all possible attributes (year, site and k13Clas).
    Example: cluster_contents(parasites_cluster_reindex, 5, 4)'''
    
    for i in df_cluster.columns[ind_start:]:
        # loop through the algorithms for clusters
        print('\n -----------------', i, '-----------------\n')

        for k in range(cluster_no):
            # 'k' looks at first few clusters
            print('\n Cluster: ', k, '\n')

            for j in df_cluster.columns[1:4]:
                # j corresponds to the attribute we are looking at
                print(count_cluster_features(df_cluster, 'index', k, i, j))
                
def cluster_contents_cluster(df_cluster, cluster_ind, cluster_no):
    print('\n -----------------', cluster_ind, '-----------------\n')
    for k in range(cluster_no):
            # 'k' looks at first few clusters
            print('\n Cluster: ', k, '\n')

            for j in df_cluster.columns[1:4]:
                # j corresponds to the attribute we are looking at
                print(count_cluster_features(df_cluster, 'index', k, cluster_ind, j))
    

In [None]:
def mcmc_cluster(path):
    '''Convert MCMC samples into clusters by taking posterior mode.'''
    a = pd.read_csv(path, header = None).T.values
    return (stats.mode(a, axis = 1)[0].reshape(-1))