In [None]:
from ipynb.fs.full.functions_header import *

In [None]:
# functions for data manipulation and graph creation

def filter_data(data, str_col, list_filter):
    '''Example: filter_data(train_df, 'Patient', ['ID00007637202177411956430'])'''
    df = data.copy()
    df = df[df[str_col].isin(list_filter)]
    return(df)

def entries_thresh(data, thresh, str_col):
    ''' Get entries when thresholding. str_col is the column you want to threshold
    Example: entries_thresh(data, 0.1, 'weight')'''

    if (thresh == 1):
        return (data[str_col] >= thresh).astype(int)
    else:
        return (data[str_col] > thresh).astype(int)

def data_thresh(data, thresh, str_col):
    '''Returns data set after thresholding. str_col is the column you want to threshold
    Example: data_thresh(data, 0.1, 'weight')'''
    
    data_copy = data.copy()
    data_bin = data_copy.loc[entries_thresh(data_copy, thresh, str_col) == 1]
    return (data_bin)

def get_G_square(array_dense):
    ''' returns graph, based on dense distance matrix given
    Example: get_G_square(data['weight'])'''
    
    array_adj = squareform(array_dense)
    return(nx.from_numpy_matrix(array_adj))

def get_G_bin(data, thresh, str_col):
    '''Returns graph, based on data and threshold given.
    Example: get_G_bin(data, 0.5, 'weight')'''
    
    G_bin_adj = get_G_square(entries_thresh(data, thresh, str_col))
    return(G_bin_adj)

def data_remove(data, meta, meta_col, meta_class_list):
    '''Remove certain data type from original sample.
    Example: data_remove(data, meta, 'k13Class', ['C580Y'])'''
    meta_list = list(meta['Sample'][meta[meta_col].isin(meta_class_list)])
    print(len(meta_list), 'parasites removed.')
    
    df_copy = data.copy()
    # remove parasites with given attribute in source node
    df_copy = df_copy.drop(df_copy[df_copy['V1'].isin(meta_list)].index)
    # remove parasites with given attribute in target node
    df_copy = df_copy.drop(df_copy[df_copy['V2'].isin(meta_list)].index)
    return(df_copy)

In [None]:
#### plotting functions

def plt_deg(degree_sequence, base = 0):
    '''Plots distribution of degrees, x = degree, y = count.
    Example: plt_deg(G_degrees, base = 'e')'''
    # print("Degree sequence", degree_sequence)
    degreeCount = collections.Counter(degree_sequence)
    deg, cnt = zip(*degreeCount.items())

    plt.figure(figsize=(15,6))
    if (base == 10):
        ax = sns.regplot(x = np.log10(deg), y = np.log10(cnt), fit_reg = False)
    elif (base == 'e'):
        ax = sns.regplot(x = np.log(deg), y = np.log(cnt), fit_reg = False)
    else:
        ax = sns.regplot(x = deg, y = cnt, fit_reg = False)
    ax.set(xlabel = 'Degree', ylabel = 'Number of nodes with degree', title='Degree distribution')
    plt.show()
    
def plt_deg_cdf(degree_sequence, base):
    '''Plot distribution of degrees as 1-ecdf (survival function)
    Example: plt_deg_cdf(G_degree_strength, base = 'e')'''
    ecdf = ECDF(degree_sequence)
    
    plt.figure(figsize=(15,6))
    if (base == 10):
        ax = sns.regplot(x = np.log10(ecdf.x), y = np.log10(1 - ecdf.y), fit_reg = False)
    elif (base == 'e'):
        ax = sns.regplot(x = np.log(ecdf.x), y = np.log(1 -ecdf.y), fit_reg = False)
    else:
        ax = sns.regplot(x = ecdf.x, y = 1 - ecdf.y, fit_reg = False)
    
    ax.set(xlabel = 'Degree', ylabel = '1 - P(D > d)', title = 'Degree distribution (1-ecdf)')
    plt.show()
    

def plt_boxplots(data_y, meta, title):
    '''Function to create boxplots
    Example: plt_boxplots(df_deg.mean(axis = 1), meta, 'Test')'''
    
    fig, ax = plt.subplots(nrows = 2,ncols = 2, figsize = (20,12))

    sns.boxplot(x = meta['Site'], y = data_y, ax = ax[0,0])
    ax[0,0].set_title(title + " distribution by Site")

    sns.boxplot(x = meta['Year'], y = data_y, ax = ax[0,1])
    ax[0,1].set_title(title + " distribution by Year")

    sns.boxplot(x = meta['k13Class'], y = data_y, ax = ax[1,0])
    ax[1,0].set_title(title + " distribution by k13Class")
    
    sns.boxplot(x = meta['crt_class'], y = data_y, ax = ax[1,1])
    ax[1,1].set_title(title + " distribution by CRT class")

In [None]:
# Sampling functions
def sample_G(df, str_weight, seed):
    np.random.seed(seed)
    A = np.array(df[str_weight])
    B = np.transpose(np.random.binomial(n = 1, p = A)) # Edge weights sampled
    G = get_G_square(B)
    return(G, B)

def get_G_lcc(G):
    Gc_ind = list(max(nx.connected_components(G), key=len))
    Gc = G.subgraph(Gc_ind)
    
    # list(G1_lcc.nodes()) == G1_lcc_ind
    # return graph and indices
    return(Gc, Gc_ind)

def sample_G_G_lcc(df, str_weight, seed):
    G, B = sample_G(df, str_weight, seed)
    G_lcc, G_lcc_ind = get_G_lcc(G)
    return G, B, G_lcc, G_lcc_ind

# def get_ind_gc(data, ind, str_col):
#     '''Example: ~parasites_cluster['index'].isin(Gc_weight)'''
#     return(data[str_col].isin(ind))

def get_edgelist_lcc(df, Gc_ind, source_str, target_str):
    ind_list1 = np.array(data_sampled[source_str].isin(Gc_ind))
    ind_list2 = np.array(data_sampled[target_str].isin(Gc_ind))
    ind_list = np.ones(len(ind_list1), dtype = bool)

    for i in range(len(ind_list1)):
        if (ind_list1[i] and ind_list2[i]):
            continue
        else:
            ind_list[i] = False
            
    df_copy = df.copy()
    return(df_copy[ind_list])

def G_add_weight(G1, df):
    '''Add corresponding weights to network.'''
    G = G1.copy()

    if(len(G.edges()) == len(df)):
        print ("Length of df accepted")
        
    for i in range(len(G.edges())):
        source_ind = df.iloc[i,:]['source_ind']
        target_ind = df.iloc[i,:]['target_ind']
        G[source_ind][target_ind]['weight'] = df.iloc[i,:]['weight']
        
    return(G)