# Packages

In [1]:
%config Completer.use_jedi = False

# Standard Python packages
import pandas as pd
import numpy as np
import collections
from IPython.display import display, display_html

# Standard plotting
import matplotlib.pyplot as plt
import seaborn as sns

# Import Network Analysis Packages
import networkx as nx

# Science packages
from scipy.spatial.distance import squareform
from statsmodels.distributions.empirical_distribution import ECDF

# sklearn packages for Clustering
import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering
from networkx.algorithms.community import greedy_modularity_communities

import community as community_louvain

# import self created functions
from ipynb.fs.full.functions_cluster_analysis import *
from ipynb.fs.full.functions_data_manipulation import *
from ipynb.fs.full.functions_karrer import *

# from networkx.algorithms import community
# !pip install community, python-louvain
sns.set(style = 'whitegrid')

# Reading Data

We read in the data `kelch_IBD_graph.csv` and rename the column `'Probability_of_edge'` to `'weight'` as the NetworkX package recognizes this immediately as a weight-attribute. Then we further index the parasites with numbers from 0 to 1467 accordingly, which will make the indexing easier.

In [2]:
# read in meta data
meta = pd.read_csv("meta_data.csv")

# geo = {"ETH": "Eastern Thailand", "WKH": "Western Cambodia", "NEKH": "North Eastern Cambodia",
#        "NKH": "North Cambodia", "VN": "Vietnam", "LA": "Laos"}

# geo_codes, geo = pd.factorize(meta['Site'])
# k13_codes, k13 = pd.factorize(meta['k13Class'])
# year_codes, year = pd.factorize(meta['Year'])

In [3]:
# Main data, rename 'Probability_of_edge' to 'weight' so it is recognised as weight attribute
data = pd.read_csv("kelch_IBD_graph.csv")
data = data.rename(columns = {'Probability_of_edge':'weight', 'V1': 'source', 'V2': 'target'})

# get parasite types
codes, parasites = pd.factorize(np.concatenate((data['source'], data['target'])))
parasites_dict = {}
for key, value in enumerate(parasites):
    parasites_dict[key] = value

n = 1468            # number of parasites
n1 = data.shape[0]  # number of non-zero entries in adjacency metrix (upper triangular part) (1076778)

# add column, where parasites are indices from 0 to 1467
data['source_ind'] = codes[0:n1]
data['target_ind'] = codes[n1:(2*n1)]
# data['type'] = 'undirected'

data = data.reindex(['source', 'target', 'source_ind', 'target_ind', 'weight'], axis = 1)

# display everything
display(data)
display(meta)

Unnamed: 0,source,target,source_ind,target_ind,weight
0,PD0498-C,PD0500-C,0,1,0.000003
1,PD0498-C,PD0501-C,0,2,0.000004
2,PD0498-C,PD0502-C,0,3,1.000000
3,PD0498-C,PD0575-C,0,4,1.000000
4,PD0498-C,PD0578-C,0,5,0.966176
...,...,...,...,...,...
1076773,RCN13098,RCN13101,1464,1466,0.000124
1076774,RCN13098,RCN13103,1464,1467,0.999985
1076775,RCN13100,RCN13101,1465,1466,0.000391
1076776,RCN13100,RCN13103,1465,1467,0.880700


Unnamed: 0,Sample,Site,Year,k13Class,crt_class
0,PD0498-C,ETH,2011,R539T,no_nea
1,PD0500-C,ETH,2011,C580Y,no_nea
2,PD0501-C,ETH,2011,C580Y,no_nea
3,PD0502-C,ETH,2011,R539T,no_nea
4,PD0575-C,ETH,2012,R539T,no_nea
...,...,...,...,...,...
1463,RCN13097,NEKH,2017,C580Y,T93S
1464,RCN13098,NEKH,2017,C580Y,T93S
1465,RCN13100,NEKH,2018,C580Y,T93S
1466,RCN13101,NEKH,2018,WT,H97Y


# Sampling Binary Graphs

In this section, we will start thresholding the data using different thresholds $\alpha$ from 0 to 1. In this case, we use step sizes of $\frac{1}{25}$ (i.e. `r = 25`) as our thresholds in the function `data_thresh` to just return the edges with `'weight'` $> \alpha$. The computation takes a bit of time, so we will save the relevant summary statistics into data frames which might become useful later, or if we want to rerun the kernel.

Note that in particular, we will have access to matrices which give relevant *local* quantities (e.g. degree, local clustering coefficient, centrality etc.) for each parasite at a given threshold $\alpha$. We will summarize *global* quantities (e.g. average degree, average local clustering coefficient, global clustering coefficient, density etc.) in the `data_summary`, or `summary_statistics.csv`.

In [4]:
# thresholding data at different steps
r = 20
X_deg = np.zeros((n,r))          # degree distribution
X_clust = np.zeros((n,r))        # matrix for local clustering coefficient
X_centrality = np.zeros((n,r))   # eigenvector centrality
X_between = np.zeros((n,r))      # betweeness centrality
X_close = np.zeros((n,r))        # closeness centrality

X_density = np.zeros((r,1))      # Density
X_transitivity = np.zeros((r,1)) # Global clustering coefficient

# get degree distribution
for i in range(0,r):
    print("Threshold: ", (i+1)/r)
    
    # create new graph
    G_main = get_G_bin(data, (i+1)/r, 'weight')
    G_bin = 
    
    # data_bin = data_thresh(data, (i+1)/r, 'weight')
    # G_bin = nx.from_pandas_edgelist(data_bin, 'V1_ind', 'V2_ind')
    
    G_nodes = [n for n, d in G_bin.degree()]     # create nodes
    G_degrees = [d for n, d in G_bin.degree()]   # create degrees
    
    # plot degree distribution
    # degree_sequence = sorted(G_degrees, reverse=True)
    # plt_deg_count(degree_sequence)
    
    # extract summary statistics
    X_deg[G_nodes,i] = G_degrees
    
    M = nx.eigenvector_centrality(G_bin)
    X_centrality[list(M.keys()),i] = list(M.values())
    
    M = nx.clustering(G_bin)
    X_clust[list(M.keys()),i] = list(M.values())
    
#     M = nx.betweenness_centrality(G_bin)
#     X_between[list(M.keys()),i] = list(M.values())
    
#     M = nx.closeness_centrality(G_bin)
#     X_close[list(M.keys()),i] = list(M.values())
    
    X_density[i,:] = nx.density(G_bin)
    X_transitivity[i,:] = nx.transitivity(G_bin)

Threshold:  0.05
Threshold:  0.1
Threshold:  0.15
Threshold:  0.2
Threshold:  0.25
Threshold:  0.3


KeyboardInterrupt: 

In [14]:
# thresholding data at different steps
r = 20

X_deg = np.zeros((r,1))     # Average degree
X_edges = np.zeros((r,1))   # no. of edges
X_nodes = np.zeros((r,1))   # no. of nodes
X_clust = np.zeros((r,1))   # avg local clustering coefficient
X_density = np.zeros((r,1))      # Density
X_transitivity = np.zeros((r,1)) # Global clustering coefficient

# get degree distribution
for i in range(0,r):
    print("Threshold: ", (i+1)/r)
    
    # create new graph
    G_main = get_G_bin(data, (i+1)/r, 'weight')
    lcc_ind = list(max(nx.connected_components(G_main), key=len))
    G_bin = G_main.subgraph(lcc_ind)
    
    G_degrees = [d for n, d in G_bin.degree()]   # create degrees
    
    X_deg[i,:] = np.mean(G_degrees)
    X_edges[i,:] = G_bin.number_of_edges()
    X_nodes[i,:] = G_bin.number_of_nodes()
    X_clust[i,:] = nx.average_clustering(G_bin)
    X_density[i,:] = nx.density(G_bin)
    X_transitivity[i,:] = nx.transitivity(G_bin)

Threshold:  0.05
Threshold:  0.1
Threshold:  0.15
Threshold:  0.2
Threshold:  0.25
Threshold:  0.3
Threshold:  0.35
Threshold:  0.4
Threshold:  0.45
Threshold:  0.5
Threshold:  0.55
Threshold:  0.6
Threshold:  0.65
Threshold:  0.7
Threshold:  0.75
Threshold:  0.8
Threshold:  0.85
Threshold:  0.9
Threshold:  0.95
Threshold:  1.0


In [22]:
data_summary_lcc = np.hstack((np.array(range(1,r+1)).reshape(r,1)/r,
                              X_nodes,
                              X_edges,
                              X_deg,
                              X_clust,
                              X_density,
                              X_transitivity))


df_lcc = pd.DataFrame(data_summary_lcc, columns = ['Threshold',
                                          'No_nodes',
                                          'No_edges',
                                          'avg_degree',
                                          'avg_clust_local',
                                          'density',
                                          'transitivity'])
df_lcc.to_csv("Excel files/Summaries/summaries_lcc.csv", index = False)

In [27]:
# Store values summarising the graph
data_summary = np.hstack((np.array(range(1,r+1)).reshape(r,1)/r,  # Threshold indices
                          sum(X_deg != 0).reshape(r,1),           # Number of nodes
                          X_deg.sum(axis = 0).reshape(r,1)/2,     # Number of edges
                          X_deg.sum(axis = 0).reshape(r,1)/n,     # average degree
                          X_clust.sum(axis = 0).reshape(r,1)/n,   # average clustering coefficient
                          X_transitivity,                         # Global clustering coefficient
                          X_density,                              # Density
                         ))

df = pd.DataFrame(data_summary, columns = ['Threshold',
                                           'Number of nodes',
                                           'Number of edges',
                                           'Average degree',
                                           'Average local clustering coefficient',
                                           'Global clustering coefficient',
                                           'Density'])

df.to_csv("summary_statistics.csv",index = False)

# Store X_degree
df_deg = pd.DataFrame(X_deg, columns = np.array(range(1,r+1))/r)
df_deg.to_csv("summary_degree.csv", index = False)

# Store X_clust
df_clust = pd.DataFrame(X_clust, columns = np.array(range(1,r+1))/r)
df_clust.to_csv("summary_clustering.csv", index = False)

# Store X_centrality
df_centrality = pd.DataFrame(X_centrality, columns = np.array(range(1,r+1))/r)
df_centrality.to_csv("summary_centrality.csv", index = False)

# Store X_between
df_between = pd.DataFrame(X_between, columns = np.array(range(1,r+1))/r)
df_between.to_csv("summary_between.csv", index = False)

# Store X_close
df_close = pd.DataFrame(X_close, columns = np.array(range(1,r+1))/r)
df_close.to_csv("summary_close.csv", index = False)

display(df)

NameError: name 'r' is not defined