We will use the data `kelch_IBD_graph.csv` which, given two parasites, shows the probability of *identity by descent*. In our data, we will refer to this as `'weight'`. Here, we analyze the data in form of a weighted graph.

The main package for network analysis will be the <font color = blue>NetworkX</font> package in Python. We also use <font color = blue>sklearn</font> and <font color = blue>scipy</font> to look into *hierarchical clustering*.

In `useful_functions.py`, we define some the functions `entries_thresh` and `data_thresh`, which return the binary entries 0 and 1, as well as the "updated" data set, respectively. We also use `plt_deg_count` to plot the distribution of the degree sequence.

# Packages

In [1]:
%config Completer.use_jedi = False

# Standard Python packages
import pandas as pd
import numpy as np
import collections
from IPython.display import display, display_html

# Standard plotting
import matplotlib.pyplot as plt
import seaborn as sns

# Import Network Analysis Packages
import networkx as nx

# Science packages
from scipy.spatial.distance import squareform
from statsmodels.distributions.empirical_distribution import ECDF

# sklearn packages for Clustering
import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering
from networkx.algorithms.community import greedy_modularity_communities

import community as community_louvain

# import self created functions
from ipynb.fs.full.functions_cluster_analysis import *
from ipynb.fs.full.functions_data_manipulation import *
from ipynb.fs.full.functions_karrer import *

# from networkx.algorithms import community
# !pip install community, python-louvain
sns.set(style = 'whitegrid')

# Reading in Data

We read in the data `kelch_IBD_graph.csv` and rename the column `'Probability_of_edge'` to `'weight'` as the NetworkX package recognizes this immediately as a weight-attribute. Then we further index the parasites with numbers from 0 to 1467 accordingly, which will make the indexing easier.

In [2]:
# read in meta data
meta = pd.read_csv("meta_data.csv")

# geo = {"ETH": "Eastern Thailand", "WKH": "Western Cambodia", "NEKH": "North Eastern Cambodia",
#        "NKH": "North Cambodia", "VN": "Vietnam", "LA": "Laos"}

# geo_codes, geo = pd.factorize(meta['Site'])
# k13_codes, k13 = pd.factorize(meta['k13Class'])
# year_codes, year = pd.factorize(meta['Year'])

In [3]:
# Main data, rename 'Probability_of_edge' to 'weight' so it is recognised as weight attribute
data = pd.read_csv("kelch_IBD_graph.csv")
data = data.rename(columns = {'Probability_of_edge':'weight', 'V1': 'source', 'V2': 'target'})

# get parasite types
codes, parasites = pd.factorize(np.concatenate((data['source'], data['target'])))
parasites_dict = {}
for key, value in enumerate(parasites):
    parasites_dict[key] = value

n = 1468            # number of parasites
n1 = data.shape[0]  # number of non-zero entries in adjacency metrix (upper triangular part) (1076778)

# add column, where parasites are indices from 0 to 1467
data['source_ind'] = codes[0:n1]
data['target_ind'] = codes[n1:(2*n1)]
# data['type'] = 'undirected'

data = data.reindex(['source', 'target', 'source_ind', 'target_ind', 'weight'], axis = 1)

# display everything
display(data)
display(meta)

Unnamed: 0,source,target,source_ind,target_ind,weight
0,PD0498-C,PD0500-C,0,1,0.000003
1,PD0498-C,PD0501-C,0,2,0.000004
2,PD0498-C,PD0502-C,0,3,1.000000
3,PD0498-C,PD0575-C,0,4,1.000000
4,PD0498-C,PD0578-C,0,5,0.966176
...,...,...,...,...,...
1076773,RCN13098,RCN13101,1464,1466,0.000124
1076774,RCN13098,RCN13103,1464,1467,0.999985
1076775,RCN13100,RCN13101,1465,1466,0.000391
1076776,RCN13100,RCN13103,1465,1467,0.880700


Unnamed: 0,Sample,Site,Year,k13Class,crt_class
0,PD0498-C,ETH,2011,R539T,no_nea
1,PD0500-C,ETH,2011,C580Y,no_nea
2,PD0501-C,ETH,2011,C580Y,no_nea
3,PD0502-C,ETH,2011,R539T,no_nea
4,PD0575-C,ETH,2012,R539T,no_nea
...,...,...,...,...,...
1463,RCN13097,NEKH,2017,C580Y,T93S
1464,RCN13098,NEKH,2017,C580Y,T93S
1465,RCN13100,NEKH,2018,C580Y,T93S
1466,RCN13101,NEKH,2018,WT,H97Y


# Sampling

Sample edges based on `'weight'` parameter 100 times to get an overview of how the network behaves.

In [34]:
np.random.seed(1234)

r = 100
A = np.array([data['weight']]*r).transpose()
B = np.random.binomial(n = 1, p = A)

In [None]:
# create arrays to save
X_deg = np.zeros((n,r))          # degree distribution
X_clust = np.zeros((n,r))        # matrix for local clustering coefficient
X_centrality = np.zeros((n,r))   # eigenvector centrality
X_between = np.zeros((n,r))      # betweeness centrality
X_close = np.zeros((n,r))        # closeness centrality

X_density = np.zeros((r,1))      # Density
X_transitivity = np.zeros((r,1)) # Global clustering coefficient

In [None]:
for i in range(B.shape[1]):
    print("Iteration: ", i)
    
    # create new graph
    G_bin = get_G_square(B[:,i])
    G_nodes = [n for n, d in G_bin.degree()]     # get nodes
    G_degrees = [d for n, d in G_bin.degree()]   # get degrees
    
    # extract summary statistics
    X_deg[G_nodes,i] = G_degrees
    
    M = nx.eigenvector_centrality(G_bin)
    X_centrality[list(M.keys()),i] = list(M.values())
    
    M = nx.clustering(G_bin)
    X_clust[list(M.keys()),i] = list(M.values())
    
    M = nx.betweenness_centrality(G_bin)
    X_between[list(M.keys()),i] = list(M.values())
    
    M = nx.closeness_centrality(G_bin)
    X_close[list(M.keys()),i] = list(M.values())
    
    X_density[i,:] = nx.density(G_bin)
    X_transitivity[i,:] = nx.transitivity(G_bin)

In [None]:
# Store values summarising the graph
data_summary = np.hstack((np.array(range(0,r)).reshape(r,1),      # Sampling runs
                          sum(X_deg != 0).reshape(r,1),           # Number of nodes
                          X_deg.sum(axis = 0).reshape(r,1)/2,     # Number of edges
                          X_deg.sum(axis = 0).reshape(r,1)/n,     # average degree
                          X_clust.sum(axis = 0).reshape(r,1)/n,   # average clustering coefficient
                          X_transitivity,                         # Global clustering coefficient (Transitivity)
                          X_density,                              # Density
                         ))

df_summary = pd.DataFrame(data_summary, columns = ['Run',
                                           'Number of nodes',
                                           'Number of edges',
                                           'Average degree',
                                           'Average local clustering coefficient',
                                           'Global clustering coefficient',
                                           'Density'])

df_summary.to_csv("sampling_statistics.csv",index = False)

# Store X_degree
df_deg = pd.DataFrame(X_deg, columns = np.array(range(0,r)))
df_deg.to_csv("sampling_degree_summary.csv", index = False)

# Store X_clust
df_clust = pd.DataFrame(X_clust, columns = np.array(range(0,r)))
df_clust.to_csv("sampling_clustering_summary.csv", index = False)

# Store X_centrality
df_centrality = pd.DataFrame(X_centrality, columns = np.array(range(0,r)))
df_centrality.to_csv("sampling_centrality_summary.csv", index = False)

# Store X_between
df_between = pd.DataFrame(X_between, columns = np.array(range(0,r)))
df_between.to_csv("sampling_between_summary.csv", index = False)

# Store X_close
df_close = pd.DataFrame(X_close, columns = np.array(range(0,r)))
df_close.to_csv("sampling_close_summary.csv", index = False)

In [6]:
# Load data from sampling run

df_deg = pd.read_csv("Excel Files/Sampling/sampling_degree.csv")
df_clust = pd.read_csv("Excel Files/Sampling/sampling_clustering.csv")
df_centrality = pd.read_csv("Excel Files/Sampling/sampling_centrality.csv")
df_between = pd.read_csv("Excel Files/Sampling/sampling_between.csv")
df_close = pd.read_csv("Excel Files/Sampling/sampling_close.csv")
df_summary = pd.read_csv("Excel Files/Sampling/sampling_statistics.csv")