# Preliminaries

In [1]:
%config Completer.use_jedi = False

# Standard packages
import pandas as pd
import numpy as np
import collections
from IPython.display import display, display_html

# Standard plotting
import matplotlib.pyplot as plt
import seaborn as sns

# Network Analysis
import networkx as nx

# Science packages
from scipy.spatial.distance import squareform
from statsmodels.distributions.empirical_distribution import ECDF
from scipy import stats
import statsmodels.api as sm

# Own functions
from ipynb.fs.full.functions_cluster_analysis import *
from ipynb.fs.full.functions_data_manipulation import *
from ipynb.fs.full.functions_karrer import *
from ipynb.fs.full.functions_clustering_plot import *

# sns.set(style = 'whitegrid')
sns.set(style = 'white')

# Data

In [87]:
# Main data, rename 'Probability_of_edge' to 'weight' so it is recognised as weight attribute
data = pd.read_csv("kelch_IBD_graph.csv")
data = data.rename(columns = {'Probability_of_edge':'weight', 'V1': 'source', 'V2': 'target'})

# get parasite types
codes, parasites = pd.factorize(np.concatenate((data['source'], data['target'])))
    
n = 1468            # number of parasites
n1 = data.shape[0]  # number of non-zero entries in adjacency metrix (upper triangular part) (1076778)

# add column, where parasites are indices from 0 to 1467
data['source_ind'] = codes[0:n1]
data['target_ind'] = codes[n1:(2*n1)]
data['type'] = 'undirected'

data = data.reindex(['source', 'target', 'source_ind', 'target_ind', 'weight', 'type'], axis = 1)

In [88]:
# read in meta data
meta = pd.read_csv("meta_data.csv")

# Creating Dictionaries for meta data
parasites_dict = {}
parasites_mut = {}
parasites_year = {}
parasites_site = {}
parasites_crt = {}

# convert nodes from numbers to parasite names
for key, value in enumerate(parasites):
    parasites_dict[key] = value

# convert nodes from numbers to k13mutation
for i, key in enumerate(parasites):
    parasites_mut[i] = meta['k13Class'][i]

# convert nodes from numbers to Year
for i, key in enumerate(parasites):
    parasites_year[i] = meta['Year'][i]

# convert nodes from numbers to Site
for i, key in enumerate(parasites):
    parasites_site[i] = meta['Site'][i]
    
# convert nodes from numbers to CRT
for i, key in enumerate(parasites):
    parasites_crt[i] = meta['crt_class'][i]

## Creating considered graphs

In [None]:
# Threshold: 0.5 - binary graph and it's largest connected component
G_bin50 = get_G_bin(data, 0.5, 'weight')
G_bin50 = G_add_weight(G_bin50, data_thresh(data, 0.5, 'weight')) # add weight
G_bin50_lcc, G_bin50_lcc_ind = get_G_lcc(G_bin50)

# Threshold: 1-0.5^10 - binary graph and it's largest connected component
G_bin1 = get_G_bin(data, 1-0.5**10, 'weight')
G_bin1 = G_add_weight(G_bin1, data_thresh(data, 1-0.5**10, 'weight')) # add weight
G_bin1_lcc, G_bin1_lcc_ind = get_G_lcc(G_bin1)

# sample a graph and it's largest connected component
# G_sample, data['Sampled'], G_sample_lcc, G_sample_lcc_ind = sample_G_G_lcc(data, 'weight', seed = 1234)
G_sample, data['Sampled'] = sample_G(data, 'weight', seed = 1234)
G_sample = G_add_weight(G_sample, filter_data(data, 'Sampled', [1])) # add weight
G_sample_lcc, G_sample_lcc_ind = get_G_lcc(G_sample)

In [None]:
nx.write_gml(G_weighted, "Export_Graphs/G_weighted.gml")
nx.write_gml(G_bin50, "Export_Graphs/G_bin50.gml")
nx.write_gml(G_bin50_lcc, "Export_Graphs/G_bin50_lcc.gml")
nx.write_gml(G_bin1, "Export_Graphs/G_bin1.gml")
nx.write_gml(G_bin1_lcc, "Export_Graphs/G_bin1_lcc.gml")
nx.write_gml(G_sample, "Export_Graphs/G_sample.gml")
nx.write_gml(G_sample_lcc, "Export_Graphs/G_sample_lcc.gml")

## Creating trimmed graphs (only run once)

In [None]:
# Initial graph
# Threshold: 1-0.5^10 - binary graph & LCC
G_bin1 = get_G_bin(data, 1-0.5**10, 'weight')
# G_bin1 = G_add_weight(G_bin1, data_thresh(data, 1-0.5**10, 'weight')) # add weight

G_bin1_lcc, lcc_ind = get_G_lcc(G_bin1)

In [None]:
# get trimmed network for 0.25, 0.5 and 0.75

# Threshold: 0.25 - binary graph and it's largest connected component
G_bin25 = get_G_bin(data, 0.25, 'weight')
G_bin50 = get_G_bin(data, 0.5, 'weight')
G_bin75 = get_G_bin(data, 0.75, 'weight')

# add weights
G_bin25 = G_add_weight(G_bin25, data_thresh(data, 0.25, 'weight'))
G_bin50 = G_add_weight(G_bin50, data_thresh(data, 0.5, 'weight'))
G_bin75 = G_add_weight(G_bin75, data_thresh(data, 0.75, 'weight'))

# create subgraphs with only given nodes
G_sub25 = G_bin25.subgraph(lcc_ind)
G_sub50 = G_bin50.subgraph(lcc_ind)
G_sub75 = G_bin75.subgraph(lcc_ind)
G_sub1 = G_bin1_lcc

nx.write_gml(G_bin25, "Export_Graphs/G_bin25.gml")
nx.write_gml(G_bin75, "Export_Graphs/G_bin75.gml")

nx.write_gml(Gsub_25, "Export_Graphs/G_sub25.gml")
nx.write_gml(Gsub_50, "Export_Graphs/G_sub50.gml")
nx.write_gml(Gsub_75, "Export_Graphs/G_sub75.gml")