In [40]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib as mpl
import matplotlib.pyplot as plt
from networkx.drawing.nx_agraph import graphviz_layout
import pymc3 as pm

In [41]:
DECIMAL_PRECISION = 2
PROBABILITY_THRESHOLD = 0.1
INPUT_FOLDER_PATH = '/Users/jugne/Documents/TnT-material/hiv/inf/final/full_tiedParams_heated_sameNe/combined/'
FILE_NAME = 'env_pol_saConstrained_s1_tiedParams_heated_sameNe_combined_tr_analyser'
OUTPUT_FOLDER_PATH = '/Users/jugne/Documents/TnT-material/hiv/inf/final/full_tiedParams_heated_sameNe/combined/'


In [42]:
from pathlib import Path
Path(OUTPUT_FOLDER_PATH).mkdir(parents=True, exist_ok=True)

In [43]:
tnt_analyser = pd.read_csv(INPUT_FOLDER_PATH+FILE_NAME+".txt",
                          sep="\t")

In [44]:
# host list
hosts = np.unique([i.split('_', 1)[0] for i in tnt_analyser.columns])

In [45]:
###################################################################
####### get probabilities of diferent kinds of transmission #######
###################################################################


# prob of direct transmission:
dd = tnt_analyser.loc[:,:]==1
prob_direct = dd.sum()/dd.shape[0]

# prob of indirect or direct transmission:
dd = tnt_analyser.loc[:,:]>=1
prob_indirectAndDirect = dd.sum()/dd.shape[0]

# prob of indirect transmission:
dd = tnt_analyser.loc[:,:]>1
prob_indirect = dd.sum()/dd.shape[0]

# prob of no transmission:
dd = tnt_analyser.loc[:,:]==0
prob_noTr = dd.sum()/dd.shape[0]

# count intermediate unobserved transmissions
n_unobserved_transmissions = tnt_analyser[tnt_analyser.loc[:,:]>=1]-1

In [46]:
##########################################
####### indirect dataframes ##############
##########################################
names_indirect = tnt_analyser.columns[prob_indirect>0]
from_indirect = [i.split('_', 1)[0] for i in names_indirect]
to_indirect = [i.split('_', 1)[1] for i in names_indirect]

edges_indirect = pd.DataFrame({'from': from_indirect, 
                               'to': to_indirect, 
                               'probability': prob_indirect[prob_indirect>0],
                               'median_unobserved': n_unobserved_transmissions.loc[:,names_indirect].median()})
nodes_indirect = pd.DataFrame({'id': np.unique([i.split('_', 1)[0] for i in from_indirect+to_indirect])})

# add edge labels, set precision
edges_indirect.loc[:, 'label'] = np.around(edges_indirect.loc[:, 'probability'],
                                           decimals=DECIMAL_PRECISION).astype(str)

###################################
######## direct dataframes ########
###################################
names_direct = tnt_analyser.columns[prob_direct>0]
from_direct = [i.split('_', 1)[0] for i in names_direct]
to_direct = [i.split('_', 1)[1] for i in names_direct]

edges_direct = pd.DataFrame({'from': from_direct, 
                             'to': to_direct, 
                             'probability': prob_direct[prob_direct>0]})
nodes_direct = pd.DataFrame({'id': np.unique([i.split('_', 1)[0] for i in from_direct+to_direct])})

# add edge labels, set precision
edges_direct.loc[:, 'label'] = np.around(edges_direct.loc[:, 'probability'],
                                         decimals=DECIMAL_PRECISION).astype(str)

###############################################################
######### direct and indirect transmission dataframes #########
###############################################################
names_indirectAndDirect = tnt_analyser.columns[prob_indirectAndDirect>0]
from_indirectAndDirect = [i.split('_', 1)[0] for i in names_indirectAndDirect]
to_indirectAndDirect = [i.split('_', 1)[1] for i in names_indirectAndDirect]

inf_intermediates = []
for name in names_indirectAndDirect:
    inf_intermediates.append(n_unobserved_transmissions[n_unobserved_transmissions[name].notnull()][name].values)
inf_intermediates = [' '.join(str(y) for y in x) for x in inf_intermediates]
    
edges_indirectAndDirect = pd.DataFrame({'from': from_indirectAndDirect, 
                                        'to': to_indirectAndDirect, 
                                        'probability': prob_indirectAndDirect[prob_indirectAndDirect>0],
                                        'n_unobserved':inf_intermediates,
                                        'median_unobserved': np.nan_to_num(n_unobserved_transmissions.loc[:,names_indirectAndDirect].median())})
nodes_indirectAndDirect = pd.DataFrame({'id': np.unique([i.split('_', 1)[0] for i in from_indirectAndDirect+to_indirectAndDirect])})

# add edge labels, set precision
edges_indirectAndDirect.loc[:, 'label'] = np.around(edges_indirectAndDirect.loc[:, 'probability'],
                                                    decimals=DECIMAL_PRECISION).astype(str)

# calculate root probability
root = [1-edges_indirectAndDirect.loc[edges_indirectAndDirect.loc[:,'to']==h,'probability'].sum() for h in nodes_indirectAndDirect['id']]
nodes_indirectAndDirect.loc[:, 'root_probability'] = np.around(root, decimals=3)

In [51]:
##################################################
####### save inferred transmission history #######
##################################################
edges_indirectAndDirect.to_csv(OUTPUT_FOLDER_PATH+'inferred_transmission.csv')

##################################################
####### save inferred root probabilities  ########
##################################################
nodes_indirectAndDirect.to_csv(OUTPUT_FOLDER_PATH+'inferred_root_prob.csv')

In [48]:
# Apply probability treshold fo inclusion in the graph
edges_indirectAndDirect_trh = edges_indirectAndDirect[edges_indirectAndDirect.probability > PROBABILITY_THRESHOLD]

# Build your graph. Note that we use the DiGraph function to create the graph!
G=nx.from_pandas_edgelist(edges_indirectAndDirect_trh, 'from', 'to',  ['probability', 'median_unobserved'], create_using=nx.DiGraph())

# Node colors by root probability
cmap = mpl.cm.get_cmap("Greens")
node_col = dict(zip(nodes_indirectAndDirect.id, nodes_indirectAndDirect.root_probability))
for key, value in node_col.items():
            rgba = cmap(value)
            node_col[key] = mpl.colors.rgb2hex(rgba)


nx.set_node_attributes(G,
                       node_col,
                       "fillcolor")

# set edge width by transmission probability
nx.set_edge_attributes(G,
                       nx.get_edge_attributes(G, 'probability'),
                       "penwidth")

# merge transmission probability and mean intermediate transmissions for edge labels
dct_rounded_probs = {k: round(v, DECIMAL_PRECISION) for k, v in nx.get_edge_attributes(G, 'probability').items()} 
dct_rounded_unobserved_count = {k: round(v, DECIMAL_PRECISION) for k, v in nx.get_edge_attributes(G, 'median_unobserved').items()}
dct_label = {k: str(dct_rounded_probs[k])+', '+str(dct_rounded_unobserved_count[k]) for k in nx.get_edge_attributes(G, 'probability').keys()}

nx.set_edge_attributes(G,
                       dct_rounded_probs,
                       "label")

In [49]:
A = nx.nx_agraph.to_agraph(G)
A.layout(prog='dot')   
A.draw(OUTPUT_FOLDER_PATH+FILE_NAME+'.png',args='-Grankdir=LR -Gsplines=true -Goverlap="false" -Nshape=hexagon -Nstyle=filled', prog='dot' ) 

In [50]:
nodes_indirectAndDirect

Unnamed: 0,id,root_probability
0,A,0.263
1,B,0.384
2,C,0.0
3,D,0.002
4,E,0.0
5,F,0.199
6,G,0.126
7,H,0.0
8,I,0.0
9,K,0.0
