In [109]:
import pandas as pd
import numpy as np
import networkx as nx
from math import log
from pathlib import Path

In [114]:
date_str = "2020-03-18_2020-03-22"
base_path = Path("/sfs/lustre/bahamut/scratch/jho5ze/bionets/covid/data/Houston_gengraph/")
test = base_path / date_str / f"{date_str}.xml"
sequences = base_path / date_str / f"{date_str}_seq_input.txt"
with open(sequences) as src:
    sequences = len(src.readlines()) - 1
test = nx.read_graphml(test)
sequences

2

In [139]:
def expand_ggraph(ggraph):
    new_graph = nx.DiGraph()
    entry_points = {node:None for node in ggraph.nodes()}
    
    for node in ggraph.nodes():
        
        node_info = ggraph.nodes[node]
        
        if entry_points[node] is None:
            last_node = None
            entry_points[node] = node + "_0"
            start_idx = 0
        else:
            last_node = entry_points[node]
            start_idx = 1
            
        ids = set(ggraph.nodes[node]["ids"].split(","))
        num_genomes = len(ids)
        for ix, character in enumerate(node_info["sequence"][start_idx:]):
            ix += start_idx
            new_node = node + "_" + str(ix)
#             new_graph.add_node(new_node, attr_dict={"sequence":character, "num_genomes":num_genomes})
            new_graph.add_node(new_node, sequence = character, num_genomes = num_genomes)
            if last_node is not None:
                new_graph.add_edge(last_node, new_node, weight=num_genomes)
            last_node = new_node

        for child in ggraph.successors(node):
            child_info = ggraph.nodes[child]
            child_ids = set(child_info["ids"].split(","))
            
            entry_point = entry_points[child]
            if entry_point is not None:
                new_graph.add_edge(last_node, entry_point, weight=len(ids & child_ids))
            else:

                child_node = child + "_0"
#                 new_graph.add_node(child_node, attr_dict={"sequence":child_info["sequence"][0], "num_genomes":len(child_info["ids"].split(","))})
                new_graph.add_node(child_node, sequence=child_info["sequence"][0], num_genomes=len(child_info["ids"].split(",")))
                entry_points[child] = child_node
                new_graph.add_edge(last_node, child_node, weight=len(ids & child_ids))
                
    return new_graph
        
def add_root(ggraph, num_genomes):
    no_in_nodes = []
    for node in ggraph.nodes():
        if ggraph.in_degree(node) == 0:
            no_in_nodes.append(node)
    if len(no_in_nodes) == 1:
        return no_in_nodes[0]
    else:
        ggraph.add_node("root", attr_dict={"sequence":"N", "num_genomes":num_genomes})
        for node in no_in_nodes:
            ggraph.add_edge("root", node)
        return "root", no_in_nodes
    
# def calc_uncertainty(ggraph):
#     contribution=0
#     dep_uncert=0#the dependent uncertainty
    
#     cur_num_nodes=ggraph.number_of_nodes()#the number of nodes currently in the graph given a previous annotation
#     init_prob=1/float(cur_num_nodes)#initial probability is 1/number of nodes left in ontology
#     init_uncert=-(log(init_prob)/log(2))# since all the nodes have equal probability the average ends up being -log P		
#     #for every node in the ontology get its contribution to the annotation uncertainty
#     #this part skips the inner loop because the summation will just be the -log(j_prob)
#     num_root=0
#     for j in ggraph.nodes():
# #         if not j == annot_node:#if this term is in the ancestors induced by a node it has a probability of 1 and uncertainty of 0
#         induced_nodes=set(ggraph.predecessors(j)).union(set([j]))#get the number of nodes that cannot follow this one in an annotation
#         j_num_induce=len(induced_nodes)
#         j_num_desc=len(list(ggraph.successors(j)))
#         if (j_num_induce == 0):
#             num_root+=1
#             assert num_root <= 1
#         if(ggraph.number_of_nodes()==j_num_induce+j_num_desc):
#             j_probability=1
#         else: j_probability=1/float(ggraph.number_of_nodes()-j_num_induce-j_num_desc)
#         contribution+= -log(j_probability)/log(2)
#     dep_uncert=contribution*init_prob #probability that a block is active * the conditional entropy when it is active
#     result=dep_uncert+init_uncert
#     return result

def calc_uncertainty(ggraph, num_genomes):
    
    uncert = 0
    
    for node in ggraph.nodes():
        try:
            node_genomes = ggraph.nodes[node]["num_genomes"]
        except:
            print(ggraph.nodes[node])
        
        #For the first term
        uncert -= ( node_genomes / num_genomes ) * log( node_genomes / num_genomes , 2)
        
        #For the second term
        for child in ggraph.successors(node):
            uncert -= ( ggraph[node][child]["weight"] / num_genomes ) * log( ( ggraph[node][child]["weight"] / node_genomes ) , 2)
            
    return uncert

Currently calculates:

$ -\log_{2} \frac{1}{Nodes} + \frac{1}{Nodes} \sum_{x}^{Nodes} -\log_{2}\frac{1}{\# nodes\_not\_connected\_to\_i} $

Need to do: 

$ -\sum_{x}^{Nodes} \frac{G_x}{G} \log_{2} \frac{G_x}{G} - \sum_{x}^{Nodes}\sum_{y}^{x_{children}} \frac{G_x}{G} \frac{G_{x\_to\_y}}{G_x} \log_{2}\frac{G_{x\_to\_y}}{G_x}$

Where $G$ is the number of genomes in the graph, $G_x$ is the number of genomes going through node $x$ and $G_{x\_to\_y}$ is the number of genomes moving from $x$ to $y$

In [140]:
new = expand_ggraph(test)
add_root(new, sequences)
calc_uncertainty(new, sequences)

12.0

In [127]:
new['Aln_1_14_0']['Aln_1_16_0']["weight"]

1

In [None]:
new

In [107]:
new.nodes[list(new.predecessors('Aln_1_1_15349'))[0]]

{'attr_dict': {'sequence': 'T', 'num_genomes': 2}}

In [94]:
test.nodes['Aln_1_1']

{'ids': 'TX-HMH-792,TX-HMH-793',
 'TX-HMH-792_leftend': 1,
 'TX-HMH-792_rightend': 15352,
 'TX-HMH-793_leftend': 1,
 'TX-HMH-793_rightend': 15352,
 'sequence': 'AGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTTGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAGATGGCACTTGTGGCTTAGTAGAAGTTGAAAAAGGCGTTTTGCCTCAACTTGAACAGCCCTATGTGTTCATCAAACGTTCGGATGCTCGAACTGCACCTCATGGTCATGTTATGGTTGAGCTGGTAGCAGAACTCGAAGGCATTCAGTACGGTCGTAGTGGTGAGACACTTGGTGTCCTTGTCCCTCATGTGGGCGAAATACCAGTGGCTTACCGCAAGGTTCTTCTTCGTAAGAACGGTAATAAAGGAGCTGGTGGCCATAGTTACGGCGCCGATCTAAAGTCATTTGACTTAGGCGACGAGCTTGGCACTGATCCTTATGAAGATTTTCAAGAAAACTGGAACACTAAACATAGCAGTGGTGTTACCCGTGAACTCATGCGTGAGCTTAACGGAGGGGCATACACTCGCTATGTCGATAACAACTTCTGTGGCCCTGATGGCTACCCTCTTGAGTGCATTAAAGACCTTCTAGCACGTGCTGGTA

In [60]:
tot = 0
for node in test.nodes():
    print(node, len(test.nodes[node]["sequence"]))
    tot += len(test.nodes[node]["sequence"])
tot

Aln_1_1 15352
Aln_1_2 1
Aln_1_3 1
Aln_1_4 12556
Aln_1_5 1
Aln_1_6 1
Aln_1_7 1779
Aln_1_8 3
Aln_1_9 3
Aln_1_10 6
Aln_1_11 1
Aln_1_12 1
Aln_1_13 1
Aln_1_14 1
Aln_1_15 1
Aln_1_16 81


29789

In [26]:
for node in test.nodes():
    ids = test.nodes[node]["ids"].split(",")
    if len(ids) < 2:
        print(test.nodes[node].keys())

dict_keys(['ids', 'TX-HMH-792_leftend', 'TX-HMH-792_rightend', 'sequence'])
dict_keys(['ids', 'TX-HMH-793_leftend', 'TX-HMH-793_rightend', 'sequence'])
dict_keys(['ids', 'TX-HMH-792_leftend', 'TX-HMH-792_rightend', 'sequence'])
dict_keys(['ids', 'TX-HMH-793_leftend', 'TX-HMH-793_rightend', 'sequence'])
dict_keys(['ids', 'TX-HMH-792_leftend', 'TX-HMH-792_rightend', 'sequence'])
dict_keys(['ids', 'TX-HMH-793_leftend', 'TX-HMH-793_rightend', 'sequence'])
dict_keys(['ids', 'TX-HMH-792_leftend', 'TX-HMH-792_rightend', 'sequence'])
dict_keys(['ids', 'TX-HMH-793_leftend', 'TX-HMH-793_rightend', 'sequence'])
dict_keys(['ids', 'TX-HMH-792_leftend', 'TX-HMH-792_rightend', 'sequence'])
dict_keys(['ids', 'TX-HMH-793_leftend', 'TX-HMH-793_rightend', 'sequence'])
