In [1]:
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

In [2]:
def Get_Neighbors(Graph, contig_path, alignment_type, filename):
    if alignment_type == 'osab':
        suffix = '.common_contigs.list'
    else:
        suffix = '.final_contig.list'
    filepath = contig_path+alignment_type+'_'+filename+suffix
    contigs = open(filepath,'r').readlines()
    print(alignment_type, '--->', len(contigs))
    Edge_List = []
    node_attr_list = dict()
    islands = []
    ctr = 0
    for c in contigs:
        try:
            c = c.replace("\n","")
            pred = list(Graph.predecessors(c))
            succ = list(Graph.successors(c))
            edges = (list(zip(pred, [c]*len(pred))) + 
                     list(zip([c]*len(succ), succ)) )
            if len(edges) == 0:
                print(c)
                islands += [c]
            Edge_List += edges
            node_attr_list[c] = {'Alignment' : alignment_type}
        except Exception:
            ctr += 1
            pass
    print(ctr, ' Nodes not found')
    return Edge_List, islands, node_attr_list
    

In [3]:
Contigs_Path = '/Users/harihara/Mount/osa_osb_all_samples/'
Graph_path = '/Users/harihara/Mount/MetaCarvel_output/wo_rd/'
Sample_id = 'HotsprSampleMS60_FD'
G = nx.read_gml(Graph_path+Sample_id+'_scaffolds_no_rd/oriented.gml')

In [4]:
edge_list_osa, islands_osa, nodes_attr_osa = Get_Neighbors(G, Contigs_Path, 'osa', Sample_id)
edge_list_osb, islands_osb, nodes_attr_osb = Get_Neighbors(G, Contigs_Path, 'osb', Sample_id)
edge_list_osab, islands_osab, nodes_attr_osab = Get_Neighbors(G, Contigs_Path, 'osab', Sample_id)
edge_list = edge_list_osa + edge_list_osb + edge_list_osab
nodes_attr_osa.update(nodes_attr_osb)
nodes_attr_osa.update(nodes_attr_osab)
islands = islands_osa + islands_osb + islands_osab
G_sub_graph = nx.DiGraph(edge_list)
G_sub_graph.add_nodes_from(islands)
nx.set_node_attributes(G_sub_graph, nodes_attr_osa)

osa ---> 1189
Ga0308421_1028461
Ga0308421_1040827
Ga0308421_1055800
Ga0308421_1064045
Ga0308421_1069971
Ga0308421_1071797
Ga0308421_1074751
Ga0308421_1075313
Ga0308421_1087576
Ga0308421_1109848
Ga0308421_1121223
Ga0308421_1122835
Ga0308421_1122978
Ga0308421_1131321
Ga0308421_1133244
Ga0308421_1137196
Ga0308421_1150755
Ga0308421_1154672
651  Nodes not found
osb ---> 3415
Ga0308421_1003143
Ga0308421_1004521
Ga0308421_1004666
Ga0308421_1009807
Ga0308421_1009945
Ga0308421_1010166
Ga0308421_1011659
Ga0308421_1017394
Ga0308421_1018075
Ga0308421_1019764
Ga0308421_1024325
Ga0308421_1026850
Ga0308421_1026963
Ga0308421_1035644
Ga0308421_1038653
Ga0308421_1038779
Ga0308421_1040122
Ga0308421_1040288
Ga0308421_1040761
Ga0308421_1051768
Ga0308421_1059278
Ga0308421_1064045
Ga0308421_1065792
Ga0308421_1066045
Ga0308421_1066339
Ga0308421_1069003
Ga0308421_1070047
Ga0308421_1070182
Ga0308421_1070694
Ga0308421_1071291
Ga0308421_1075619
Ga0308421_1083315
Ga0308421_1085747
Ga0308421_1094205
Ga0308421_10984

In [5]:
ctr = 0
for n in G_sub_graph.nodes():
    if n in nodes_attr_osa.keys():
        ctr += 1

In [6]:
print(2160*100/3694, '% of the nodes are annotated')

58.47319978343259 % of the nodes are annotated


In [7]:
orig_connected_components = list(nx.weakly_connected_components(G))
Nodes, Conn_ids = [], []
ctr = 0
for conn in orig_connected_components:
    Nodes += conn
    Conn_ids += [ctr]*len(conn)
    ctr += 1
df_orig_Conn = pd.DataFrame(data = {'Nodes':Nodes, 'conn-id':Conn_ids})
connected_components = list(nx.weakly_connected_components(G_sub_graph))

In [8]:
op = []
ctr = 0
for conn in connected_components:
    node_counts = len(conn)
    aligns = set([])
    osa, osb, osab, na = 0,0,0,0
    for node in conn:
        try:
            align = nodes_attr_osa[node]['Alignment']
            aligns.add(align)
            if align == 'osa': osa += 1
            if align == 'osb': osb += 1
            if align == 'osab': osab += 1
            
        except KeyError:
            aligns.add('NA')
            na += 1
    ctr += 1
    if (len(aligns) == 2 and 'NA' in aligns) or len(aligns) == 1:
        Unique = True
    else:
        Unique = False
    d = {'conn-id':ctr, 'Aligns':aligns, 'Unique':Unique, 'Node Counts':node_counts, 
         'OSA':osa, 'OSB':osb,'OSAB':osab,'None':na}
    op.append(d)
df_op_conn = pd.DataFrame(op)        

In [47]:
d = {}
e = {}
for c in connected_components:
    node_1 = list(c)[0]
    i = df_orig_Conn[df_orig_Conn['Nodes'] == node_1].iloc[0]['conn-id']
    try:
        d[i] += len(c)
    except KeyError:
        d[i] = len(c)
    counter = 0
    for node in c:
        try:
            align = nodes_attr_osa[node]
            counter += 1
        except KeyError:
            pass
    try:
        e[i] += counter
    except KeyError:
        e[i] = counter
        
#print(i)
#print(connected_components[10])
print(' Connected Component   #Contigs in connected component   #Contigs in subgraph    #Contigs mapping to OSA/OSB')
print(' -----------------------------------------------------------------------------------------------------------')
for k in d.keys():
    print('|\t',k,'\t    |\t\t', len(orig_connected_components[k]), 
          '\t\t       |\t', d[k], '\t\t|\t', e[k],'\t\t   |')


 Connected Component   #Contigs in connected component   #Contigs in subgraph    #Contigs mapping to OSA/OSB
 -----------------------------------------------------------------------------------------------------------
|	 37 	    |		 3053 		       |	 2658 		|	 1466 		   |
|	 787 	    |		 6 		       |	 6 		|	 4 		   |
|	 1375 	    |		 11 		       |	 11 		|	 9 		   |
|	 1579 	    |		 3 		       |	 3 		|	 3 		   |
|	 2088 	    |		 10 		       |	 9 		|	 4 		   |
|	 623 	    |		 23 		       |	 23 		|	 13 		   |
|	 2645 	    |		 37 		       |	 37 		|	 25 		   |
|	 3986 	    |		 4 		       |	 4 		|	 4 		   |
|	 4559 	    |		 6 		       |	 6 		|	 5 		   |
|	 6285 	    |		 9 		       |	 8 		|	 5 		   |
|	 6650 	    |		 2 		       |	 2 		|	 1 		   |
|	 7021 	    |		 5 		       |	 5 		|	 4 		   |
|	 8854 	    |		 2 		       |	 2 		|	 2 		   |
|	 9623 	    |		 2 		       |	 2 		|	 2 		   |
|	 7770 	    |		 10 		       |	 10 		|	 3 		   |
|	 1657 	    |		 10 		       |	 10 		|	 6 		   |
|	 13 	    |

|	 20092 	    |		 1 		       |	 1 		|	 1 		   |
|	 20367 	    |		 1 		       |	 1 		|	 1 		   |
|	 21214 	    |		 1 		       |	 1 		|	 1 		   |
|	 21445 	    |		 1 		       |	 1 		|	 1 		   |
|	 1138 	    |		 1 		       |	 1 		|	 1 		   |
|	 1625 	    |		 1 		       |	 1 		|	 1 		   |
|	 1679 	    |		 1 		       |	 1 		|	 1 		   |
|	 3387 	    |		 1 		       |	 1 		|	 1 		   |
|	 3429 	    |		 1 		       |	 1 		|	 1 		   |
|	 3488 	    |		 1 		       |	 1 		|	 1 		   |
|	 3940 	    |		 1 		       |	 1 		|	 1 		   |
|	 5486 	    |		 1 		       |	 1 		|	 1 		   |
|	 5671 	    |		 1 		       |	 1 		|	 1 		   |
|	 6073 	    |		 1 		       |	 1 		|	 1 		   |
|	 7127 	    |		 1 		       |	 1 		|	 1 		   |
|	 7652 	    |		 1 		       |	 1 		|	 1 		   |
|	 7680 	    |		 1 		       |	 1 		|	 1 		   |
|	 9371 	    |		 1 		       |	 1 		|	 1 		   |
|	 9911 	    |		 1 		       |	 1 		|	 1 		   |
|	 9934 	    |		 1 		       |	 1 		|	 1 		   |
|	 10154 	    |		 1 		       |	 1 		|	 1 		   |
|	 10186

In [101]:
comp_id = 10
for node in sorted(list(connected_components[comp_id])):
    try:
        print(node, nodes_attr_osa[node])
    except KeyError:
        print(node, 'Unannotated')

Ga0308421_1007477 {'Alignment': 'osa'}
Ga0308421_1011222 Unannotated
Ga0308421_1012128 {'Alignment': 'osb'}
Ga0308421_1017987 {'Alignment': 'osb'}
Ga0308421_1019297 {'Alignment': 'osb'}
Ga0308421_1019522 {'Alignment': 'osb'}
Ga0308421_1025462 Unannotated
Ga0308421_1028899 {'Alignment': 'osb'}
Ga0308421_1049325 {'Alignment': 'osb'}
Ga0308421_1054555 {'Alignment': 'osab'}
Ga0308421_1060638 {'Alignment': 'osb'}
Ga0308421_1060733 {'Alignment': 'osb'}
Ga0308421_1063866 {'Alignment': 'osa'}
Ga0308421_1075154 {'Alignment': 'osb'}
Ga0308421_1085568 {'Alignment': 'osb'}
Ga0308421_1089042 {'Alignment': 'osb'}
Ga0308421_1089542 {'Alignment': 'osb'}
Ga0308421_1092963 {'Alignment': 'osb'}
Ga0308421_1095448 Unannotated
Ga0308421_1097019 {'Alignment': 'osa'}
Ga0308421_1099336 Unannotated
Ga0308421_1103545 Unannotated
Ga0308421_1104825 {'Alignment': 'osab'}
Ga0308421_1106848 {'Alignment': 'osb'}
Ga0308421_1107171 Unannotated
Ga0308421_1108763 {'Alignment': 'osb'}
Ga0308421_1117961 Unannotated
Ga030842

In [84]:
len(orig_connected_components[2645])

37

In [64]:
len(G_sub_graph.nodes())

3694

In [65]:
print(connected_components[0])

{'Ga0308421_1080345', 'Ga0308421_1125999', 'Ga0308421_1145719', 'Ga0308421_1125208', 'Ga0308421_1011339', 'Ga0308421_1153583', 'Ga0308421_1135600', 'Ga0308421_1115506', 'Ga0308421_1102577', 'Ga0308421_1018521', 'Ga0308421_1028008', 'Ga0308421_1128579', 'Ga0308421_1089890', 'Ga0308421_1024334', 'Ga0308421_1055137', 'Ga0308421_1129311', 'Ga0308421_1132773', 'Ga0308421_1101571', 'Ga0308421_1118432', 'Ga0308421_1115516', 'Ga0308421_1156899', 'Ga0308421_1030504', 'Ga0308421_1145656', 'Ga0308421_1008446', 'Ga0308421_1137178', 'Ga0308421_1039854', 'Ga0308421_1068533', 'Ga0308421_1138531', 'Ga0308421_1100278', 'Ga0308421_1099293', 'Ga0308421_1132427', 'Ga0308421_1012599', 'Ga0308421_1105851', 'Ga0308421_1036512', 'Ga0308421_1099171', 'Ga0308421_1018772', 'Ga0308421_1080804', 'Ga0308421_1079897', 'Ga0308421_1127311', 'Ga0308421_1015783', 'Ga0308421_1049796', 'Ga0308421_1113224', 'Ga0308421_1015914', 'Ga0308421_1072700', 'Ga0308421_1123831', 'Ga0308421_1072171', 'Ga0308421_1149555', 'Ga0308421_1