In [1]:
import pandas as pd
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
from os import listdir, mkdir
from os.path import isdir


In [2]:
def Load_PAF(filepath):
    lines = open(filepath).readlines()
    header = ['Query','Qlen','QStart','QEnd','Orientation','Subject','SLen',
              'SStart','SEnd','Matches','AlignLength','MAPQ']
    op = []
    for l in lines:
        l = l.split('\t')[:12]
        op.append(dict(zip(header, l)))
    df = pd.DataFrame(op)
    df[['Qlen','QStart','QEnd','SLen','SStart',
        'SEnd','Matches','AlignLength','MAPQ']] = df[['Qlen','QStart','QEnd','SLen','SStart',
                                                      'SEnd','Matches','AlignLength','MAPQ']].astype('int')
    df['PIdent'] = df['AlignLength']/df['Qlen']*100
    df = df.loc[df.groupby(['Query'])['PIdent'].idxmax()]
    return df


In [3]:
def Return_Memberships(df_osa, df_osb, G, genome, quality_cutoff = 80):
    osa_contigs_aligned = df_osa[df_osa['PIdent'] >= quality_cutoff]['Query'].tolist()
    osa_contigs_not_aligned = df_osa[df_osa['PIdent'] < quality_cutoff]['Query'].tolist()

    osb_contigs_aligned = df_osb[df_osb['PIdent'] >= quality_cutoff]['Query'].tolist()
    osb_contigs_not_aligned = df_osb[df_osb['PIdent'] < quality_cutoff]['Query'].tolist()
    
    df_osa_grp = df_osa[df_osa['PIdent'] >= quality_cutoff]
    df_osa_grp = df_osa.set_index('Query').T.to_dict()
    
    df_osb_grp = df_osb[df_osb['PIdent'] >= quality_cutoff]
    df_osb_grp = df_osb.set_index('Query').T.to_dict()
    
    d_memberships = {}
    contigs = set(df_osa['Query'].tolist()).union(set(df_osb['Query'].tolist())).union(set(list(G.nodes())))
    for c in contigs:
        if c in osa_contigs_aligned:
            try: d_memberships[c]['Memberships'].append('OSA')
            except KeyError: d_memberships[c] = {'Memberships':['OSA']}

        if c in osb_contigs_aligned:
            try: d_memberships[c]['Memberships'].append('OSB')
            except KeyError: d_memberships[c] = {'Memberships':['OSB']}

        if c not in osa_contigs_aligned and c not in osb_contigs_aligned:
            d_memberships[c] = {'Memberships' : []} 
        
        try: d_memberships[c]['OSA_PIdent'] = df_osa_grp[c]['PIdent']
        except KeyError: d_memberships[c]['OSA_PIdent'] = 0
        
        try: d_memberships[c]['OSB_PIdent'] = df_osb_grp[c]['PIdent']
        except KeyError: d_memberships[c]['OSB_PIdent'] = 0
        
        if genome == 'OSA':
            try:
                Start = min(df_osa_grp[c]['SStart'], df_osa_grp[c]['SEnd'])
                End = max(df_osa_grp[c]['SStart'], df_osa_grp[c]['SEnd'])
                Orientation = df_osa_grp[c]['Orientation']
                d_memberships[c]['Start'] = Start
                d_memberships[c]['End'] = End
                d_memberships[c]['Orientation'] = Orientation
            except KeyError:
                d_memberships[c]['Start'] = -1
                d_memberships[c]['End'] = -1
                d_memberships[c]['Orientation'] = '*'
                
        if genome == 'OSB':
            try:
                Start = min(df_osb_grp[c]['SStart'], df_osb_grp[c]['SEnd'])
                End = max(df_osb_grp[c]['SStart'], df_osb_grp[c]['SEnd'])
                Orientation = df_osb_grp[c]['Orientation']
                d_memberships[c]['Start'] = Start
                d_memberships[c]['End'] = End
                d_memberships[c]['Orientation'] = Orientation
            except KeyError:
                d_memberships[c]['Start'] = -1
                d_memberships[c]['End'] = -1
                d_memberships[c]['Orientation'] = '*'
    nx.set_node_attributes(G, d_memberships)
    return G

In [38]:
def Assign_Coordinates(subg, genome):
    conn = subg.to_undirected()
    nodes = conn.nodes()
    alignments = []
    for n in nodes:
        if genome in conn.nodes[n]['Memberships']:
            alignments.append(n)
    if len(alignments) == 0:
        return
    else:
        print(len(alignments))
        start = alignments[0]
        Q = [start]
        visited = set({})
        while len(Q) > 0:
            n = Q.pop(0)
            visited.add(n)
            neighbors = conn.neighbors(n)
            for u in neighbors:
                if u not in alignments:
                    print(u)
                if genome not in conn.nodes[u]['Memberships']:
                    if subg.has_edge(n, u):
                        edge = (n, u)
                        edge_orientation = subg.edges[edge]['orientation']
                        edge_overlap = int(float(subg.edges[edge]['mean']))
                        c2_length = int(subg.nodes[u]['length'])
                        s1, e1 = subg.nodes[n]['Start'], subg.nodes[n]['End']
                        orientation = subg.nodes[n]['Orientation']
                        e1 = s1 + int(subg.nodes[n]['length'])
                        if orientation == '+' and subg.nodes[n]['orientation'] == 'REV':
                            s1, e1 = e1, s1
                        if orientation == '-' and subg.nodes[n]['orientation'] == 'REV':
                            s1, e1 = e1, s1
                        if edge_orientation == 'EE':
                            e2 = e1 + edge_overlap
                            s2 = e2 + c2_length
                        if edge_orientation == 'EB':
                            s2 = e1 + edge_overlap
                            e2 = s2 + c2_length
                        if edge_orientation == 'BB':
                            s2 = s1 + edge_overlap
                            e2 = s2 + c2_length
                        if edge_orientation == 'BE':
                            e2 = s1 + edge_overlap
                            s2 = e2 + c2_length
                        start,end = (s2, e2)
                        flag = 0
                    else:
                        edge = (u, n)
                        edge_orientation = subg.edges[edge]['orientation']
                        edge_overlap = int(float(subg.edges[edge]['mean']))
                        c1_length = int(subg.nodes[u]['length'])
                        s2, e2 = subg.nodes[n]['Start'], subg.nodes[n]['End']
                        e2 = s2 + int(subg.nodes[n]['length'])
                        orientation = subg.nodes[n]['Orientation']
                        if orientation == '+' and subg.nodes[n]['orientation'] == 'REV':
                            s2, e2 = e2, s2
                        if orientation == '-' and subg.nodes[n]['orientation'] == 'REV':
                            s2, e2 = e2, s2
                        if edge_orientation == 'EE':
                            e1 = e2 - edge_overlap
                            s1 = e1 - c1_length
                        if edge_orientation == 'EB':
                            e1 = s2 - edge_overlap
                            s1 = e1 - c1_length
                        if edge_orientation == 'BB':
                            s1 = s2 - edge_overlap
                            e1 = s1 - c1_length
                        if edge_orientation == 'BE':
                            s1 = e2 - edge_overlap
                            e1 = s1 - c1_length
                        start,end = (s1,e1)
                        flag = 1
                    d = {'Contig':u, 'Parent_Node':n,'Start':start, 'End':end,
                         'Membership':subg.nodes[u]['Memberships'],
                         'Contig_Length':int(subg.nodes[u]['length']),
                         'Parent_Length':int(subg.nodes[n]['length']), 
                         'OSA_PIdent(Contig)':subg.nodes[u]['OSA_PIdent'],
                         'OSB_PIdent(Contig)':subg.nodes[u]['OSB_PIdent'],
                         'OSA_PIdent(Parent)':subg.nodes[n]['OSA_PIdent'],
                         'OSB_PIdent(Parent)':subg.nodes[n]['OSB_PIdent'],
                         'Contig_Orientation(MetaCarvel)':subg.nodes[u]['orientation'],
                         'Contig_Orientation(Minimap2)':subg.nodes[u]['Orientation'],
                         'Parent_Orientation(MetaCarvel)':subg.nodes[n]['orientation'],
                         'Parent_Orientation(Minimap2)':subg.nodes[n]['Orientation']}
                    conn.nodes[u]['Start'] = start
                    conn.nodes[u]['End'] = end
                    conn.nodes[u]['Memberships'].append('Graph')

                    subg.nodes[u]['Start'] = start
                    subg.nodes[u]['End'] = end
                    subg.nodes[u]['Memberships'].append('Graph')
                    
                    print(d)
                    
                    if flag == 0:
                        d['Parent_Type'] = 'Parent'
                    elif flag == 1:
                        d['Parent_Type'] = 'Descendant'
                    ####Assign Coordinates to u based on n
                if u not in visited:
                    Q.append(u)
        
    

In [5]:
pardir = '/Users/harihara/Mount-2/hotspring_metagenome/Synechococcus_paper_analysis/reassembly/'

f = 'HotsprottomLayer_megahit_assembled_contigs_osa'

sample_id = f.replace("_megahit_assembled_contigs_osa","")
osa_path = pardir+'contig_mapping_paf/'+sample_id+'_osa_contigs_aligned_to_osa.paf'
osb_path = pardir+'contig_mapping_paf/'+sample_id+'_osa_contigs_aligned_to_osb.paf'
g_path =  pardir+f+'/'+sample_id+'_osa_scaffolds/oriented.gml'

In [8]:
df_osa = Load_PAF(osa_path)
df_osb = Load_PAF(osb_path)

G = nx.read_gml(g_path)
G = Return_Memberships(df_osa, df_osb, G, "OSA", quality_cutoff = 80)

In [39]:
connected_components = list(nx.weakly_connected_components(G))

In [40]:
for c in connected_components:
    print("*", len(c))
    subg = G.subgraph(c)
    Assign_Coordinates(subg, "OSA")

* 1
1
* 1
* 35
15
k141_4734
{'Contig': 'k141_4734', 'Parent_Node': 'k141_5981', 'Start': 335900, 'End': 335357, 'Membership': ['Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph'], 'Contig_Length': 543, 'Parent_Length': 340, 'OSA_PIdent(Contig)': 73.48066298342542, 'OSB_PIdent(Contig)': 73.48066298342542, 'OSA_PIdent(Parent)': 99.11764705882354, 'OSB_PIdent(Parent)': 103.5294117647059, 'Contig_Orientation(MetaCarvel)': 'REV', 'Contig_Orientation(Minimap2)': '+', 'Parent_Orientation(MetaCarvel)': 'FOW', 'Parent_Orientation(Minimap2)': '-'}
k141_1339
{'Contig': 'k141_1339', 'Parent_Node': 'k141_5981', 'Start': 335078, 'End': 335563, 'Membership': ['Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph'], 'Contig_Length': 485, 'Parent_Length': 340, 'OSA_PIdent(Contig)': 69.89690721649484, 'OSB_PIdent(Contig)': 0, 'OSA_PIdent(Parent)': 99.11764705882354, 'OSB_PIdent(Parent)': 103.5294117647059, 'Contig_Orientation(MetaCarvel)': 'FOW', 'Contig_Orientation(Minimap2)': '-', 'P

k141_2636
{'Contig': 'k141_2636', 'Parent_Node': 'k141_3460', 'Start': 994308, 'End': 995321, 'Membership': ['Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph'], 'Contig_Length': 1013, 'Parent_Length': 262, 'OSA_PIdent(Contig)': 54.689042448173744, 'OSB_PIdent(Contig)': 74.33366238894374, 'OSA_PIdent(Parent)': 97.32824427480917, 'OSB_PIdent(Parent)': 100.0, 'Contig_Orientation(MetaCarvel)': 'FOW', 'Contig_Orientation(Minimap2)': '+', 'Parent_Orientation(MetaCarvel)': 'FOW', 'Parent_Orientation(Minimap2)': '-'}
k141_987
{'Contig': 'k141_987', 'Parent_Node': 'k141_3460', 'Start': 993794, 'End': 995658, 'Membership': ['Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph

{'Contig': 'k141_2375', 'Parent_Node': 'k141_3144', 'Start': 694994, 'End': 693498, 'Membership': ['Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph'], 'Contig_Length': 1496, 'Parent_Length': 374, 'OSA_PIdent(Contig)': 71.4572192513369, 'OSB_PIdent(Contig)': 71.5909090909091, 'OSA_PIdent(Parent)': 100.0, 'OSB_PIdent(Parent)': 100.0, 'Contig_Orientation(MetaCarvel)': 'REV', 'Contig_Orientation(Minimap2)': '-', 'Parent_Orientation(MetaCarvel)': 'REV', 'Parent_Orientation(Minimap2)': '-'}
k141_2375
{'Contig': 'k141_2375', 'Parent_Node': 'k141_6513', 'Start': 693434, 'End': 691938, 'Membership': ['Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'G

k141_17
{'Contig': 'k141_17', 'Parent_Node': 'k141_1603', 'Start': 1444945, 'End': 1444418, 'Membership': ['Graph', 'Graph', 'Graph', 'Graph'], 'Contig_Length': 527, 'Parent_Length': 234, 'OSA_PIdent(Contig)': 43.07400379506641, 'OSB_PIdent(Contig)': 60.91081593927894, 'OSA_PIdent(Parent)': 100.0, 'OSB_PIdent(Parent)': 0, 'Contig_Orientation(MetaCarvel)': 'REV', 'Contig_Orientation(Minimap2)': '-', 'Parent_Orientation(MetaCarvel)': 'FOW', 'Parent_Orientation(Minimap2)': '-'}
k141_143
{'Contig': 'k141_143', 'Parent_Node': 'k141_1603', 'Start': 1444455, 'End': 1444942, 'Membership': ['OSB', 'Graph', 'Graph', 'Graph', 'Graph'], 'Contig_Length': 487, 'Parent_Length': 234, 'OSA_PIdent(Contig)': 28.54209445585216, 'OSB_PIdent(Contig)': 100.0, 'OSA_PIdent(Parent)': 100.0, 'OSB_PIdent(Parent)': 0, 'Contig_Orientation(MetaCarvel)': 'FOW', 'Contig_Orientation(Minimap2)': '+', 'Parent_Orientation(MetaCarvel)': 'FOW', 'Parent_Orientation(Minimap2)': '-'}
k141_801
{'Contig': 'k141_801', 'Parent_Nod

* 2
2
* 8
8
* 4
3
k141_3050
{'Contig': 'k141_3050', 'Parent_Node': 'k141_1343', 'Start': 2688614, 'End': 2689300, 'Membership': ['OSB', 'Graph'], 'Contig_Length': 686, 'Parent_Length': 1888, 'OSA_PIdent(Contig)': 68.07580174927114, 'OSB_PIdent(Contig)': 96.6472303206997, 'OSA_PIdent(Parent)': 100.3707627118644, 'OSB_PIdent(Parent)': 100.0, 'Contig_Orientation(MetaCarvel)': 'FOW', 'Contig_Orientation(Minimap2)': '-', 'Parent_Orientation(MetaCarvel)': 'FOW', 'Parent_Orientation(Minimap2)': '-'}
* 1
* 2
2
* 6
6
* 5
5
* 1
1
* 1
1
* 3
3
* 5
4
k141_2336
{'Contig': 'k141_2336', 'Parent_Node': 'k141_5109', 'Start': 1815914, 'End': 1815505, 'Membership': ['OSB', 'Graph'], 'Contig_Length': 409, 'Parent_Length': 2312, 'OSA_PIdent(Contig)': 34.96332518337408, 'OSB_PIdent(Contig)': 100.0, 'OSA_PIdent(Parent)': 100.0, 'OSB_PIdent(Parent)': 99.9567474048443, 'Contig_Orientation(MetaCarvel)': 'REV', 'Contig_Orientation(Minimap2)': '+', 'Parent_Orientation(MetaCarvel)': 'REV', 'Parent_Orientation(Minim

1
* 2
2
* 1
* 4
3
k141_584
{'Contig': 'k141_584', 'Parent_Node': 'k141_3951', 'Start': 1419992, 'End': 1420765, 'Membership': ['OSB', 'Graph'], 'Contig_Length': 773, 'Parent_Length': 1218, 'OSA_PIdent(Contig)': 79.94825355756791, 'OSB_PIdent(Contig)': 80.46571798188874, 'OSA_PIdent(Parent)': 100.24630541871922, 'OSB_PIdent(Parent)': 100.08210180623973, 'Contig_Orientation(MetaCarvel)': 'FOW', 'Contig_Orientation(Minimap2)': '+', 'Parent_Orientation(MetaCarvel)': 'FOW', 'Parent_Orientation(Minimap2)': '+'}
k141_584
{'Contig': 'k141_584', 'Parent_Node': 'k141_1883', 'Start': 1419890, 'End': 1420663, 'Membership': ['OSB', 'Graph', 'Graph'], 'Contig_Length': 773, 'Parent_Length': 311, 'OSA_PIdent(Contig)': 79.94825355756791, 'OSB_PIdent(Contig)': 80.46571798188874, 'OSA_PIdent(Parent)': 100.0, 'OSB_PIdent(Parent)': 100.64308681672026, 'Contig_Orientation(MetaCarvel)': 'FOW', 'Contig_Orientation(Minimap2)': '+', 'Parent_Orientation(MetaCarvel)': 'REV', 'Parent_Orientation(Minimap2)': '-'}
*

1
* 5
4
k141_4316
{'Contig': 'k141_4316', 'Parent_Node': 'k141_5632', 'Start': 207237, 'End': 206829, 'Membership': ['Graph'], 'Contig_Length': 408, 'Parent_Length': 520, 'OSA_PIdent(Contig)': 68.62745098039215, 'OSB_PIdent(Contig)': 69.36274509803921, 'OSA_PIdent(Parent)': 96.92307692307692, 'OSB_PIdent(Parent)': 100.0, 'Contig_Orientation(MetaCarvel)': 'REV', 'Contig_Orientation(Minimap2)': '-', 'Parent_Orientation(MetaCarvel)': 'REV', 'Parent_Orientation(Minimap2)': '-'}
k141_4316
{'Contig': 'k141_4316', 'Parent_Node': 'k141_784', 'Start': 207371, 'End': 206963, 'Membership': ['Graph', 'Graph'], 'Contig_Length': 408, 'Parent_Length': 556, 'OSA_PIdent(Contig)': 68.62745098039215, 'OSB_PIdent(Contig)': 69.36274509803921, 'OSA_PIdent(Parent)': 97.3021582733813, 'OSB_PIdent(Parent)': 100.0, 'Contig_Orientation(MetaCarvel)': 'REV', 'Contig_Orientation(Minimap2)': '-', 'Parent_Orientation(MetaCarvel)': 'FOW', 'Parent_Orientation(Minimap2)': '+'}
* 1
1
* 1
* 2
2
* 1
1
* 1
1
* 1
1
* 2
2
* 1

* 1
1
* 1
1
* 1
1
* 1
1
* 1
1
* 3
3
* 1
1
* 1
1
* 1
* 1
* 2
2
* 2
2
* 1
* 1
1
* 1
1
* 1
1
* 2
1
k141_4880
{'Contig': 'k141_4880', 'Parent_Node': 'k141_6306', 'Start': 894467, 'End': 895130, 'Membership': ['Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph'], 'Contig_Length': 663, 'Parent_Length': 336, 'OSA_PIdent(Contig)': 79.03469079939669, 'OSB_PIdent(Contig)': 79.18552036199095, 'OSA_PIdent(Parent)': 100.0, 'OSB_PIdent(Parent)': 100.0, 'Contig_Orientation(MetaCarvel)': 'FOW', 'Contig_Orientation(Minimap2)': '-', 'Parent_Orientation(MetaCarvel)': 'FOW', 'Parent_Orientation(Minimap2)': '+'}
* 1
1
* 1
1
* 2
1
k141_6024
{'Contig': 'k141_6024', 'Parent_Node': 'k141_4898', 'Start': 9716, 'End': 10417, 'Membership': ['OSB', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph', 'Graph'], 'Contig_Length': 701, 'Parent_Length': 797, 'OSA_PIdent(Contig)': 68.188302425107, 'OSB_PIdent(Contig)': 100.14265335235378, 'OSA_PIdent(Parent)': 99.49811794228356, 'OSB_PIdent(Parent)': 99.62358845671268, 'Contig