In [4]:
import pandas as pd
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
from os import listdir, mkdir
from os.path import isdir

def Load_PAF(filepath):
    lines = open(filepath).readlines()
    header = ['Query','Qlen','QStart','QEnd','Orientation','Subject','SLen',
              'SStart','SEnd','Matches','AlignLength','MAPQ']
    op = []
    for l in lines:
        l = l.split('\t')[:12]
        op.append(dict(zip(header, l)))
    df = pd.DataFrame(op)
    df[['Qlen','QStart','QEnd','SLen','SStart',
        'SEnd','Matches','AlignLength','MAPQ']] = df[['Qlen','QStart','QEnd','SLen','SStart',
                                                      'SEnd','Matches','AlignLength','MAPQ']].astype('int')
    df['PIdent'] = df['AlignLength']/df['Qlen']*100
    df = df.loc[df.groupby(['Query'])['PIdent'].idxmax()]
    return df
 
def Assign_Coords(subg, filter_str):
    g_undir = subg.to_undirected()
    values, orderslist, osb_contigs_aligned = [], [], []
    for n in subg.nodes():
        try:
            if filter_str in subg.nodes[n]['Memberships']:
                osb_contigs_aligned.append(n)
        except nx.NetworkXError:
            continue
    
    for n in subg.nodes:
        try:
            if filter_str not in subg.nodes[n]['Memberships']:
                neighbors = list(subg.predecessors(n)) + list(subg.successors(n))
                cnt = len(set(neighbors).intersection(osb_contigs_aligned))
                values.append(cnt)
                orderslist.append(n)
        except KeyError:
            pass
    orderslist = np.array(orderslist)[np.argsort(values)]
    op = []
    
    for src in orderslist:
        neighbors = g_undir.neighbors(src)
        for n in neighbors:
            flag = -1
            if subg.nodes[n]['Orientation'] == '*':
                continue
            if subg.has_edge(n, src):
                edge = (n, src)
                edge_orientation = subg.edges[edge]['orientation']
                edge_overlap = int(float(g_undir.edges[edge]['mean']))
                c2_length = int(g_undir.nodes[src]['length'])
                s1, e1 = subg.nodes[n]['Start'], subg.nodes[n]['End']
                orientation = subg.nodes[n]['Orientation']
                e1 = s1 + int(subg.nodes[n]['length'])
                if orientation == '+' and subg.nodes[n]['orientation'] == 'REV':
                    s1, e1 = e1, s1
                if orientation == '-' and subg.nodes[n]['orientation'] == 'REV':
                    s1, e1 = e1, s1
                if edge_orientation == 'EE':
                    e2 = e1 + edge_overlap
                    s2 = e2 + c2_length
                if edge_orientation == 'EB':
                    s2 = e1 + edge_overlap
                    e2 = s2 + c2_length
                if edge_orientation == 'BB':
                    s2 = s1 + edge_overlap
                    e2 = s2 + c2_length
                if edge_orientation == 'BE':
                    e2 = s1 + edge_overlap
                    s2 = e2 + c2_length
                start,end = (s2, e2)
                flag = 0
            else:
                edge = (src, n)
                edge_orientation = subg.edges[edge]['orientation']
                edge_overlap = int(float(g_undir.edges[edge]['mean']))
                c1_length = int(g_undir.nodes[src]['length'])
                s2, e2 = subg.nodes[n]['Start'], subg.nodes[n]['End']
                e2 = s2 + int(subg.nodes[n]['length'])
                orientation = subg.nodes[n]['Orientation']
                if orientation == '+' and subg.nodes[n]['orientation'] == 'REV':
                    s2, e2 = e2, s2
                if orientation == '-' and subg.nodes[n]['orientation'] == 'REV':
                    s2, e2 = e2, s2
                if edge_orientation == 'EE':
                    e1 = e2 - edge_overlap
                    s1 = e1 - c1_length
                if edge_orientation == 'EB':
                    e1 = s2 - edge_overlap
                    s1 = e1 - c1_length
                if edge_orientation == 'BB':
                    s1 = s2 - edge_overlap
                    e1 = s1 - c1_length
                if edge_orientation == 'BE':
                    s1 = e2 - edge_overlap
                    e1 = s1 - c1_length
                start,end = (s1,e1)
                flag = 1
                
            d = {'Contig':src, 'Parent_Node':n,'Start':start, 'End':end,
                 'Membership':subg.nodes[src]['Memberships'],
                 'Contig_Length':int(subg.nodes[src]['length']),
                 'Parent_Length':int(subg.nodes[n]['length']), 
                 'OSA_PIdent(Contig)':subg.nodes[src]['OSA_PIdent'],
                 'OSB_PIdent(Contig)':subg.nodes[src]['OSB_PIdent'],
                 'OSA_PIdent(Parent)':subg.nodes[n]['OSA_PIdent'],
                 'OSB_PIdent(Parent)':subg.nodes[n]['OSB_PIdent'],
                 'Contig_Orientation(MetaCarvel)':subg.nodes[src]['orientation'],
                 'Contig_Orientation(Minimap2)':subg.nodes[src]['Orientation'],
                 'Parent_Orientation(MetaCarvel)':subg.nodes[n]['orientation'],
                 'Parent_Orientation(Minimap2)':subg.nodes[n]['Orientation']}
            if flag == 0:
                d['Parent_Type'] = 'Parent'
            elif flag == 1:
                d['Parent_Type'] = 'Descendant'
            op.append(d)
    return op

def Return_Memberships(df_osa, df_osb, G, genome, quality_cutoff = 80):
    osa_contigs_aligned = df_osa[df_osa['PIdent'] > quality_cutoff]['Query'].tolist()
    osa_contigs_not_aligned = df_osa[df_osa['PIdent'] <= quality_cutoff]['Query'].tolist()

    osb_contigs_aligned = df_osb[df_osb['PIdent'] > quality_cutoff]['Query'].tolist()
    osb_contigs_not_aligned = df_osb[df_osb['PIdent'] <= quality_cutoff]['Query'].tolist()
    
    df_osa_grp = df_osa[df_osa['PIdent'] > quality_cutoff]
    df_osa_grp = df_osa.set_index('Query').T.to_dict()
    
    df_osb_grp = df_osb[df_osb['PIdent'] > quality_cutoff]
    df_osb_grp = df_osb.set_index('Query').T.to_dict()
    
    d_memberships = {}
    contigs = set(df_osa['Query'].tolist()).union(set(df_osb['Query'].tolist())).union(set(list(G.nodes())))
    for c in contigs:
        if c in osa_contigs_aligned:
            try: d_memberships[c]['Memberships'].append('OSA')
            except KeyError: d_memberships[c] = {'Memberships':['OSA']}

        if c in osb_contigs_aligned:
            try: d_memberships[c]['Memberships'].append('OSB')
            except KeyError: d_memberships[c] = {'Memberships':['OSB']}

        if c not in osa_contigs_aligned and c not in osb_contigs_aligned:
            d_memberships[c] = {'Memberships' : []} 
        
        try: d_memberships[c]['OSA_PIdent'] = df_osa_grp[c]['PIdent']
        except KeyError: d_memberships[c]['OSA_PIdent'] = 0
        
        try: d_memberships[c]['OSB_PIdent'] = df_osb_grp[c]['PIdent']
        except KeyError: d_memberships[c]['OSB_PIdent'] = 0
        
        if genome == 'OSA':
            try:
                Start = min(df_osa_grp[c]['SStart'], df_osa_grp[c]['SEnd'])
                End = max(df_osa_grp[c]['SStart'], df_osa_grp[c]['SEnd'])
                Orientation = df_osa_grp[c]['Orientation']
                d_memberships[c]['Start'] = Start
                d_memberships[c]['End'] = End
                d_memberships[c]['Orientation'] = Orientation
            except KeyError:
                d_memberships[c]['Start'] = -1
                d_memberships[c]['End'] = -1
                d_memberships[c]['Orientation'] = '*'
                
        if genome == 'OSB':
            try:
                Start = min(df_osb_grp[c]['SStart'], df_osb_grp[c]['SEnd'])
                End = max(df_osb_grp[c]['SStart'], df_osb_grp[c]['SEnd'])
                Orientation = df_osb_grp[c]['Orientation']
                d_memberships[c]['Start'] = Start
                d_memberships[c]['End'] = End
                d_memberships[c]['Orientation'] = Orientation
            except KeyError:
                d_memberships[c]['Start'] = -1
                d_memberships[c]['End'] = -1
                d_memberships[c]['Orientation'] = '*'
    nx.set_node_attributes(G, d_memberships)
    return G

def Main(osa_path, osb_path, Graph_path, genome, quality_cutoff):
    df_osa = Load_PAF(osa_path)
    df_osb = Load_PAF(osb_path)

    G = nx.read_gml(Graph_path)
    G = Return_Memberships(df_osa, df_osb, G, genome, quality_cutoff = quality_cutoff)
    ctr = 0
    oplist = []
    print(len(G.nodes()))
    
    for c in G.nodes():
        neighbors = list(G.predecessors(c)) + list(G.successors(c))
        found = False
        for n in neighbors:
            try:
                if genome not in G.nodes[n]['Memberships']:
                    found = True
                    break
            except KeyError:
                continue

        if found == True:
            subg = G.subgraph([c]+neighbors)
            op = Assign_Coords(subg, genome)
            oplist += op
            #print(ctr, len(op), len(oplist))
        ctr+=1
    
    #print(ctr, len(oplist))
    df_scaffold = pd.DataFrame(oplist)
    df_scaffold['Membership'] = df_scaffold['Membership'].astype(str)
    #print (len(df_scaffold))
    df_scaffold = df_scaffold.drop_duplicates()
    #print(len(df_scaffold))
    return df_scaffold


1. Sort the contigs based on their start coordinate. 
2. Starting at the first contig, assign coordinates to its putative descendants based on all its neighbors found in the reference. 
3. The coordinates must respect the orientation, length and overlaps. 

In [5]:
files = listdir('Mount/')
if not isdir('Research-Activities/Data/Hot-Spring/Missing_Contig_Coords/'):
    mkdir('Research-Activities/Data/Hot-Spring/Missing_Contig_Coords/')
if not isdir('Research-Activities/Data/Hot-Spring/Missing_Contig_Coords/OSA/'):
    mkdir('Research-Activities/Data/Hot-Spring/Missing_Contig_Coords/OSA/')
if not isdir('Research-Activities/Data/Hot-Spring/Missing_Contig_Coords/OSB/'):
    mkdir('Research-Activities/Data/Hot-Spring/Missing_Contig_Coords/OSB/')

for f in files:
    if isdir('Mount/'+f) and ('osa' in f or 'osb' in f):
        if 'osa' in f:
            sample_id = f.replace("_megahit_assembled_contigs_osa","")
            osa_path = '/Users/harihara/Mount/contig_mapping_paf/'+sample_id+'_osa_contigs_aligned_to_osa.paf'
            osb_path = '/Users/harihara/Mount/contig_mapping_paf/'+sample_id+'_osa_contigs_aligned_to_osb.paf'
            g_path = '/Users/harihara/Mount/'+f+'/'+sample_id+'_osa_scaffolds/oriented.gml'
            df_scaffold = Main(osa_path, osb_path, g_path, 'OSA', 80)
            print(f,len(df_scaffold))
            df_scaffold.to_csv('Research-Activities/Data/Hot-Spring/Missing_Contig_Coords/OSA/'+sample_id+'.txt', 
                               sep = "\t")
            
        if 'osb' in f:
            sample_id = f.replace("_megahit_assembled_contigs_osb","")
            osa_path = '/Users/harihara/Mount/contig_mapping_paf/'+sample_id+'_osb_contigs_aligned_to_osa.paf'
            osb_path = '/Users/harihara/Mount/contig_mapping_paf/'+sample_id+'_osb_contigs_aligned_to_osb.paf'
            g_path = '/Users/harihara/Mount/'+f+'/'+sample_id+'_osb_scaffolds/oriented.gml'
            df_scaffold = Main(osa_path, osb_path, g_path, 'OSB', 80)
            print(f,len(df_scaffold))
            df_scaffold.to_csv('Research-Activities/Data/Hot-Spring/Missing_Contig_Coords/OSB/'+sample_id+'.txt', 
                               sep = "\t")

20197
HotsprSampleOSM4_megahit_assembled_contigs_osa 10293
16528
Hotspr20Samplet1_megahit_assembled_contigs_osb 6463
28551
HotsprSampleOS65_megahit_assembled_contigs_osb 4178
15134
HotsprSampleMS65_megahit_assembled_contigs_osa 5382
6563
HotsprSampOS1265_megahit_assembled_contigs_osb 3240
7252
HotsprSampleOSM2_megahit_assembled_contigs_osa 1466
11124
HotsprSampOS1260_megahit_assembled_contigs_osb 4843
31374
HotsprSampleOSM3_megahit_assembled_contigs_osb 8426
13530
HotsprSampleMSe3_megahit_assembled_contigs_osa 7892
11919
HotsprSampleOSM1_megahit_assembled_contigs_osa 4350
16700
HotsprSampleMS13_megahit_assembled_contigs_osa 12276
8782
HotsprSampleOS55_megahit_assembled_contigs_osb 5798
49373
Hotspr2Sampleee2_megahit_assembled_contigs_osb 26114
9079
HotsprSampleOS65_megahit_assembled_contigs_osa 3432
41894
Hotspr20SampleP4_megahit_assembled_contigs_osb 20747
17406
HotsprSamplt10cd_megahit_assembled_contigs_osa 10769
18573
HotsprSampleMSe2_megahit_assembled_contigs_osa 14041
15487
Hotspr