In [4]:
import pandas as pd
from os import listdir
import numpy as np

Scaffolds_path = '/Users/harihara/Mount/Strain_Analysis/Scaffolds/'
Bins_path = '/Users/harihara/Mount/Strain_Analysis/Metabat_Results/asm_graph_coverages/'
Binnacle_path = '/Users/harihara/Mount/Strain_Analysis/Binnacle/'
CheckM_Path = '/Users/harihara/Mount/Strain_Analysis/CheckM-Results/asm_graph_coverages/BinStats/'
synec_aligns = '/Users/harihara/Mount/Strain_Analysis/Synechococcus_alignments/'

def Get_Feature_Information(filedir):
    feature_dict = dict()
    three_bubbles = open(filedir+'three_bubbles','r').readlines()
    bub_ctr = 0
    for bubble in three_bubbles:
        contigs = bubble.replace("\n","").split("\t")
        for c in contigs:
            if len(c) == 0: continue
            feature_dict[c] = 'Three_Bubble_'+str(bub_ctr)
        bub_ctr += 1
    
    four_bubbles = open(filedir+'four_bubbles','r').readlines()
    bub_ctr = 0
    for bubble in four_bubbles:
        contigs = bubble.replace("\n","").split("\t")
        for c in contigs:
            feature_dict[c] = 'Four_Bubble_'+str(bub_ctr)
        bub_ctr += 1
        
    complex_bubbles = open(filedir+'complex_bubbles','r').readlines()
    bub_ctr = 0
    for bubble in complex_bubbles:
        contigs = bubble.replace("\n","").split("\t")
        for c in contigs:
            feature_dict[c] = 'Complex_Bubble_'+str(bub_ctr)
        bub_ctr += 1
        
    plasmids = open(filedir+'plasmids','r').readlines()
    bub_ctr = 0
    for plasmid in plasmids:
        contigs = plasmid.replace("\n","").replace("[","").replace("]","").replace(" ","").split(",")
        for c in contigs:
            c = c.replace("'","")
            feature_dict[c] = 'Plasmid_'+str(bub_ctr)
        bub_ctr += 1
    df_feature = pd.DataFrame(data = {'Contig':list(feature_dict.keys()), 
                                      'Feature':list(feature_dict.values())})
    df_feature = df_feature.set_index('Contig')
    return df_feature

def Load_Coords_Info(sampleid):
    df_features = Get_Feature_Information(Scaffolds_path+sampleid+'_scaffolds/')
    df_coords = pd.read_csv(Binnacle_path+sampleid+'/Coords_After_Delinking.txt', sep = '\t',
                           names = ['CC_After_Delinking', 'CC_Before_Delinking', 'Contig', 'Start', 'End', 
                                   'In_Graph', 'Length'], index_col = 'Contig')
    df_temp = df_coords.join(df_features, how = 'inner')[['CC_After_Delinking', 'Feature']]
    temp = df_temp.groupby('Feature').agg({'CC_After_Delinking':'nunique'})
    temp = temp.rename(columns = {'CC_After_Delinking':'Num-components'}).reset_index()
    df_temp = df_temp.reset_index().merge(temp, left_on = 'Feature', right_on = 'Feature').set_index('Contig')
    del df_temp['CC_After_Delinking']
    df_coords = df_coords.join(df_temp, how = 'left')
    temp = df_coords.groupby(['CC_After_Delinking',
                              'Feature']).mean()[['Num-components']].reset_index(level=1)
    df_coords = df_coords.reset_index().set_index('CC_After_Delinking')
    return df_coords, temp

def agg_num_bubbles(group):
    if len(group) == 1:
        if len(group.iloc[0] == 1):
            return 1
        else:
            return 0
    else:
        return len(group[group['Num-components']==1])
    
def Load_Bins(sampleid):
    df_coords, df_features = Load_Coords_Info(sampleid)
    df_coords_stats = df_coords.reset_index().groupby('CC_After_Delinking').agg({'Contig': 'count',
                                                                                 'Length': 'sum'})
    df_coords_stats.rename(columns={'Contig':'Num_Contigs', 'Length':'TotalBP'}, inplace = True)
    df_feature_agg = df_features.reset_index().groupby('CC_After_Delinking')[['Feature']].count()
    df_feature_agg['Num_Bubbles'] = df_features.groupby('CC_After_Delinking').apply(agg_num_bubbles)
    df_feature_agg['Broken_Bubbles'] = df_feature_agg['Feature'] - df_feature_agg['Num_Bubbles']
    del df_feature_agg['Feature']
    df_coords_stats = df_coords_stats.join(df_feature_agg, how = 'left')
    df_coords_stats['Num_Bubbles'] = df_coords_stats['Num_Bubbles'].fillna(0)
    df_coords_stats['Broken_Bubbles'] = df_coords_stats['Broken_Bubbles'].fillna(0)
    
    bins = listdir(Bins_path+sampleid+'/')
    ctr = 0
    d = {}
    for b in bins:
        if b.startswith('cluster'):continue
        scaffolds = open(Bins_path+sampleid+'/'+b).readlines()
        for s in scaffolds:
            d[int(s.replace("\n",""))] = b
            
    df_op = pd.DataFrame(data = {'CC_After_Delinking':list(d.keys()), 'BinID':list(d.values())})
    df_op = df_op.set_index('CC_After_Delinking')
    df_bin_contigs = df_op.join(df_coords, how = 'right')
    df_bin_contigs['BinID'] = df_bin_contigs['BinID'].fillna('Unbinned')
    
    df_op = df_op.join(df_coords_stats, how = 'outer')
    df_op['BinID'] = df_op['BinID'].fillna('Unbinned')
    df_op['Num_Scaffolds'] = 0
    df_op['Num_Singleton'] = 0
    df_op.loc[df_op['Num_Contigs'] > 1, 'Num_Scaffolds'] = 1
    df_op.loc[df_op['Num_Contigs'] == 1, 'Num_Singleton'] = 1
    
    df_stats = df_op.groupby('BinID').sum()
    return df_stats, df_bin_contigs

def Parse_Checkm_Results(sampleid):
    df_stats, df_bins = Load_Bins(sampleid)
    filepath = CheckM_Path+sampleid+'_bin_stats.txt'
    column_names = ['Bin' ,'Marker lineage','# genomes','# markers','# marker sets','Completeness',
                    'Contamination','StrainHeterogeneity','Genome size (bp)','# ambiguous bases',
                    '# scaffolds','# contigs','N50 (scaffolds)','N50 (contigs)','Mean scaffold length (bp)',
                    'Mean contig length (bp)','Longest scaffold (bp)','Longest contig (bp)','GC',
                    'GC std (scaffolds > 1kbp)','Coding density','Translation table','# predicted genes',
                    '0','1','2','3','4','5+']

    lines = open(filepath, 'r').readlines()
    mat = []
    for l in lines[3:]:
        temp_str = ''
        vec = []
        for i in range(1, len(l)):
            if l[i] != ' ' or (l[i] == ' ' and l[i+1] == '('):
                temp_str += l[i]
            else:
                if len(temp_str) > 0:
                    vec.append(temp_str)
                temp_str = ''
        mat.append(vec)
    df = pd.DataFrame(data = mat)
    df.columns = column_names
    df = df.set_index('Bin')
    df[['Completeness','Contamination',
        'StrainHeterogeneity']] = df[['Completeness','Contamination','StrainHeterogeneity']].apply(pd.to_numeric)
    df = df[['Marker lineage','# genomes','Completeness','Contamination','StrainHeterogeneity']]
    df['Contamination_Normalized'] = df['Contamination']*(100-df['StrainHeterogeneity'])/100
    df = df.dropna()
    df = df.join(df_stats, how = 'right')
    return df, df_bins

def Load_Synechococcus_Alignments(sampleid):
    osa, osb, osab = [],[],[]
    temp = open(synec_aligns+'osa_'+sampleid+'.final_contig.list','r').readlines()
    for s in temp:
        osa.append(s.replace("\n",""))
    temp = open(synec_aligns+'osb_'+sampleid+'.final_contig.list','r').readlines()
    for s in temp:
        osb.append(s.replace("\n",""))
    temp = open(synec_aligns+'osab_'+sampleid+'.common_contigs.list','r').readlines()
    for s in temp:
        osab.append(s.replace("\n",""))
    return osa, osb, osab


In [6]:
df_MS50_summ, df_MS50_bins = Parse_Checkm_Results('MS50')
df_MS55_summ, df_MS55_bins = Parse_Checkm_Results('MS55')
df_MS60_summ, df_MS60_bins = Parse_Checkm_Results('MS60')
df_MS65_summ, df_MS65_bins = Parse_Checkm_Results('MS65')

In [8]:
osa, osb, osab = Load_Synechococcus_Alignments('MS50')
df_MS50_bins.reset_index().set_index('Contig').loc[osa+osb+osab].groupby('BinID').sum()[['Length']].\
sort_values(by = 'Length', ascending = False)

Unnamed: 0_level_0,Length
BinID,Unnamed: 1_level_1
Unbinned,5701166
bin_result.69,208636
bin_result.9,46199


In [9]:
osa, osb, osab = Load_Synechococcus_Alignments('MS55')
df_MS55_bins.reset_index().set_index('Contig').loc[osa+osb+osab].groupby('BinID').sum()[['Length']].\
sort_values(by = 'Length', ascending = False)

Unnamed: 0_level_0,Length
BinID,Unnamed: 1_level_1
Unbinned,4302393
bin_result.44,1912986
bin_result.85,129413
bin_result.27,30004
bin_result.90,19327
bin_result.94,13804
bin_result.63,2447
bin_result.102,2039
bin_result.40,556


In [29]:
osa, osb, osab = Load_Synechococcus_Alignments('MS60')
df_MS60_bins.reset_index().set_index('Contig').loc[osa+osb+osab].groupby('BinID').sum()[['Length']].\
sort_values(by = 'Length', ascending = False)

Unnamed: 0_level_0,Length
BinID,Unnamed: 1_level_1
Unbinned,4383055
bin_result.25,1836832
bin_result.19,272527
bin_result.57,67064
bin_result.55,62421


In [15]:
osa, osb, osab = Load_Synechococcus_Alignments('MS65')
df_MS65_bins.reset_index().set_index('Contig').loc[osa+osb+osab].groupby('BinID').sum()[['Length']].\
sort_values(by = 'Length', ascending = False)

Unnamed: 0_level_0,Length
BinID,Unnamed: 1_level_1
Unbinned,2210357
bin_result.39,1909395
bin_result.43,39193
bin_result.1,20946


In [17]:
df_MS65_bins.reset_index().set_index('Contig').loc[osa+osb+osab].groupby('CC_After_Delinking').count()

Unnamed: 0_level_0,BinID,CC_Before_Delinking,Start,End,In_Graph,Length,Feature,Num-components
CC_After_Delinking,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
46,1,1,1,1,1,1,0,0
59,11,11,11,11,11,11,2,2
118,5,5,5,5,5,5,0,0
141,2,2,2,2,2,2,0,0
192,3,3,3,3,3,3,0,0
...,...,...,...,...,...,...,...,...
508417,3,3,3,3,3,3,0,0
508502,1,1,1,1,1,1,0,0
508571,1,1,1,1,1,1,0,0
508589,1,1,1,1,1,1,0,0


In [22]:
df_MS65_bins.loc[141]

Unnamed: 0_level_0,BinID,Contig,CC_Before_Delinking,Start,End,In_Graph,Length,Feature,Num-components
CC_After_Delinking,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
141,bin_result.39,k141_399530,101,0,9004,1,9004,,
141,bin_result.39,k141_703803,101,8751,9260,1,509,,
141,bin_result.39,k141_100826,101,7452,9056,1,1604,,
141,bin_result.39,k141_581881,101,13172,9111,1,4061,,
