In [1]:
import os 
import sys
from subprocess import run
import importlib
import mixing
importlib.reload(mixing)
import glob
from adjustText import adjust_text
import pandas as pd
import seaborn as sns
import numpy as np
import re
import matplotlib.pyplot as plt
from collections import defaultdict
plt.rcParams["figure.figsize"] = (4,4)
sns.__version__

'0.13.2'

In [None]:
# 1) Run this to construct long read assemblies
RUN_FLY_BASH = '''
#!/bin/bash 
#SBATCH --job-name=checkM
#SBATCH --time=4:00:00
#SBATCH --mem=96G
#SBATCH --cpus-per-task=16
#SBATCH --exclude=m005
#SBATCH --account pmg
#SBATCH --output=/burg/pmg/users/ic2465/Projects/MANU_copangraph/KmerMixing/flye-%x-%j.log

flye --meta --threads 16 --pacbio-hifi ${SAMPLE} -o ${OUTDIR}
'''

# 2) Select all contigs in all long read assemblies with length > 1M
# python split_and_filter_long_reads.py <ASSEMBLY> 1 <CANDIATE_GENOMES_OUTDIR>

# 3) Run checkM on all contigs > 1M 
RUN_CHECKM_BASH = '''
#!/bin/bash 
#SBATCH --job-name=checkM
#SBATCH --time=4:00:00
#SBATCH --mem=90G
#SBATCH --cpus-per-task=16
#SBATCH --exclude=m005
#SBATCH --account pmg
#SBATCH --output=/burg/pmg/users/ic2465/Projects/MANU_copangraph/KmerMixing/checkM-%x-%j.log
checkm lineage_wf -x fa $DIR ${DIR}/checkm_out -t 16 --file output.txt
'''

# 4) Select contigs with Contamination < 5%
# checkm qa checkm_out/lineage.ms checkm_out -o 2 --tab_table --file checkm_output.tsv
# then filter table to contamination < 5 in ipython. Saved to Bin_contamination_lt5.csv

# 5) Run dRep at 99, 97, 95 % ANI
# /usr/bin/time -v dRep dereplicate -g low_contam_bins/*.fa -p 32 --ignoreGenomeQuality -sa 0.95 --clusterAlg average dreped_bins_95


In [34]:
# 6) select representatives
ANI=95
drep_data = pd.read_csv(f'../data/KmerMixing/gherig/bins/dreped_bins_{ANI}/data_tables/Cdb.csv')
checkm_data = pd.read_csv('../data/KmerMixing/gherig/bins/checkm_out/Bins_contamination_lt5.csv')
drep_data['bin_id'] = drep_data.genome.apply(lambda x: os.path.splitext(x)[0])
full_data = pd.merge(checkm_data, drep_data, on='bin_id')

cluster_reps = full_data.groupby('secondary_cluster').apply(lambda x: x.loc[x.completeness == x.completeness.max(),:].iloc[0,:]).reset_index(drop=True)
cluster_reps.to_csv(f'../data/KmerMixing/gherig/bins/drep_representatives_{ANI}/cluster_reps.csv',index=None)
# 7) move genomes to representative dir
ALL_BINS= '/burg/pmg/users/ic2465/Projects/MANU_copangraph/data/KmerMixing/gherig/bins/contigs_gt_1M'
REP_BINS= f'/burg/pmg/users/ic2465/Projects/MANU_copangraph/data/KmerMixing/gherig/bins/drep_representatives_{ANI}'
for b in cluster_reps.bin_id:
    cmd = 'cp ' + os.path.join(ALL_BINS, f'{b}.fa') + ' ' + os.path.join(REP_BINS, f'{b}.fa')
    run(cmd, shell=True)

  cluster_reps = full_data.groupby('secondary_cluster').apply(lambda x: x.loc[x.completeness == x.completeness.max(),:].iloc[0,:]).reset_index(drop=True)


In [1]:
# 7) call gherig run to launch the seq in genome analysii
RUN_GHERIG_MIXING='''#!/bin/bash 
#SBATCH --job-name=kmer-mixing
#SBATCH --time=48:00:00
#SBATCH --mem=128G
#SBATCH --cpus-per-task=1
#SBATCH --exclude=m005
#SBATCH --account pmg
#SBATCH --output=/burg/pmg/users/ic2465/Projects/MANU_copangraph/KmerMixing/repmix-%x-%j.log
python replicate_run.py ${r} ${g}
'''

In [6]:
# - some useful paths
ANI = 99
#PREF=f'hmlgy_panel_gherig_mag{ANI}_'
PREF_COV=f'hmlgy_panel_coverage_magani{ANI}_'
PREF_MULTI=f'hmlgy_panel_multigenome_magani{ANI}_'
OUTDIR='/burg/pmg/users/ic2465/Projects/MANU_copangraph/data/KmerMixing/gherig/'

In [None]:
# 8) preprocess raw output
get_base = lambda x: os.path.splitext(os.path.basename(x))[0]
data = pd.read_csv(f'../data/KmerMixing/gherig/ani_{ANI}_mixing_results.csv', index_col=0)
data['file'] = data.tool
data['genome_ani'] = ANI
data.tool = data.file.apply(lambda x: 'copangraph' if 'copangraph' in get_base(x) else 'megahit')
data['parameter'] = data.file.apply(lambda x: float(get_base(x)[1:]) if 'copangraph' not in get_base(x) else float(get_base(x).split('_')[3]))
data
plot_dat = data.groupby(['parameter', 'tool', 'file', 'genome_ani']).apply(
    lambda x: pd.Series(
        [
            x.nid.shape[0],  # how many nodes?
            x.total_bp.sum(), # total bp in graph?
            x.max_bp.sum(),  # total bp in graph, if we only take the longest seq in each node?
            x.is_multi_genome.sum()/x.shape[0],  # proportion of multi genome nodes?
            x.is_single_genome.sum()/x.shape[0],  # proportion of single genome nodes?
            (x.unmapped_bp == x.total_bp).sum()/x.shape[0], # proportion of unmapped nodes
            x.num_genomes_in.sum()/x.shape[0],  # average number of genomes per node?
            x.has_unmapped.sum()/x.shape[0], # proportion of nodes containing some unmapped sequence?
            x.unmapped_bp.sum(), # sum total unmapped
            x.unmapped_bp.sum()/x.total_bp.sum()# propotion bp unmappable
        ],
        index=[
            'num_nodes', 'total_bp_in_graph', 'sum_max_seq_per_node', 
            'multi_genome_node_proportion', 'single_genome_node_proportion', 'unmapped_node_proportion',
            'mean_num_genomes_per_node', 'node_has_unmapped_sequences_proportion', 
            'total_unmapped_bp', 'proportion_bp_unmappable'
        ]
    )
).reset_index()
plot_dat['proportion_bp_mappable'] = 1-plot_dat.proportion_bp_unmappable
print('n50 calc...')
plot_dat['n50'] = plot_dat.file.apply(lambda x: mixing.n50(x))
print('num edge calc...')
plot_dat['num_edges'] = plot_dat.file.apply(lambda x: mixing.num_edges(x))
plot_dat.to_csv(f'ani_{ANI}_plot_dat.csv', index=None)


In [None]:
# 9) plot data
plot_dat = pd.read_csv('ani_99_plot_dat.csv')
plot_dat


Unnamed: 0,parameter,tool,file,genome_ani,num_nodes,total_bp_in_graph,sum_max_seq_per_node,multi_genome_node_proportion,single_genome_node_proportion,unmapped_node_proportion,mean_num_genomes_per_node,node_has_unmapped_sequences_proportion,total_unmapped_bp,proportion_bp_unmappable,proportion_bp_mappable,n50,num_edges
0,0.005,copangraph,/burg/pmg/users/ic2465/Projects/MANU_copangrap...,99,1203098.0,1340150000.0,1058038000.0,0.044872,0.182432,0.772696,0.325932,0.790403,999600800.0,0.745887,0.254113,2279,275751
1,0.01,copangraph,/burg/pmg/users/ic2465/Projects/MANU_copangrap...,99,1223090.0,1339838000.0,955694300.0,0.05546,0.187229,0.757311,0.357938,0.791212,968227300.0,0.722645,0.277355,1771,316363
2,0.02,copangraph,/burg/pmg/users/ic2465/Projects/MANU_copangrap...,99,1243165.0,1339523000.0,868864800.0,0.069864,0.190843,0.739294,0.399913,0.792199,942258100.0,0.703428,0.296572,1472,355628
3,0.03,copangraph,/burg/pmg/users/ic2465/Projects/MANU_copangrap...,99,1250211.0,1339418000.0,848068500.0,0.075128,0.191916,0.732955,0.416946,0.792299,935222700.0,0.698231,0.301769,1398,369122
4,0.04,copangraph,/burg/pmg/users/ic2465/Projects/MANU_copangrap...,99,1253397.0,1339377000.0,839653100.0,0.0774,0.192396,0.730204,0.426151,0.7923,931650900.0,0.695585,0.304415,1369,375111
5,0.05,copangraph,/burg/pmg/users/ic2465/Projects/MANU_copangrap...,99,1255173.0,1339359000.0,834546300.0,0.078926,0.192495,0.728579,0.433617,0.792393,929385000.0,0.693903,0.306097,1350,378454
6,0.06,copangraph,/burg/pmg/users/ic2465/Projects/MANU_copangrap...,99,1256473.0,1339349000.0,831431400.0,0.079968,0.192484,0.727548,0.438847,0.792535,927791100.0,0.692718,0.307282,1336,380860
7,0.1,copangraph,/burg/pmg/users/ic2465/Projects/MANU_copangrap...,99,1259308.0,1339334000.0,825922800.0,0.08215,0.192347,0.725503,0.450269,0.792704,922911000.0,0.689082,0.310918,1306,385786
8,21.0,megahit,/burg/pmg/users/ic2465/Projects/MANU_copangrap...,99,21383619.0,1432286000.0,1432286000.0,0.110418,0.20255,0.687032,0.535583,0.687032,1021420000.0,0.71314,0.28686,103,24414695
9,29.0,megahit,/burg/pmg/users/ic2465/Projects/MANU_copangrap...,99,5136069.0,1045655000.0,1045655000.0,0.054212,0.175409,0.770379,0.317683,0.770379,824182800.0,0.788198,0.211802,295,3235339


In [3]:
def plot_unlabelled_version(ax, name, tight_layout=True):
    name = os.path.splitext(name)[0]
    ax.set_xticklabels([])
    ax.set_yticklabels([])
    ax.set_xlabel('')
    ax.set_ylabel('')
    frame1 = plt.gca()
    frame1.legend().set_visible(False)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(f'{name}_UNLBLD.pdf', dpi=1400, bbox_inches='tight')
    plt.savefig(f'{name}_UNLBLD.png', dpi=900, bbox_inches='tight')
    plt.clf()

In [25]:
# SUPPLEMENTARY FIGURES SHOWING PROPERTIES BY PARAMETERS

for prop, lims in [
    ('mean_num_genomes_per_node', (0.15, 0.6)), 
    ('n50', (0, 3000)), 
    #('total_bp_in_graph', (0.75*1e9, 1.5*1e9)), 
    #('num_nodes', (5*1e5, 2.2*1e7)), 
    #('num_edges', (2*1e5, 25*1e6))
    ('total_bp_in_graph', None), 
    ('num_nodes', None), 
    ('num_edges', None)
]:

    # megahit mean genomes per node
    ax=sns.lineplot(x=plot_dat.loc[plot_dat.tool == 'megahit', :].parameter, y=plot_dat.loc[plot_dat.tool == 'megahit', :][prop], color='red')
    if lims is not None:
        ax.set_ylim(*lims)
    ax.set_xticks(plot_dat.loc[plot_dat.tool == 'megahit'].parameter.sort_values())
    name = OUTDIR + PREF + f'megahit_{prop}.pdf'
    plt.savefig(name, dpi=1400, bbox_inches='tight')
    plot_unlabelled_version(ax, name)

    # copangraph mean genomes per node
    ax=sns.lineplot(x=plot_dat.loc[plot_dat.tool == 'copangraph', :].parameter, y=plot_dat.loc[plot_dat.tool == 'copangraph', :][prop], color='blue')
    if lims is not None:
        ax.set_ylim(*lims)
    ax.set_xticks(plot_dat.loc[plot_dat.tool == 'copangraph'].parameter.sort_values())
    name = OUTDIR + PREF + f'copangraph_{prop}.pdf'
    plt.savefig(name, dpi=1400, bbox_inches='tight')
    plot_unlabelled_version(ax, name)




No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No artists with labels found to put in legend.  Note that 

<Figure size 400x400 with 0 Axes>

In [52]:
# MULTI_GENOME VS N50, NUM_NODES, NUM_EDGES SCATTER

ANI=99 
plt.figure(figsize=(4,4))
ax = sns.scatterplot(x=plot_dat.n50, y=plot_dat.multi_genome_node_proportion, hue=plot_dat.tool, s=20)
texts = list()
for i in plot_dat.index:
    if plot_dat.loc[i, 'tool'] == 'megahit':
        ano = f'k={int(plot_dat.parameter[i])}'
    else:
        ano = f'sd={float(plot_dat.parameter[i])}'
    plt.annotate(ano, (plot_dat.n50[i], plot_dat.multi_genome_node_proportion[i]), textcoords='offset points', xytext=(5, 5), size=8)
name = OUTDIR + PREF + 'gherig_n50bymultigenome.pdf'
plt.savefig(name, dpi=1400, bbox_inches='tight')
plot_unlabelled_version(ax, name, tight_layout=False)

plt.figure(figsize=(4,4))
ax = sns.scatterplot(x=plot_dat.num_nodes, y=plot_dat.multi_genome_node_proportion, hue=plot_dat.tool, s=20)
plt.xscale('log')
texts = list()
for i in plot_dat.index:
    if plot_dat.loc[i, 'tool'] == 'megahit':
        ano = f'k={int(plot_dat.parameter[i])}'
    else:
        ano = f'sd={float(plot_dat.parameter[i])}'
    plt.annotate(ano, (plot_dat.num_nodes[i], plot_dat.multi_genome_node_proportion[i]), textcoords='offset points', xytext=(5, 5), size=8)
name = OUTDIR + PREF + 'gherig_numnodebymultigenome.pdf'
plt.savefig(name, dpi=1400, bbox_inches='tight')
plot_unlabelled_version(ax, name, tight_layout=False)

plt.figure(figsize=(4,4))
ax = sns.scatterplot(x=plot_dat.num_edges, y=plot_dat.multi_genome_node_proportion, hue=plot_dat.tool, s=20)
plt.xscale('log')
texts = list()
for i in plot_dat.index:
    if plot_dat.loc[i, 'tool'] == 'megahit':
        ano = f'k={int(plot_dat.parameter[i])}'
    else:
        ano = f'sd={float(plot_dat.parameter[i])}'
    plt.annotate(ano, (plot_dat.num_edges[i], plot_dat.multi_genome_node_proportion[i]), textcoords='offset points', xytext=(5, 5), size=8)
name = OUTDIR + PREF + 'gherig_numedgesbymultigenome.pdf'
plt.savefig(name, dpi=1400, bbox_inches='tight')
plot_unlabelled_version(ax, name, tight_layout=False)

# MULTI_GENOME VS N50, NUM_NODES, NUM_EDGES SCATTER COPAN ONLY
plot_dat_cpn = plot_dat.loc[plot_dat.tool == 'copangraph',:]
plt.figure(figsize=(4,4))
ax = sns.scatterplot(x=plot_dat_cpn.n50, y=plot_dat_cpn.multi_genome_node_proportion, s=20)
texts = list()
for i in plot_dat_cpn.index:
    ano = f'sd={float(plot_dat_cpn.parameter[i])}'
    plt.annotate(ano, (plot_dat_cpn.n50[i], plot_dat_cpn.multi_genome_node_proportion[i]), textcoords='offset points', xytext=(5, 5), size=8)
name = OUTDIR + PREF + 'gherig_n50bymultigenome_copanonly.pdf'
plt.savefig(name, dpi=1400, bbox_inches='tight')
plot_unlabelled_version(ax, name, tight_layout=False)

plt.figure(figsize=(4,4))
ax = sns.scatterplot(x=plot_dat_cpn.num_nodes, y=plot_dat_cpn.multi_genome_node_proportion, s=20)
texts = list()
for i in plot_dat_cpn.index:
    ano = f'sd={float(plot_dat_cpn.parameter[i])}'
    plt.annotate(ano, (plot_dat_cpn.num_nodes[i], plot_dat_cpn.multi_genome_node_proportion[i]), textcoords='offset points', xytext=(5, 5), size=8)
name = OUTDIR + PREF + 'gherig_numnodebymultigenome_copanonly.pdf'
plt.savefig(name, dpi=1400, bbox_inches='tight')
plot_unlabelled_version(ax, name, tight_layout=False)

plt.figure(figsize=(4,4))
ax = sns.scatterplot(x=plot_dat_cpn.num_edges, y=plot_dat_cpn.multi_genome_node_proportion, s=20)
texts = list()
for i in plot_dat_cpn.index:
    ano = f'sd={float(plot_dat_cpn.parameter[i])}'
    plt.annotate(ano, (plot_dat_cpn.num_edges[i], plot_dat_cpn.multi_genome_node_proportion[i]), textcoords='offset points', xytext=(5, 5), size=8)
name = OUTDIR + PREF + 'gherig_numedgesbymultigenome_copanonly.pdf'
plt.savefig(name, dpi=1400, bbox_inches='tight')
plot_unlabelled_version(ax, name, tight_layout=False)

No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.


<Figure size 400x400 with 0 Axes>

<Figure size 400x400 with 0 Axes>

<Figure size 400x400 with 0 Axes>

<Figure size 400x400 with 0 Axes>

<Figure size 400x400 with 0 Axes>

<Figure size 400x400 with 0 Axes>

In [75]:
# COVERAGE VS N50, NUM_NODES, NUM_EDGES SCATTER
ANI=99
get_base = lambda x: os.path.splitext(os.path.basename(x))[0]
asm_dat = pd.read_csv('ani_99_plot_dat.csv')
data = pd.read_csv('../data/KmerMixing/gherig/ani_99_covered_positions.csv', index_col=0)
#data.genome = [f'{g}_{i}' for i, g in zip(data.index, data.genome)]
data['genome_ani'] = ANI
data['tool'] = data.file.apply(lambda x: 'copangraph' if 'copangraph' in get_base(x) else 'megahit')
data['parameter'] = data.file.apply(lambda x: float(get_base(x)[1:]) if 'copangraph' not in get_base(x) else float(get_base(x).split('_')[3]))
data_melt = data.melt(id_vars=['tool', 'parameter', 'file', 'genome'], value_vars=['covered_bp', 'total_bp', 'proportion_covered'])
data_melt = data_melt.merge(asm_dat, on=['tool', 'parameter', 'file'])
## coverage by n50
plot_dat = data_melt.groupby(['tool', 'parameter']).apply(
    lambda x: pd.Series([
        x.n50.iloc[0], 
        x.num_nodes.iloc[0], 
        x.num_edges.iloc[0], 
        x.loc[x.variable == 'proportion_covered'].value.mean(), 
        x.loc[x.variable == 'covered_bp'].value.sum() / x.loc[x.variable == 'total_bp'].value.sum()
    ],
        index = [
        'n50',
        'num_nodes',
        'num_edges',
        'macro_cov',
        'micro_cov'
        ]
    )
).reset_index()
plot_dat

plt.figure(figsize=(4,4))
ax = sns.scatterplot(x=plot_dat.n50, y=plot_dat.micro_cov, hue=plot_dat.tool, s=20)
texts = list()
for i in plot_dat.index:
    if plot_dat.loc[i, 'tool'] == 'megahit':
        ano = f'k={int(plot_dat.parameter[i])}'
    else:
        ano = f'sd={float(plot_dat.parameter[i])}'
    plt.annotate(ano, (plot_dat.n50[i], plot_dat.micro_cov[i]), textcoords='offset points', xytext=(5, 5), size=8)
name = OUTDIR + PREF + 'gherig_n50bymicrocov.pdf'
plt.savefig(name, dpi=1400, bbox_inches='tight')
plot_unlabelled_version(ax, name, tight_layout=False)

# coverage by num_nodes
plt.figure(figsize=(4,4))
ax = sns.scatterplot(x=plot_dat.num_nodes, y=plot_dat.micro_cov, hue=plot_dat.tool, s=20)
plt.xscale('log')
texts = list()
for i in plot_dat.index:
    if plot_dat.loc[i, 'tool'] == 'megahit':
        ano = f'k={int(plot_dat.parameter[i])}'
    else:
        ano = f'sd={float(plot_dat.parameter[i])}'
    plt.annotate(ano, (plot_dat.num_nodes[i], plot_dat.micro_cov[i]), textcoords='offset points', xytext=(5, 5), size=8)
name = OUTDIR + PREF + 'gherig_numnodesbymicrocov.pdf'
plt.savefig(name, dpi=1400, bbox_inches='tight')
plot_unlabelled_version(ax, name)

# coverage by num_edges 
plt.figure(figsize=(4,4))
ax = sns.scatterplot(x=plot_dat.num_edges, y=plot_dat.micro_cov, hue=plot_dat.tool, s=20)
plt.xscale('log')
texts = list()
for i in plot_dat.index:
    if plot_dat.loc[i, 'tool'] == 'megahit':
        ano = f'k={int(plot_dat.parameter[i])}'
    else:
        ano = f'sd={float(plot_dat.parameter[i])}'
    plt.annotate(ano, (plot_dat.num_edges[i], plot_dat.micro_cov[i]), textcoords='offset points', xytext=(5, 5), size=8)
name = OUTDIR + PREF + 'gherig_numedgesbymicrocov.pdf'
plt.savefig(name, dpi=1400, bbox_inches='tight')
plot_unlabelled_version(ax, name)

# MULTI_GENOME VS N50, NUM_NODES, NUM_EDGES SCATTER COPAN ONLY
plot_dat_cpn = plot_dat.loc[plot_dat.tool == 'copangraph',:]
plt.figure(figsize=(4,4))
ax = sns.scatterplot(x=plot_dat_cpn.n50, y=plot_dat_cpn.micro_cov, s=20)
texts = list()
for i in plot_dat_cpn.index:
    ano = f'sd={float(plot_dat_cpn.parameter[i])}'
    plt.annotate(ano, (plot_dat_cpn.n50[i], plot_dat_cpn.micro_cov[i]), textcoords='offset points', xytext=(5, 5), size=8)
name = OUTDIR + PREF + 'gherig_n50bymicrocov_copanonly.pdf'
plt.savefig(name, dpi=1400, bbox_inches='tight')
plot_unlabelled_version(ax, name, tight_layout=False)

plt.figure(figsize=(4,4))
ax = sns.scatterplot(x=plot_dat_cpn.num_nodes, y=plot_dat_cpn.micro_cov, s=20)
texts = list()
for i in plot_dat_cpn.index:
    ano = f'sd={float(plot_dat_cpn.parameter[i])}'
    plt.annotate(ano, (plot_dat_cpn.num_nodes[i], plot_dat_cpn.micro_cov[i]), textcoords='offset points', xytext=(5, 5), size=8)
name = OUTDIR + PREF + 'gherig_numnodebymicrocov_copanonly.pdf'
plt.savefig(name, dpi=1400, bbox_inches='tight')
plot_unlabelled_version(ax, name, tight_layout=False)

plt.figure(figsize=(4,4))
ax = sns.scatterplot(x=plot_dat_cpn.num_edges, y=plot_dat_cpn.micro_cov, s=20)
texts = list()
for i in plot_dat_cpn.index:
    ano = f'sd={float(plot_dat_cpn.parameter[i])}'
    plt.annotate(ano, (plot_dat_cpn.num_edges[i], plot_dat_cpn.micro_cov[i]), textcoords='offset points', xytext=(5, 5), size=8)
name = OUTDIR + PREF + 'gherig_numedgesbymicrocov_copanonly.pdf'
plt.savefig(name, dpi=1400, bbox_inches='tight')
plot_unlabelled_version(ax, name, tight_layout=False)

  plot_dat = data_melt.groupby(['tool', 'parameter']).apply(
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.


<Figure size 400x400 with 0 Axes>

<Figure size 400x400 with 0 Axes>

<Figure size 400x400 with 0 Axes>

<Figure size 400x400 with 0 Axes>

<Figure size 400x400 with 0 Axes>

<Figure size 400x400 with 0 Axes>

In [4]:
cov_records = glob.glob('../data/KmerMixing/gherig/*coverage_records.csv')
cov_records = pd.concat(pd.read_csv(e) for e in cov_records)
cov_records.sort_values(by=[ 'assembler', 'parameter', 'metric'], inplace=True)
cov_records = cov_records.loc[cov_records.parameter != 0,:]
cov_records.reset_index(drop=True, inplace=True)
cov_records

## F-score by n50
subrecord = cov_records.loc[cov_records.metric == 'cov_F-score', :]
#subrecord.to_csv(OUTDIR + PREF_COV + '_coverage_F-score_raw_dat.csv', index=None)
subrecord
#plt.figure(figsize=(4,4))
#ax = sns.scatterplot(x=subrecord.n50, y=subrecord.value, hue=subrecord.assembler, s=20)
#texts = list()
#for i in subrecord.index:
#    if subrecord.loc[i, 'assembler'] == 'megahit':
#        ano = f'k={int(subrecord.parameter[i])}'
#    else:
#        ano = f'sd={float(subrecord.parameter[i])}'
#    plt.annotate(ano, (subrecord.n50[i], subrecord.value[i]), textcoords='offset points', xytext=(2, 2), size=8)
#name = OUTDIR + PREF_COV + 'cov_F-score_n50.pdf'
#plt.savefig(name, dpi=1400, bbox_inches='tight')
#plot_unlabelled_version(ax, name, tight_layout=False)
#
## recall by n50
#subrecord = cov_records.loc[cov_records.metric == 'cov_recall', :]
#subrecord
#plt.figure(figsize=(4,4))
#ax = sns.scatterplot(x=subrecord.n50, y=subrecord.value, hue=subrecord.assembler, s=20)
#texts = list()
#for i in subrecord.index:
#    if subrecord.loc[i, 'assembler'] == 'megahit':
#        ano = f'k={int(subrecord.parameter[i])}'
#    else:
#        ano = f'sd={float(subrecord.parameter[i])}'
#    plt.annotate(ano, (subrecord.n50[i], subrecord.value[i]), textcoords='offset points', xytext=(5, 5), size=8)
#name = OUTDIR + PREF_COV + 'cov_recall_n50.pdf'
#plt.savefig(name, dpi=1400, bbox_inches='tight')
#plot_unlabelled_version(ax, name, tight_layout=False)
#
## recall by n50
#subrecord = cov_records.loc[cov_records.metric == 'cov_precsion', :]
#subrecord
#plt.figure(figsize=(4,4))
#ax = sns.scatterplot(x=subrecord.n50, y=subrecord.value, hue=subrecord.assembler, s=20)
#texts = list()
#for i in subrecord.index:
#    if subrecord.loc[i, 'assembler'] == 'megahit':
#        ano = f'k={int(subrecord.parameter[i])}'
#    else:
#        ano = f'sd={float(subrecord.parameter[i])}'
#    plt.annotate(ano, (subrecord.n50[i], subrecord.value[i]), textcoords='offset points', xytext=(5, 5), size=8)
#name = OUTDIR + PREF_COV + 'cov_precision_n50.pdf'
#plt.savefig(name, dpi=1400, bbox_inches='tight')
#plot_unlabelled_version(ax, name, tight_layout=False)
#
## F-score by num_edges
#subrecord = cov_records.loc[cov_records.metric == 'cov_F-score', :]
#subrecord
#plt.figure(figsize=(4,4))
#ax = sns.scatterplot(x=subrecord.num_edges, y=subrecord.value, hue=subrecord.assembler, s=20)
#ax.set_xscale('log')
#texts = list()
#for i in subrecord.index:
#    if subrecord.loc[i, 'assembler'] == 'megahit':
#        ano = f'k={int(subrecord.parameter[i])}'
#    else:
#        ano = f'sd={float(subrecord.parameter[i])}'
#    plt.annotate(ano, (subrecord.num_edges[i], subrecord.value[i]), textcoords='offset points', xytext=(5, 5), size=8)
#name = OUTDIR + PREF_COV + 'cov_F-score_num_edges.pdf'
#plt.savefig(name, dpi=1400, bbox_inches='tight')
#plot_unlabelled_version(ax, name, tight_layout=False)
#
## recall by num_edges 
#subrecord = cov_records.loc[cov_records.metric == 'cov_recall', :]
#subrecord
#plt.figure(figsize=(4,4))
#ax = sns.scatterplot(x=subrecord.num_edges, y=subrecord.value, hue=subrecord.assembler, s=20)
#ax.set_xscale('log')
#texts = list()
#for i in subrecord.index:
#    if subrecord.loc[i, 'assembler'] == 'megahit':
#        ano = f'k={int(subrecord.parameter[i])}'
#    else:
#        ano = f'sd={float(subrecord.parameter[i])}'
#    plt.annotate(ano, (subrecord.num_edges[i], subrecord.value[i]), textcoords='offset points', xytext=(5, 5), size=8)
#name = OUTDIR + PREF_COV + 'cov_recall_num_edges.pdf'
#plt.savefig(name, dpi=1400, bbox_inches='tight')
#plot_unlabelled_version(ax, name, tight_layout=False)
#
#
## F-score by num_nodes
#subrecord = cov_records.loc[cov_records.metric == 'cov_F-score', :]
#subrecord
#plt.figure(figsize=(4,4))
#ax = sns.scatterplot(x=subrecord.num_nodes, y=subrecord.value, hue=subrecord.assembler, s=20)
#ax.set_xscale('log')
#texts = list()
#for i in subrecord.index:
#    if subrecord.loc[i, 'assembler'] == 'megahit':
#        ano = f'k={int(subrecord.parameter[i])}'
#    else:
#        ano = f'sd={float(subrecord.parameter[i])}'
#    plt.annotate(ano, (subrecord.num_nodes[i], subrecord.value[i]), textcoords='offset points', xytext=(5, 5), size=8)
#name = OUTDIR + PREF_COV + 'cov_F-score_num_nodes.pdf'
#plt.savefig(name, dpi=1400, bbox_inches='tight')
#plot_unlabelled_version(ax, name, tight_layout=False)
#
## recall by num_nodes 
#subrecord = cov_records.loc[cov_records.metric == 'cov_recall', :]
#subrecord
#plt.figure(figsize=(4,4))
#ax = sns.scatterplot(x=subrecord.num_nodes, y=subrecord.value, hue=subrecord.assembler, s=20)
#ax.set_xscale('log')
#texts = list()
#for i in subrecord.index:
#    if subrecord.loc[i, 'assembler'] == 'megahit':
#        ano = f'k={int(subrecord.parameter[i])}'
#    else:
#        ano = f'sd={float(subrecord.parameter[i])}'
#    plt.annotate(ano, (subrecord.num_nodes[i], subrecord.value[i]), textcoords='offset points', xytext=(5, 5), size=8)
#name = OUTDIR + PREF_COV + 'cov_recall_num_nodes.pdf'
#plt.savefig(name, dpi=1400, bbox_inches='tight')
#plot_unlabelled_version(ax, name, tight_layout=False)

Unnamed: 0,assembler,parameter,name,n50,num_nodes,num_edges,metric,value
0,copangraph,0.001,gherig_copan_sd_0.001,2475,1175776,247772,cov_F-score,0.773083
6,copangraph,0.005,gherig_copan_sd_0.005,1825,1207357,315215,cov_F-score,0.790058
12,copangraph,0.01,gherig_copan_sd_0.01,1481,1229141,360768,cov_F-score,0.8009
18,copangraph,0.02,gherig_copan_sd_0.02,1270,1250303,403586,cov_F-score,0.811608
24,copangraph,0.03,gherig_copan_sd_0.03,1217,1257578,418093,cov_F-score,0.813971
30,copangraph,0.04,gherig_copan_sd_0.04,1195,1260739,424231,cov_F-score,0.814993
36,copangraph,0.05,gherig_copan_sd_0.05,1181,1262531,427526,cov_F-score,0.81572
42,copangraph,0.1,gherig_copan_sd_0.1,1150,1266432,434087,cov_F-score,0.81765
48,copangraph,0.75,gherig_copan_sd_0.75,1148,1266662,434520,cov_F-score,0.817658
54,megahit,21.0,k21.fastg,103,21383619,24414695,cov_F-score,0.794718


In [None]:
print('reading mix records')
ms_records = glob.glob('../data/KmerMixing/gherig/*multisample_record.csv')
ms_records = pd.concat(pd.read_csv(e) for e in ms_records)
ms_records.rename({'param':'parameter'}, axis=1, inplace=True)
ms_records.sort_values(by=[ 'assembler', 'parameter'], inplace=True)
ms_records.reset_index(drop=True, inplace=True)

ms_records

# node-max-bp to ref
plt.figure(figsize=(4,4))
ax = sns.scatterplot(x=ms_records.num_nodes, y=ms_records.prop_multigenome_coverage_bp, hue=ms_records.assembler, s=20)
ax.set_xscale('log')
texts = list()
for i in ms_records.index:
    if ms_records.loc[i, 'assembler'] == 'megahit':
        ano = f'k={int(ms_records.parameter[i])}'
    else:
        ano = f'sd={float(ms_records.parameter[i])}'
    plt.annotate(ano, (ms_records.num_nodes[i], ms_records.prop_multigenome_coverage_bp[i]), textcoords='offset points', xytext=(5, 5), size=8)
name = OUTDIR + PREF_MULTI + 'multi_genome_coverage_nodes.pdf'
plt.savefig(name, dpi=1400, bbox_inches='tight')
plot_unlabelled_version(ax, name, tight_layout=False)

plt.figure(figsize=(4,4))
ax = sns.scatterplot(x=ms_records.num_edges, y=ms_records.prop_multigenome_coverage_bp, hue=ms_records.assembler, s=20)
ax.set_xscale('log')
texts = list()
for i in ms_records.index:
    if ms_records.loc[i, 'assembler'] == 'megahit':
        ano = f'k={int(ms_records.parameter[i])}'
    else:
        ano = f'sd={float(ms_records.parameter[i])}'
    plt.annotate(ano, (ms_records.num_edges[i], ms_records.prop_multigenome_coverage_bp[i]), textcoords='offset points', xytext=(5, 5), size=8)
name = OUTDIR + PREF_MULTI + 'multi_genome_coverage_edges.pdf'
plt.savefig(name, dpi=1400, bbox_inches='tight')
plot_unlabelled_version(ax, name, tight_layout=False)

plt.figure(figsize=(4,4))
ax = sns.scatterplot(x=ms_records.n50, y=ms_records.prop_multigenome_coverage_bp, hue=ms_records.assembler, s=20)
texts = list()
for i in ms_records.index:
    if ms_records.loc[i, 'assembler'] == 'megahit':
        ano = f'k={int(ms_records.parameter[i])}'
    else:
        ano = f'sd={float(ms_records.parameter[i])}'
    plt.annotate(ano, (ms_records.n50[i], ms_records.prop_multigenome_coverage_bp[i]), textcoords='offset points', xytext=(5, 5), size=8)
name = OUTDIR + PREF_MULTI + 'multi_genome_coverage_n50.pdf'
plt.savefig(name, dpi=1400, bbox_inches='tight')
plot_unlabelled_version(ax, name, tight_layout=False)

# node-max-bp to total-in-graph
plt.figure(figsize=(4,4))
ax = sns.scatterplot(x=ms_records.num_nodes, y=ms_records.prop_maxlen_multigenome_bp, hue=ms_records.assembler, s=20)
ax.set_xscale('log')
texts = list()
for i in ms_records.index:
    if ms_records.loc[i, 'assembler'] == 'megahit':
        ano = f'k={int(ms_records.parameter[i])}'
    else:
        ano = f'sd={float(ms_records.parameter[i])}'
    plt.annotate(ano, (ms_records.num_nodes[i], ms_records.prop_maxlen_multigenome_bp[i]), textcoords='offset points', xytext=(5, 5), size=8)
name = OUTDIR + PREF_MULTI + 'maxlen_multigenome_nodes.pdf'
plt.savefig(name, dpi=1400, bbox_inches='tight')
plot_unlabelled_version(ax, name, tight_layout=False)

plt.figure(figsize=(4,4))
ax = sns.scatterplot(x=ms_records.num_edges, y=ms_records.prop_maxlen_multigenome_bp, hue=ms_records.assembler, s=20)
ax.set_xscale('log')
texts = list()
for i in ms_records.index:
    if ms_records.loc[i, 'assembler'] == 'megahit':
        ano = f'k={int(ms_records.parameter[i])}'
    else:
        ano = f'sd={float(ms_records.parameter[i])}'
    plt.annotate(ano, (ms_records.num_edges[i], ms_records.prop_maxlen_multigenome_bp[i]), textcoords='offset points', xytext=(5, 5), size=8)
name = OUTDIR + PREF_MULTI + 'maxlen_multigenome_edges.pdf'
plt.savefig(name, dpi=1400, bbox_inches='tight')
plot_unlabelled_version(ax, name, tight_layout=False)

plt.figure(figsize=(4,4))
ax = sns.scatterplot(x=ms_records.n50, y=ms_records.prop_maxlen_multigenome_bp, hue=ms_records.assembler, s=20)
texts = list()
for i in ms_records.index:
    if ms_records.loc[i, 'assembler'] == 'megahit':
        ano = f'k={int(ms_records.parameter[i])}'
    else:
        ano = f'sd={float(ms_records.parameter[i])}'
    plt.annotate(ano, (ms_records.n50[i], ms_records.prop_maxlen_multigenome_bp[i]), textcoords='offset points', xytext=(5, 5), size=8)
name = OUTDIR + PREF_MULTI + 'maxlen_multigenome_n50.pdf'
plt.savefig(name, dpi=1400, bbox_inches='tight')
plot_unlabelled_version(ax, name, tight_layout=False)

# node to total-nodes
plt.figure(figsize=(4,4))
ax = sns.scatterplot(x=ms_records.num_nodes, y=ms_records.prop_multigenome_nodes, hue=ms_records.assembler, s=20)
ax.set_xscale('log')
texts = list()
for i in ms_records.index:
    if ms_records.loc[i, 'assembler'] == 'megahit':
        ano = f'k={int(ms_records.parameter[i])}'
    else:
        ano = f'sd={float(ms_records.parameter[i])}'
    plt.annotate(ano, (ms_records.num_nodes[i], ms_records.prop_multigenome_nodes[i]), textcoords='offset points', xytext=(5, 5), size=8)
name = OUTDIR + PREF_MULTI + 'prop_multigenome_nodes_nodes.pdf'
plt.savefig(name, dpi=1400, bbox_inches='tight')
plot_unlabelled_version(ax, name, tight_layout=False)

plt.figure(figsize=(4,4))
ax = sns.scatterplot(x=ms_records.num_edges, y=ms_records.prop_multigenome_nodes, hue=ms_records.assembler, s=20)
ax.set_xscale('log')
texts = list()
for i in ms_records.index:
    if ms_records.loc[i, 'assembler'] == 'megahit':
        ano = f'k={int(ms_records.parameter[i])}'
    else:
        ano = f'sd={float(ms_records.parameter[i])}'
    plt.annotate(ano, (ms_records.num_edges[i], ms_records.prop_multigenome_nodes[i]), textcoords='offset points', xytext=(5, 5), size=8)
name = OUTDIR + PREF_MULTI + 'prop_multigenome_nodes_edges.pdf'
plt.savefig(name, dpi=1400, bbox_inches='tight')
plot_unlabelled_version(ax, name, tight_layout=False)

plt.figure(figsize=(4,4))
ax = sns.scatterplot(x=ms_records.n50, y=ms_records.prop_multigenome_nodes, hue=ms_records.assembler, s=20)
texts = list()
for i in ms_records.index:
    if ms_records.loc[i, 'assembler'] == 'megahit':
        ano = f'k={int(ms_records.parameter[i])}'
    else:
        ano = f'sd={float(ms_records.parameter[i])}'
    plt.annotate(ano, (ms_records.n50[i], ms_records.prop_multigenome_nodes[i]), textcoords='offset points', xytext=(5, 5), size=8)
name = OUTDIR + PREF_MULTI + 'prop_multigenome_nodes_n50.pdf'
plt.savefig(name, dpi=1400, bbox_inches='tight')
plot_unlabelled_version(ax, name, tight_layout=False)

reading mix records


Unnamed: 0,assembler,parameter,name,n50,num_nodes,num_edges,sum_maxlen_bp,total_ref_bps,sum_multigenome_nodes,prop_multigenome_nodes,sum_maxlen_multigenome_bp,prop_maxlen_multigenome_bp,sum_multigenome_coverage_bp,prop_multigenome_coverage_bp
0,copangraph,0.001,gherig_copan_sd_0.001,2475,1175776,247772,1325970016,413817357,23283,0.019802,14729568,0.011109,25934496,0.062671
1,copangraph,0.005,gherig_copan_sd_0.005,1825,1207357,315215,1199250946,413817357,32629,0.027025,24914307,0.020775,46646006,0.112721
2,copangraph,0.01,gherig_copan_sd_0.01,1481,1229141,360768,1091140280,413817357,45709,0.037188,37046223,0.033952,74325279,0.179609
3,copangraph,0.02,gherig_copan_sd_0.02,1270,1250303,403586,999025638,413817357,64831,0.051852,51245403,0.051295,110716966,0.26755
4,copangraph,0.03,gherig_copan_sd_0.03,1217,1257578,418093,976369105,413817357,71930,0.057197,54839302,0.056167,119912112,0.289771
5,copangraph,0.04,gherig_copan_sd_0.04,1195,1260739,424231,967290372,413817357,75203,0.05965,56169640,0.058069,124155465,0.300025
6,copangraph,0.05,gherig_copan_sd_0.05,1181,1262531,427526,961796562,413817357,77312,0.061236,56835501,0.059093,126988739,0.306871
7,copangraph,0.1,gherig_copan_sd_0.1,1150,1266432,434087,952910795,413817357,81514,0.064365,57696764,0.060548,131817625,0.318541
8,copangraph,0.75,gherig_copan_sd_0.75,1148,1266662,434520,952369450,413817357,81866,0.064631,57827270,0.060719,132282543,0.319664
9,megahit,21.0,k21.fastg,103,21383619,24414695,1432285844,413817357,353565,0.016534,34988490,0.024428,77700839,0.187766
