In [1]:
# Bash scripts are run via SLURM.
# see commands/04_run_gempipe_recon.sh

# Create filtering table


In [1]:
import pandas as pnd
import gempipe

In [2]:
# parse metadata

metadata = pnd.read_csv('genomes_all/raw_ncbi_1598.txt', sep='\t', index_col=0)
metadata = metadata.iloc[[i for i in range(len(metadata)) if i%2==0], ]  # remove report lines
metadata['infraspecific_name'] = metadata['infraspecific_name'].str.replace('strain=', '') 
metadata['infraspecific_name'] = metadata['infraspecific_name'].str.replace('na', '') 
metadata['isolate'] = metadata['isolate'].str.replace('na', '')
metadata['strain_isolate'] = metadata['infraspecific_name'] + metadata['isolate']

filt_table = metadata[['strain_isolate']].copy()


In [7]:
# parse MAGS


filt_table['excluded_M'] = False
for acc, row in filt_table.iterrows(): 
    if 'derived from metagenome' in str(metadata.loc[acc, 'excluded_from_refseq']):
        filt_table.loc[acc, 'excluded_M'] = True


In [10]:
%%capture

# parse ANI

fig, triangular = gempipe.animatrix(
    tree_original='fastani/ANIclustermap_dendrogram.nwk', 
    triangular='fastani/ANIclustermap_matrix.tsv',
    verbose=True, replace0=0, cellannot=False, fastmode=True,
)

filt_table['ANI_with_ts'] = None
filt_table['excluded_T'] = None
for acc, row in filt_table.iterrows(): 
    if acc in triangular.index:
        filt_table.loc[acc, 'ANI_with_ts'] = triangular.loc[acc, 'GCA_000016825.1']
        if triangular.loc[acc, 'GCA_000016825.1'] < 95:  # type strain
            filt_table.loc[acc, 'excluded_T'] = True


In [12]:
%%capture 

# parse Q metrics

summary_table, summary_table_filt, fig = gempipe.get_filtering_summary(
    working_dir='gempipe/working/',
    thr_N50=19000, thr_nc=240, thr_bm=3, thr_bf=2
)

filt_table['ncontigs'] = None
filt_table['sum_len'] = None
filt_table['N50'] = None
filt_table['BUSCO_F%'] = None
filt_table['BUSCO_M%'] = None
filt_table['excluded_Q'] = None
filt_table['GSMM_reconstructed'] = ''
for acc, row in filt_table.iterrows(): 
    
    if acc in summary_table.index:
        filt_table.loc[acc, 'ncontigs'] = summary_table.loc[acc, 'ncontigs']
        filt_table.loc[acc, 'sum_len'] = summary_table.loc[acc, 'sum_len']
        filt_table.loc[acc, 'N50'] = summary_table.loc[acc, 'N50']
        filt_table.loc[acc, 'BUSCO_F%'] = summary_table.loc[acc, 'BUSCO_F%']
        filt_table.loc[acc, 'BUSCO_M%'] = summary_table.loc[acc, 'BUSCO_M%']
        
        if acc in summary_table_filt.index:
            filt_table.loc[acc, 'excluded_Q'] = False
            filt_table.loc[acc, 'GSMM_reconstructed'] = 'yes'
        else:
            filt_table.loc[acc, 'excluded_Q'] = True

In [13]:
filt_table

Unnamed: 0_level_0,strain_isolate,excluded_M,ANI_with_ts,excluded_T,ncontigs,sum_len,N50,BUSCO_F%,BUSCO_M%,excluded_Q,GSMM_reconstructed
assembly_accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
GCA_000712555.1,LTH2584,False,95.729347,,25,2.066,245555,3.5,1.0,True,
GCA_000712565.2,TMW1.656,False,95.806763,,17,1.95,190607,4.0,3.5,True,
GCA_000722535.2,TMW1.112,False,95.577881,,12,2.032,327314,1.5,0.7,False,yes
GCA_000758185.1,LTH5448,False,96.06131,,36,1.98,113363,1.7,2.7,False,yes
GCA_001046835.1,IRT,False,99.977783,,1,1.994,1993967,0.2,0.5,False,yes
...,...,...,...,...,...,...,...,...,...,...,...
GCA_000410995.1,I5007,False,96.12925,,7,2.093,1947706,0.5,0.2,False,yes
GCA_000439275.1,TD1,False,96.046661,,1,2.145,2145445,0.0,0.2,False,yes
GCA_041888795.1,BG-R46,False,95.16214,,3,2.252,2225488,0.0,0.2,False,yes
GCA_041888805.1,DSM 17938,False,95.203659,,3,2.249,2223332,0.0,0.2,False,yes


In [14]:
filt_table.to_excel('tables/Supplementary File 4.xlsx')