## Compare the assemblies in GenBank, GTDB, and RAST

Each has a different set. What are the unions and intersections?

In [1]:
# A lot of this is not used, but we import it so we have it later!
import os
import sys
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import pandas as pd
import seaborn as sns
import numpy as np

import math
import re

from PhiSpyAnalysis import theils_u, DateConverter, printmd
from PhiSpyAnalysis import read_phages, read_gtdb, read_checkv, read_base_pp, read_categories, read_metadata, read_gbk_metadata

from scipy.stats import pearsonr, f_oneway
from sklearn.linear_model import LinearRegression
from sklearn import decomposition
from sklearn.ensemble import RandomForestClassifier

import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd, tukeyhsd, MultiComparison
from statsmodels.multivariate.manova import MANOVA


import subprocess
import gzip


In [2]:
rv = re.compile('^\\w+')
def remove_ver(x):
    return rv.search(str(x)).group()

# GTDB



In [3]:
# GTDB
gtdb = read_gtdb()
gtdb['assembly_nover'] = gtdb['assembly_accession'].apply(remove_ver)
gtdb[['assembly_accession', 'assembly_nover']]

Unnamed: 0,assembly_accession,assembly_nover
0,GCA_000006155.2,GCA_000006155
1,GCA_000007385.1,GCA_000007385
2,GCA_000008605.1,GCA_000008605
3,GCA_000010565.1,GCA_000010565
4,GCA_000013845.2,GCA_000013845
...,...,...
191522,GCA_902166935.1,GCA_902166935
191523,GCA_902166945.1,GCA_902166945
191524,GCA_902167295.1,GCA_902167295
191525,GCA_902167305.1,GCA_902167305


In [4]:
list(gtdb.columns)

['accession',
 'ambiguous_bases',
 'checkm_completeness',
 'checkm_contamination',
 'checkm_marker_count',
 'checkm_marker_lineage',
 'checkm_marker_set_count',
 'checkm_strain_heterogeneity',
 'coding_bases',
 'coding_density',
 'contig_count',
 'gc_count',
 'gc_percentage',
 'genome_size',
 'gtdb_genome_representative',
 'gtdb_representative',
 'gtdb_taxonomy',
 'gtdb_type_designation',
 'gtdb_type_designation_sources',
 'gtdb_type_species_of_genus',
 'l50_contigs',
 'l50_scaffolds',
 'longest_contig',
 'longest_scaffold',
 'lsu_23s_contig_len',
 'lsu_23s_count',
 'lsu_23s_length',
 'lsu_23s_query_id',
 'lsu_5s_contig_len',
 'lsu_5s_count',
 'lsu_5s_length',
 'lsu_5s_query_id',
 'lsu_silva_23s_blast_align_len',
 'lsu_silva_23s_blast_bitscore',
 'lsu_silva_23s_blast_evalue',
 'lsu_silva_23s_blast_perc_identity',
 'lsu_silva_23s_blast_subject_id',
 'lsu_silva_23s_taxonomy',
 'mean_contig_length',
 'mean_scaffold_length',
 'mimag_high_quality',
 'mimag_low_quality',
 'mimag_medium_quali

In [5]:
# RAST
# the full data set. Don't try this at home!
# metadf = pd.read_csv("../small_data/patric_genome_metadata.tsv.gz", compression='gzip', header=0, delimiter="\t")
rast = read_metadata()
rast['assembly_nover'] = rast['assembly_accession'].apply(remove_ver)
rast[['assembly_accession', 'assembly_nover']]

Unnamed: 0,assembly_accession,assembly_nover
0,GCA_000003135.1,GCA_000003135
1,GCA_000003215.1,GCA_000003215
2,GCA_000003645.1,GCA_000003645
3,GCA_000003925.1,GCA_000003925
4,GCA_000003955.1,GCA_000003955
...,...,...
320171,GCF_900167595.1,GCF_900167595
320172,GCF_900167605.1,GCF_900167605
320173,GCF_900167615.1,GCF_900167615
320174,GCF_900167625.1,GCF_900167625


# GenBank

This assembly summary comes from [GenBank ftp site](ftp://ftp.ncbi.nlm.nih.gov/genomes/genbank/) and you want the [assembly_summary.txt](ftp://ftp.ncbi.nlm.nih.gov/genomes/genbank/bacteria/assembly_summary.txt) file explicitly from bacteria (but don't try and open the bacteria list in your browser!)

In [6]:
# GenBank
gbk = read_gbk_metadata()
gbk['assembly_nover'] = gbk['assembly_accession'].apply(remove_ver)
gbk[['assembly_accession', 'assembly_nover']]

  if (await self.run_code(code, result,  async_=asy)):


Unnamed: 0,assembly_accession,assembly_nover
0,GCA_900128725.1,GCA_900128725
1,GCA_008244535.1,GCA_008244535
2,GCA_011046815.1,GCA_011046815
3,GCA_011054035.1,GCA_011054035
4,GCA_011331185.1,GCA_011331185
...,...,...
949928,GCA_017746755.1,GCA_017746755
949929,GCA_017746715.1,GCA_017746715
949930,GCA_013266695.1,GCA_013266695
949931,GCA_015831295.1,GCA_015831295


In [7]:
phagesdf = read_phages(maxcontigs=-1) # this disables contig length filtering
phagesdf['assembly_nover'] = phagesdf['assembly_accession'].apply(remove_ver)
phagesdf

Please note that this was run with git commit 86420e1 that has 567,404 genomes parsed.
Initially there were 3,265,453 kept phages,but after filtering we kept 3,265,453 prophages from 567,399 genomes

Unnamed: 0,assembly_accession,assembly_name,Genome length,Contigs,Phage Contigs,Total Predicted Prophages,Kept,No phage genes,Not enough genes,bp prophage,assembly_nover
0,GCA_000043285.1,ASM4328v1,705557.0,1.0,1.0,2.0,0.0,2.0,0.0,0.0,GCA_000043285
1,GCA_000046685.1,ASM4668v1,1581384.0,1.0,1.0,18.0,3.0,0.0,15.0,97011.0,GCA_000046685
2,GCA_000046705.1,ASM4670v1,1931047.0,1.0,1.0,36.0,7.0,0.0,29.0,278490.0,GCA_000046705
3,GCA_000046845.1,ASM4684v1,3598621.0,1.0,1.0,13.0,3.0,4.0,6.0,83112.0,GCA_000046845
4,GCA_000047365.1,ASM4736v1,4840898.0,3.0,3.0,22.0,3.0,3.0,16.0,213427.0,GCA_000047365
...,...,...,...,...,...,...,...,...,...,...,...
567399,GCA_905187425.1,Xanthomonas_sp._CPBF_424_-_hybrid_assembly,4900930.0,1.0,1.0,46.0,7.0,9.0,30.0,214278.0,GCA_905187425
567400,GCA_905188235.1,ASM90518823v1,4963609.0,1.0,1.0,29.0,4.0,4.0,21.0,178993.0,GCA_905188235
567401,GCA_905219375.1,QI0054,3034314.0,1.0,1.0,12.0,4.0,3.0,5.0,178527.0,GCA_905219375
567402,GCA_905219385.1,QI0055,3034113.0,1.0,1.0,12.0,4.0,3.0,5.0,178526.0,GCA_905219385


In [11]:
phagesdf[['Total Predicted Prophages', 'Not enough genes', 'No phage genes', 'Kept']].agg('sum')

Total Predicted Prophages    21562059.0
Not enough genes             15047785.0
No phage genes                3248821.0
Kept                          3265453.0
dtype: float64

# What is in common between the groups?

In [9]:
gbkaa=set(gbk['assembly_accession'])
rastaa=set(rast['assembly_accession'])
gtdbaa=set(gtdb['assembly_accession'])
phagesaa=set(phagesdf['assembly_accession'])


gbr = gbkaa.intersection(rastaa)
gbg = gbkaa.intersection(gtdbaa)
gtr = gtdbaa.intersection(rastaa)

print(f"Between GBK and RAST there are {len(gbr):,} genomes in common")
print(f"Between GBK and GTDB there are {len(gbg):,} genomes in common")
print(f"Between GTDB and RAST there are {len(gtr):,} genomes in common")
print()

gbnotdone = gbkaa - phagesaa
gtnotdone = gtdbaa - phagesaa
ranotdone = rastaa - phagesaa
print(f"There are {len(gbnotdone):,} phages in Genbank that have not been analyzed")
print(f"There are {len(gtnotdone):,} phages in GTDB that have not been analyzed")
print(f"There are {len(ranotdone):,} phages in Genbank that have not been analyzed")
print()

allmissing = gbnotdone.intersection(gtnotdone).intersection(ranotdone)
print(f"There are {len(allmissing):,} phages in all three that have not been analyzed")

if False:
    with open("../data/unprocessed_phages.txt", 'w') as out:
        for o in allmissing:
            gbk[gbk['assembly_accession'] == 'GCA_002129805.1'][['assembly_accession', 'ftp_path']].to_csv(out, sep="\t", header=False, index=False)

Between GBK and RAST there are 244,870 genomes in common
Between GBK and GTDB there are 189,704 genomes in common
Between GTDB and RAST there are 140,463 genomes in common

There are 339,231 phages in Genbank that have not been analyzed
There are 65,392 phages in GTDB that have not been analyzed
There are 186,485 phages in Genbank that have not been analyzed

There are 52,219 phages in all three that have not been analyzed


In [10]:
print("IF WE IGNORE GENOME VERSIONS")
gbkaa=set(gbk['assembly_nover'])
rastaa=set(rast['assembly_nover'])
gtdbaa=set(gtdb['assembly_nover'])
phagesaa=set(phagesdf['assembly_nover'])


gbr = gbkaa.intersection(rastaa)
gbg = gbkaa.intersection(gtdbaa)
gtr = gtdbaa.intersection(rastaa)

print(f"There are {len(gbkaa):,} genomes in GenBank")
print(f"There are {len(rastaa):,} genomes in PATRIC")
print(f"There are {len(gtdbaa):,} genomes in GTDB")

print(f"Between GenBank and PATRIC there are {len(gbr):,} genomes in common")
print(f"Between GenBank and GTDB there are {len(gbg):,} genomes in common")
print(f"Between GTDB and PATRIC there are {len(gtr):,} genomes in common")
print()

gbnotdone = gbkaa - phagesaa
gtnotdone = gtdbaa - phagesaa
ranotdone = rastaa - phagesaa
print(f"There are {len(gbnotdone):,} phages in Genbank that have not been analyzed")
print(f"There are {len(gtnotdone):,} phages in GTDB that have not been analyzed")
print(f"There are {len(ranotdone):,} phages in PATRIC that have not been analyzed")
print()

allmissing = gbnotdone.intersection(gtnotdone).intersection(ranotdone)
print(f"There are {len(allmissing):,} phages in all three that have not been analyzed")

IF WE IGNORE GENOME VERSIONS
There are 904,681 genomes in GenBank
There are 320,033 genomes in PATRIC
There are 191,521 genomes in GTDB
Between GenBank and PATRIC there are 249,334 genomes in common
Between GenBank and GTDB there are 191,370 genomes in common
Between GTDB and PATRIC there are 143,445 genomes in common

There are 337,599 phages in Genbank that have not been analyzed
There are 64,358 phages in GTDB that have not been analyzed
There are 183,308 phages in PATRIC that have not been analyzed

There are 53,262 phages in all three that have not been analyzed
