## Compare the assemblies in GenBank, GTDB, and RAST

Each has a different set. What are the unions and intersections?

In [3]:
# A lot of this is not used, but we import it so we have it later!
import os
import sys
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import pandas as pd
import seaborn as sns
import numpy as np

import math
import re

from PhiSpyAnalysis import theils_u, DateConverter, file_to_accession

from scipy.stats import pearsonr, f_oneway
from sklearn.linear_model import LinearRegression
from sklearn import decomposition
from sklearn.ensemble import RandomForestClassifier

import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd, tukeyhsd, MultiComparison
from statsmodels.multivariate.manova import MANOVA


import subprocess
import gzip


# this is a neat trick for getting markdown in our output
# see https://stackoverflow.com/questions/23271575/printing-bold-colored-etc-text-in-ipython-qtconsole
# for the inspiration
from IPython.display import Markdown, display
def printmd(string, color="black"):
    colorstr = "<span style='color:{}'>{}</span>".format(color, string)
    display(Markdown(colorstr))

# GTDB



In [18]:
# GTDB
gtdb = pd.read_csv("../data/bac120_metadata_r95.tsv.gz", compression='gzip', header=0, delimiter="\t")
gtdb = gtdb.rename(columns={'ncbi_genbank_assembly_accession': 'assembly_accession'})
gtdb['assembly_accession']

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


0         GCA_000006155.2
1         GCA_000007385.1
2         GCA_000008605.1
3         GCA_000010565.1
4         GCA_000013845.2
               ...       
191522    GCA_902166935.1
191523    GCA_902166945.1
191524    GCA_902167295.1
191525    GCA_902167305.1
191526    GCA_902167325.1
Name: assembly_accession, Length: 191527, dtype: object

In [19]:
# RAST
# the full data set. Don't try this at home!
# metadf = pd.read_csv("../small_data/patric_genome_metadata.tsv.gz", compression='gzip', header=0, delimiter="\t")
rast = pd.read_csv("../data/patric_genome_metadata.tsv.gz", compression='gzip', header=0, delimiter="\t")
rast['assembly_accession']

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


0         GCA_003181115.1
1         GCF_001007995.1
2         GCF_001307505.1
3         GCA_000196515.1
4         GCA_002163025.1
               ...       
433517    GCA_000379245.1
433518    GCA_000379345.1
433519    GCA_000379265.1
433520    GCA_000379205.1
433521    GCA_000372125.1
Name: assembly_accession, Length: 433522, dtype: object

# GenBank

This assembly summary comes from [GenBank ftp site](ftp://ftp.ncbi.nlm.nih.gov/genomes/genbank/) and you want the [assembly_summary.txt](ftp://ftp.ncbi.nlm.nih.gov/genomes/genbank/bacteria/assembly_summary.txt) file explicitly from bacteria (but don't try and open the bacteria list in your browser!)

In [20]:
# GenBank
gbk = pd.read_csv("../data/assembly_summary.txt.gz", compression='gzip', header=1, delimiter="\t")
gbk = gbk.rename(columns={'# assembly_accession': 'assembly_accession'})
gbk['assembly_accession']

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


0         GCA_900128725.1
1         GCA_008244535.1
2         GCA_011046815.1
3         GCA_011054035.1
4         GCA_011331185.1
               ...       
949928    GCA_017746755.1
949929    GCA_017746715.1
949930    GCA_013266695.1
949931    GCA_015831295.1
949932    GCA_015686645.1
Name: assembly_accession, Length: 949933, dtype: object

In [23]:
phagesdf = pd.read_csv("../data/phages_per_genome.tsv.gz", compression='gzip', header=0, delimiter="\t")
githash = subprocess.check_output(["git", "describe", "--always"]).strip().decode()
print(f"Please note that this was run with git commit {githash} that has {phagesdf.shape[0]:,} genomes parsed and {phagesdf['Total Predicted Prophages'].sum():,} total prophages")

Please note that this was run with git commit b783953 that has 553,082 genomes parsed and 20,946,107 total prophages


In [25]:
phagesdf['assembly_accession'] = phagesdf['Contig'].apply(file_to_accession)
phagesdf

Unnamed: 0,Contig,Genome length,Contigs,Phage Contigs,Total Predicted Prophages,Kept,No phage genes,Not enough genes,bp prophage,assembly_accession
0,GCA_000003135.1_ASM313v1_genomic.gbff.gz,2396359,114,10,16,2,1,13,48916,GCA_000003135.1
1,GCA_000003645.1_ASM364v1_genomic.gbff.gz,5269725,1,1,31,1,10,20,40297,GCA_000003645.1
2,GCA_000003925.1_ASM392v1_genomic.gbff.gz,5561906,1,1,38,6,13,19,268081,GCA_000003925.1
3,GCA_000003955.1_ASM395v1_genomic.gbff.gz,5790501,1,1,46,6,11,29,166286,GCA_000003955.1
4,GCA_000005825.2_ASM582v2_genomic.gbff.gz,4249248,3,3,33,3,9,21,93416,GCA_000005825.2
...,...,...,...,...,...,...,...,...,...,...
553077,GCA_902860175.1_LMG_5997_genomic.gbff.gz,7197255,38,21,33,2,14,17,69051,GCA_902860175.1
553078,GCA_902860185.1_LMG_6103_genomic.gbff.gz,6497464,13,8,22,0,10,12,0,GCA_902860185.1
553079,GCA_902860195.1_LMG_7053_genomic.gbff.gz,6702936,200,148,33,1,11,21,12819,GCA_902860195.1
553080,GCA_902860205.1_LMG_6001_genomic.gbff.gz,6320373,36,19,35,2,21,12,41572,GCA_902860205.1


# What is in common between the groups?

In [41]:
gbkaa=set(gbk['assembly_accession'])
rastaa=set(rast['assembly_accession'])
gtdbaa=set(gtdb['assembly_accession'])
phagesaa=set(phagesdf['assembly_accession'])


gbr = gbkaa.intersection(rastaa)
gbg = gbkaa.intersection(gtdbaa)
gtr = gtdbaa.intersection(rastaa)

print(f"Between GBK and RAST there are {len(gbr):,} genomes in common")
print(f"Between GBK and GTDB there are {len(gbg):,} genomes in common")
print(f"Between GTDB and RAST there are {len(gtr):,} genomes in common")
print()

gbnotdone = gbkaa - phagesaa
gtnotdone = gtdbaa - phagesaa
ranotdone = rastaa - phagesaa
print(f"There are {len(gbnotdone):,} phages in Genbank that have not been analyzed")
print(f"There are {len(gtnotdone):,} phages in GTDB that have not been analyzed")
print(f"There are {len(ranotdone):,} phages in Genbank that have not been analyzed")
print()

allmissing = gbnotdone.intersection(gtnotdone).intersection(ranotdone)
print(f"There are {len(allmissing):,} phages in all three that have not been analyzed")
with open("../data/unprocessed_phages.txt", 'w') as out:
    for o in allmissing:
        gbk[gbk['assembly_accession'] == 'GCA_002129805.1'][['assembly_accession', 'ftp_path']].to_csv(out, sep="\t", header=False, index=False)

Between GBK and RAST there are 244,870 genomes in common
Between GBK and GTDB there are 189,704 genomes in common
Between GTDB and RAST there are 140,463 genomes in common

There are 353,548 phages in Genbank that have not been analyzed
There are 65,392 phages in GTDB that have not been analyzed
There are 200,803 phages in Genbank that have not been analyzed

There are 52,219 phages in all three that have not been analyzed
