## Compare the assemblies in GenBank, GTDB, and RAST

Each has a different set. What are the unions and intersections?

In [1]:
import os
import sys
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import pandas as pd
from scipy.stats import pearsonr
from sklearn.linear_model import LinearRegression
import seaborn as sns
import numpy as np
import statsmodels.api as sm
import subprocess
import gzip

In [2]:
# this is a neat trick for getting markdown in our output
# see https://stackoverflow.com/questions/23271575/printing-bold-colored-etc-text-in-ipython-qtconsole
# for the inspiration
from IPython.display import Markdown, display
def printmd(string, color="black"):
    colorstr = "<span style='color:{}'>{}</span>".format(color, string)
    display(Markdown(colorstr))

# GTDB



In [3]:
# GTDB
gtdb = pd.read_csv("../data/bac120_metadata_r95.tsv.gz", compression='gzip', header=0, delimiter="\t")
gtdb

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,accession,ambiguous_bases,checkm_completeness,checkm_contamination,checkm_marker_count,checkm_marker_lineage,checkm_marker_set_count,checkm_strain_heterogeneity,coding_bases,coding_density,...,ssu_silva_blast_align_len,ssu_silva_blast_bitscore,ssu_silva_blast_evalue,ssu_silva_blast_perc_identity,ssu_silva_blast_subject_id,ssu_silva_taxonomy,total_gap_length,trna_aa_count,trna_count,trna_selenocysteine_count
0,GB_GCA_000006155.2,1916,93.12,0.00,1171,g__Bacillus (UID902),324,0.0,4305660,80.178992,...,none,none,none,none,none,none,42300,16,31,0
1,GB_GCA_000007385.1,0,99.82,0.00,481,c__Gammaproteobacteria (UID4202),276,0.0,4190634,84.805944,...,1541,2846,0,100,JXEG01000201.4293.5839,Bacteria;Proteobacteria;Gammaproteobacteria;Xa...,0,20,53,0
2,GB_GCA_000008605.1,67,100.00,0.00,235,f__Spirochaetaceae (UID2512),124,0.0,1048744,92.155875,...,1545,2854,0,100,CP003679.231299.232859,Bacteria;Spirochaetes;Spirochaetia;Spirochaeta...,0,20,45,0
3,GB_GCA_000010565.1,0,100.00,0.63,295,p__Firmicutes (UID1022),158,0.0,2608397,86.217312,...,1526,2819,0,100,AP009389.1049070.1050595,Bacteria;Firmicutes;Clostridia;Clostridiales;P...,0,20,51,1
4,GB_GCA_000013845.2,0,100.00,0.00,332,o__Clostridiales (UID1375),124,0.0,2428396,82.037966,...,1509,2787,0,100,CP000312.233320.234832,Bacteria;Firmicutes;Clostridia;Clostridiales;C...,0,20,95,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191522,RS_GCF_902166935.1,0,99.62,0.09,1312,g__Klebsiella (UID5140),336,0.0,4894244,87.293148,...,1536,2837,0,100,MAPL01000082.143.1696,Bacteria;Proteobacteria;Gammaproteobacteria;En...,0,20,87,1
191523,RS_GCF_902166945.1,0,99.94,0.72,1162,f__Enterobacteriaceae (UID5121),336,0.0,5162721,86.540042,...,1536,2837,0,100,CP016813.16076.17629,Bacteria;Proteobacteria;Gammaproteobacteria;En...,0,20,86,1
191524,RS_GCF_902167295.1,0,99.51,0.00,303,p__Bacteroidetes (UID2591),203,0.0,5737447,90.122088,...,1490,2747,0,99.933,JX101438.1.1490,Bacteria;Bacteroidetes;Bacteroidia;Chitinophag...,0,20,58,0
191525,RS_GCF_902167305.1,0,99.51,0.00,303,p__Bacteroidetes (UID2591),203,0.0,5738096,90.147873,...,1490,2747,0,99.933,JX101438.1.1490,Bacteria;Bacteroidetes;Bacteroidia;Chitinophag...,0,20,58,0


In [5]:
# RAST
# the full data set. Don't try this at home!
# metadf = pd.read_csv("../small_data/patric_genome_metadata.tsv.gz", compression='gzip', header=0, delimiter="\t")
rast = pd.read_csv("../data/patric_genome_metadata.tsv.gz", compression='gzip', header=0, delimiter="\t")
rast

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,genome_id,genome_name,organism_name,taxon_id,genome_status,strain,serovar,biovar,pathovar,mlst,...,motility,sporulation,temperature_range,optimal_temperature,salinity,oxygen_requirement,habitat,disease,comments,additional_metadata
0,469009.4,"""'Brassica napus' phytoplasma strain TW1""",,469009,WGS,TW1,,,,,...,,,,,,,,,Genome sequence of a strain of bacteria that c...,sample_type:metagenomic assembly;collected_by:...
1,1309411.5,"""'Deinococcus soli' Cha et al. 2014 strain N5""",,1309411,Complete,N5,,,,,...,,,,,,,,,Genome sequencing of a Gamma-Radiation-Resista...,sample_type:bacterial
2,1123738.3,"""'Echinacea purpurea' witches'-broom phytoplas...",,1123738,WGS,NCHU2014,,,,,...,,,,C,,,,,'Echinacea purpurea' witches'-broom phytoplasm...,lab_host:Catharanthus roseus
3,551115.6,"""'Nostoc azollae' 0708""",'Nostoc azollae' 0708,551115,Complete,708,,,,,...,Yes,,Mesophilic,-,,Aerobic,Multiple,,"Nostoc azollae 0708. Nostoc azollae 0708, also...",
4,1856298.3,"""'Osedax' symbiont bacterium Rs2_46_30_T18 str...",,1856298,WGS,Rs2_46_30_T18,,,,,...,,,,,,,,,"In this study, we simulate the Deepwater Horiz...",sample_type:metagenomic assembly
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
433517,1131286.3,zeta proteobacterium SCGC AB-137-J06,zeta proteobacterium SCGC AB-137-J06,1131286,WGS,SCGC AB-137-J06,,,,,...,,,,,,,,,Single cell genome sequencing of biomineralizi...,
433518,1131287.3,zeta proteobacterium SCGC AB-602-C20,zeta proteobacterium SCGC AB-602-C20,1131287,WGS,SCGC AB-602-C20,,,,,...,,,,,,,,,Single cell genome sequencing of biomineralizi...,
433519,1131288.3,zeta proteobacterium SCGC AB-602-E04,zeta proteobacterium SCGC AB-602-E04,1131288,WGS,SCGC AB-602-E04,,,,,...,,,,,,,,,Single cell genome sequencing of biomineralizi...,
433520,1131289.3,zeta proteobacterium SCGC AB-604-B04,zeta proteobacterium SCGC AB-604-B04,1131289,WGS,SCGC AB-604-B04,,,,,...,,,,,,,,,Single cell genome sequencing of biomineralizi...,


# GenBank

This assembly summary comes from [GenBank ftp site](ftp://ftp.ncbi.nlm.nih.gov/genomes/genbank/) and you want the [assembly_summary.txt](ftp://ftp.ncbi.nlm.nih.gov/genomes/genbank/bacteria/assembly_summary.txt) file explicitly from bacteria (but don't try and open the bacteria list in your browser!)

In [6]:
# GenBank
gbk = pd.read_csv("../data/assembly_summary_.txt.gz", compression='gzip', header=1, delimiter="\t")
gbk

FileNotFoundError: [Errno 2] No such file or directory: '../data/assembly_summary_.txt.gz'

In [None]:
def split_aa(x):
    return x.split(".")

# phagesdf = pd.concat([pd.DataFrame.from_records(phagesdf['Contig'].apply(get_acc_name), columns=[acccol, 'Name']), phagesdf], axis=1)
# phagesdf

gbkaa =  pd.concat([pd.DataFrame.from_records(gbk['# assembly_accession'].apply(split_aa), columns=['assembly', 'accession']), gbk], axis=1)
gbkaa

In [None]:
pd.unique(gbkaa['assembly']).shape

In [None]:
gtdb['ncbi_genbank_assembly_accession']

In [None]:
rast['assembly_accession']

In [None]:
gbk['# assembly_accession']

In [None]:
gbkaa[gbkaa['assembly'] == 'GCA_000001405']