In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import seaborn as sns

from scipy.stats import pearsonr

sns.set_style("darkgrid")
np.random.seed(930525)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 200)

warnings.simplefilter('once')

%matplotlib inline
%load_ext watermark
%watermark --iversions

pandas  1.1.4
seaborn 0.10.1
numpy   1.20.1



In [3]:
file_path = "/mnt/btrfs/data/type_1/assemblies/assembly_summary.txt"

  and should_run_async(code)


In [6]:
df_assembly = pd.read_csv(file_path, sep="\t", skiprows=1)

  and should_run_async(code)
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [15]:
df_assembly_clean = df_assembly.copy()

  and should_run_async(code)


In [8]:
df_assembly.columns

Index(['# assembly_accession', 'bioproject', 'biosample', 'wgs_master',
       'refseq_category', 'taxid', 'species_taxid', 'organism_name',
       'infraspecific_name', 'isolate', 'version_status', 'assembly_level',
       'release_type', 'genome_rep', 'seq_rel_date', 'asm_name', 'submitter',
       'gbrs_paired_asm', 'paired_asm_comp', 'ftp_path',
       'excluded_from_refseq', 'relation_to_type_material'],
      dtype='object')

In [13]:
df_assembly.genome_rep

0         Full
1         Full
2         Full
3         Full
4         Full
          ... 
210328    Full
210329    Full
210330    Full
210331    Full
210332    Full
Name: genome_rep, Length: 210333, dtype: object

In [30]:
df_assembly = df_assembly.query('refseq_category in ("representative genome", "reference genome")')

df_assembly = df_assembly.query('genome_rep == "Full"')

df_assembly = df_assembly.query('version_status == "latest"')

mask_master = df_assembly['wgs_master'].astype(str) != 'nan'

df_assembly = df_assembly.loc[mask_master,:]

  and should_run_async(code)


In [33]:
df_assembly["# assembly_accession"]

0         GCF_002287175.1
14        GCF_002287215.1
15        GCF_001571405.1
76        GCF_000765475.1
151       GCF_001462395.1
               ...       
209414    GCF_900239495.1
209463    GCF_013366925.1
209724    GCF_003397585.1
209726    GCF_003397705.1
210009    GCF_002191155.1
Name: # assembly_accession, Length: 9806, dtype: object

In [37]:
tax_file = "/mnt/btrfs/data/gtdb_95/gtdb_genomes_reps_r95/r95.gtdb.tax"

In [40]:
df_tax_clean = pd.read_csv(tax_file, header=None, sep="\t")

df_tax = df_tax_clean.copy()

  and should_run_async(code)


In [79]:
in_gtdb = set([_.split('.')[0] for _ in df_tax[0]])

In [86]:

refseq = np.array([_.split('.')[0] in in_gtdb for _ in df_assembly["# assembly_accession"]])
genbank = np.array([_.split('.')[0] in in_gtdb for _ in df_assembly["gbrs_paired_asm"]])

df_assembly["in_gtdb"] = refseq | genbank

In [88]:
tax_file_path = "/mnt/btrfs/data/type_1/assemblies/taxtmp/tid2gg.srt.txt"

df_tax_refseq = pd.read_csv(tax_file_path, sep="\t", header=None)

df_tax_refseq.columns = ["taxid", "gg"]

  and should_run_async(code)


In [96]:
df_tax_refseq.query('taxid == 2193')

Unnamed: 0,taxid,gg
1137181,2193,k__Archaea;p__Euryarchaeota;c__Methanomicrobia...


In [102]:
df_tax_refseq.taxid.astype(int)

0               10
1              100
2           100000
3          1000000
4          1000001
            ...   
2316613     999995
2316614     999996
2316615     999997
2316616     999998
2316617     999999
Name: taxid, Length: 2316618, dtype: int64

0            2161
14           2193
15           2200
76           2226
151          2309
           ...   
209414    2781388
209463    2782701
209724    2792977
209726    2792979
210009    2801452
Name: taxid, Length: 9806, dtype: int64

In [98]:
pd.merge(df_assembly, df_tax_refseq, on="taxid", how="left")

Unnamed: 0,# assembly_accession,bioproject,biosample,wgs_master,refseq_category,taxid,species_taxid,organism_name,infraspecific_name,isolate,...,seq_rel_date,asm_name,submitter,gbrs_paired_asm,paired_asm_comp,ftp_path,excluded_from_refseq,relation_to_type_material,in_gtdb,gg
0,GCF_002287175.1,PRJNA224116,SAMN04229035,LMVM00000000.1,representative genome,2161,2161,Methanobacterium bryantii,strain=M.o.H.,,...,2017/09/06,ASM228717v1,University of California Santa Barbara,GCA_002287175.1,identical,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002...,,assembly from type material,True,
1,GCF_002287215.1,PRJNA224116,SAMN04229038,LMVO00000000.1,representative genome,2193,2193,Methanocorpusculum parvum,strain=XII,,...,2017/09/06,ASM228721v1,University of California Santa Barbara,GCA_002287215.1,identical,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002...,,assembly from type material,True,
2,GCF_001571405.1,PRJNA224116,SAMD00044722,BCNX00000000.1,representative genome,2200,2200,Methanoculleus thermophilus,strain=CR-1,,...,2016/01/19,ASM157140v1,"Bioproduction Research Institute, National Ins...",GCA_001571405.1,identical,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001...,,assembly from type material,True,
3,GCF_000765475.1,PRJNA224116,SAMN03067868,JRHO00000000.1,representative genome,2226,2226,Methanococcoides methylutens,strain=DSM 2657,,...,2014/10/10,Mmet_Version1,King Abdullah University of Science and Techno...,GCA_000765475.1,identical,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,,assembly from type material,True,
4,GCF_001462395.1,PRJNA224116,SAMN03154506,LNTB00000000.1,representative genome,2309,2309,Pyrodictium occultum,strain=PL-19,,...,2015/12/08,ASM146239v1,Oak Ridge National Laboratory,GCA_001462395.1,identical,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001...,,assembly from type material,True,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9801,GCF_900239495.1,PRJNA224116,SAMEA104389409,OENE00000000.1,representative genome,2781388,2781243,Tenacibaculum finnmarkense genomovar ulcerans,strain=TNO010,,...,2020/04/03,TNO010_V1,INRA,GCA_900239495.1,identical,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/900...,,assembly from type material,False,k__Bacteria;p__Bacteroidetes;c__Flavobacteriia...
9802,GCF_013366925.1,PRJNA224116,SAMN14144370,JAALDK000000000.1,representative genome,2782701,2782701,Paraburkholderia youngii,strain=JPY169,,...,2020/06/22,ASM1336692v1,University of Pretoria,GCA_013366925.1,identical,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/013...,,assembly from type material,False,k__Bacteria;p__Proteobacteria;c__Betaproteobac...
9803,GCF_003397585.1,PRJNA224116,SAMN09373177,QJUV00000000.1,representative genome,2792977,2792977,Gardnerella piotii,strain=UGent 18.01,,...,2018/08/16,ASM339758v1,Ghent University,GCA_003397585.1,identical,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/003...,,assembly from type material,False,k__Bacteria;p__Actinobacteria;c__Actinomycetia...
9804,GCF_003397705.1,PRJNA224116,SAMN09373170,QJVB00000000.1,representative genome,2792979,2792979,Gardnerella swidsinskii,strain=GS 9838-1,,...,2018/08/16,ASM339770v1,Ghent University,GCA_003397705.1,identical,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/003...,,assembly from type material,False,k__Bacteria;p__Actinobacteria;c__Actinomycetia...


In [112]:
df_assembly['taxid'] = df_assembly['taxid'].astype(int)
df_tax_refseq['taxid'] = df_tax_refseq['taxid'].astype(int)

df_assembly_gg = pd.merge(df_assembly, df_tax_refseq, on="taxid", how="left")

# print(df_assembly_gg['gg'].str.startswith("k__Bact").sum() - df_assembly_gg.shape[0])

# df_assembly_gg = df_assembly_gg.query("not in_gtdb").copy().reset_index(drop=True)

# print(df_assembly_gg['gg'].str.startswith("k__Bact").sum() - df_assembly_gg.shape[0])

  and should_run_async(code)


In [115]:
df_filtered = df_assembly_gg.sample(500)

df_filtered.to_csv("../data/assemblies_wgs.500.csv")

  and should_run_async(code)


In [121]:
df_filtered.head()

  and should_run_async(code)


Unnamed: 0,# assembly_accession,bioproject,biosample,wgs_master,refseq_category,taxid,species_taxid,organism_name,infraspecific_name,isolate,...,seq_rel_date,asm_name,submitter,gbrs_paired_asm,paired_asm_comp,ftp_path,excluded_from_refseq,relation_to_type_material,in_gtdb,gg
3126,GCF_000825685.1,PRJNA224116,SAMEA2771239,CCXM00000000.1,representative genome,467174,467174,Rickettsia hoogstraalii,strain=Croatica,,...,2014/09/23,Rickettsia hoogstraalii Croatica,URMITE,GCA_000825685.1,identical,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,,assembly from type material,True,k__Bacteria;p__Proteobacteria;c__Alphaproteoba...
6181,GCF_003990185.1,PRJNA224116,SAMN10644075,RZHF00000000.1,representative genome,1258546,1258546,Halomonas nanhaiensis,strain=JCM 18142,,...,2019/01/01,ASM399018v1,Zhejiang Sci-Tech University,GCA_003990185.1,identical,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/003...,,assembly from type material,True,k__Bacteria;p__Proteobacteria;c__Gammaproteoba...
9052,GCF_003449035.1,PRJNA224116,SAMN09759696,QRGB00000000.1,representative genome,2292082,2292082,Pedobacter indicus,strain=SM1810,,...,2018/09/04,ASM344903v1,Shandong University Qingdao Campus,GCA_003449035.1,identical,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/003...,,assembly from type material,True,k__Bacteria;p__Bacteroidetes;c__Sphingobacteri...
1342,GCF_014648075.1,PRJNA224116,SAMD00245318,BMQK00000000.1,representative genome,83378,83378,Streptomyces ruber,strain=JCM 3131,,...,2020/09/12,ASM1464807v1,WFCC-MIRCEN World Data Centre for Microorganis...,GCA_014648075.1,identical,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/014...,,assembly from type material,False,k__Bacteria;p__Actinobacteria;c__Actinomycetia...
950,GCF_002995745.1,PRJNA224116,SAMN08638840,PVXQ00000000.1,representative genome,52704,52704,Clostridium vincentii,strain=DSM 10228,,...,2018/03/14,ASM299574v1,Goettingen Genomics Laboratory,GCA_002995745.1,identical,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002...,,assembly from type material,True,k__Bacteria;p__Firmicutes;c__Clostridia;o__Eub...


In [120]:
df_filtered.bioproject

3126    PRJNA224116
6181    PRJNA224116
9052    PRJNA224116
1342    PRJNA224116
950     PRJNA224116
           ...     
1164    PRJNA224116
6861    PRJNA224116
2611    PRJNA224116
8785    PRJNA224116
5483    PRJNA224116
Name: bioproject, Length: 500, dtype: object

In [148]:
df_filtered['biosample'].to_csv("../data/biosamples.500.txt", index=False, header=False)
df_assembly_gg['wgs_master'].to_csv("../data/wgs_master.all.txt", index=False, header=False)

In [None]:
df_assembly_gg.sample(500).to_csv("../data/assemblies_wgs.csv")

In [105]:
from glob import glob

  and should_run_async(code)


In [106]:
wgs_master_files = glob("/mnt/btrfs/data/type_1/assemblies/*.gbff")

  and should_run_async(code)


In [130]:
import json

In [145]:
len(results)

6

In [143]:
results = []
for wgs_master in df_assembly_gg["wgs_master"].values:
    f = !bio fetch {wgs_master}
    result = !bio runinfo {wgs_master} --sample
    if result:
        result = ''.join(result)
#         d_result['wgs_master'] = wgs_master
        results.append(result)

^C


  out = process_handler(cmd, lambda p: p.communicate()[0], subprocess.STDOUT)


^C


  _warn("subprocess %s is still running" % self.pid,


^C


  out = process_handler(cmd, lambda p: p.communicate()[0], subprocess.STDOUT)


^C


  _cleanup()


^C
^C
^C
^C
^C
^C
^C
^C


KeyboardInterrupt: 

In [123]:
for file in wgs_master_files:
    results = !rg "SRR" {file}
    if len(results) > 0:
        break

KeyboardInterrupt: 

In [125]:
!cat {file}

LOCUS       NZ_AGJL01000000      1818783 bp    DNA     linear   BCT 07-JUN-2020
DEFINITION  Methanotorris formicicus Mc-S-70, whole genome shotgun sequencing
            project.
ACCESSION   NZ_AGJL00000000
VERSION     NZ_AGJL00000000.1  GI:374637162
DBLINK      BioProject: PRJNA224116
            BioSample: SAMN02261372
            Assembly: GCF_000243455.1
KEYWORDS    WGS; GSC:MIGS:2.1; HIGH_QUALITY_DRAFT; RefSeq.
SOURCE      Methanotorris formicicus Mc-S-70
  ORGANISM  Methanotorris formicicus Mc-S-70
            Archaea; Euryarchaeota; Methanomada group; Methanococci;
            Methanococcales; Methanocaldococcaceae; Methanotorris.
REFERENCE   1  (bases 1 to 1818783)
  AUTHORS   Lucas,S., Han,J., Lapidus,A., Cheng,J.-F., Goodwin,L., Pitluck,S.,
            Peters,L., Land,M.L., Hauser,L., Sieprawska-Lupa,M., Takai,K.,
            Miyazaki,J., Whitman,W. and Woyke,T.J.
  CONSRTM   US DOE Joint Genome Institute (JGI-PGF)
  TITLE     The draft genome of Methanotorris formicicus Mc-S

In [68]:
df_assembly_gg.shape

(3548, 26)