In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import seaborn as sns

from scipy.stats import pearsonr

sns.set_style("darkgrid")
np.random.seed(930525)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 200)

warnings.simplefilter('once')

%matplotlib inline
%load_ext watermark
%watermark --iversions

numpy   1.19.5
pandas  1.1.4
seaborn 0.10.1



In [2]:
from glob import glob

  and should_run_async(code)


In [3]:
files = glob("/mnt/btrfs/data/type_1/sra/runinfor.*.txt")

In [4]:
import json

In [5]:
sras = []

for file in files:
    with open(file) as inf:
        filename = ".".join(file.split(".")[-3:-1])
        contents = inf.read()
        if len(contents) > 0:
            js = json.loads(contents)
            if type(js) != type(sras):
                js['wgs_master'] = filename
                sras.append(js)
            else:
                for d in js:
                    d['wgs_master'] = filename
                    sras.append(d)
            

In [6]:
df_sra = pd.DataFrame.from_records(sras)

In [7]:
df_sra.download_path.values

array(['https://sra-downloadb.st-va.ncbi.nlm.nih.gov/sos2/sra-pub-run-6/SRR2886784/SRR2886784.1',
       'https://sra-downloadb.st-va.ncbi.nlm.nih.gov/sos2/sra-pub-run-6/SRR2886880/SRR2886880.1',
       'https://sra-downloadb.be-md.ncbi.nlm.nih.gov/sos3/sra-pub-run-20/SRR11038190/SRR11038190.1',
       ...,
       'https://sra-downloadb.be-md.ncbi.nlm.nih.gov/sos3/sra-pub-run-21/ERR3046676/ERR3046676.1',
       'https://sra-downloadb.be-md.ncbi.nlm.nih.gov/sos3/sra-pub-run-20/ERR3046932/ERR3046932.1',
       'https://sra-downloadb.be-md.ncbi.nlm.nih.gov/sos1/sra-pub-run-2/SRR8847018/SRR8847018.1'],
      dtype=object)

In [8]:
df_sra.head()

Unnamed: 0,Run,ReleaseDate,LoadDate,spots,bases,spots_with_mates,avgLength,size_MB,download_path,Experiment,...,Tumor,CenterName,Submission,Consent,RunHash,ReadHash,wgs_master,LibraryName,Subject_ID,AssemblyName
0,SRR2886784,2017-08-01 00:00:32,2015-11-01 02:48:19,3776199,569917417,3776199,150,277,https://sra-downloadb.st-va.ncbi.nlm.nih.gov/s...,SRX1406647,...,no,"UNIVERSITY OF CALIFORNIA, SANTA BARBARA",SRA308460,public,B3D91B8C88595960A14E5899ECA06D2B,22BE7CC5D2703B673FB9AF0CA3517131,LMVM00000000.1,,,
1,SRR2886880,2017-08-01 00:00:32,2015-11-01 21:05:38,2395447,361505594,2395447,150,177,https://sra-downloadb.st-va.ncbi.nlm.nih.gov/s...,SRX1406759,...,no,"UNIVERSITY OF CALIFORNIA, SANTA BARBARA",SRA308487,public,E738CBB0367D44574137C7D7F48EBCF4,4ED2E095F415C2E3BB6958B3DE7E10D4,LMVO00000000.1,,,
2,SRR11038190,2020-02-07 18:27:13,2020-02-07 18:24:21,3101206,1556805412,3101206,502,746,https://sra-downloadb.be-md.ncbi.nlm.nih.gov/s...,SRX7690448,...,no,NORTH CAROLINA STATE UNIVERSITY,SRA1039248,public,40932E74CD0AD0A03ED796098C097DBD,8D51B9EB6984CA2DE8FE2BD21E410865,WFIY00000000.1,Acidianus_infernus_Illumina,,
3,SRR11038191,2020-02-07 18:27:16,2020-02-07 18:26:16,218658,1130230661,0,5168,272,https://sra-downloadb.be-md.ncbi.nlm.nih.gov/s...,SRX7690447,...,no,NORTH CAROLINA STATE UNIVERSITY,SRA1039248,public,E128C60B2086D564E54FF3C9D720ED16,949C68C299081D419C8D40B9FDAA5F81,WFIY00000000.1,Acidianus_infernus_PacBio,,
4,SRR4236987,2016-09-11 07:09:36,2016-09-11 07:13:04,2850334,860800868,2850334,302,336,https://sra-downloadb.st-va.ncbi.nlm.nih.gov/s...,SRX2158223,...,no,JGI,SRA467326,public,F56FEFE1F9893DC04F115A5C280A6108,9D71A380EB8A49C6CE75F0167A2281FB,FNOF00000000.1,BBPBB,,


In [9]:
(df_sra["Platform"] == "ILLUMINA").sum()

  and should_run_async(code)


3723

In [10]:
df_sra = df_sra.query("Platform == 'ILLUMINA'").copy()

In [11]:
file_path = "/mnt/btrfs/data/type_1/assemblies/assembly_summary.txt"

df_assembly = pd.read_csv(file_path, sep="\t", skiprows=1)

df_assembly_clean = df_assembly.copy()

df_assembly = df_assembly.query('refseq_category in ("representative genome", "reference genome")')

df_assembly = df_assembly.query('genome_rep == "Full"')

df_assembly = df_assembly.query('version_status == "latest"')

mask_master = df_assembly['wgs_master'].astype(str) != 'nan'

df_assembly = df_assembly.loc[mask_master,:]

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [12]:
tax_file = "/mnt/btrfs/data/gtdb_95/gtdb_genomes_reps_r95/r95.gtdb.tax"

In [13]:
df_tax_clean = pd.read_csv(tax_file, header=None, sep="\t")

df_tax = df_tax_clean.copy()

In [14]:
in_gtdb = set([_.split('.')[0] for _ in df_tax[0]])

In [15]:
refseq = np.array([_.split('.')[0] in in_gtdb for _ in df_assembly["# assembly_accession"]])
genbank = np.array([_.split('.')[0] in in_gtdb for _ in df_assembly["gbrs_paired_asm"]])

df_assembly["in_gtdb"] = refseq | genbank

In [16]:
tax_file_path = "/mnt/btrfs/data/type_1/assemblies/taxtmp/tid2gg.srt.txt"

df_tax_refseq = pd.read_csv(tax_file_path, sep="\t", header=None)

df_tax_refseq.columns = ["taxid", "gg"]

In [17]:
df_assembly['taxid'] = df_assembly['taxid'].astype(int)
df_tax_refseq['taxid'] = df_tax_refseq['taxid'].astype(int)

df_assembly_gg = pd.merge(df_assembly, df_tax_refseq, on="taxid", how="left")

In [18]:
df_merged = pd.merge(df_assembly_gg, df_sra, how="inner", on="wgs_master")

In [19]:
df_merged["in_gtdb"].sum()

2619

In [20]:
df_merged.shape

(3723, 61)

In [21]:
df_merged.to_csv("../data/assembly_summary.sra.csv", index=False)

In [22]:
df_merged.sample(500)["Run"].to_csv("../data/srastudy.500.txt", index=False, header=False)
df_merged.sample(100)["Run"].to_csv("../data/srastudy.100.txt", index=False, header=False)

In [23]:
df_merged.ftp_path.values

array(['ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/287/175/GCF_002287175.1_ASM228717v1',
       'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/287/215/GCF_002287215.1_ASM228721v1',
       'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/009/729/545/GCF_009729545.1_ASM972954v1',
       ...,
       'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/900/890/705/GCF_900890705.1_Kiritimatiellales_strain_F21_draft_genome',
       'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/900/890/425/GCF_900890425.1_Kiritimatiellales_strain_F1_draft_genome',
       'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/004/768/465/GCF_004768465.1_ASM476846v1'],
      dtype=object)

In [24]:
df_merged.SRAStudy

0       SRP065574
1       SRP065585
2       SRP247566
3       SRP089436
4       SRP008004
          ...    
3718    SRP072866
3719    ERP000943
3720    ERP106613
3721    ERP106613
3722    SRP190798
Name: SRAStudy, Length: 3723, dtype: object

In [25]:
df_merged.Submission

0        SRA308460
1        SRA308487
2       SRA1039248
3        SRA467326
4        SRA045669
           ...    
3718     SRA404858
3719     ERA087401
3720    ERA1714097
3721    ERA1714097
3722     SRA869444
Name: Submission, Length: 3723, dtype: object