In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import seaborn as sns

from scipy.stats import pearsonr

sns.set_style("darkgrid")
np.random.seed(930525)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 200)

warnings.simplefilter('once')

%matplotlib inline
%load_ext watermark
%watermark --iversions

numpy   1.18.4
seaborn 0.10.1
pandas  1.0.4



In [5]:
import requests

from io import StringIO

In [58]:
from collections import defaultdict

url = requests.get('https://docs.google.com/spreadsheets/d/1OYgLNTSd1BvFyGk9ps6xEaEk1KGoPx5VIgb_awHUXR4/export?format=csv')
csv_raw = StringIO(url.text)
df_truth = pd.read_csv(csv_raw, engine="python")

inf_tax_file = "/mnt/btrfs/data/gtdb_95/gtdb_genomes_reps_r95/r95.gtdb.tax"

In [59]:
ins_database = []
database_accessions = []

for accession in df_truth.gen_assembly_accession:
    in_database = False
    database_accession = ""
    if str(accession).startswith("GCF"):
        results = !rg {accession} {inf_tax_file}
        results_2 = !rg {accession.replace("GCF", "GCA")} {inf_tax_file}
        results = results + results_2
        if len(results) == 1:
            in_database = True
            database_accession = results[0].split("\t")[0]
    ins_database.append(in_database)
    database_accessions.append(database_accession)

In [60]:
df_truth['database_accessions'] = database_accessions
df_truth['in_database'] = ins_database

In [61]:
accessions_to_download = set(df_truth.loc[~df_truth['in_database'], 'download_names'])

In [62]:
accessions_to_download.remove(np.nan)

In [70]:
accessions_to_download

{'GCF_000005845',
 'GCF_000006785',
 'GCF_000006885',
 'GCF_000007265',
 'GCF_000007465',
 'GCF_000007645',
 'GCF_000008005',
 'GCF_000008345',
 'GCF_000010005',
 'GCF_000016305',
 'GCF_000016965',
 'GCF_000153625',
 'GCF_000172575',
 'GCF_000174395',
 'GCF_000182965',
 'GCF_000191785',
 'GCF_000196035',
 'GCF_000307795',
 'GCF_000349975',
 'GCF_000377685',
 'GCF_000743055',
 'GCF_001077675',
 'GCF_003324715',
 'GCF_008329785',
 'GCF_009873455',
 'Neisseria_meningitidis_ATCC_BAA_335.fasta',
 'Pseudomonas_aeruginosa_ATCC_47085.fasta',
 'Staphylococcus_aureus_subsp_aureus_ATCC_BAA_1718.fasta',
 'bsubtilis_pb.fasta',
 'cneoformans_ont.fasta',
 'ecoli_pb.fasta',
 'efaecalis_pb.fasta',
 'lfermentum_ontlumina.fasta',
 'lmonocytogenes_pb.fasta',
 'paeruginosa_pb.fasta',
 'saureus_pb.fasta',
 'scerevisiae_pb.fasta',
 'senterica_pb.fasta'}

In [72]:
strains_folder = "/mnt/btrfs/data/type_1/strains"
for file in accessions_to_download:
    if file.endswith(".fasta"):
        if file.lower() == file:
            !wget https://raw.githubusercontent.com/al-mcintyre/mCaller_analysis_scripts/master/assemblies/{file} -O {strains_folder}/{file}

--2021-02-24 17:18:27--  https://raw.githubusercontent.com/al-mcintyre/mCaller_analysis_scripts/master/assemblies/cneoformans_ont.fasta
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 27515738 (26M) [text/plain]
Saving to: ‘/mnt/btrfs/data/type_1/strains/cneoformans_ont.fasta’


2021-02-24 17:18:28 (97.4 MB/s) - ‘/mnt/btrfs/data/type_1/strains/cneoformans_ont.fasta’ saved [27515738/27515738]

--2021-02-24 17:18:28--  https://raw.githubusercontent.com/al-mcintyre/mCaller_analysis_scripts/master/assemblies/saureus_pb.fasta
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sen

In [69]:
strains_folder = "/mnt/btrfs/data/type_1/strains"
for file in accessions_to_download:
    if file.startswith("GCF"):
        file = file.replace("GCF", "GCA")
        print(file)
        !enaDataGet -f fasta {file} -d {strains_folder}
        !mv {strains_folder}/{file}/assembled-molecule.fasta {strains_folder}/{file}.fna
        !rm -rf {strains_folder}/{file}

GCA_000005845
fetching 1 sequences: assembled-molecule
downloaded 1 of 1 sequences
no sequences: unlocalised-scaffold
no sequences: unplaced-scaffold
no sequences: patch
Completed
GCA_000008345
fetching 1 sequences: assembled-molecule
downloaded 1 of 1 sequences
no sequences: unlocalised-scaffold
no sequences: unplaced-scaffold
no sequences: patch
Completed
GCA_000349975
fetching 1 sequences: assembled-molecule
downloaded 1 of 1 sequences
no sequences: unlocalised-scaffold
no sequences: unplaced-scaffold
no sequences: patch
Completed
GCA_000196035
fetching 1 sequences: assembled-molecule
downloaded 1 of 1 sequences
no sequences: unlocalised-scaffold
no sequences: unplaced-scaffold
no sequences: patch
Completed
GCA_000006885
fetching 1 sequences: assembled-molecule
downloaded 1 of 1 sequences
no sequences: unlocalised-scaffold
no sequences: unplaced-scaffold
no sequences: patch
Completed
GCA_000007465
fetching 1 sequences: assembled-molecule
downloaded 1 of 1 sequences
no sequences: unl

In [63]:
accessions_to_download

{'GCF_000005845',
 'GCF_000006785',
 'GCF_000006885',
 'GCF_000007265',
 'GCF_000007465',
 'GCF_000007645',
 'GCF_000008005',
 'GCF_000008345',
 'GCF_000010005',
 'GCF_000016305',
 'GCF_000016965',
 'GCF_000153625',
 'GCF_000172575',
 'GCF_000174395',
 'GCF_000182965',
 'GCF_000191785',
 'GCF_000196035',
 'GCF_000307795',
 'GCF_000349975',
 'GCF_000377685',
 'GCF_000743055',
 'GCF_001077675',
 'GCF_003324715',
 'GCF_008329785',
 'GCF_009873455',
 'Neisseria_meningitidis_ATCC_BAA_335.fasta',
 'Pseudomonas_aeruginosa_ATCC_47085.fasta',
 'Staphylococcus_aureus_subsp_aureus_ATCC_BAA_1718.fasta',
 'bsubtilis_pb.fasta',
 'cneoformans_ont.fasta',
 'ecoli_pb.fasta',
 'efaecalis_pb.fasta',
 'lfermentum_ontlumina.fasta',
 'lmonocytogenes_pb.fasta',
 'paeruginosa_pb.fasta',
 'saureus_pb.fasta',
 'scerevisiae_pb.fasta',
 'senterica_pb.fasta'}

In [74]:
def read_fasta(fh):
    """
    :return: tuples of (title, seq)
    """
    title = ""
    data = ""
    for line in fh:
        if line[0] == ">":
            if title:
                yield title.strip(), data
            title = line[1:]
            data = ''
        else:
            data += line.strip()
    if not title:
        yield None
    yield title.strip(), data

In [78]:
?os.mkdir

[0;31mSignature:[0m [0mos[0m[0;34m.[0m[0mmkdir[0m[0;34m([0m[0mpath[0m[0;34m,[0m [0mmode[0m[0;34m=[0m[0;36m511[0m[0;34m,[0m [0;34m*[0m[0;34m,[0m [0mdir_fd[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Create a directory.

If dir_fd is not None, it should be a file descriptor open to a directory,
  and path should be relative; path will then be relative to that directory.
dir_fd may not be implemented on your platform.
  If it is unavailable, using it will raise a NotImplementedError.

The mode argument is ignored on Windows.
[0;31mType:[0m      builtin_function_or_method


In [85]:
from glob import glob
import os

out_strains_folder = "/mnt/btrfs/data/type_1/strains_lin"

os.makedirs(out_strains_folder, exist_ok=True)

for file in glob(f"{strains_folder}/*"):
    with open(file) as inf:
        outf = os.path.join(out_strains_folder, ".".join(os.path.basename(file).split(".")[:-1]) + ".lin.noplasmid.fna")
        with open(outf, "w") as outfile:
            for header, seq in read_fasta(inf):
                if not "plasmid" in header:
                    outfile.write(f">{header}\n{seq}\n")
                
    

In [79]:
!grep "GCF_001457555" "/mnt/btrfs/data/gtdb_95/gtdb_genomes_reps_r95/r95.gtdb.tax"

GCF_001457555.1	k__Bacteria;p__Fusobacteriota;c__Fusobacteriia;o__Fusobacteriales;f__Fusobacteriaceae;g__Fusobacterium;s__Fusobacterium_polymorphum
