# 200801 Gold standard queries

In [2]:
from pathlib import Path
import pickle
from gzip import GzipFile

from tqdm import tqdm
import pandas as pd

In [3]:
from midas.kmers import KmerSpec
from midas.database.basicdatabase import BasicDatabase
from midas.backports.signaturefile import SignatureFile

from midas_cli.query import query_coords_concurrent, format_query_results

## Notebook config

In [4]:
_data_dir = Path('/Users/student/projects/midas/data/')

in_files = {
    'database': _data_dir / '2019_20/refseq_curated_1.1_beta_200525_db',
    'signatures': _data_dir / '2019_20/refseq_curated_1.1beta_200604.midas-signatures.gz',
    'seqs': _data_dir / 'gold_standard_seqs_200726/',
    'species_thresholds': _data_dir / 'v1/thresholds/species_thresholds_160901.pickle',
    'genus_thresholds': _data_dir / 'v1/thresholds/genus_thresholds_0.2_160906.pickle',
}

In [5]:
out_dir = Path('../../data/processed/200801-gold-standard-queries/')
out_dir.mkdir(exist_ok=True)

out_files = {
    'results': out_dir / '200801-refseq-curated-1.1_beta-gold-standard-query-results.csv',
}

## Read files

### Signatures

In [6]:
sigfile = SignatureFile(GzipFile(in_files['signatures']))

In [7]:
sigfile.get_metadata()

{'genome_set': {'name': 'refseq_curated_2020',
  'description': 'Created 2020-05-26 by filtering version 0.9 by inclusion in refseq/assembly/all 1.1',
  'meta': {'parent': {'key_version': '0.9', 'key': 'midas/assembly/curated'},
   'date_created': '2020-05-26'},
  'key_version': '1.1',
  'key': 'midas/assembly/curated'},
 'date_created': '2020-06-04',
 'kmer_spec': {'k': 11, 'prefix': 'ATGAC'},
 'description': 'Signatures for version 1.1 of curated genome set'}

In [8]:
sigs = sigfile.get_coords_collection()

### Database

In [9]:
db = BasicDatabase(in_files['database'])

In [10]:
session = db.get_session()

In [11]:
gset = session.query(db.GenomeSet).filter_by(key='midas/assembly/curated', key_version='1.1').one()
gset.meta

{'parent': {'key_version': '0.9', 'key': 'midas/assembly/curated'}, 'date_created': '2020-05-26'}

In [12]:
annotations_dict = {a.genome.key: a for a in tqdm(gset.annotations)}

50752it [00:57, 885.82it/s] 


In [13]:
annotations = [annotations_dict[k] for k in sigfile.ids]

### Thresholds

In [14]:
species_thresholds = pickle.load(in_files['species_thresholds'].open('rb'))
genus_thresholds = pickle.load(in_files['genus_thresholds'].open('rb'))

## Query

In [15]:
kspec = KmerSpec(11, b'ATGAC')

In [16]:
query_files = sorted(in_files['seqs'].glob('*.fasta'))
len(query_files)

80

In [17]:
with tqdm(total=len(query_files)) as pbar:
    scores = query_coords_concurrent(kspec, query_files, sigs, callback=lambda i: pbar.update())

100%|██████████| 80/80 [02:40<00:00,  2.00s/it]


### Format results

In [18]:
results = format_query_results(query_files, scores, annotations, species_thresholds, genus_thresholds)

In [19]:
results['query_file'] = results['query_file'].apply(lambda p: p.name)

In [20]:
results.to_csv(out_files['results'], index=False)

## Review results

Queries without a predicted genus or species:

In [26]:
results[pd.isnull(results['predicted_genus']) | pd.isnull(results['predicted_species'])]

Unnamed: 0,query_file,top_score,predicted_genus,predicted_species,top_genus,genus_threshold,top_species,species_threshold,top_strain,top_description,top_accession
3,16AC1611140-CAP_S15_L001_R1_001 Assembly Conti...,0.847769,Staphylococcus,,Staphylococcus,0.2,lugdunensis,0.853163,HKU09-01,[GCF_000025085.1] Staphylococcus lugdunensis H...,GCF_000025085.1
19,17AC0006313ANBLD_S10_L001_R1_001 Assembly Cont...,0.087383,,,Veillonella,0.2,parvula,0.392589,DSM 2008,[GCF_000024945.1] Veillonella parvula DSM 2008...,GCF_000024945.1
21,17AC0006314_S11_L001_R1_001 Assembly Contigs.f...,0.873626,Pseudomonas,,Pseudomonas,0.2,denitrificans,0.982552,576_PAER,[GCF_001065515.1] Pseudomonas aeruginosa (g-pr...,GCF_001065515.1
22,17AC0012455-1AN_S5_L001_R1_001 Assembly Contig...,0.51589,Streptococcus,,Streptococcus,0.2,constellatus,0.947005,SK53,[GCF_000257785.1] Streptococcus constellatus s...,GCF_000257785.1
23,17AC0012455-1A_S1_L001_R1_001 Assembly Contigs...,0.443076,Streptococcus,,Streptococcus,0.2,constellatus,0.947005,SK53,[GCF_000257785.1] Streptococcus constellatus s...,GCF_000257785.1
39,18AC0018936-1_S12_L001_R1_001 Assembly Contigs...,0.060266,,,Staphylococcus,0.2,epidermidis,0.689851,NIHLM023,[GCF_000276305.1] Staphylococcus epidermidis N...,GCF_000276305.1
42,18AC0018937-BAP_S11_L001_R1_001 Assembly Conti...,0.058114,,,Proteus,0.2,mirabilis,0.449309,FDAARGOS_60,[GCF_000784015.1] Proteus mirabilis (enterobac...,GCF_000784015.1
46,19AC0002347_S3_L001_R1_001 Assembly Contigs.fasta,0.269556,Stenotrophomonas,,Stenotrophomonas,0.2,maltophilia,0.418237,305_SMAL,[GCF_001072195.1] Stenotrophomonas maltophilia...,GCF_001072195.1
48,19AC0002349B1_S10_L001_R1_001 Assembly Contigs...,0.008197,,,Clostridium,0.2,perfringens,0.414225,13,[GCF_000009685.1] Clostridium perfringens str....,GCF_000009685.1
56,19AC0011209-1_S9_L001_R1_001 Assembly Contigs....,0.314936,Burkholderia,,Burkholderia,0.2,cenocepacia,0.832753,HI2424,[GCF_000203955.1] Burkholderia cenocepacia HI2...,GCF_000203955.1
