# 200818 Gold standard queries set 2

In [1]:
from pathlib import Path
import pickle
from gzip import GzipFile

from tqdm import tqdm
import pandas as pd

In [2]:
from midas.kmers import KmerSpec
from midas.database.basicdatabase import BasicDatabase
from midas.backports.signaturefile import SignatureFile

from midas_cli.query import query_coords_concurrent, format_query_results

## Notebook config

In [3]:
_data_dir = Path('/Users/student/projects/midas/data/')

in_files = {
    'database': _data_dir / '2019_20/refseq_curated_1.1_beta_200525_db',
    'signatures': _data_dir / '2019_20/refseq_curated_1.1beta_200604.midas-signatures.gz',
    'seqs': _data_dir / '2019_20/gold_standard_seqs_200817/',
    'species_thresholds': _data_dir / 'v1/thresholds/species_thresholds_160901.pickle',
    'genus_thresholds': _data_dir / 'v1/thresholds/genus_thresholds_0.2_160906.pickle',
}

In [4]:
out_dir = Path('../../data/processed/200818-gold-standard-queries-set2/')
out_dir.mkdir(exist_ok=True)

out_files = {
    'results': out_dir / '200818-refseq-curated-1.1_beta-gold-standard-queries-set2-results.csv',
}

## Read files

### Signatures

In [5]:
sigfile = SignatureFile(GzipFile(in_files['signatures']))

In [6]:
sigfile.get_metadata()

{'genome_set': {'name': 'refseq_curated_2020',
  'description': 'Created 2020-05-26 by filtering version 0.9 by inclusion in refseq/assembly/all 1.1',
  'meta': {'parent': {'key_version': '0.9', 'key': 'midas/assembly/curated'},
   'date_created': '2020-05-26'},
  'key_version': '1.1',
  'key': 'midas/assembly/curated'},
 'date_created': '2020-06-04',
 'kmer_spec': {'k': 11, 'prefix': 'ATGAC'},
 'description': 'Signatures for version 1.1 of curated genome set'}

In [7]:
sigs = sigfile.get_coords_collection()

### Database

In [8]:
db = BasicDatabase(in_files['database'])

In [9]:
session = db.get_session()

In [10]:
gset = session.query(db.GenomeSet).filter_by(key='midas/assembly/curated', key_version='1.1').one()
gset.meta

{'parent': {'key_version': '0.9', 'key': 'midas/assembly/curated'}, 'date_created': '2020-05-26'}

In [11]:
annotations_dict = {a.genome.key: a for a in tqdm(gset.annotations)}

50752it [00:41, 1222.05it/s]


In [12]:
annotations = [annotations_dict[k] for k in sigfile.ids]

### Thresholds

In [13]:
species_thresholds = pickle.load(in_files['species_thresholds'].open('rb'))
genus_thresholds = pickle.load(in_files['genus_thresholds'].open('rb'))

## Query

In [14]:
kspec = KmerSpec(11, b'ATGAC')

In [15]:
query_files = sorted(in_files['seqs'].glob('*.fasta'))
len(query_files)

18

In [16]:
with tqdm(total=len(query_files)) as pbar:
    scores = query_coords_concurrent(kspec, query_files, sigs, callback=lambda i: pbar.update())

100%|██████████| 18/18 [00:30<00:00,  1.68s/it]


### Format results

In [17]:
results = format_query_results(query_files, scores, annotations, species_thresholds, genus_thresholds)

In [18]:
results['query_file'] = results['query_file'].apply(lambda p: p.stem)

In [19]:
results.to_csv(out_files['results'], index=False)

## Review results

Queries without a predicted genus or species:

In [20]:
results[pd.isnull(results['predicted_genus']) | pd.isnull(results['predicted_species'])]

Unnamed: 0,query_file,top_score,predicted_genus,predicted_species,top_genus,genus_threshold,top_species,species_threshold,top_strain,top_description,top_accession
0,16AC1611140BCAP_S11_L001_R1_001,0.662597,Propionibacterium,,Propionibacterium,0.2,acnes,0.914643,HL030PA1,[GCF_000145115.1] Cutibacterium acnes HL030PA1...,GCF_000145115.1
3,17AC0006313-1_S23_L001_R1_001,0.087412,,,Veillonella,0.2,parvula,0.392589,DSM 2008,[GCF_000024945.1] Veillonella parvula DSM 2008...,GCF_000024945.1
4,17AC0012453_S1_L001_R1_001,0.113749,,,Corynebacterium,0.2,simulans,0.575528,PES1,[GCF_001586215.1] Corynebacterium simulans (hi...,GCF_001586215.1
5,17AC0012454_S2_L001_R1_001,0.9811,Aerococcus,,Aerococcus,0.2,viridans,0.990698,ATCC 11563,[GCF_000178435.1] Aerococcus viridans ATCC 115...,GCF_000178435.1
6,17AC0012455-1A_S1_L001_R1_001,0.443076,Streptococcus,,Streptococcus,0.2,constellatus,0.947005,SK53,[GCF_000257785.1] Streptococcus constellatus s...,GCF_000257785.1
