# 210730 validation

In [1]:
from pathlib import Path
import json

In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm

In [3]:
from gambit.db.fromfile import load_database_from_dir
from gambit.io.seq import SequenceFile, find_kmers_in_files
from gambit.metric import jaccard_sparse_matrix
from gambit.db.models import Taxon
from gambit.classify import classify, find_matches, GenomeMatch
from gambit.export.json import JSONResultsExporter
from gambit.util.dev import install_info

In [4]:
install_info()

{'pkg_dir': PosixPath('/home/jared/git/gambit/gambit/gambit'),
 'repo_dir': PosixPath('/home/jared/git/gambit/gambit'),
 'commit': {'hash': 'c4d923b324948c5a4fea7ca3a8dc5099c77798c8',
  'author': 'Jared Lumpe <mjlumpe@gmail.com>',
  'author_date': '2021-07-28T22:00:47-06:00',
  'commit': 'Jared Lumpe <mjlumpe@gmail.com>',
  'commit_date': '2021-07-28T22:00:47-06:00',
  'subject': 'Edits to consensus_taxon() implementation, docstring, and test'},
 'status': 'Git info retrieved successfully.'}

## Setup

In [5]:
DATESTR = '210730'
NBNAME = DATESTR + '-validation'

In [6]:
infiles = dict(
    db=Path('/home/jared/projects/gambit/data/databases/refseq-curated/1.0-beta1/'),
    validation=Path('/home/jared/projects/gambit/data/validation/200726-gold-standard-seqs/'),
)

In [7]:
intermediate_out = Path('data-intermediate') / NBNAME
if not intermediate_out.is_dir():
    intermediate_out.mkdir()

In [8]:
processed_out = Path('data-processed') / NBNAME
if not processed_out.is_dir():
    processed_out.mkdir()

## Load data

### Database

In [9]:
db = load_database_from_dir(infiles['db'])

### Query files

In [10]:
query_df = pd.read_csv(infiles['validation'] / 'files.csv')

In [11]:
query_files = SequenceFile.from_paths([infiles['validation'] / 'fasta' / (name + '.fasta') for name in query_df['name']], 'fasta')

In [12]:
nqueries = len(query_files)
nqueries

98

## Run query

In [13]:
query_sigs = find_kmers_in_files(db.signatures.kmerspec, query_files, progress=True)

100%|██████████| 98/98 [01:06<00:00,  1.46it/s]


In [14]:
dmat = jaccard_sparse_matrix(query_sigs, db.signatures, ref_indices=db.sig_indices, distance=True, chunksize=1000, progress=True)

4802000it [00:48, 99240.49it/s]                              


In [15]:
results = [classify(db.genomes, dmat[i, :], strict=True) for i in tqdm(range(nqueries))]

100%|██████████| 98/98 [00:27<00:00,  3.60it/s]


## Analysis

### Results with warnings/errors

In [16]:
no_prediction = [i for i, item in enumerate(results) if item.predicted_taxon is None]
no_prediction

[6, 39, 42, 48, 56, 58, 80]

In [17]:
has_problem = [i for i, item in enumerate(results) if item.warnings or item.error]
has_problem

[6, 56, 80]

In [18]:
for i in has_problem:
    item = results[i]
    
    print()
    print(i, query_df.loc[i, 'name'])
    
    print('\twarnings:')
    for w in item.warnings:
        print('\t\t' + w)
    print('\terror:', item.error)


6 17AC0001409_S8
		Query matched 3 inconsistent taxa: 115:Neisseria, 1412:Streptococcus dysgalactiae, 323:Streptococcus. Reporting lowest common ancestor of this set.
	error: Matched taxa have no common ancestor.

56 19AC0011209-1_S9
		Query matched 3 inconsistent taxa: 319:Burkholderia, 1386:Burkholderia cenocepacia, 323:Streptococcus. Reporting lowest common ancestor of this set.
	error: Matched taxa have no common ancestor.

80 16AC1611140BCAP_S11
		Query matched 3 inconsistent taxa: 1527:Cutibacterium acnes, 353:Cutibacterium, 317:Staphylococcus. Reporting lowest common ancestor of this set.
	error: Matched taxa have no common ancestor.


### Alternate predictions

In [19]:
alt_taxids = {
    6: 115,
    56: 323,
    80: 317,
}

In [20]:
alt_matches = dict()

for row, taxid in alt_taxids.items():
    taxon = db.session.query(Taxon).get(taxid)
    all_matches = find_matches(zip(db.genomes, dmat[row, :]))
    matches = all_matches[taxon]
    
    top_i = matches[np.argmin(dmat[row, matches])]
    
    alt_matches[row] = GenomeMatch(
        genome=db.genomes[top_i],
        distance=dmat[row, top_i],
        matched_taxon=taxon,
    )

In [21]:
alt_matches

{6: GenomeMatch(genome=<AnnotatedGenome:1:1252 'gambit/refseq-curated'/'refseq/assembly/GCF_000763635.1'>, distance=0.7966102, matched_taxon=<Taxon:115 'Neisseria'>),
 56: GenomeMatch(genome=<AnnotatedGenome:1:4289 'gambit/refseq-curated'/'refseq/assembly/GCF_001069445.1'>, distance=0.7654679, matched_taxon=<Taxon:323 'Streptococcus'>),
 80: GenomeMatch(genome=<AnnotatedGenome:1:2209 'gambit/refseq-curated'/'refseq/assembly/GCF_000025085.1'>, distance=0.7412573, matched_taxon=<Taxon:317 'Staphylococcus'>)}

## Output

### JSON

In [22]:
json_data = []

for query, item in zip(query_df.itertuples(), results):
    data = dict(
        query=query.name,
        classifier_result=item,
        alt_match=alt_matches.get(query.Index),
    )
    json_data.append(data)

In [23]:
exporter = JSONResultsExporter()

In [24]:
with open(intermediate_out / 'result-data.json', 'wt') as f:
    exporter.export(f, json_data)

### CSV

In [25]:
def int_or_none(values):
    values2 = [None if pd.isnull(v) else int(v) for v in values]
    return pd.Series(values2, dtype=object)

def set_int_or_none(df, column):
    df[column] = int_or_none(df[column])

In [26]:
_rows = []

for query, item in zip(query_df.itertuples(), results):
    closest = item.closest_match
    predicted = closest.matched_taxon
    
    row = (
        query.set,
        query.name,
        None if predicted is None else predicted.name,
        None if predicted is None else predicted.ncbi_id,
        None if predicted is None else predicted.rank,
        None if predicted is None else predicted.distance_threshold,
        closest.distance,
        closest.genome.description,
        closest.genome.taxon.ncbi_id,
        closest.genome.taxon.name,
        closest.genome.taxon.distance_threshold,
    )
    
    _rows.append(row)
    
cols = [
    'set',
    'name',
    'predicted.name',
    'predicted.ncbi_id',
    'predicted.rank',
    'predicted.threshold',
    'closest_genome.distance',
    'closest_genome.description',
    'closest_genome.taxon.ncbi_id',
    'closest_genome.taxon.name',
    'closest_genome.taxon.threshold',
]

main_df = pd.DataFrame.from_records(_rows, columns=cols)

set_int_or_none(main_df, 'predicted.ncbi_id')
set_int_or_none(main_df, 'closest_genome.taxon.ncbi_id')

In [27]:
_rows = []

for i, alt_match in alt_matches.items():
    query = query_df.loc[i]
    item = results[i]
    closest = item.closest_match
    predicted = closest.matched_taxon
    
    row = (
        query['set'],
        query['name'],
        alt_match.matched_taxon.name,
        alt_match.matched_taxon.ncbi_id,
        alt_match.matched_taxon.rank,
        alt_match.matched_taxon.distance_threshold,
        alt_match.distance,
        alt_match.genome.description,
        alt_match.genome.taxon.name,
        alt_match.genome.taxon.ncbi_id,
        alt_match.genome.taxon.distance_threshold,
    )
    
    _rows.append(row)
    
cols = [
    'set',
    'name',
    'alt_prediction.name',
    'alt_prediction.ncbi_id',
    'alt_prediction.rank',
    'alt_prediction.threshold',
    'alt_genome.distance',
    'alt_genome.description',
    'alt_genome.taxon.name',
    'alt_genome.taxon.ncbi_id',
    'alt_genome.taxon.threshold',
]

alt_df = pd.DataFrame.from_records(_rows, columns=cols)

set_int_or_none(alt_df, 'alt_prediction.ncbi_id')
set_int_or_none(alt_df, 'alt_genome.taxon.ncbi_id')

In [28]:
_rows = []

for query, item in zip(query_df.itertuples(), results):
    closest = item.closest_match
    if closest.matched_taxon is not None:
        continue

    genus = closest.genome.taxon.root()
    assert genus.rank == 'genus'
    
    row = (
        query.set,
        query.name,
        closest.distance,
        closest.genome.description,
        closest.genome.taxon.ncbi_id,
        closest.genome.taxon.name,
        closest.genome.taxon.distance_threshold,
        genus.ncbi_id,
        genus.name,
        genus.distance_threshold,
    )
    
    _rows.append(row)
    
cols = [
    'set',
    'name',
    'closest_genome.distance',
    'closest_genome.description',
    'closest_genome.taxon.ncbi_id',
    'closest_genome.taxon.name',
    'closest_genome.taxon.threshold',
    'closest_genome.genus.ncbi_id',
    'closest_genome.genus.name',
    'closest_genome.genus.threshold',
]

missing_df = pd.DataFrame.from_records(_rows, columns=cols)

set_int_or_none(missing_df, 'closest_genome.taxon.ncbi_id')
set_int_or_none(missing_df, 'closest_genome.genus.ncbi_id')

In [29]:
main_df.to_csv(processed_out / f'{DATESTR}-1.0b-validation-primary-matches.csv', index=False)
alt_df.to_csv(processed_out / f'{DATESTR}-1.0b-validation-alternate-matches.csv', index=False)
missing_df.to_csv(processed_out / f'{DATESTR}-1.0b-validation-unmatched-genus-info.csv', index=False)