# 211015 FastANI Gold Standard Genome

In [1]:
from pathlib import Path
import json
from subprocess import run, PIPE

In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import h5py as h5

## Setup

In [3]:
DATESTR = '211015'
NBNAME = DATESTR + '-fastani-gsg'

In [4]:
FASTANI_CMD = '/home/jared/opt/anaconda3/envs/fastani/bin/fastANI'

In [5]:
NTHREADS = 12

In [6]:
DATASET = '200726_gold_standard'

### Paths

In [7]:
infiles = dict(
    genomes=Path('/home/jared/projects/gambit/data/validation/200726-gold-standard-seqs/files.csv'),
    genomes_dir=Path('/home/jared/projects/gambit/data/validation/200726-gold-standard-seqs/fasta/'),
)

In [8]:
tmpdir = Path('tmp')

fastani_dir = tmpdir / 'fastani' / DATASET
fastani_dir.mkdir(exist_ok=True)

In [9]:
intermediate_out = Path('data-intermediate') / NBNAME
intermediate_out.mkdir(exist_ok=True)

outfile = intermediate_out / f'{DATASET}.h5'

## Load data

In [10]:
genomes_all = pd.read_csv(infiles['genomes'])

# Exclude 200817 set
genomes = genomes_all.loc[genomes_all['set'] == 200726, 'name']

ngenomes = len(genomes)
ngenomes

80

In [11]:
genome_files = [infiles['genomes_dir'] / f'{name}.fasta.gz' for name in genomes]

## Calculate pairwise ANIs

In [12]:
list_file = fastani_dir / f'{DATASET}.txt'

with open(list_file, 'w') as f:
    for gf in genome_files:
        print(gf, file=f)

In [13]:
result_file = fastani_dir / f'{DATASET}.tsv'

In [14]:
if not result_file.is_file():

    cmd = [
        FASTANI_CMD,
        '-t', NTHREADS,
        '--rl', list_file,
        '--ql', list_file,
        '-o', result_file,
    ]
    cmd = list(map(str, cmd))

    result = run(cmd, stdout=PIPE, stderr=PIPE)

    try:
        result.check_returncode()
    except:
        result_file.unlink(missing_ok=True)
        print(result.stderr.decode('ascii'))
        raise

## Assemble data

### Read result matrices

In [15]:
file_to_index = {str(file): i for i, file in enumerate(genome_files)}

In [16]:
results = pd.read_csv(result_file, sep='\t', names=['query', 'reference', 'ani', 'mapped', 'query_fragments'])

reported = np.zeros((ngenomes, ngenomes), dtype=bool)

arrays = dict(
    ani=np.zeros((ngenomes, ngenomes)),
    mapped=np.zeros((ngenomes, ngenomes), dtype=int),
    query_fragments=np.zeros((ngenomes, ngenomes), dtype=int),
)

for i, row in results.iterrows():
    q = file_to_index[row.query]
    r = file_to_index[row.reference]
    assert not reported[q, r]
    reported[q, r] = True

    for k, a in arrays.items():
        a[q, r] = row[k]

stats_matrix = dict(reported=reported, **arrays)

In [17]:
reported.mean()

0.07921875

### Convert to pair format

In [18]:
g1, g2 = np.tril_indices(ngenomes)

stats_pw = dict(genome1=g1, genome2=g2)

for k in ['reported', 'ani', 'mapped', 'query_fragments']:
    a = stats_matrix[k]
    q1r2 = stats_pw[k + '_q1r2'] = a[g1, g2]
    q2r1 = stats_pw[k + '_q2r1'] = a[g2, g1]
    if k == 'reported':
        stats_pw['reported_both'] = q1r2 & q2r1
    else:
        stats_pw[k + '_mean'] = (q1r2 + q2r1) / 2

## Write to HDF5 files

In [19]:
with h5.File(outfile, 'w') as f:
    mat_grp = f.create_group('matrix')
    for k, a in stats_matrix.items():
        mat_grp.create_dataset(k, data=a)

    pw_grp = f.create_group('pw')
    for k, a in stats_pw.items():
        pw_grp.create_dataset(k, data=a)