# 211012 FastANI

In [1]:
from pathlib import Path
import json
from subprocess import run, PIPE

In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import h5py as h5

## Setup

In [3]:
DATESTR = '211012'
NBNAME = DATESTR + '-fastani'

In [4]:
FASTANI_CMD = '/home/jared/opt/anaconda3/envs/fastani/bin/fastANI'

In [5]:
NTHREADS = 12

### Paths

In [6]:
infiles = dict(
    genomes=Path('data-processed/211011-get-genomes-list/211011-gambit-ani-additional-genomes.csv'),
)

In [7]:
tmpdir = Path('tmp')

genomes_dir = tmpdir / 'genomes'

fastani_dir = tmpdir / 'fastani'
fastani_dir.mkdir(exist_ok=True)

In [8]:
intermediate_out = Path('data-intermediate') / NBNAME
intermediate_out.mkdir(exist_ok=True)

## Load data

In [9]:
genomes_df = pd.read_csv(infiles['genomes'], index_col=[0, 1])

ngenomes = genomes_df.shape[0]

In [10]:
genomes_df['file'] = [genomes_dir / (acc + '.fasta.gz') for acc in genomes_df['assembly_accession']]

In [11]:
data_sets = genomes_df.index.levels[0]

## Calculate pairwise ANIs

In [12]:
list_files = dict()

for ds, chunk in genomes_df.groupby('data_set'):
    lf = list_files[ds] = fastani_dir / f'{ds}.txt'
    with open(lf, 'w') as f:
        for gf in chunk['file']:
            print(gf, file=f)

In [13]:
result_files = {ds: fastani_dir / f'{ds}.tsv' for ds in data_sets}

In [14]:
for ds, result_file in tqdm(result_files.items()):
    if result_file.is_file():
        continue

    list_file = list_files[ds]
    cmd = [
        FASTANI_CMD,
        '-t', NTHREADS,
        '--rl', list_file,
        '--ql', list_file,
        '-o', result_file,
    ]
    cmd = list(map(str, cmd))

    result = run(cmd, stdout=PIPE, stderr=PIPE)

    try:
        result.check_returncode()
    except:
        result_file.unlink(missing_ok=True)
        print(result.stderr.decode('ascii'))
        raise

100%|██████████| 2/2 [00:00<00:00, 5526.09it/s]


## Assemble data

### Read result matrices

In [15]:
file_to_index = {
    ds: {str(row.file): i for (ds, i), row in chunk.iterrows()}
    for ds, chunk in genomes_df.groupby('data_set')
}

In [16]:
stats_matrix = dict()

for ds in data_sets:
    f2i = file_to_index[ds]
    ng = len(f2i)
    results = pd.read_csv(result_files[ds], sep='\t', names=['query', 'reference', 'ani', 'mapped', 'query_fragments'])
    
    reported = np.zeros((ng, ng), dtype=bool)
    
    arrays = dict(
        ani=np.zeros((ng, ng)),
        mapped=np.zeros((ng, ng), dtype=int),
        query_fragments=np.zeros((ng, ng), dtype=int),
    )
    
    for i, row in results.iterrows():
        q = f2i[row.query]
        r = f2i[row.reference]
        assert not reported[q, r]
        reported[q, r] = True
            
        for k, a in arrays.items():
            a[q, r] = row[k]
            
    stats_matrix[ds] = dict(reported=reported, **arrays)

In [19]:
{ds: s['reported'].mean() for ds, s in stats_matrix.items()}

{'konstantinidis_2005': 0.06918367346938775, 'snitkin_2012': 1.0}

### Convert to pair format

In [17]:
stats_pw = dict()

for ds in data_sets:
    mat = stats_matrix[ds]
    
    ng = mat['reported'].shape[0]
    g1, g2 = np.tril_indices(ng)
    
    pw = stats_pw[ds] = dict(genome1=g1, genome2=g2)
    
    for k in ['reported', 'ani', 'mapped', 'query_fragments']:
        a = mat[k]
        q1r2 = pw[k + '_q1r2'] = a[g1, g2]
        q2r1 = pw[k + '_q2r1'] = a[g2, g1]
        if k == 'reported':
            pw['reported_both'] = q1r2 & q2r1
        else:
            pw[k + '_mean'] = (q1r2 + q2r1) / 2

## Write to HDF5 files

In [18]:
for ds in data_sets:
    with h5.File(str(intermediate_out / f'{ds}.h5'), 'w') as f:
        mat_grp = f.create_group('matrix')
        for k, a in stats_matrix[ds].items():
            mat_grp.create_dataset(k, data=a)
            
        pw_grp = f.create_group('pw')
        for k, a in stats_pw[ds].items():
            pw_grp.create_dataset(k, data=a)