# 210902 Mash distances

In [1]:
from pathlib import Path
import json
from subprocess import run, PIPE
import sys

In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import h5py as h5

## Setup

In [3]:
DATESTR = '210902'
NBNAME = DATESTR + '-mash-distances'

In [4]:
MASH_CMD = '/home/jared/opt/anaconda3/envs/gambit-bioconda/bin/mash'

### Paths

In [5]:
infiles = dict(
    genomes=Path('data-intermediate/210902-get-genomes/genomes.csv'),
)

In [6]:
tmpdir = Path('tmp')

In [7]:
mash_dir = tmpdir / 'mash'
mash_dir.mkdir(exist_ok=True)

dists_dir = mash_dir / 'distances'
dists_dir.mkdir(exist_ok=True)

In [8]:
INPUTS_FILE = mash_dir / 'files.txt'

In [9]:
intermediate_out = Path('data-intermediate') / NBNAME
intermediate_out.mkdir(exist_ok=True)

## Load data

In [10]:
genomes = pd.read_csv(infiles['genomes'], index_col='index')

In [11]:
ngenomes = genomes.shape[0]

In [12]:
genome_files = [tmpdir / 'genomes' / (acc + '.fa.gz') for acc in genomes['assembly_accession']]

## Parameters

In [13]:
SKETCH_SIZES = [500, 1000, 5_000, 10_000]
K_VALS = range(11, 32, 2)

In [14]:
PARAM_VALS = [(s, k) for s in SKETCH_SIZES for k in K_VALS]

## Calculate distances

In [15]:
with open(INPUTS_FILE, 'w') as f:
    for path in genome_files:
        print(path, file=f)

In [16]:
dist_files = {(s, k): dists_dir / f's{s}-k{k}' for s, k in PARAM_VALS}

In [17]:
for (s, k), file in tqdm(dist_files.items()):
    if file.is_file():
        continue

    cmd = [
        MASH_CMD,
        'triangle',
        '-l', INPUTS_FILE,
        '-s', s,
        '-k', k,
        '-p', 6,
    ]
    cmd = list(map(str, cmd))
    
    with open(file, 'w') as out:
        result = run(cmd, stdout=out, stderr=PIPE)
        
    try:
        result.check_returncode()
    except:
        file.unlink(missing_ok=True)
        print(result.stderr.decode('ascii'))
        raise
    
    with open(file.parent / (file.name + '.stderr'), 'wb') as f:
        f.write(result.stderr)

100%|██████████| 44/44 [00:00<00:00, 27004.59it/s]


## Assemble data

In [18]:
def parse_phylip_matrix(f):
    line = f.readline()
    n = int(line.strip())
    
    for i, line in enumerate(f.readlines()):
        id_, *values = line.strip().split('\t')
        assert len(values) == i
        values = list(map(float, values))
        yield id_, values

In [19]:
npairs = ngenomes * (ngenomes - 1) // 2

In [20]:
index1 = []
index2 = []

for i in range(ngenomes):
    for j in range(i):
        index1.append(i + 1)
        index2.append(j + 1)
        
assert len(index1) == len(index2) == npairs

In [21]:
dists = np.empty(shape=(len(PARAM_VALS), npairs), dtype=np.float32)

for i, params in enumerate(tqdm(PARAM_VALS)):
    with open(dist_files[params]) as f:
        start = 0

        for j, (id_, vals) in enumerate(parse_phylip_matrix(f)):
            assert genomes.loc[j + 1, 'assembly_accession'] in id_

            if j > 0:
                dists[i, start:start+j] = vals

            start += j

        assert start == npairs

100%|██████████| 44/44 [00:00<00:00, 46.88it/s]


In [22]:
dists_df = pd.DataFrame(dists.T)

params_to_col = {(s, k): f's{s}-k{k}' for (s, k) in PARAM_VALS}
dists_df.columns = [params_to_col[p] for p in PARAM_VALS]

## Copy data to HDF5 file

In [23]:
with h5.File(str(intermediate_out / 'mash-distances.h5'), 'w') as f:
    f.create_dataset('k', dtype=int, data=[k for (s, k) in PARAM_VALS])
    f.create_dataset('sketch_size', dtype=int, data=[s for (s, k) in PARAM_VALS])
    
    f.create_dataset('index1', dtype=int, data=index1)
    f.create_dataset('index2', dtype=int, data=index2)
    
    f.create_dataset('distances', dtype=np.float32, data=dists)