# 211012 Gambit

In [1]:
from pathlib import Path

In [2]:
import numpy as np
import pandas as pd
import h5py as h5
from tqdm import tqdm

In [3]:
from gambit.kmers import KmerSpec
from gambit.signatures.calc import calc_file_signatures
from gambit.io.seq import SequenceFile
from gambit.signatures import SignatureArray
from gambit.signatures.hdf5 import HDF5Signatures
from gambit.metric import jaccarddist_matrix

## Setup

In [4]:
DATESTR = '211012'
NBNAME = DATESTR + '-gambit'

In [5]:
tmpdir = Path('tmp')

genomes_dir = tmpdir / 'genomes'

gambit_dir = tmpdir / 'gambit'
gambit_dir.mkdir(exist_ok=True)

In [6]:
infiles = dict(
    genomes=Path('data-processed/211011-get-genomes-list/211011-gambit-ani-additional-genomes.csv'),
    params=Path('../../data/intermediate/210902-mash-Escherichia-genomes/210917-gambit/params.csv')
)

In [7]:
intermediate_out = Path('data-intermediate') / NBNAME
intermediate_out.mkdir(exist_ok=True)

## Load data

In [8]:
genomes_df = pd.read_csv(infiles['genomes'], index_col=[0, 1])

dsets = genomes_df.index.levels[0]

In [9]:
params_df = pd.read_csv(infiles['params'], index_col=['prefix_len', 'prefix_version', 'k'])

params_df['kspec'] = [KmerSpec(row.k, row.prefix) for i, row in params_df.reset_index().iterrows()]

nparams = params_df.shape[0]

## Calculate signatures

In [10]:
genome_files = {
    ds: SequenceFile.from_paths((genomes_dir / f'{acc}.fasta.gz' for acc in genomes_df.loc[ds, 'assembly_accession']), 'fasta', 'gzip')
    for ds in dsets
}

In [11]:
signature_dirs = {ds: gambit_dir / ds for ds in dsets}
for sd in signature_dirs.values():
    sd.mkdir(exist_ok=True)

In [12]:
signature_files = {
    ds: {kspec: sd / f'{kspec.k}-{kspec.prefix_str}.h5' for kspec in params_df['kspec']}
    for ds, sd in signature_dirs.items()
}

In [13]:
for ds in dsets:
    for kspec in tqdm(params_df['kspec'], desc=ds):
        file = signature_files[ds][kspec]
        if file.is_file():
            continue

        sigs = calc_file_signatures(kspec, genome_files[ds], concurrency='processes', max_workers=12)

        with h5.File(file, 'w') as f:
            HDF5Signatures.create(f, sigs, genomes_df.loc[ds, 'assembly_accession'])

konstantinidis_2005: 100%|██████████| 192/192 [00:00<00:00, 201831.17it/s]
snitkin_2012: 100%|██████████| 192/192 [00:00<00:00, 170219.06it/s]


## Pairwise distances

In [15]:
for ds in dsets:
    outfile = intermediate_out / f'{ds}.h5'
    ngenomes = genomes_df.loc[ds].shape[0]

    gi1, gi2 = np.tril_indices(ngenomes)
    npairs = len(gi1)

    dists_square = np.empty((ngenomes, ngenomes), dtype=np.float32)
    dists_flat = np.empty((nparams, npairs), dtype=np.float32)
    kmer_counts = np.empty((nparams, ngenomes), dtype=int)

    for i, kspec in enumerate(tqdm(params_df['kspec'], desc=ds)):
        with HDF5Signatures.open(signature_files[ds][kspec]) as h5sigs:
            sigs = h5sigs[:]

        jaccarddist_matrix(sigs, sigs, out=dists_square)
        dists_flat[i, :] = dists_square[gi1, gi2]

        kmer_counts[i, :] = sigs.sizes()

    with h5.File(outfile, 'w') as f:
        f.create_dataset('genome1', data=gi1)
        f.create_dataset('genome2', data=gi2)
        f.create_dataset('pw_dists', data=dists_flat)
        f.create_dataset('kmer_counts', data=kmer_counts)

konstantinidis_2005: 100%|██████████| 192/192 [00:12<00:00, 15.55it/s]
snitkin_2012: 100%|██████████| 192/192 [00:01<00:00, 113.23it/s]
