# 210917 Gambit

In [1]:
from pathlib import Path

In [2]:
import numpy as np
import pandas as pd
import h5py as h5
from tqdm import tqdm

In [3]:
from gambit.kmers import KmerSpec
from gambit.search import calc_file_signatures
from gambit.io.seq import SequenceFile
from gambit.signatures import SignatureArray
from gambit.signatures.hdf5 import HDF5Signatures
from gambit.metric import jaccard_sparse_matrix
from gambit.test import random_seq

## Setup

In [4]:
DATESTR = '210917'
NBNAME = DATESTR + '-gambit'

In [5]:
tmpdir = Path('tmp')

gambit_dir = tmpdir / 'gambit'
gambit_dir.mkdir(exist_ok=True)

signatures_dir = gambit_dir / 'signatures'
signatures_dir.mkdir(exist_ok=True)

In [6]:
infiles = dict(
    genomes=Path('data-processed/210902-get-genomes/210902-mash-genomes-filtered.csv'),
)

In [7]:
intermediate_out = Path('data-intermediate') / NBNAME
intermediate_out.mkdir(exist_ok=True)

## Load data

In [8]:
genomes_df = pd.read_csv(infiles['genomes'], index_col='index')

In [9]:
genome_paths = [tmpdir / f'genomes/{acc}.fa.gz' for acc in genomes_df['assembly_accession']]
genome_files = SequenceFile.from_paths(genome_paths, 'fasta', 'gzip')

In [10]:
ngenomes = genomes_df.shape[0]

## Parameters

In [11]:
PREFIX_LENGTHS = range(4, 8)

In [12]:
K_VALS = range(7, 18, 2)

In [13]:
BASE_PREFIXES = [b'ATGACTG']

np.random.seed(0)
for i in range(7):
    BASE_PREFIXES.append(random_seq(len(BASE_PREFIXES[0])))
    
BASE_PREFIXES

[b'ATGACTG',
 b'ATCATTT',
 b'TCTCGAT',
 b'GAAAGCG',
 b'TTGACCC',
 b'CACATAT',
 b'CGTTAGT',
 b'ACTCTTG']

### Combine

In [14]:
_rows = []

for k in K_VALS:
    for l in PREFIX_LENGTHS:
        for (i, base_prefix) in enumerate(BASE_PREFIXES):
            _rows.append((l, i, base_prefix[:l].decode('ascii'), k))
            
params_df = pd.DataFrame.from_records(_rows, columns=['prefix_len', 'prefix_version', 'prefix', 'k'])

In [15]:
params_df

Unnamed: 0,prefix_len,prefix_version,prefix,k
0,4,0,ATGA,7
1,4,1,ATCA,7
2,4,2,TCTC,7
3,4,3,GAAA,7
4,4,4,TTGA,7
...,...,...,...,...
187,7,3,GAAAGCG,17
188,7,4,TTGACCC,17
189,7,5,CACATAT,17
190,7,6,CGTTAGT,17


In [16]:
KSPECS = [KmerSpec(row.k, row.prefix) for row in params_df.itertuples()]

In [17]:
len(KSPECS)

192

## Calculate signatures

In [18]:
signature_files = {kspec: signatures_dir / f'{kspec.k}-{kspec.prefix_str}.h5' for kspec in KSPECS}

In [19]:
for i, kspec in enumerate(KSPECS):
    file = signature_files[kspec]
    if file.is_file():
        continue
        
    print(i, kspec.k, kspec.prefix_str)
    sigs = calc_file_signatures(kspec, genome_files, progress='tqdm', concurrency='processes', max_workers=12)
    sigs = SignatureArray(sigs, dtype=kspec.index_dtype)
    
    with h5.File(file, 'w') as f:
        HDF5Signatures.create(f, kspec, sigs, genomes_df['assembly_accession'])

## Pairwise distances

In [20]:
gi1, gi2 = np.tril_indices(ngenomes, -1)
npairs = len(gi1)

In [21]:
dists_square = np.empty((ngenomes, ngenomes), dtype=np.float32)
dists_flat = np.empty((len(KSPECS), npairs), dtype=np.float32)
kmer_counts = np.empty((len(KSPECS), ngenomes), dtype=int)

In [22]:
for i, kspec in enumerate(tqdm(KSPECS)):
    with HDF5Signatures.open(signature_files[kspec]) as h5sigs:
        sigs = h5sigs[:]

    jaccard_sparse_matrix(sigs, sigs, out=dists_square, distance=True)
    dists_flat[i, :] = dists_square[gi1, gi2]
    
    kmer_counts[i, :] = sigs.sizes()

100%|██████████| 192/192 [08:21<00:00,  2.61s/it]


## Save

In [23]:
params_df.to_csv(intermediate_out / 'params.csv', index=False)

In [25]:
with h5.File(intermediate_out / 'pairwise-dists.h5', 'w') as f:
    f.create_dataset('genome1', data=gi1 + 1)
    f.create_dataset('genome2', data=gi2 + 1)
    f.create_dataset('pw_dists', data=dists_flat)
    f.create_dataset('kmer_counts', data=kmer_counts)