# 211206 Pairwise dists

In [1]:
from pathlib import Path
import json

In [7]:
import pandas as pd
from tqdm import tqdm
import h5py as h5

In [19]:
from gambit.kmers import KmerSpec
from gambit.io.seq import SequenceFile
from gambit.metric import jaccarddist_matrix
from gambit.sigs import SignatureArray, SignaturesMeta
from gambit.sigs.calc import calc_file_signatures
from gambit.sigs.hdf5 import HDF5Signatures

## Setup

In [20]:
DATESTR = '211206'
DATESTR_LONG = '2021-12-06'
NBNAME = DATESTR + '-pw-dists'

In [14]:
tmpdir = Path('tmp')

genomes_dir = tmpdir / 'genomes'
genomes_dir.mkdir(exist_ok=True)

archive_dir = Path('archive')

In [8]:
infiles = dict(
    genomes_table=Path('data-processed/211111-inspect-genomes/genomes.csv'),
)

In [11]:
intermediate_out = Path('data-intermediate') / NBNAME
intermediate_out.mkdir(exist_ok=True)

outfiles = dict(
    signatures=archive / f'211109-ncbi-representative-genomes-ATGAC_11-{DATESTR}.h5',
    pw_dists=intermediate_out / 'pw-dists.h5',
)

## Load data

In [12]:
genomes_df = pd.read_csv(infiles['genomes_table'], index_col=0)
ngenomes = genomes_df.shape[0]
ngenomes

14388

## Find genome files

In [13]:
seq_files = []
seq_uids = []
missing_uids = []

for uid, acc in genomes_df['refseq_acc'].iteritems():
    f = genomes_dir / f'{acc}.fasta.gz'
    
    if f.is_file():
        seq_files.append(f)
        seq_uids.append(uid)
    else:
        missing_uids.append(uid)
        
seq_files = SequenceFile.from_paths(seq_files, 'fasta', 'gzip')

missing_uids

[10898551, 11011431, 11411721, 11411751, 11411881]

These were the ones that couldn't be found in the 211115 notebook.

## Calculate signatures

In [21]:
if not outfiles['signatures'].is_file():
    kspec = KmerSpec(11, 'ATGAC')
    
    _sigs = calc_file_signatures(kspec, seq_files, progress=True, concurrency='processes')
    
    _ids = genomes_df.loc[seq_uids, 'refseq_acc']
    _meta = SignaturesMeta(
        name='211109-ncbi-representative-genomes-ATGAC_11',
        extra=dict(
            author='Jared Lumpe',
            date_created=DATESTR_LONG,
        )
    )
    
    with h5.File(outfiles['signatures'], 'w') as f:
        HDF5Signatures.create(f, _sigs, _ids, _meta)
        
    del _sigs  # Reclaim memory

100%|██████████| 14383/14383 [21:29<00:00, 11.15it/s]


In [22]:
sigs = HDF5Signatures.open(outfiles['signatures'])

## Pairwise distances

In [24]:
dmat = jaccarddist_matrix(sigs, sigs, progress=True)

100%|██████████| 206870689/206870689 [35:12<00:00, 97948.03it/s] 


In [33]:
with h5.File(outfiles['pw_dists'], 'w') as f:
    f.create_dataset('uid', data=seq_uids)
    f.create_dataset('accession', data=list(genomes_df.loc[seq_uids, 'refseq_acc']))
    f.create_dataset('pw_dists', data=dmat)