# 211003 Find kmers

In [1]:
from pathlib import Path
import json

In [2]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import h5py as h5

In [3]:
from gambit.kmers import KmerSpec, find_kmers
from gambit.io.seq import SequenceFile
from gambit.signatures.calc import calc_file_signatures, calc_signature
from gambit.signatures.hdf5 import HDF5Signatures

## Setup

In [4]:
DATESTR = '211003'
NBNAME = DATESTR + '-find-kmers'

In [5]:
infiles = dict(
    genomes=Path('/home/jared/projects/gambit/data/validation/200726-gold-standard-seqs/'),
)

In [6]:
data_intermediate = Path('data-intermediate') / NBNAME
data_intermediate.mkdir(exist_ok=True)

outfiles = dict(
    assembled_sigs=data_intermediate / 'assembled-signatures.h5',
    fastq_data=data_intermediate / 'fastq-data.h5',
)

In [7]:
tmpdir = Path('tmp')
tmpdir.mkdir(exist_ok=True)

In [8]:
KSPEC = KmerSpec(11, 'ATGAC')

## Get genomes

In [9]:
genomes_df = pd.read_csv(infiles['genomes'] / 'files.csv')

In [10]:
genomes_df = genomes_df.loc[genomes_df['set'] == 200726, ['name']]

In [11]:
fasta_files = SequenceFile.from_paths([infiles['genomes'] / 'fasta' / (name + '.fasta') for name in genomes_df['name']], 'fasta')
assert all(f.path.is_file() for f in fasta_files)

fastq_files = SequenceFile.from_paths([infiles['genomes'] / 'fastq' / (name + '_L001_R1_001.fastq.gz') for name in genomes_df['name']], 'fastq', 'gzip')
assert all(f.path.is_file() for f in fastq_files)

In [12]:
genomes_df['fasta_file'] = [str(file.path) for file in fasta_files]
genomes_df['fastq_file'] = [str(file.path) for file in fastq_files]
ngenomes = genomes_df.shape[0]

In [13]:
genomes_df.to_csv(tmpdir / 'genome_files.csv', index=False)

## Assembled signatures

In [14]:
fasta_sigs = calc_file_signatures(KSPEC, fasta_files, progress=True, concurrency='processes')

100%|██████████| 80/80 [00:01<00:00, 43.28it/s]


In [15]:
with h5.File(outfiles['assembled_sigs'], 'w') as f:
    HDF5Signatures.create(f, fasta_sigs, genomes_df['name'])

## Fastq k-mer search

### Code

In [16]:
# Factor to convert exponent from PHRED (base 10 ** (1/10)) to natural (base e)
PHRED_TO_NAT = -np.log(10) / 10

def phredsum(q):
    return np.logaddexp.reduce(np.asarray(q) * PHRED_TO_NAT) / PHRED_TO_NAT

def get_phred(record):
    return np.asarray(record.letter_annotations['phred_quality'])

In [17]:
class PhredAccumulator:
    def __init__(self, bin_edges, dtype=np.dtype('u2')):
        self.bin_edges = np.asarray(bin_edges)
        self.nbins = len(self.bin_edges) + 1
        self.dtype = dtype
        self.dict = dict()
        
    def get_bin(self, score):
        return np.searchsorted(self.bin_edges, score, side='right')
    
    def add(self, index, score):
        b = self.get_bin(score)

        try:
            arr = self.dict[index]
        except KeyError:
            arr = self.dict[index] = np.zeros(self.nbins, dtype=self.dtype)
            
        arr[b] += 1
        assert arr[b] != 0  # catch overflow
        
    def to_arrays(self):
        indices = np.fromiter(self.dict, dtype=int)
        indices.sort()
        
        counts = np.empty((len(indices), self.nbins), dtype=self.dtype)
        for row, index in enumerate(indices):
            counts[row, :] = self.dict[index]
            
        return indices, counts

In [18]:
def accumulate_matches(kspec, record, accumulators):
    phred = get_phred(record)
    
    for match in find_kmers(kspec, record.seq):
        try:
            index = match.kmer_index()
        except ValueError:
            continue
            
        p = phred[match.full_indices()]
        
        for agg_func, accum in accumulators:
            accum.add(index, agg_func(p))

### Run

In [19]:
PHRED_BINS = range(1, 31)

AGG_NAMES = ['phredsum', 'min']
AGG_FUNCS = [phredsum, np.min]

In [20]:
if not outfiles['fastq_data'].is_file():
    with h5.File(outfiles['fastq_data'], 'w') as h5file:
        h5file.attrs['phred_bins'] = PHRED_BINS

In [21]:
with h5.File(outfiles['fastq_data'], 'r+') as h5file:
    for i, file in enumerate(tqdm(fastq_files, desc='Files')):
        name = genomes_df['name'].iloc[i]
        fasta_sig = fasta_sigs[i]
        
        if name in h5file:
            continue
        
        accums = [PhredAccumulator(PHRED_BINS) for _ in AGG_FUNCS]

        for record in tqdm(file.parse(), leave=False, desc='Reads'):
            accumulate_matches(KSPEC, record, zip(AGG_FUNCS, accums))

        group = h5file.create_group(name)
        
        for aname, accum in zip(AGG_NAMES, accums):
            indices, counts = accum.to_arrays()
            in_assembly = np.in1d(indices, fasta_sig)
            
            group.create_dataset(aname + '_indices', data=indices)
            group.create_dataset(aname + '_counts', data=counts)
            group.create_dataset(aname + '_in_assembly', data=in_assembly)

Files:   0%|          | 0/80 [00:00<?, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]

Reads: 0it [00:00, ?it/s]