# 210717 Calculate signatures

In [None]:
from pathlib import Path
import json

In [2]:
import numpy as np
from tqdm import tqdm

In [3]:
from gambit.kmers import KmerSpec
from gambit.io.seq import SequenceFile, find_kmers_in_file
from subprocess import run, DEVNULL

## Setup

In [4]:
infiles = dict(
    seq_urls=Path('data-intermediate/210717-find-sequence-urls/seq-urls.json'),
)

In [5]:
tmpdir = Path('tmp')

downloads_dir = tmpdir / 'download'
downloads_dir.mkdir(parents=True, exist_ok=True)

sigs_dir = tmpdir / 'sigs'
sigs_dir.mkdir(parents=True, exist_ok=True)

## Load data

### Sequence URLs

In [6]:
with open(infiles['seq_urls']) as f:
    seq_info = json.load(f)

In [7]:
len(seq_info)

50752

# ?

In [8]:
items_all = [
    (
        info['url'],
        SequenceFile(downloads_dir / f'{i+1}.fa.gz', 'fasta', 'gzip'),
        sigs_dir / f'{i+1}.npy',
    )
    for i, info in enumerate(seq_info)
]

In [9]:
kspec = KmerSpec(11, 'ATGAC')

In [10]:
def get_signature(url, seqfile, sigfile):
    if sigfile.is_file():
        return
    
    run(['wget', url, '-O', str(seqfile.path)], check=True)
    
    sigs = find_kmers_in_file(kspec, seqfile)
    np.save(sigfile, sigs)
    
    seqfile.path.unlink()

## Download

In [11]:
from concurrent.futures import ThreadPoolExecutor, as_completed

In [12]:
with ThreadPoolExecutor(max_workers=10) as pool:
    futures = [pool.submit(get_signature, *item) for item in items_all]
    
    for f in tqdm(as_completed(futures), total=len(futures)):
        pass

100%|██████████| 50752/50752 [02:11<00:00, 385.36it/s]   
