In [9]:
import tarfile
from io import BytesIO

import numpy as np

In [6]:
sweeps = dict()

Grab all the empirical signatures:

In [12]:
with tarfile.open(snakemake.input["empirical"]) as tar:
    members = tar.getmembers()
    for member in members:
        if member.isdir():
            continue
        array_file = BytesIO()
        array_file.write(tar.extractfile(member).read())
        array_file.seek(0)
        np_array = np.load(array_file).transpose([2, 0, 1])

In [15]:
member

In [14]:
np_array

In [8]:
members

In [None]:
for ms_file in tqdm.notebook.tqdm(ms_files):
    with tarfile.open(ms_file) as f:    
        members = f.getmembers()
        for member in members:
            if member.isdir():
                continue
            fileobj = TextIOWrapper(f.extractfile(member))
            positions, genotypes = ms_to_numpy(fileobj)
            sfs.update(genotypes.sum(axis=1))

In [None]:
uuids = self.df.uuid
for uuid in tqdm.notebook.tqdm(uuids):
    name = uuid + ".npy"
    array_file = BytesIO()
    array_file.write(data.extractfile(name).read())
    array_file.seek(0)
    np_array = np.load(array_file).transpose([2, 0, 1])

----

In [1]:
from pathlib import Path
from random import choice

import numpy as np
import pandas as pd

from tqdm import tqdm_notebook

In [2]:
loc_size_bp = 1e6
DIMENSION = 21
WINDOW_SIZES = np.exp(np.linspace(np.log(1000), np.log(loc_size_bp/((DIMENSION + 1)/2)), DIMENSION))
WINDOW_SIZES = [np.round(i) for i in WINDOW_SIZES]
WINDOW_SIZES = [rev for rev in reversed(WINDOW_SIZES)]
FEATURES = ['pi', 'snps', 'haps', 'H1', 'H12', 'H2overH1', 'tajD']

In [3]:
def tidify(path):
    """Turns the .npy file in the path into a tidy table of statistics."""
    if isinstance(path, list):
        mean = np.zeros(shape=(DIMENSION, DIMENSION, len(FEATURES)))
        n = 0
        for arr_path in tqdm_notebook(path):
            array = np.load(arr_path)
            mean += array
            n += 1
        x = mean/n
    else:
        x = np.load(path)
    coords, values = zip(*np.ndenumerate(x))
    df = (
        pd
        .DataFrame(coords, columns=['window_size', 'position', 'feature'])
        .assign(value=values)
    )
    df = df.assign(
        window_size=[WINDOW_SIZES[i] for i in df.window_size],
        feature=[FEATURES[i] for i in df.feature]
    )
    return df

In [4]:
weak_sweeps = list(Path('../../../raw-data/20210521_data/hardsweeps/s0.01-2133504-final/data').glob('*.npy'))
strong_sweeps = list(Path('../../../raw-data/20210521_data/hardsweeps/s100.0-2133505-final/data').glob('*.npy'))

to_tidify = {
    'Ace': list(Path('../output/drosophila/empirical-window-npy/').glob('sweep-ace*'))[0],
    'Cyp': list(Path('../output/drosophila/empirical-window-npy/').glob('sweep-cyp*'))[0],
    'CHKoV': list(Path('../output/drosophila/empirical-window-npy/').glob('sweep-chkov*'))[0],
    'Average hard sweep\n(s=0.01)': weak_sweeps,
    'Average hard sweep\n(s=100)': strong_sweeps,
}

num_hard_sweep_examples = 2
for example in range(1, num_hard_sweep_examples + 1):
    to_tidify[f'Hard sweep example {example}\n(s=0.01)'] = choice(weak_sweeps)
    to_tidify[f'Hard sweep example {example}\n(s=100)'] = choice(strong_sweeps)

In [5]:
dfs = []
for case, path in to_tidify.items():
    dfs.append(tidify(path).assign(case=case))
signals = pd.concat(dfs)

In [6]:
signals.to_csv('../output/signals/signals.csv', sep='\t', index=False)