In [2]:
import tarfile
from pathlib import Path
from io import BytesIO
import re

import numpy as np
import pandas as pd
import tqdm.notebook

In [3]:
sweeps = dict()

Get all empirical control sweeps:

In [4]:
with tarfile.open(snakemake.input["empirical"]) as tar:
    members = tar.getmembers()
    for member in members:
        if member.isdir():
            continue
        array_file = BytesIO()
        array_file.write(tar.extractfile(member).read())
        array_file.seek(0)
        sweeps[Path(member.name).stem] = np.load(array_file)

Get averages of simulated sweeps:

In [5]:
s_pattern = re.compile(r"hard_s(.+)-")
hard_sweeps = dict()

In [6]:
for p in Path(snakemake.config["additional_sim_folder"]).iterdir():
    match = s_pattern.search(p.name)
    if match is None:
        continue
    selstren = float(s_pattern.search(p.name).groups()[0])
    data_files = (p / "data").glob("*.tar")
    arrays = list()
    for file in tqdm.notebook.tqdm(data_files):
        with tarfile.open(file) as f:    
            members = f.getmembers()
            for member in members:
                if member.isdir():
                    continue
                array_file = BytesIO()
                array_file.write(f.extractfile(member).read())
                array_file.seek(0)
                arrays.append(np.load(array_file))
    hard_sweeps[selstren] = np.stack(arrays).mean(axis=0)

### Turn signals into tidy tables

In [15]:
from utils.project_parameters import summary_statistic_order, smallest_window, locus_size, data_dimension
from utils.prepare_data import save_data
from utils.feature_calculation import get_windows

window_sizes, center_pos_dict = get_windows(
    locus_size,
    data_dimension,
    start_pos=1,
    smallest_window=smallest_window,
)
window_sizes = list(reversed(window_sizes)) # Due to the way Numpy assigns dimension labels

In [16]:
def tidify(signal, name, s=None):
    coords, values = zip(*np.ndenumerate(signal))
    df = (
        pd
        .DataFrame(coords, columns=['window_size', 'position', 'feature'])
        .assign(value=values)
    )
    df = df.assign(
        window_size=[window_sizes[i] for i in df.window_size],
        feature=[summary_statistic_order[i] for i in df.feature],
        dataset=name,
        selection_coefficient=s
    )
    return df

In [17]:
final_dataframes = []
final_dataframes.extend([tidify(data, name) for name, data in sweeps.items()])
final_dataframes.extend([tidify(data, 'simulated-hard-sweep', s) for s, data in hard_sweeps.items()])
result = pd.concat(final_dataframes)

In [19]:
save_data(result, snakemake.output[0])