In [2]:
import tarfile
import re
from pathlib import Path
from itertools import chain
from collections import defaultdict

import pandas as pd

In [3]:
source_folder = Path(snakemake.config["raw_sim_folder"])/snakemake.wildcards["sim"]
info_dict = defaultdict(int)

In [5]:
sources = list(Path(snakemake.config["raw_sim_folder"]).glob(snakemake.wildcards["sim"] + "*"))
data_tars = chain(*[source.glob('data/*.tar') for source in sources])
logdata_tars = chain(*[source.glob('logdata/*.tar') for source in sources])
feature_tars = chain(*[source.glob('features/*.tar.gz') for source in sources])
parameter_tsvs = chain(*[source.glob('parameters/*.tsv') for source in sources])

In [5]:
def get_sim_id(path):
    re_numbers = r"[0-9]+"
    stem = path.parents[1].name.replace(snakemake.wildcards["sim"], "")
    if stem.startswith("-"):
        stem = stem [1:]
    batch_id = re.search(re_numbers, path.stem).group()
    return "-".join([stem, batch_id])

In [6]:
with tarfile.open(snakemake.output["data"], 'w') as dest:
    for data_tar in data_tars:
        tar = tarfile.open(data_tar, 'r')
        members = tar.getmembers()
        for member in members:
            if member.isdir():
                continue
            fileobj = tar.extractfile(member)
            member.name = Path(member.name).name
            dest.addfile(member, fileobj)
            info_dict['npy_files'] += 1
        tar.close()

In [6]:
with tarfile.open(snakemake.output["logdata"], 'w') as dest:
    for data_tar in logdata_tars:
        tar = tarfile.open(data_tar, 'r')
        members = tar.getmembers()
        for member in members:
            if member.isdir():
                continue
            fileobj = tar.extractfile(member)
            member.name = Path(member.name).name
            dest.addfile(member, fileobj)
            info_dict['log_npy_files'] += 1
        tar.close()

In [7]:
with tarfile.open(snakemake.output["features"], 'w:gz') as dest:
    for data_tar in feature_tars:
        tar = tarfile.open(data_tar, 'r:gz')
        members = tar.getmembers()
        for member in members:
            if member.isdir():
                continue
            fileobj = tar.extractfile(member)
            member.name = get_sim_id(data_tar) + "-" + Path(member.name).name
            dest.addfile(member, fileobj)
            info_dict['feature_files'] += 1
        tar.close()

In [8]:
param_dfs = []

for f in parameter_tsvs:
    sim_id = get_sim_id(f)
    df = pd.read_table(f, dtype={"data_id": str})
    df = df.assign(data_id = pd.Series([sim_id for _ in range(len(df))]).str.cat(df.data_id, sep='-'))
    param_dfs.append(df)
    
params = pd.concat(param_dfs).reset_index(drop=True)

In [9]:
params.to_csv(snakemake.output["parameters"], index=False, sep="\t")

In [10]:
info_dict["total_simulations"] = len(params)
info_dict["successful_simulations"] = len(params.loc[params.simulation_status == "ok"])

In [11]:
lines = [
    f"Combining files for simulation with ID {snakemake.wildcards['sim']}.",
    f"Total number of simulation parameters: {info_dict['total_simulations']}.",
    f"Successful simulations: {info_dict['successful_simulations']}.",
    f"There should be one NPY file per successful simulation. Number of NPY files: {info_dict['npy_files']}.",
    f"There should be one log-transformed NPY per successful simulation. Number of log-transformed NPY files: {info_dict['log_npy_files']}.",
    f"There should be two feature files per successful simulation. Number of feature files: {info_dict['feature_files']}."
]

In [12]:
with open(snakemake.output["info"], "w") as f:
    f.write("\n".join(lines))