In [2]:
import pandas as pd
from pandas.api.types import is_numeric_dtype

In [3]:
def get_distribution(s):
    always_unique = ["demography", "seed", "swept_frequencies"]
    if s.name in always_unique:
        x = f"{s.nunique()} unique values."
    elif is_numeric_dtype(s):
        if s.nunique() == 0:
            x = f"{len(s)}x NaN"
        elif s.nunique() == 1:
            x = f"{s.notna().sum()}x {s.loc[s.notna()].unique()[0]}"
        else:
            x = s.describe().to_string()
    elif s.loc[s.notna()].is_unique:
        x = f"{s.nunique()} unique values."
    else:
        if s.nunique() == 1:
            x = f"{s.notna().sum()}x {s.loc[s.notna()].unique()[0]}"
        else:
            x = s.value_counts().to_string()
    return x

In [4]:
def get_columns_report(df):
    lines = []
    for column in sorted(df.columns):
        lines.append(column)
        lines.append(get_distribution(df[column]))
        lines.append("")
    return "\n".join(lines)

In [5]:
def get_df_report(df, status, regime):
    lines = []
    if status == "all":
        this_df = df
    else:
        this_df = df.loc[df.simulation_status == status]
    title = f"{status} {regime} simulations"
    lines.append(title.upper())
    lines.append("-"*len(title))
    lines.append("")
    lines.append(get_columns_report(this_df))
    lines.append("")
    return lines

In [6]:
data = pd.read_table(snakemake.input[0], dtype={'swept_frequencies': str})

In [7]:
lines = []

In [8]:
constant_cols = data.columns[data.nunique(dropna=False) == 1].tolist()
title = "CONSTANT PARAMETERS"
lines.append(title)
lines.append("-"*len(title))
lines.append("")
for col in constant_cols:
    lines.append(col)
    s = data[col]
    lines.append(f"{len(s)}x {s.unique()[0]}")
    lines.append("")
    data = data.drop(col, axis='columns')
lines.append("\n")

In [10]:
regimes = {
    regime: data.loc[data.regime == regime] for regime in data.regime.unique()
}

for regime, df in regimes.items():
    statuses = df.simulation_status.unique()
    for status in statuses:
        lines.extend(get_df_report(df, status, regime))
    if len(statuses) > 1:
        lines.extend(get_df_report(df, "all", regime))

In [11]:
text = "\n".join(lines)
with open(snakemake.output[0], "w") as f:
    f.write(text)