# Premise

Why is there 1700 datasets????

In [4]:
import os

import numpy as np
import pandas as pd

from grant import grant
import seaborn as sns
import matplotlib.pyplot as plt

from mtist import master_dataset_generation as mdg
from mtist import assemble_mtist as am
from mtist import mtist_utils as mu
from mtist import infer_mtist as im

import matplotlib.colors as mcolors
from matplotlib.colors import ListedColormap

%config InlineBackend.figure_format='retina'
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
from mtist.master_dataset_generation import MASTER_DATASET_DEFAULTS
from mtist.mtist_utils import GLOBALS

from mtist.mtist_utils import load_ground_truths

## Checking number of master datasets

In [11]:
### Gather current conditions ###
random_seeds = MASTER_DATASET_DEFAULTS.random_seeds
tend = MASTER_DATASET_DEFAULTS.tend
dt = MASTER_DATASET_DEFAULTS.dt
sample_freq = MASTER_DATASET_DEFAULTS.sample_freq
noises = MASTER_DATASET_DEFAULTS.noises

# Initialize conditions
conditions = []
for seed in random_seeds:
    for noise in noises:
        conditions.append((seed, noise))
        
# Load ground truths
aijs, grs = load_ground_truths(GLOBALS.GT_DIR)
gt_names = [
    "3_sp_gt_1",
    "3_sp_gt_2",
    "3_sp_gt_3",
    "10_sp_gt_1",
    "10_sp_gt_2",
    "10_sp_gt_3",
    "100_sp_gt",
]

### DO THE SIMULATIONS ###
# Index then by name, seed, noise
results = []
for name, aij, gr in zip(gt_names, aijs.values(), grs.values()):
    for seed, noise in conditions:
        results.append((name, seed, noise))

In [13]:
len(results)

1050

***This checks out***

## Checking MTIST assembled datasets

In [15]:
master_meta = pd.read_csv(
        os.path.join(GLOBALS.MASTER_DATASET_DIR, "master_metadata.csv")
    ).set_index("master_did")

In [17]:
# Collect indices for the datasets per `name`, `noise`
grp = master_meta.groupby(["name", "noise"])

name_noise_dict = {}
for (name, noise), df in grp:
    name_noise_dict[(name, noise)] = df.index

# Distribute the n_timeseries throughout
# the noise/ground truth combinations
n_timeseries_params = [5, 10, 50]

name_noise_nts_dict = {}
for name, noise in name_noise_dict.keys():

    # In this inner loop, make name_noise_nts an expanded
    # version of name_noise_dict that now also includes the
    # "n_timeseries variable"
    for each_n_timeseries in n_timeseries_params:
        name_noise_nts_dict[(name, noise, each_n_timeseries)] = name_noise_dict[name, noise][
            0:each_n_timeseries
        ]

# Finally, duplicate out the conditions for all parameters
sampling_scheme_params = ["even", "random", "seq"]
sampling_freq_params = [5, 10, 15]

full_conditions_dict = {}
for name, noise, nts in name_noise_nts_dict.keys():
    for ss in sampling_scheme_params:
        for sf in sampling_freq_params:

            # Just copy those indices since they'll be the same for each
            # combination of sample_scheme and sample_frequency
            full_conditions_dict[name, noise, nts, ss, sf] = name_noise_nts_dict[
                name, noise, nts
            ].copy()


In [19]:
len(full_conditions_dict)

567

In [20]:
def generate_metadata_too_big():
    # fns = glob.glob(os.path.join(GLOBALS.MTIST_DATASET_DIR, "dataset_*.csv"))
    fns = [os.path.join(GLOBALS.MTIST_DATASET_DIR, f"dataset_{i}.csv") for i in range(1701)]

    meta = pd.DataFrame([])
    i = 0
    for fn in fns:
        i = i + 1

        # This try/except block here is because some of my
        df = pd.read_csv(fn).drop(columns="Unnamed: 0")

        # Gather metadata
        sd = df["seq_depth"].unique()[0]
        did = df["did"].unique()[0]
        n_species = df["n_species"].unique()[0]
        noise = df["noise"].unique()[0]
        gt = df["ground_truth"].unique()[0]
        ss = df["sampling_scheme"].unique()[0]
        n_timepoints = df["n_timepoints"].unique()[0]
        n_timeseries = len(df["timeseries_id"].unique())

        # FIRST, crosscheck
        n_sp_crosscheck = df.columns.str.contains("species_").sum()
        n_tp_crosscheck = np.unique([len(subset) for (_, subset) in df.groupby("timeseries_id")])[0]

        assert (
            n_sp_crosscheck == n_species
        ), f"n_sp crosscheck failure: from df {n_species}, from crosscheck {n_sp_crosscheck}"

        assert (
            n_tp_crosscheck == n_timepoints
        ), f"n_tp crosscheck failure: from df {n_timepoints}, from crosscheck {n_tp_crosscheck}"

        # Check to make sure these "unique" arrays are all len() == 1
        to_check = [
            "seq_depth",
            "did",
            "n_species",
            "noise",
            "ground_truth",
            "n_timepoints",
            "sampling_scheme",
        ]
        for each in to_check:
            assert len(df[each].unique()) == 1, f"unique array len of {each} is not 1"

        # Create the next meta row
        cur_meta_row = pd.DataFrame(
            [did, n_species, gt, noise, n_timeseries, n_timepoints, ss, sd],
            index=[
                "did",
                "n_species",
                "ground_truth",
                "noise",
                "n_timeseries",
                "n_timepoints",
                "sampling_scheme",
                "seq_depth",
            ],
        ).T

        # Combine
        meta = pd.concat((meta, cur_meta_row))

    # meta = meta.set_index("did").sort_index()

    return meta


In [23]:
test = generate_metadata_too_big()

In [24]:
test

Unnamed: 0,did,n_species,ground_truth,noise,n_timeseries,n_timepoints,sampling_scheme,seq_depth
0,0,100,100_sp_gt,0.01,5,5,even,high
0,1,100,100_sp_gt,0.01,5,10,even,high
0,2,100,100_sp_gt,0.01,5,15,even,high
0,3,100,100_sp_gt,0.01,5,5,random,high
0,4,100,100_sp_gt,0.01,5,10,random,high
...,...,...,...,...,...,...,...,...
0,1696,3,3_sp_gt_3,0.1,50,10,random,low
0,1697,3,3_sp_gt_3,0.1,50,15,random,low
0,1698,3,3_sp_gt_3,0.1,50,5,seq,low
0,1699,3,3_sp_gt_3,0.1,50,10,seq,low


In [27]:
test.groupby(['ground_truth', 'noise', 'n_timeseries', 'n_timepoints', 'sampling_scheme', 'seq_depth']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,did,n_species
ground_truth,noise,n_timeseries,n_timepoints,sampling_scheme,seq_depth,Unnamed: 6_level_1,Unnamed: 7_level_1
100_sp_gt,0.01,5,5,even,high,1,1
100_sp_gt,0.01,5,5,even,low,2,2
100_sp_gt,0.01,5,5,random,high,1,1
100_sp_gt,0.01,5,5,random,low,2,2
100_sp_gt,0.01,5,5,seq,high,1,1
...,...,...,...,...,...,...,...
3_sp_gt_3,0.10,50,15,even,low,2,2
3_sp_gt_3,0.10,50,15,random,high,1,1
3_sp_gt_3,0.10,50,15,random,low,2,2
3_sp_gt_3,0.10,50,15,seq,high,1,1


HUH! Seems like there are two lows. let me fix that.
