In [29]:
# packages

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import hsv_to_rgb
import pandas as pd
import os
from itertools import combinations
import h5py

import sys
sys.path.append("../src")

from analysis import *
from inference import *

In [30]:
import re

def natural_sort(l): 
    convert = lambda text: int(text) if text.isdigit() else text.lower() 
    alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)] 
    return sorted(l, key=alphanum_key)

In [None]:
datapath = "../experiment_outputs/growth_scale_0.1_env_noise0.1"
log = h5py.File(f"{datapath}/data_generation_log.h5", "r")

print(f"n_species = {log.attrs['n_species']}")
print(f"avg_samp_dt = {log.attrs['avg_samp_dt']}")
print(f"env_noise = {log.attrs['env_noise']}")
print(f"meas_noise_list = {log.attrs['meas_noise_list']}")
print(f"n_params_seeds = {log.attrs['n_params_seeds']}")

n_species = [ 3  5 10 20]
avg_samp_dt = [3.  1.5 1.  0.6 0.3]
env_noise = 0.1
meas_noise_list = [0.1]
n_params_seeds = 10


In [None]:
def get_files(datapath, n_sp, env_noise, meas_noise, avg_samp_dt, filetype="dataset", ext="csv"):
    params_seeds = [i.split("param_seed")[1] for i in os.listdir(f"{datapath}/{n_sp}_sp")]

    datafiles = []

    for p in params_seeds:
        datafiles.append(f"{datapath}/{n_sp}_sp/param_seed{p}/meas_noise{meas_noise}/t_samp{avg_samp_dt}/{filetype}{n_sp}_sp{p}_env_noise{env_noise}.{ext}")
    return datafiles

In [None]:
print(f"Numbers of sampling points: {log.attrs['n_samples']}")
print(f"Average sampling intervals: {log.attrs['avg_samp_dt'].round(3)}")
print(f"Number of initial conditions: {log.attrs['n_init_cond']}")
print(f"Number of repetitions: {log.attrs['repetitions']}")
print(f"Environmental noise: {log.attrs['env_noise']}")
print(f"Amounts of measurement noise: {log.attrs['meas_noise_list']}")

env_noise = log.attrs['env_noise']

Numbers of sampling points: [ 11  21  31  51 101]
Average sampling intervals: [3.  1.5 1.  0.6 0.3]
Number of initial conditions: 30
Number of repetitions: 1
Environmental noise: 0.1
Amounts of measurement noise: [0.1]


In [None]:
def calculate_es_score(true_aij, inferred_aij) -> float:
    """GRANT'S edited version to calculate ED score

    Calculate the ecological direction (EDₙ) score (n := number of species in ecosystem).

    Parameters
    ===============
    truth: ndarray(axis0=species_names, axis1=species_names), the ecosystem coefficient matrix used to generate data
    inferred: ndarray(axis0=species_names, axis1=species_names), the inferred ecosystem coefficient matrix
    Returns
    ===============
    ES_score: float
    """

    truth = pd.DataFrame(true_aij).copy()
    inferred = pd.DataFrame(inferred_aij).copy()

    # consider inferred coefficients
    mask = inferred != 0

    # compare sign: agreement when == -2 or +2, disagreement when 0
    nonzero_sign = np.sign(inferred)[mask] + np.sign(truth)[mask]
    corr_sign = (np.abs(nonzero_sign) == 2).sum().sum()
    opposite_sign = (np.abs(nonzero_sign) == 0).sum().sum()

    # count incorrect non-zero coefficients
    wrong_nz = (truth[mask] == 0).sum().sum()

    # combine
    unscaled_score = corr_sign - opposite_sign

    # scale by theoretical extrema
    truth_nz_counts = (truth != 0).sum().sum()
    truth_z_counts = len(truth.index) ** 2 - truth_nz_counts
    theoretical_min = -truth_nz_counts
    theoretical_max = truth_nz_counts

    ES_score = (unscaled_score - theoretical_min) / (theoretical_max - theoretical_min)

    return ES_score

# Infer and score

for n_sp in log.attrs["n_species"]:
    for avg_samp_dt in log.attrs["avg_samp_dt"]:
        for meas_noise in log.attrs["meas_noise_list"]:
            datafiles = get_files(datapath, n_sp, env_noise, meas_noise, avg_samp_dt)
            metadatafiles = get_files(datapath, n_sp, env_noise, meas_noise, avg_samp_dt, "metadata", "txt")

            for file_idx in range(len(datafiles)):
                datafile = datafiles[file_idx]
                metadatafile = metadatafiles[file_idx]
                metadict = get_meta(open(metadatafile, "r").read().split("\n"))
                
                df = pd.read_csv(datafile, index_col=0)
                
                param_columns = [f"r{i}" for i in range(1, n_sp+1)] + \
                [f"A{i},{j}" for i in range(1, n_sp+1) for j in range(1, n_sp+1)]
                cols = ["n_init_cond"] + list(df.columns[1:4]) + param_columns + ["MSPD", "CSR", "ES"]

                infer_out = pd.DataFrame(columns=cols)

                pd.options.mode.chained_assignment = None
                
                p = metadict["parameters"]
                r = p[:n_sp]
                A = p[n_sp:].reshape((n_sp,n_sp))

                for i in tqdm(range(len(df.init_cond_idx.unique()))):
                    combs = list(combinations(df.init_cond_idx.unique(), i+1))
                    np.random.shuffle(combs)
                    for comb in combs[:100]:
                        df_comb = df[df.init_cond_idx.isin(comb)]
                        r_est, A_est = fit_ridge_cv(df_comb)
                        p_est = np.concatenate((r_est, A_est.flatten()))
                        MSPD = ((p-p_est)**2).mean()
                        CSR = (np.sign(A_est)==np.sign(A)).mean()
                        ES = calculate_es_score(A, A_est)
                        infer_out.loc[len(infer_out)] = [i+1, comb, avg_samp_dt, meas_noise] + list(p_est) + [MSPD, CSR, ES]

                infer_out.to_csv(datafile.split('dataset')[0]+"/inference"+datafile.split("dataset")[1])

In [None]:
n_sp = 10
avg_samp_dt = 3.
env_noise = 0.1
meas_noise = 0.1

In [None]:
datafiles = get_files(datapath, n_sp, env_noise, meas_noise, avg_samp_dt)
pd.read_csv(datafiles[0], index_col=0)

Unnamed: 0,dataset,init_cond_idx,t_samp_dist_idx,measurement_noise,replicate,time,dt,sp1,sp2,sp3,sp4,sp5,sp6,sp7,sp8,sp9,sp10
0,0.0,0.0,0.0,0.1,0.0,0.0,3.0,0.020463,0.029830,0.014310,0.022106,0.007418,0.008575,0.005520,0.041375,0.003918,0.012350
1,0.0,0.0,0.0,0.1,0.0,3.0,3.0,0.050150,0.083832,0.039007,0.105712,0.017638,0.041970,0.033069,0.126822,0.069387,0.017040
2,0.0,0.0,0.0,0.1,0.0,6.0,3.0,0.118098,0.082531,0.078010,0.152139,0.033296,0.049153,0.110442,0.094341,0.200847,0.044018
3,0.0,0.0,0.0,0.1,0.0,9.0,3.0,0.180420,0.068103,0.148239,0.194660,0.046932,0.070541,0.134549,0.053704,0.156831,0.120309
4,0.0,0.0,0.0,0.1,0.0,12.0,3.0,0.253067,0.061037,0.170874,0.211621,0.075105,0.142457,0.173067,0.042395,0.179181,0.190378
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
325,29.0,29.0,0.0,0.1,0.0,18.0,3.0,0.215978,0.119305,0.197424,0.223782,0.062158,0.122119,0.239858,0.072690,0.109992,0.248139
326,29.0,29.0,0.0,0.1,0.0,21.0,3.0,0.177467,0.075213,0.169093,0.253530,0.096993,0.137520,0.205123,0.106396,0.138747,0.234307
327,29.0,29.0,0.0,0.1,0.0,24.0,3.0,0.175498,0.099151,0.200643,0.353971,0.110002,0.101953,0.238100,0.097520,0.178762,0.228997
328,29.0,29.0,0.0,0.1,0.0,27.0,3.0,0.206518,0.104705,0.195028,0.253211,0.072336,0.102288,0.192780,0.060564,0.166390,0.233049


In [None]:
import math

def n_comb(n, k):
    return math.factorial(n)/(math.factorial(n-k)*math.factorial(k))

In [34]:
combs = list(combinations(df.dataset.unique(), 3))

In [56]:
combs

[array([ 5.,  3., 17.])]

In [55]:
tuple(comb) in combs

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [57]:
# Infer and score

for n_sp in log.attrs["n_species"]:
    for avg_samp_dt in log.attrs["avg_samp_dt"]:
        for meas_noise in log.attrs["meas_noise_list"]:
            datafiles = get_files(datapath, n_sp, env_noise, meas_noise, avg_samp_dt)
            metadatafiles = get_files(datapath, n_sp, env_noise, meas_noise, avg_samp_dt, "metadata", "txt")

            for file_idx in range(len(datafiles)):
                datafile = datafiles[file_idx]
                metadatafile = metadatafiles[file_idx]
                metadict = get_meta(open(metadatafile, "r").read().split("\n"))
                
                df = pd.read_csv(datafile, index_col=0)
                
                param_columns = [f"r{i}" for i in range(1, n_sp+1)] + \
                [f"A{i},{j}" for i in range(1, n_sp+1) for j in range(1, n_sp+1)]
                cols = ["n_dset"] + list(df.columns[1:4]) + param_columns + ["MSPD", "CSR", "ES"]

                infer_out = pd.DataFrame(columns=cols)

                pd.options.mode.chained_assignment = None
                
                p = metadict["parameters"]
                r = p[:n_sp]
                A = p[n_sp:].reshape((n_sp,n_sp))

                for i in tqdm(range(len(df.dataset.unique()))):
                    if n_comb(len(df.dataset.unique()), i+1) < 10000:
                        combs = list(combinations(df.dataset.unique(), i+1))
                        np.random.shuffle(combs)
                        combs = combs[:100]
                    else:
                        combs = []
                        while len(combs) < 100:
                            comb = tuple(np.random.choice(df.dataset.unique(), i+1, replace=False))
                            if comb not in combs:
                                combs.append(comb)
                    for comb in combs:
                        comb = np.random.choice(df.dataset.unique(), i+1, replace=False)
                        df_comb = df[df.dataset.isin(comb)]
                        r_est, A_est = fit_ridge_cv(df_comb)
                        p_est = np.concatenate((r_est, A_est.flatten()))
                        MSPD = ((p-p_est)**2).mean()
                        CSR = (np.sign(A_est)==np.sign(A)).mean()
                        ES = calculate_es_score(A, A_est)
                        infer_out.loc[len(infer_out)] = [i+1, comb, avg_samp_dt, meas_noise] + list(p_est) + [MSPD, CSR, ES]

                infer_out.to_csv(datafile.split('dataset')[0]+"/inference"+datafile.split("dataset")[1])

100%|██████████| 30/30 [01:32<00:00,  3.07s/it]
100%|██████████| 30/30 [01:25<00:00,  2.86s/it]
100%|██████████| 30/30 [01:26<00:00,  2.89s/it]
100%|██████████| 30/30 [01:23<00:00,  2.77s/it]
100%|██████████| 30/30 [01:20<00:00,  2.68s/it]
100%|██████████| 30/30 [01:21<00:00,  2.70s/it]
100%|██████████| 30/30 [01:28<00:00,  2.96s/it]
100%|██████████| 30/30 [01:26<00:00,  2.90s/it]
100%|██████████| 30/30 [01:33<00:00,  3.12s/it]
100%|██████████| 30/30 [01:44<00:00,  3.49s/it]
100%|██████████| 30/30 [01:35<00:00,  3.17s/it]
100%|██████████| 30/30 [01:26<00:00,  2.89s/it]
100%|██████████| 30/30 [01:32<00:00,  3.08s/it]
100%|██████████| 30/30 [01:28<00:00,  2.95s/it]
100%|██████████| 30/30 [01:26<00:00,  2.88s/it]
100%|██████████| 30/30 [01:27<00:00,  2.93s/it]
100%|██████████| 30/30 [01:28<00:00,  2.95s/it]
100%|██████████| 30/30 [01:23<00:00,  2.80s/it]
100%|██████████| 30/30 [01:24<00:00,  2.80s/it]
100%|██████████| 30/30 [01:40<00:00,  3.35s/it]
100%|██████████| 30/30 [01:30<00:00,  3.