In [8]:
# packages

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import hsv_to_rgb
import pandas as pd
import os
from itertools import combinations
import h5py

import sys
sys.path.append("../src")

from analysis import *
from inference import *

In [9]:
import re

def natural_sort(l): 
    convert = lambda text: int(text) if text.isdigit() else text.lower() 
    alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)] 
    return sorted(l, key=alphanum_key)

In [10]:
datapath = "../experiment_outputs/growth_rep_env_noise0.1"
log = h5py.File(f"{datapath}/data_generation_log.h5", "r")

print(f"n_species = {log.attrs['n_species']}")
print(f"avg_samp_dt = {log.attrs['avg_samp_dt']}")
print(f"env_noise = {log.attrs['env_noise']}")
print(f"meas_noise_list = {log.attrs['meas_noise_list']}")
print(f"n_params_seeds = {log.attrs['n_params_seeds']}")

n_species = [10]
avg_samp_dt = [3.  1.5 1. ]
env_noise = 0.1
meas_noise_list = [0.1]
n_params_seeds = 5


In [11]:
def get_files(datapath, n_sp, env_noise, meas_noise, avg_samp_dt, filetype="dataset", ext="csv"):
    params_seeds = [i.split("param_seed")[1] for i in os.listdir(f"{datapath}/{n_sp}_sp")]

    datafiles = []

    for p in params_seeds:
        datafiles.append(f"{datapath}/{n_sp}_sp/param_seed{p}/meas_noise{meas_noise}/t_samp{avg_samp_dt}/{filetype}{n_sp}_sp{p}_env_noise{env_noise}.{ext}")
    return datafiles

In [12]:
print(f"Numbers of sampling points: {log.attrs['n_samples']}")
print(f"Average sampling intervals: {log.attrs['avg_samp_dt'].round(3)}")
print(f"Number of initial conditions: {log.attrs['n_init_cond']}")
print(f"Number of repetitions: {log.attrs['repetitions']}")
print(f"Environmental noise: {log.attrs['env_noise']}")
print(f"Amounts of measurement noise: {log.attrs['meas_noise_list']}")

env_noise = log.attrs['env_noise']

Numbers of sampling points: [11 21 31]
Average sampling intervals: [3.  1.5 1. ]
Number of initial conditions: 5
Number of repetitions: 20
Environmental noise: 0.1
Amounts of measurement noise: [0.1]


In [13]:
def calculate_es_score(true_aij, inferred_aij) -> float:
    """GRANT'S edited version to calculate ED score

    Calculate the ecological direction (EDₙ) score (n := number of species in ecosystem).

    Parameters
    ===============
    truth: ndarray(axis0=species_names, axis1=species_names), the ecosystem coefficient matrix used to generate data
    inferred: ndarray(axis0=species_names, axis1=species_names), the inferred ecosystem coefficient matrix
    Returns
    ===============
    ES_score: float
    """

    truth = pd.DataFrame(true_aij).copy()
    inferred = pd.DataFrame(inferred_aij).copy()

    # consider inferred coefficients
    mask = inferred != 0

    # compare sign: agreement when == -2 or +2, disagreement when 0
    nonzero_sign = np.sign(inferred)[mask] + np.sign(truth)[mask]
    corr_sign = (np.abs(nonzero_sign) == 2).sum().sum()
    opposite_sign = (np.abs(nonzero_sign) == 0).sum().sum()

    # count incorrect non-zero coefficients
    wrong_nz = (truth[mask] == 0).sum().sum()

    # combine
    unscaled_score = corr_sign - opposite_sign

    # scale by theoretical extrema
    truth_nz_counts = (truth != 0).sum().sum()
    truth_z_counts = len(truth.index) ** 2 - truth_nz_counts
    theoretical_min = -truth_nz_counts
    theoretical_max = truth_nz_counts

    ES_score = (unscaled_score - theoretical_min) / (theoretical_max - theoretical_min)

    return ES_score

In [15]:
df

Unnamed: 0,dataset,init_cond_idx,t_samp_dist_idx,measurement_noise,replicate,time,dt,sp1,sp2,sp3,sp4,sp5,sp6,sp7,sp8,sp9,sp10
0,0.0,0.0,2.0,0.1,0.0,0.0,1.0,0.026325,0.005365,0.023963,0.013102,0.024119,0.015263,0.013260,0.023745,0.024693,0.021861
1,0.0,0.0,2.0,0.1,0.0,1.0,1.0,0.041217,0.007788,0.053101,0.026880,0.030246,0.023798,0.027603,0.021556,0.037407,0.035245
2,0.0,0.0,2.0,0.1,0.0,2.0,1.0,0.092971,0.011043,0.107373,0.052974,0.052923,0.024794,0.061179,0.025030,0.058830,0.035552
3,0.0,0.0,2.0,0.1,0.0,3.0,1.0,0.141262,0.015159,0.204517,0.081370,0.103300,0.031650,0.076802,0.027765,0.059759,0.037350
4,0.0,0.0,2.0,0.1,0.0,4.0,1.0,0.225501,0.017266,0.275799,0.141263,0.148743,0.038659,0.115082,0.040491,0.065005,0.041596
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3095,99.0,4.0,2.0,0.1,19.0,26.0,1.0,0.363928,0.005563,0.442523,0.256032,0.301295,0.141893,0.034704,0.092784,0.159031,0.055561
3096,99.0,4.0,2.0,0.1,19.0,27.0,1.0,0.400128,0.005218,0.408842,0.264956,0.347910,0.140371,0.031851,0.112886,0.179235,0.067466
3097,99.0,4.0,2.0,0.1,19.0,28.0,1.0,0.406289,0.004069,0.555427,0.217759,0.284295,0.167293,0.032265,0.086926,0.157854,0.066552
3098,99.0,4.0,2.0,0.1,19.0,29.0,1.0,0.429302,0.004914,0.495994,0.275562,0.335936,0.144133,0.033661,0.093785,0.172065,0.059546


In [14]:
# Infer and score

for n_sp in log.attrs["n_species"]:
    for avg_samp_dt in log.attrs["avg_samp_dt"]:
        for meas_noise in log.attrs["meas_noise_list"]:
            datafiles = get_files(datapath, n_sp, env_noise, meas_noise, avg_samp_dt)
            metadatafiles = get_files(datapath, n_sp, env_noise, meas_noise, avg_samp_dt, "metadata", "txt")

            for file_idx in range(len(datafiles)):
                datafile = datafiles[file_idx]
                metadatafile = metadatafiles[file_idx]
                metadict = get_meta(open(metadatafile, "r").read().split("\n"))
                
                df = pd.read_csv(datafile, index_col=0)
                
                param_columns = [f"r{i}" for i in range(1, n_sp+1)] + \
                [f"A{i},{j}" for i in range(1, n_sp+1) for j in range(1, n_sp+1)]
                cols = ["n_init_cond"] + list(df.columns[1:4]) + param_columns + ["MSPD", "CSR", "ES"]

                infer_out = pd.DataFrame(columns=cols)

                pd.options.mode.chained_assignment = None
                
                p = metadict["parameters"]
                r = p[:n_sp]
                A = p[n_sp:].reshape((n_sp,n_sp))

                for i in tqdm(range(len(df.init_cond_idx.unique()))):
                    combs = list(combinations(df.init_cond_idx.unique(), i+1))
                    np.random.shuffle(combs)
                    for comb in combs[:100]:
                        df_comb = df[df.init_cond_idx.isin(comb)]
                        r_est, A_est = fit_ridge_cv(df_comb)
                        p_est = np.concatenate((r_est, A_est.flatten()))
                        MSPD = ((p-p_est)**2).mean()
                        CSR = (np.sign(A_est)==np.sign(A)).mean()
                        ES = calculate_es_score(A, A_est)
                        infer_out.loc[len(infer_out)] = [i+1, comb, avg_samp_dt, meas_noise] + list(p_est) + [MSPD, CSR, ES]

                infer_out.to_csv(datafile.split('dataset')[0]+"/inference"+datafile.split("dataset")[1])

100%|██████████| 5/5 [00:04<00:00,  1.12it/s]
100%|██████████| 5/5 [00:03<00:00,  1.61it/s]
100%|██████████| 5/5 [00:03<00:00,  1.55it/s]
100%|██████████| 5/5 [00:03<00:00,  1.60it/s]
100%|██████████| 5/5 [00:03<00:00,  1.58it/s]
100%|██████████| 5/5 [00:03<00:00,  1.39it/s]
100%|██████████| 5/5 [00:03<00:00,  1.29it/s]
100%|██████████| 5/5 [00:04<00:00,  1.14it/s]
100%|██████████| 5/5 [00:04<00:00,  1.15it/s]
100%|██████████| 5/5 [00:03<00:00,  1.26it/s]
100%|██████████| 5/5 [00:04<00:00,  1.25it/s]
100%|██████████| 5/5 [00:03<00:00,  1.39it/s]
100%|██████████| 5/5 [00:04<00:00,  1.00it/s]
100%|██████████| 5/5 [00:04<00:00,  1.05it/s]
100%|██████████| 5/5 [00:04<00:00,  1.21it/s]
