In [14]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import numpy as np
import os
import sys
sys.path.append("../../../code/statistical_analysis/extinction_analysis/")
from simulator import Simulator, ExtinctionOrder

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, nb_workers=40, use_memory_fs=False)

import seaborn as sns

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import auc

INFO: Pandarallel will run on 40 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [15]:
sim_type = 'diploids_first'

rewiring_prob = 0.5
sim_num=100

working_dir = f"../../../data/statistical_analysis/extinction_analysis/rewiring_prob_{rewiring_prob}"
os.makedirs(working_dir, exist_ok=True)

extinction_simluations_output_path = f"{working_dir}/rewiring_prob_{rewiring_prob}_united_{sim_type}_order_extinction_simulations.csv"
weighted_features_data = "../../../data/features/plant/weighted/features_with_classification.csv"
binary_features_data = "../../../data/features/plant/binary/features_with_classification.csv"
binarized_features_data = "../../../data/features/plant/binarized_weighted/features_with_classification.csv"

all_networks_path = "../../../data/networks/all/"
binary_networks_path = f"{all_networks_path}binary/"
weighted_networks_path = f"{all_networks_path}weighted/"
binarized_networks_path = f"{all_networks_path}binarized_weighted/"

networks = {'weighted': {'networks': weighted_networks_path, 'data': weighted_features_data},
            'binary': {'networks': binary_networks_path, 'data': binary_features_data},
            'binarized_weighted': {'networks': binarized_networks_path, 'data': binarized_features_data}}

# conduct extinction simulations across all networks with 

In [16]:
simulators = {}
extinction_simulations = None
if not os.path.exists(extinction_simluations_output_path):
    inputs_for_exec = pd.DataFrame(columns=["network_path", "classifications_path", "rate_range", "num_sim", "output_path"])
    for _type in networks:
        networks_dir = networks[_type]["networks"]
        output_dir = f"{working_dir}/{_type}/{sim_type}/"
        os.makedirs(output_dir, exist_ok=True)
        classification_path = networks[_type]["data"]
        for path in os.listdir(networks_dir):
            if path == ".ipynb_checkpoints":
                continue
            network_path = f"{networks_dir}{path}"
            output_path = f"{output_dir}{path}"
            if not os.path.exists(output_path):
                inputs_for_exec = pd.concat([inputs_for_exec, pd.DataFrame({"network_path": [network_path],
                                                                            "output_path": [output_path],
                                                                            "network_type": [_type],
                                                               "num_sim": [sim_num]})])
        simulators[_type] = Simulator(classification_path=classification_path,
                                      ext_order = ExtinctionOrder.random if sim_type == 'random' else (ExtinctionOrder.polyploids_first if sim_type == "polyploids_first" else ExtinctionOrder.diploids_first),
                                      rewiring_flag = True,
                                      rewiring_probability = rewiring_prob)
    print(f"# tasks =  {inputs_for_exec.shape[0]:,}")         
    if inputs_for_exec.shape[0] > 0:
        inputs_for_exec.parallel_apply(lambda record: simulators[record.network_type].write_simulations(network_path=record.network_path,
                                                                                                        output_path=record.output_path,
                                                                                                        nsim=record.num_sim), axis=1)
else:
    extinction_simulations = pd.read_csv(extinction_simluations_output_path)

In [17]:
if extinction_simulations is None:
    extinction_simulations = []
    for _type in networks:
        networks_dir = networks[_type]["networks"]
        output_dir = f"{working_dir}/{_type}/{sim_type}/"
        datasets_simulations = []
        for path in os.listdir(output_dir):
            try:
                network_index = int(path.split("_")[-1].replace(".csv",""))
                simulations = pd.read_csv(f"{output_dir}/{path}")
                simulations["network"] = network_index
                datasets_simulations.append(simulations)
            except Exception as e:
                pass
        print(f"# datasets with simulations for networks of type {_type}")
        all_simulations = pd.concat(datasets_simulations)
        all_simulations["network_type"] = _type
        extinction_simulations.append(all_simulations)
    extinction_simulations = pd.concat(extinction_simulations)
    extinction_simulations.to_csv(extinction_simluations_output_path)

In [18]:
extinction_simulations = extinction_simulations[[c for c in extinction_simulations.columns if not c.startswith("Unnamed")]]

## process data

In [19]:
extinction_simulations.sort_values(["network_type", "network", "simulation_index", "primary_iteration", "cascade_iteration"], inplace=True)

In [20]:
network_to_stats = []
for _type in networks:
    for path in os.listdir(f"{all_networks_path}{_type}/"):
        if not path.endswith(".csv"):
            continue
        network_path = f"{all_networks_path}{_type}/{path}"
        network_id = int(path.replace(".csv", ""))
        network = pd.read_csv(network_path)
        network = network.rename(columns={"Unnamed: 0": "Plant"}).set_index("Plant")
        plants = set(network.index)
        pollinators = set(network.columns)-{"Plant"}
        network_to_stats.append(pd.DataFrame({"network_type": [_type],
                                              "network_id": [network_id],
                                              "plants": [plants],
                                              "nplants": [len(plants)],
                                              "pollinators": [pollinators],
                                              "npollinators": [len(pollinators)]}))
network_to_stats = pd.concat(network_to_stats)

In [21]:
def process_extinction_simulation(df: pd.DataFrame) -> pd.DataFrame:
    net_stats = network_to_stats.loc[(network_to_stats.network_type == df.network_type.values[0]) & (network_to_stats.network_id == df.network.values[0])]
    n_plants = net_stats.nplants.values[0]
    n_pollinators = net_stats.npollinators.values[0]
    network_size = n_plants+n_pollinators
    if not df.shape[0]-1 <= network_size:
        print(f"in {df.network_type.values[0]} network {df.network.values[0]} with {n_plants} plants and {n_pollinators} pollinators, {df.shape[0]} extinctions were simulated")
        return df
    n_plants_extinct = df.loc[df.extinction_level == "plant"]
    n_pollinators_extinct = df.loc[df.extinction_level == "pollinator"]
    n_plants_series, n_pollinators_series = [], []
    for i, row in df.iterrows():
        n_plants = n_plants-(1 if row.extinction_level == "plant" else 0)
        n_pollinators = n_pollinators-(1 if row.extinction_level == "pollinator" else 0)
        if n_plants < 0 or n_pollinators < 0:
            raise ValueError(f"more extinctions than species with {n_plants} plants and {n_pollinators} pollinators for simulation {df.simulation_index.values[0]} of network {all_networks_path}{df.network_type.values[0]}/{df.network.values[0]}.csv")
            # return pd.DataFrame(columns=df.columns)
            
        assert(n_pollinators >= 0)
        n_plants_series.append(n_plants)
        n_pollinators_series.append(n_pollinators)
    df["n_plants"] = n_plants_series
    df["n_pollinators"] = n_pollinators_series
    df["prop_primary_extinctions"] = 1-(df["n_plants"]/net_stats.nplants.values[0])
    df["prop_secondary_extinctions"] = 1-(df["n_pollinators"]/net_stats.npollinators.values[0])
    return df

extinction_data = extinction_simulations.groupby(["network_type", "network", "simulation_index"]).parallel_apply(process_extinction_simulation)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=3143), Label(value='0 / 3143'))), …

In [22]:
extinction_data = extinction_data.reset_index(drop=True)

In [23]:
extinction_data = extinction_data[["network_type", "network", "simulation_index", 
                                   "primary_iteration", "cascade_iteration", 
                                   "extinction_type", "extinction_level", "extinct_taxon",
                                   "n_plants", "n_pollinators", 
                                   "prop_primary_extinctions", "prop_secondary_extinctions"]]

In [24]:
extinction_data["prop_secondary_survival"] = 1-extinction_data.prop_secondary_extinctions

In [25]:
extinction_data.to_csv(extinction_simluations_output_path)

In [26]:
for _type in networks:
    output_dir = f"{working_dir}/{_type}/{sim_type}/"
    os.system(f"rm -rf {output_dir}")