In [38]:
import networkx as nx
import os
import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt
import glob
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, nb_workers=30, use_memory_fs=False)

import warnings
warnings.filterwarnings("ignore")

INFO: Pandarallel will run on 30 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [39]:
compute_on_null = False

networks_dir = f"../../data/networks/{'null' if compute_on_null else 'all'}/"
plant_features_dir = f"../../data/features/plant/"
networks_types = ["binary", "weighted", "binarized_weighted"]

networks_paths_data = []
for net_type in networks_types:
    paths = glob.glob(f'{networks_dir}{net_type}/**/**/**.csv', recursive=True)
    df = pd.DataFrame({"path": paths}) #[f"{networks_dir}{net_type}/{p}" for p in os.listdir(f"{networks_dir}{net_type}/") if p.endswith(".csv")]})
    df["net_type"] = net_type
    networks_paths_data.append(df)
networks_paths_data = pd.concat(networks_paths_data)
print(f"# networks = {networks_paths_data.shape[0]:,}")

# networks = 1,257


In [40]:
def get_network(network_path):
    ep_network = pd.read_csv(network_path)
    if "Plant" not in ep_network.columns:
        ep_network = ep_network.rename(columns={"Unnamed: 0": "Plant"})
    ep_network = ep_network.set_index("Plant")
    plants = ep_network.index.tolist()
    pollinators = list(ep_network.columns)
    pp_pairs = {pair: ep_network.loc[pair[0]][pair[1]] for pair in list(itertools.product(plants, pollinators))}
    G = nx.Graph()
    G.add_nodes_from(plants, bipartite=0)
    G.add_nodes_from(pollinators, bipartite=1)
    G.add_weighted_edges_from(
        [(pair[0], pair[1], pp_pairs[pair]) for pair in pp_pairs if pp_pairs[pair] > 0])
    return G 

def draw_network(G):
    nx.draw_networkx(G,
                     pos=nx.kamada_kawai_layout(G, weight='Value'),
                     node_size=200,
                     with_labels=True)
    plt.show()

In [41]:
networks_paths_data["network_id"] = networks_paths_data.path.parallel_apply(lambda p: os.path.basename(p).replace(".csv",""))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=42), Label(value='0 / 42'))), HBox…

In [42]:
def get_hubiness_scores(path: str) -> dict: 
    try:
        network = get_network(path)
        (hubs,authorities) = nx.hits(network)
        return hubs
    except Exception as e:
        print(f"couldn't compute hubbiness for {path} due to error {e}")
        return np.nan
    
networks_paths_data["hubbiness"] = networks_paths_data.parallel_apply(lambda rec: get_hubiness_scores(rec.path), axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=42), Label(value='0 / 42'))), HBox…

In [43]:
hubbiness_data_across_networks = networks_paths_data[["network_id", "net_type", "hubbiness"]]

In [44]:
hubbiness_data = []
for i, row in hubbiness_data_across_networks.iterrows():
    if pd.notna(row.hubbiness):
        df = pd.DataFrame.from_dict(row.hubbiness, orient="index").reset_index().rename(columns={"index": "plant", 0: "hubbiness_score"})
        df["network_type"] = row.net_type
        df["network_id"] = row.network_id
        hubbiness_data.append(df)
    else:
        print(f"no hubbiness data for {row.net_type}-{row.network_id}")

In [45]:
hubbiness_data = pd.concat(hubbiness_data)
hubbiness_data.head()

Unnamed: 0,plant,hubbiness_score,network_type,network_id
0,Aechmea cylindrata,-1.600884e-18,binary,0
1,Nidularium campo-alegrensis,0.07072577,binary,0
2,Vriesea altodaserrae,0.311028,binary,0
3,Vriesea guttata,0.07072577,binary,0
4,Vriesea heterostachys,0.07072577,binary,0


In [46]:
def get_ranked_hubbiness_df(df: pd.DataFrame) -> pd.DataFrame:
    if df.shape[0] == 0:
        return df
    feature = "hubbiness_score"
    df[f"standardized_{feature}"] = df[feature].rank()
    nranks = df[f"standardized_{feature}"].max()
    df[f"standardized_{feature}"] = df[f"standardized_{feature}"] / nranks
    return df

hubbiness_data = hubbiness_data.groupby(["network_type", "network_id"]).parallel_apply(lambda df: get_ranked_hubbiness_df(df))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=42), Label(value='0 / 42'))), HBox…

In [47]:
hubbiness_data = hubbiness_data.reset_index(drop=True)

In [48]:
assert(hubbiness_data.standardized_hubbiness_score.max() == 1)

In [49]:
hubbiness_data.network_id = hubbiness_data.network_id.astype(int)

In [51]:
for net_type in hubbiness_data.network_type.unique():
    features_df = pd.read_csv(f"{plant_features_dir}{net_type}/features.csv")
    if "Plant" not in features_df.columns:
        features_df = features_df.rename(columns={"Unnamed: 0": "Plant"})
    if "network" in features_df.columns:
        features_df["network_id"] = features_df.network
    relevant_hubbiness_data = hubbiness_data.query(f"network_type == '{net_type}'").rename(columns={"plant": "Plant"})
    features_df = features_df.merge(relevant_hubbiness_data, on=['network_type', 'network_id', "Plant"], how="left")
    features_df = features_df.drop([c for c in features_df.columns if "Unnamed" in c], axis=1)
    features_df.to_csv(f"{plant_features_dir}{net_type}/features.csv")

In [52]:
for net_type in hubbiness_data.network_type.unique():
    features_df = pd.read_csv(f"{plant_features_dir}{net_type}/features_with_classification.csv")
    if "hubbiness_score_x" in features_df.columns:
        features_df["hubbiness_score"] = features_df.hubbiness_score_x
    if "hubbiness_score" not in features_df.columns:
        features_df["network_id"] = features_df.network
        relevant_hubbiness_data = hubbiness_data.query(f"network_type == '{net_type}'").rename(columns={"plant": "Plant"})
        features_df = features_df.merge(relevant_hubbiness_data, on=['network_type', 'network_id', "Plant"], how="left")
        features_df = features_df.drop([c for c in features_df.columns if "Unnamed" in c], axis=1)      
        features_df = features_df[[c for c in features_df.columns if not c.startswith("hubbiness_score_")]]
    features_df.to_csv(f"{plant_features_dir}{net_type}/features_with_classification.csv")