In [3]:
import pandas as pd
import numpy as np
import os
import re
from subprocess import *
from collections import defaultdict

In [1]:
features_type = "network"

features_of_interest = {"plant": ["partner.diversity", "d", "normalised.degree", "weighted.betweenness", "weighted.closeness"],
                        "network": ["connectance", "NODF", "modularity", "robustness", "robustness_mean", "robustness.LL"]}

features_dir = f"../../data/features/{features_type}/"
networks_dir = f"../../data/networks/all/"
network_types = ["weighted", "binarized_weighted", "binary"]

plant_classification_path = f"../../data/ploidy_classification/plant_classification.csv"
network_classification_path = f"../../data/ploidy_classification/network_classification.csv"


In [4]:
output_paths = []
features_paths = []
to_submit = []
for nt in network_types:
    outdir = f"{features_dir}{nt}/jobs_output/"
    resdir = f"{features_dir}{nt}/features_by_network/"
    jobsdir = f"{features_dir}{nt}/jobs/"
    l1 = [f"{outdir}{p}" for p in os.listdir(outdir) if p.endswith(".out")]
    l2 = [f"{resdir}{p}" for p in os.listdir(resdir) if p.endswith(".csv") and "null" not in p]
    output_paths += l1
    features_paths += l2
print(f"# output_paths = {len(output_paths):,}\n# result paths = {len(features_paths):,}")

# output_paths = 785
# result paths = 801


In [5]:
features = []
for p in features_paths:
    nt = p.split("/")[-3]
    df = pd.read_csv(p)
    sd_cols = [col for col in df.columns if col.startswith("standardized_")]
    for c in sd_cols:
        df[c] = df[c].apply(lambda x: np.nan if x < -10000 or x > 10000 else x)
    df["network_type"] = nt
    features.append(df)
features = pd.concat(features)
if features_type == "plant":
    if "Plant" not in features.columns:
        features = features.rename(columns={"Unnamed: 0": "Plant"})
    features.Plant = features.Plant.str.lower()
if "network_id" in features.columns:
    if features["network"].dtype == str:
        features["network_id"] = features["network"].str.replace(".csv","").astype(int)
    else:
        features["network_id"] = features["network"] 
features.to_csv(f"{features_dir}/all_features.csv")
for nt in network_types:
    nt_features = features.loc[features.network_type == nt]
    print(f"# {nt} networks for analysis = {len(nt_features.network.unique()):,}")
    nt_features.to_csv(f"{features_dir}/{nt}/features.csv")

# weighted networks for analysis = 377
# binarized_weighted networks for analysis = 377
# binary networks for analysis = 47


# add classification

In [6]:
classification_df = pd.read_csv(plant_classification_path if features_type == "plant" else network_classification_path)
classification_merge_cols = ["network_index"] if features_type == "network" else ["original_name"]
features_merge_cols = ["network_index"] if features_type == "network" else ["Plant"]
features["network_index"] = features.network.str.replace(".csv","").astype(int)

  features["network_index"] = features.network.str.replace(".csv","").astype(int)


In [7]:
for nt in network_types:
    df = features.loc[features.network_type == nt]
    relevant_classification_data = classification_df
    if "network_type" in classification_df.columns:
        relevant_classification_data = classification_df.query(f"network_type == '{nt}'").drop(["network_type"], axis=1)
    df = df.merge(relevant_classification_data, 
                  left_on=features_merge_cols,
                  right_on=classification_merge_cols,
                  how="left")
    print(f"\nnetwork_type = {nt}")
    print(df[set(classification_df.columns)&set(df.columns)].notna().sum())
    df.to_csv(f"{features_dir}/{nt}/features_with_classification.csv", index=False)   

  print(df[set(classification_df.columns)&set(df.columns)].notna().sum())



network_type = weighted
num_plants                   377
is_polyploid_poly_frac       377
num_classified               377
num_diploids                 377
network_index                377
network_type                 377
is_polyploid_missing_frac    377
num_resolved                 377
num_polyploids               377
dtype: int64

network_type = binarized_weighted
num_plants                   377
is_polyploid_poly_frac       377
num_classified               377
num_diploids                 377
network_index                377
network_type                 377
is_polyploid_missing_frac    377
num_resolved                 377
num_polyploids               377
dtype: int64

network_type = binary
num_plants                   47
is_polyploid_poly_frac       47
num_classified               47
num_diploids                 47
network_index                47
network_type                 47
is_polyploid_missing_frac    47
num_resolved                 47
num_polyploids               47
dtype: in

  print(df[set(classification_df.columns)&set(df.columns)].notna().sum())
  print(df[set(classification_df.columns)&set(df.columns)].notna().sum())


In [8]:
for nt in network_types:
    df = pd.read_csv(f"{features_dir}/{nt}/features_with_classification.csv")
    df = df[[c for c in df.columns if "Unnamed" not in c and not "standardized_" in c]]
    df = df.rename(columns={c: c.replace("_y","") for c in df.columns if c.endswith("_y")})
    df = df[[c for c in df.columns if not c.endswith("_x")]]
    display(df[[c for c in relevant_classification_data.columns if c in df.columns]].notna().sum() / df.shape[0])
    df.to_csv(f"{features_dir}/{nt}/features_with_classification.csv", index=False)   

network_index                1.0
is_polyploid_poly_frac       1.0
is_polyploid_missing_frac    1.0
num_plants                   1.0
num_resolved                 1.0
num_classified               1.0
num_polyploids               1.0
num_diploids                 1.0
dtype: float64

network_index                1.0
is_polyploid_poly_frac       1.0
is_polyploid_missing_frac    1.0
num_plants                   1.0
num_resolved                 1.0
num_classified               1.0
num_polyploids               1.0
num_diploids                 1.0
dtype: float64

network_index                1.0
is_polyploid_poly_frac       1.0
is_polyploid_missing_frac    1.0
num_plants                   1.0
num_resolved                 1.0
num_classified               1.0
num_polyploids               1.0
num_diploids                 1.0
dtype: float64