In [3]:
import pandas as pd
import numpy as np
import os

In [4]:
features_type = "plant"

features_of_interest = {"plant": ["partner.diversity", "d", "normalised.degree", "weighted.betweenness", "weighted.closeness"],
                        "network": ["connectance", "NODF", "modularity", "robustness", "robustness_mean", "robustness.LL"]}

features_dir = f"../../data/features/{features_type}/"
networks_dir = f"../../data/networks/all/"
network_types = ["weighted", "binarized_weighted", "binary"]

plant_classification_path = f"../../data/ploidy_classification/plant_classification.csv"
network_classification_path = f"../../data/ploidy_classification/network_classification.csv"


In [6]:
output_paths = []
features_paths = []
to_submit = []
for nt in network_types:
    outdir = f"{features_dir}{nt}/jobs_output/"
    resdir = f"{features_dir}{nt}/features_by_network/"
    jobsdir = f"{features_dir}{nt}/jobs/"
    unsubmitted = set([p.replace(".sh","") for p in os.listdir(jobsdir)])-set([p.replace("_features.csv","") for p in os.listdir(resdir) if "null" not in p])
    to_submit += [f"{jobsdir}{j}.sh" for j in unsubmitted]
    l1 = [f"{outdir}{p}" for p in os.listdir(outdir) if p.endswith(".out")]
    l2 = [f"{resdir}{p}" for p in os.listdir(resdir) if p.endswith(".csv") and "null" not in p]
    output_paths += l1
    features_paths += l2
print(f"# output_paths = {len(output_paths):,}\n# result paths = {len(features_paths):,}")
print(f"# unsubmitted jobs = {len(to_submit):,}")

# output_paths = 0
# result paths = 710
# unsubmitted jobs = 192


In [12]:
import re
from subprocess import *
from collections import defaultdict

job_path_regex = re.compile("Submit_arguments\s=.*?(\/.*?)\s")

jobs_log = str(Popen(["qstat", "-u", "halabikeren"], stdout=PIPE).communicate()[0]).split("\\n")[5:]
jobs_ids = [item.split(".")[0] for item in jobs_log if len(item.split(".")[0]) > 1]
job_path_to_id = defaultdict(list)
for job_id in jobs_ids:
    try:
        job_log = str(Popen(["qstat", "-f", job_id], stdout=PIPE).communicate()[0]).replace("\\n","").replace("\\t","")
        job_path = job_path_regex.search(job_log).group(1)
        job_path_to_id[job_path].append(job_id)
    except:
        continue

In [20]:
for jp in to_submit:
    if jp not in job_path_to_id:
        print(jp)
        # res=os.system(f"qsub -q itaym {jp}")

In [21]:
failed = []
failed_mem = []
failed_too_small = []
to_rerun = []

net_path_re = re.compile("network  (.*?) does not exist")

for p in output_paths:
    res_path = p.replace("jobs_output", "features_by_network").replace(".out", "_features.csv")
    job_path = p.replace("jobs_output", "jobs").replace(".out", ".sh")
    
    if job_path in job_path_to_id:
        continue

    with open(p, "r") as f:
        c=f.read()
    if "duration" not in c and not os.path.exists(res_path):
        failed.append(p)
    if "PBS: job killed: mem" in c:
        failed_mem.append(p)
    if "too small" in c:
        failed_too_small.append(p)
    if "does not exist" in c:
        net_path = net_path_re.search(c).group(1).replace("null", "all").replace("/NA", ".csv")
        net = pd.read_csv(net_path)
        if net.shape[0] < 3 or net.shape[1] < 3:
            failed_too_small.append(p)
        else:
            to_rerun.append(p)
failed_other = set(failed)-set(failed_mem)-set(failed_too_small)-set(to_rerun)       
print(f"# failed by unknown reason = {len(failed_other):,}\n# failed by memory = {len(failed_mem):,}\n# failed by size = {len(failed_too_small):,}\n# jobs_to_rerun = {len(to_rerun)}")

# failed by unknown reason = 0
# failed by memory = 0
# failed by size = 0
# jobs_to_rerun = 0


In [22]:
jobs=[]
for j in to_rerun:
    jobs.append(j.replace(".out",".sh").replace("_output",""))
print(os.getcwd())
print(jobs)

/groups/itay_mayrose/halabikeren/plant_pollinator_networks/notebooks/feature_extraction
[]


In [23]:
missing = {}
nmissing = 0
for nt in network_types:
    networks = [p.replace(".csv", "") for p in os.listdir(f"{networks_dir}{nt}/") if p.endswith(".csv")]
    results_dir = f"{features_dir}{nt}/features_by_network/"
    resulting_networks = [p.replace("_features.csv","") for p in os.listdir(results_dir) if "null" not in p and p.endswith(".csv")]
    missing[nt] = set(networks)-set(resulting_networks)
    nmissing += len(missing[nt])
print(f"# missing output = {nmissing}")

# missing output = 18


In [9]:
failed_mem_jobs = [p.replace("_output", "").replace(".out", ".sh") for p in failed_mem]
for nt in network_types:
    net_id_to_job = {}
    jobs_dir = f"../../data/features/{features_type}/{nt}/jobs/"
    job_paths = [f"{jobs_dir}/{path}" for path in os.listdir(jobs_dir) if path.endswith(".sh")]
    net_id_regex = re.compile("(\d*)_features.csv", re.MULTILINE)
    for path in job_paths:
        try:
            job_id = int(os.path.basename(path).replace(".sh",""))
            with open(path, "r") as f:
                net_id = int(net_id_regex.search(f.read()).group(1))
            net_id_to_job[net_id] = job_id 
        except:
            continue

    jobs = []
    for net_id in missing[nt]:
        job_id = net_id_to_job[int(net_id)]
        job_path = f"{jobs_dir}{job_id}.sh"
        if job_path not in failed_mem_jobs:
            jobs.append(job_path)
    print(f"# jobs for {nt} = {len(jobs)}")
    print(jobs)
    print("\n")

# jobs for binary = 3
['../../data/features/network/binary/jobs/155.sh', '../../data/features/network/binary/jobs/96.sh', '../../data/features/network/binary/jobs/0.sh']


# jobs for weighted = 15
['../../data/features/network/weighted/jobs/104.sh', '../../data/features/network/weighted/jobs/73.sh', '../../data/features/network/weighted/jobs/72.sh', '../../data/features/network/weighted/jobs/2.sh', '../../data/features/network/weighted/jobs/383.sh', '../../data/features/network/weighted/jobs/79.sh', '../../data/features/network/weighted/jobs/388.sh', '../../data/features/network/weighted/jobs/483.sh', '../../data/features/network/weighted/jobs/260.sh', '../../data/features/network/weighted/jobs/231.sh', '../../data/features/network/weighted/jobs/387.sh', '../../data/features/network/weighted/jobs/269.sh', '../../data/features/network/weighted/jobs/4.sh', '../../data/features/network/weighted/jobs/357.sh', '../../data/features/network/weighted/jobs/386.sh']


# jobs for binarized_weight

In [147]:
# for nt in missing:
#     for m in missing[nt]:
#         jp = f"{features_dir}{nt}/jobs/{m}.sh"
#         with open(jp, "r") as f:
#             c=f.read()
#         c=c.replace("/_features/", "/features/").replace("/features/_by_network/", "/features_by_network/")
#         with open(jp, "w") as f:
#             f.write(c)

In [148]:
# for nt in missing:
#     for m in missing[nt]:
#         jp = f"{features_dir}{nt}/jobs/{m}.sh"
#         res = os.system(f"qsub -q itay_75 {jp}")

In [21]:
for jop in failed_mem:
    jp = jop.replace("jobs_output", "jobs").replace(".out", ".sh")
    with open(jp, "r") as f:
        c=f.read()
    c=c.replace("/features", "/_features/").replace("15gb", "30gb").replace("50gb", "80gb").replace("30gb", "50gb").replace("20gb", "30gb").replace("15gb", "20gb").replace("10gb", "15gb").replace("4gb", "10gb")
    with open(jp, "w") as f:
        f.write(c)
    if os.path.exists(jop):
        os.remove(jop)
    # res=os.system(f"qsub -q itay_75 {jp}")
    print(jp)

In [40]:
for f in to_rerun:
    if os.path.exists(f):
        os.remove(f)
    jp = f.replace("jobs_output", "jobs").replace(".out", ".sh")
    res = os.system(f"qsub -q itay_75 {jp}")

In [7]:
features = []
for p in features_paths:
    nt = p.split("/")[-3]
    df = pd.read_csv(p)
    sd_cols = [col for col in df.columns if col.startswith("standardized_")]
    for c in sd_cols:
        df[c] = df[c].apply(lambda x: np.nan if x < -10000 or x > 10000 else x)
    df["network_type"] = nt
    features.append(df)
features = pd.concat(features)
if features_type == "plant":
    if "Plant" not in features.columns:
        features = features.rename(columns={"Unnamed: 0": "Plant"})
    features.Plant = features.Plant.str.lower()
if "network_id" in features.columns:
    if features["network"].dtype == str:
        features["network_id"] = features["network"].str.replace(".csv","").astype(int)
    else:
        features["network_id"] = features["network"] 
features.to_csv(f"{features_dir}/all_features.csv")
for nt in network_types:
    nt_features = features.loc[features.network_type == nt]
    print(f"# {nt} networks for analysis = {len(nt_features.network.unique()):,}")
    nt_features.to_csv(f"{features_dir}/{nt}/features.csv")

# weighted networks for analysis = 710


# add classification

In [8]:
classification_df = pd.read_csv(plant_classification_path if features_type == "plant" else network_classification_path)
classification_merge_cols = ["network_id"] if features_type == "network" else ["original_name"]
features_merge_cols = ["network_id"] if features_type == "network" else ["Plant"]

In [9]:
features["network_id"] = features.network.str.replace(".csv","").astype(int)

  features["network_id"] = features.network.str.replace(".csv","").astype(int)


In [10]:
for nt in network_types:
    df = features.loc[features.network_type == nt]
    relevant_classification_data = classification_df
    if "network_type" in classification_df.columns:
        relevant_classification_data = classification_df.query(f"network_type == '{nt}'").drop(["network_type"], axis=1)
    df = df.merge(relevant_classification_data, 
                  left_on=features_merge_cols,
                  right_on=classification_merge_cols,
                  how="left")
    print(df[set(classification_df.columns)&set(df.columns)].notna().sum())
    df.to_csv(f"{features_dir}/{nt}/features_with_classification.csv", index=False)   

  print(df[set(classification_df.columns)&set(df.columns)].notna().sum())


matched_name                 5636
is_polyploid_by_matched         0
is_polyploid                 5755
polyploid_age                5754
is_polyploid_by_original     5490
polyploid_age_by_matched        0
is_polyploid_by_resolved     5604
polyploid_age_by_original    5489
original_name                5755
resolved_name                5636
polyploid_age_by_resolved    5603
dtype: int64


In [11]:
for nt in network_types:
    df = pd.read_csv(f"{features_dir}/{nt}/features_with_classification.csv")
    df = df[[c for c in df.columns if "Unnamed" not in c and not "standardized_" in c]]
    df = df.rename(columns={c: c.replace("_y","") for c in df.columns if c.endswith("_y")})
    df = df[[c for c in df.columns if not c.endswith("_x")]]
    display(df[[c for c in relevant_classification_data.columns if c in df.columns]].notna().sum() / df.shape[0])
    df.to_csv(f"{features_dir}/{nt}/features_with_classification.csv", index=False)   

resolved_name                0.557413
matched_name                 0.557413
original_name                0.569182
is_polyploid_by_original     0.542973
polyploid_age_by_original    0.542874
is_polyploid_by_matched      0.000000
polyploid_age_by_matched     0.000000
is_polyploid_by_resolved     0.554248
polyploid_age_by_resolved    0.554149
is_polyploid                 0.569182
polyploid_age                0.569083
dtype: float64