In [29]:
import pandas as pd
import numpy as np
import os

In [30]:
features_type = "plant"

features_of_interest = {"plant": ["partner.diversity", "d", "normalised.degree", "weighted.betweenness", "weighted.closeness"],
                        "network": ["connectance", "NODF", "modularity", "robustness", "robustness_mean", "robustness.LL"]}

features_dir = f"../../data/features/{features_type}/"
networks_dir = f"../../data/networks/all/"
network_types = ["binary", "weighted", "binarized_weighted"]

plant_classification_path = f"../../data/ploidy_classification/plant_classification.csv"
network_classification_path = f"../../data/ploidy_classification/network_classification.csv"


In [31]:
output_paths = []
features_paths = []
to_submit = []
for nt in network_types:
    outdir = f"{features_dir}{nt}/jobs_output/"
    resdir = f"{features_dir}{nt}/features_by_network/"
    jobsdir = f"{features_dir}{nt}/jobs/"
    unsubmitted = set([p.replace(".sh","") for p in os.listdir(jobsdir)])-set([p.replace(".out","") for p in os.listdir(outdir)])
    to_submit += [f"{jobsdir}{j}.sh" for j in unsubmitted]
    l1 = [f"{outdir}{p}" for p in os.listdir(outdir) if p.endswith(".out")]
    l2 = [f"{resdir}{p}" for p in os.listdir(resdir) if p.endswith(".csv") and "null" not in p]
    output_paths += l1
    features_paths += l2
print(f"# output_paths = {len(output_paths):,}\n# result paths = {len(features_paths):,}")
print(f"# unsubmitted jobs = {len(to_submit):,}")

# output_paths = 1,255
# result paths = 1,253
# unsubmitted jobs = 2


In [32]:
import re
from subprocess import *
from collections import defaultdict

job_path_regex = re.compile("Submit_arguments\s=.*?(\/.*?)\s")

jobs_log = str(Popen(["qstat", "-u", "halabikeren"], stdout=PIPE).communicate()[0]).split("\\n")[5:]
jobs_ids = [item.split(".")[0] for item in jobs_log if len(item.split(".")[0]) > 1]
job_path_to_id = defaultdict(list)
for job_id in jobs_ids:
    try:
        job_log = str(Popen(["qstat", "-f", job_id], stdout=PIPE).communicate()[0]).replace("\\n","").replace("\\t","")
        job_path = job_path_regex.search(job_log).group(1)
        job_path_to_id[job_path].append(job_id)
    except:
        continue

In [33]:
for jp in to_submit:
    if jp not in job_path_to_id:
        print(jp)
        res=os.system(f"qsub -q itay_75 {jp}")

../../data/features/plant/weighted/jobs/536.sh
../../data/features/plant/binarized_weighted/jobs/536.sh


In [34]:
failed = []
failed_mem = []
failed_too_small = []
to_rerun = []
for p in output_paths:
    res_path = p.replace("jobs_output", "features_by_network").replace(".out", "_features.csv")
    job_path = p.replace("jobs_output", "jobs").replace(".out", ".sh")
    
    if job_path in job_path_to_id:
        continue

    with open(p, "r") as f:
        c=f.read()
    if "duration" not in c and not os.path.exists(res_path):
        failed.append(p)
    if "PBS: job killed: mem" in c:
        failed_mem.append(p)
    if "too small" in c:
        failed_too_small.append(p)
    if "does not exist" in c:
        to_rerun.append(p)
failed_other = set(failed)-set(failed_mem)-set(failed_too_small)-set(to_rerun)       
print(f"# failed by unknown reason = {len(failed_other):,}\n# failed by memory = {len(failed_mem):,}\n# failed by size = {len(failed_too_small):,}\njobs_to_rerun = {len(to_rerun)}")

# failed by unknown reason = 0
# failed by memory = 2
# failed by size = 0
jobs_to_rerun = 0


In [35]:
missing = {}
nmissing = 0
for nt in network_types:
    networks = [p.replace(".csv", "") for p in os.listdir(f"{networks_dir}{nt}/") if p.endswith(".csv")]
    results_dir = f"{features_dir}{nt}/features_by_network/"
    resulting_networks = [p.replace("_features.csv","") for p in os.listdir(results_dir) if "null" not in p and p.endswith(".csv")]
    missing[nt] = set(networks)-set(resulting_networks)
    nmissing += len(missing[nt])
print(f"# missing output = {nmissing}")

# missing output = 4


In [36]:
missing

{'binary': set(),
 'weighted': {'290', '536'},
 'binarized_weighted': {'290', '536'}}

In [37]:
for nt in missing:
    for m in missing[nt]:
        jp = f"{features_dir}{nt}/jobs/{m}.sh"
        with open(jp, "r") as f:
            c=f.read()
        c=c.replace("/_features/", "/features/").replace("/features/_by_network/", "/features_by_network/")
        with open(jp, "w") as f:
            f.write(c)

In [38]:
for nt in missing:
    for m in missing[nt]:
        jp = f"{features_dir}{nt}/jobs/{m}.sh"
        res = os.system(f"qsub -q itay_75 {jp}")

In [44]:
for jop in failed_mem:
    jp = jop.replace("jobs_output", "jobs").replace(".out", ".sh")
    with open(jp, "r") as f:
        c=f.read()
    c=c.replace("/features", "/_features/").replace("15gb", "30gb").replace("50gb", "80gb").replace("30gb", "50gb").replace("20gb", "30gb").replace("15gb", "20gb").replace("10gb", "15gb").replace("4gb", "10gb")
    with open(jp, "w") as f:
        f.write(c)
    if os.path.exists(jop):
        os.remove(jop)
    # res=os.system(f"qsub -q itay_75 {jp}")
    print(jp)

../../data/features/plant/weighted/jobs/290.sh
../../data/features/plant/binarized_weighted/jobs/290.sh


In [40]:
for f in to_rerun:
    if os.path.exists(f):
        os.remove(f)
    jp = f.replace("jobs_output", "jobs").replace(".out", ".sh")
    res = os.system(f"qsub -q itay_75 {jp}")

In [51]:
features = []
for p in features_paths:
    nt = p.split("/")[-3]
    df = pd.read_csv(p)
    sd_cols = [col for col in df.columns if col.startswith("standardized_")]
    for c in sd_cols:
        df[c] = df[c].apply(lambda x: np.nan if x < -10000 or x > 10000 else x)
    df["network_type"] = nt
    features.append(df)
features = pd.concat(features)
features = features.rename(columns={"Unnamed: 0": "Plant"})
if "network" in features.columns:
    features["network"] = features["network"].str.replace(".csv","").astype(int)
features.to_csv(f"{features_dir}/all_features.csv")
for nt in network_types:
    features.loc[features.network_type == nt].to_csv(f"{features_dir}/{nt}/features.csv")

  features["network"] = features["network"].str.replace(".csv","").astype(int)


# add classification

In [47]:
classification_df = pd.read_csv(plant_classification_path if features_type == "plant" else network_classification_path)
classification_merge_cols = ["network_type", "network_id"] if features_type == "network" else ["original_name"]
features_merge_cols = ["network_type", "network"] if features_type == "network" else ["Plant"]
classification_cols = ["conservative_is_polyploid_poly_frac","conservative_is_polyploid_missing_frac","num_plants", "num_resolved","num_classified","num_polyploids","num_diploids"] if features_type == "network" else ["conservative_is_polyploid_by_resolved"]    

In [53]:
for nt in network_types:
    df = features.loc[features.network_type == nt]
    df = df.merge(classification_df[classification_merge_cols+classification_cols], 
                  left_on=features_merge_cols,
                  right_on=classification_merge_cols,
                  how="left")
    df.to_csv(f"{features_dir}/{nt}/features_with_classification.csv")   