In [9]:
# Set the process name to be human readable in htop
import setproctitle
setproctitle.setproctitle("Fetch_Eclair_Preds")

from tqdm import tqdm, tqdm_notebook
from tqdm._tqdm_notebook import tqdm_notebook

tqdm.pandas(tqdm_notebook)
tqdm_notebook.pandas()

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

import pandas as pd
import glob

# Paramaters for file locations

In [10]:
# Base directory for whole project
base_dir = "/home/sdw95/3D_SARS2"

# Original ECLAIR Prediction Output Location
orig_dir = "/home/sdw95/Collaborators/Eclair_Runs/2021_01_17_COVID19_Human_Interactome_Use_PDB_Structures/Predictions"

# Destination directory to store ECLAIR predictions locally
dest_dir = "{0}/Data/Eclair_Predictions".format(base_dir)

# Collect / Parse all Raw Eclair Predictions

In [11]:
# Store ires summary
summary = []

# Iterate over all predictions in the input directory
for f in tqdm_notebook(glob.glob(orig_dir + "/*.pkl")):
    # Read ECLAIR Predictions
    preds = pd.read_pickle(f)
    
    # Label by confidence tier
    def label_tier(p):
        if(p < 0.12):
            return "Very Low"
        elif(p < 0.24):
            return "Low"
        elif(p < 0.36):
            return "Medium"
        elif(p < 0.48):
            return "High"
        else:
            return "Very High"
    # FUNCTION END
    preds["Tier"] = preds["Pred"].map(label_tier)
    
    # Reorder so COVID protein is P1 (for consistency with other naming)
    p1, p2 = preds[["P1", "P2"]].values[0]
    if(not "COVID" in p1):
        preds["P1"] = p2
        preds["P2"] = p1
        p1, p2 = p2, p1
        preds["Prot"] = 1 - preds["Prot"]
        preds = preds.sort_values(["Prot", "Pos"])
    
    # Save Preds to dest folder
    preds.to_csv("{0}/{1}_{2}.txt".format(dest_dir, p1, p2), sep="\t", index=None)
    
    # Store High / Very High Ires for Compiled Summary
    p1_ires = sorted(preds[(preds["Pred"] >= 0.36)&(preds["Prot"] == 0)]["Pos"].to_list())
    p2_ires = sorted(preds[(preds["Pred"] >= 0.36)&(preds["Prot"] == 1)]["Pos"].to_list())
    
    summary.append([p1, p2, "ECLAIR", sum(preds["Prot"] == 0), len(p1_ires), ",".join([str(x) for x in p1_ires]), sum(preds["Prot"] == 1), len(p2_ires), ",".join([str(x) for x in p2_ires])])

# Create final DF
summary = pd.DataFrame(summary, columns=["P1", "P2", "Source", "P1_Len", "P1_N_Ires", "P1_Ires", "P2_Len", "P2_N_Ires", "P2_Ires"])

HBox(children=(IntProgress(value=0, max=332), HTML(value=u'')))




In [12]:
summary.head()

Unnamed: 0,P1,P2,Source,P1_Len,P1_N_Ires,P1_Ires,P2_Len,P2_N_Ires,P2_Ires
0,COVID19nsp7,P11233,ECLAIR,83,14,7891112151625262734374960,206,11,374356159160161162163165169173
1,COVID19nsp4,P62072,ECLAIR,500,87,"43,44,46,49,54,55,56,60,61,63,64,65,66,67,68,6...",90,45,"9,12,13,14,15,16,17,18,19,20,22,23,24,31,32,37..."
2,COVID19orf9b,Q7KZI7,ECLAIR,97,20,"44,45,46,48,49,50,52,53,56,57,59,60,61,64,65,6...",788,70,"51,57,61,63,87,88,90,95,98,99,102,135,138,139,..."
3,COVID19orf9c,O43292,ECLAIR,73,25,"6,9,13,27,28,30,31,33,34,35,43,49,50,53,54,55,...",621,0,
4,COVID19orf8,P29122,ECLAIR,121,8,3940427681104105111,969,21,"222,232,237,279,282,283,284,286,298,316,320,32..."


In [13]:
# Save summary file
summary.to_csv("{0}/Data/Interface_Summary.txt".format(base_dir), sep="\t", index=None)