In [1]:
# Set the process name to be human readable in htop
import setproctitle
setproctitle.setproctitle("01_Run_ECLAIR")

from config import *

This notebook contains the command that WOULD run the ECLAIR pipeline on the provided set of interactions. The full ECLIAR pipeline is not currently provided as a standalone piece of software. For demonstrative purposes, the raw ECLAIR prediciton outputs for a handful of 

Raw ECLAIR outputs are saved as .pkl objects and separated out as [P1]\_[P2]\_0.pkl and [P1]\_[P2]\_1.pkl (for the interface predictions on the P1-P2 interaction for P1 and P2 respectively). The parsing done here merges the two files into one and additionally adds a confidence tier interpretation to the raw prediciton values (only High and Very High) predictions are
retained in the binary interface prediction output.

- Inputs:
  - [P1]\_[P1]\_[prot].pkl (Raw Eclair Prediction outputs)


- Outputs:
  - [P1]\_[P1].txt (Parsed Eclair Prediciton outputs)
  - Interface_Summary.txt


- Dependencies:
  - Should be run AFTER all interacitons have been fed through ECLAIR pipeline for interface prediction
    - **NOTE:** The ECLAIR pipeline is not incluuded in this repository, and treats any output from this pipeline as a static result that is already available

# Collect / Parse all Raw Eclair Predictions

In [3]:
# Store ires summary
summary = []

# Iterate over all predictions in the input directory
for f in tqdm_notebook(glob.glob(raw_eclair_dir + "/*.pkl")):
    # Read ECLAIR Predictions
    preds = pd.read_pickle(f)
    
    # Label by confidence tier
    def label_tier(p):
        if(p < 0.12):
            return "Very Low"
        elif(p < 0.24):
            return "Low"
        elif(p < 0.36):
            return "Medium"
        elif(p < 0.48):
            return "High"
        else:
            return "Very High"
    # FUNCTION END
    preds["Tier"] = preds["Pred"].map(label_tier)
    
    # Reorder so COVID protein is P1 (for consistency with other naming)
    p1, p2 = preds[["P1", "P2"]].values[0]
    if(not "COVID" in p1):
        preds["P1"] = p2
        preds["P2"] = p1
        p1, p2 = p2, p1
        preds["Prot"] = 1 - preds["Prot"]
        preds = preds.sort_values(["Prot", "Pos"])
    
    # Save Preds to dest folder
    preds.to_csv("{0}/{1}_{2}.txt".format(eclair_dir, p1, p2), sep="\t", index=None)
    
    # Store High / Very High Ires for Compiled Summary
    p1_ires = sorted(preds[(preds["Pred"] >= 0.36)&(preds["Prot"] == 0)]["Pos"].to_list())
    p2_ires = sorted(preds[(preds["Pred"] >= 0.36)&(preds["Prot"] == 1)]["Pos"].to_list())
    
    summary.append([p1, p2, "ECLAIR", sum(preds["Prot"] == 0), len(p1_ires), ",".join([str(x) for x in p1_ires]), sum(preds["Prot"] == 1), len(p2_ires), ",".join([str(x) for x in p2_ires])])

# Create final DF
summary = pd.DataFrame(summary, columns=["P1", "P2", "Source", "P1_Len", "P1_N_Ires", "P1_Ires", "P2_Len", "P2_N_Ires", "P2_Ires"])

HBox(children=(IntProgress(value=0, max=10), HTML(value=u'')))




In [4]:
summary.head()

Unnamed: 0,P1,P2,Source,P1_Len,P1_N_Ires,P1_Ires,P2_Len,P2_N_Ires,P2_Ires
0,COVID19nsp5,Q92769,ECLAIR,306,7,139141142169189218278,488,40,"24,71,105,145,166,167,186,190,191,202,203,213,..."
1,COVID19nsp15,P62330,ECLAIR,346,13,10232650103104290291312314315317332,175,49,"15,23,27,31,34,37,38,39,40,41,42,43,45,46,47,4..."
2,COVID19E,Q8IWA5,ECLAIR,75,0,,706,0,
3,COVID19nsp14,P12268,ECLAIR,527,28,"8,9,114,142,144,145,146,147,148,165,168,245,24...",514,70,"36,41,94,117,120,139,140,141,142,144,161,165,2..."
4,COVID19nsp2,O14975,ECLAIR,638,74,"6,8,10,11,12,13,14,15,57,58,60,61,64,65,66,71,...",620,33,"145,226,227,228,229,230,231,292,295,318,353,37..."


In [6]:
# Save summary file
summary.to_csv("{0}/Interface_Summary.txt".format(output_dir), sep="\t", index=None)