In [4]:
# Set the process name to be human readable in htop
import setproctitle
setproctitle.setproctitle("Select_Models")

from config import *
from helper_functions import fasta2dict, NWSeqAlignment, alignPrint, batchUniProtAPI


import requests
import pandas as pd
pd.options.display.max_columns = 999

import numpy as np
import helper as my

import glob
import os
import sys

from tqdm import tqdm, tqdm_notebook
from tqdm._tqdm_notebook import tqdm_notebook

tqdm.pandas(tqdm_notebook)
tqdm_notebook.pandas()


%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

import seaborn as sns

from mjm_tools import zip_res_range, unzip_res_range, open_pdb

from collections import defaultdict

# Paramaters for file locations

In [5]:
# Base directory for whole project
base_dir = "/home/sdw95/3D_SARS2"

# Load Interactions / Predicted ECLAIR Pred Data

In [10]:
# Read Interactions
interactions = pd.read_csv("{0}/Data/Interactions.txt".format(base_dir), sep="\t")

# List of Human Interactors
human_unis = set(interactions["P2"])

In [19]:
# Read ECLAIR Preds
inter2preds = glob.glob("{0}/Data/Eclair_Predictions/*".format(base_dir))
inter2preds = {os.path.basename(x).split(".")[0]:pd.read_csv(x, sep="\t") for x in inter2preds}

# Select Best Available Viral Structures

In [43]:
# Select Homology Models for COVID Proteins
#
# NOTE: This is done is a fairly hacky way by fetching the info from the ECLAIR feature set
#       Homology models were originally created and stored here...
#
#       /home/sdw95/Collaborators/Lab_Member_Requests/Haiyuan/2020_03_27_COVID19_3DInteractome/modelling_test/{0}.*pdb
#
#       Since the code / data for this project got split up in multiple locations, I fetch the models from the ECLAIR
#       feature pipeline to be positive I'm using the same models used in the predictions
#
interactions["P1 PDB"] = interactions["P1"].map(lambda x: glob.glob("/home/adr66/eclair/data/modbase/models/hash/{0}.pdb".format(x.replace("COVID19", "")))[0] if glob.glob("/home/adr66/eclair/data/modbase/models/hash/{0}.pdb".format(x.replace("COVID19", ""))) else np.nan)
interactions.ix[interactions["P1"] == "COVID19nsp5C145A", "P1 PDB"] = glob.glob("/home/adr66/eclair/data/modbase/models/hash/nsp5_C145A.pdb")[0]

# Manually drop low quality homology models
# These 4 homology models were generated originally, but later
# decided to be removed from the analysis because the template
# quality did not pass our thresholds
interactions.ix[interactions["P1"].map(lambda x: "nsp2" in x or "nsp4" in x or "orf6" in x or "orf9c" in x), "P1 PDB"] = np.nan

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


In [44]:
# Replace Homology Models with PDB Structures where available
# For Viral proteins, replace any of the structures with newer PDB models where available
#
# NOTE: Code to select PDB structures for COVID proteins not provided here
#
prot2pdbs = defaultdict(set)
for f in glob.glob("/home/sdw95/3D_SARS2/Misc_Notebooks/2021_01_13_SARS2_PDB_Sifts_Mapping/SIFTS_Mapped_PDBs/*.pdb"):
    prot = os.path.basename(f).split("_")[0]
    prot2pdbs[prot].add(f)

def do(p1, p2, pdb):
    if(p1 in prot2pdbs):
        # Special case to select best structure for N protein
        # (This is the only one that has multiple PDB structures with unique coverages)
        if(p1 == "COVID19N"):
            pair = "_".join(sorted([p1, p2]))
            preds = inter2preds[pair]
            preds = preds[preds["Prot"] == (p1 > p2)]
            
            w1 = sum(preds[preds["Pos"].map(lambda x: 44 <= x <= 180)]["Pred"])
            w2 = sum(preds[preds["Pos"].map(lambda x: 251 <= x <= 364)]["Pred"])
            if(w1 > w2):
                return '/home/sdw95/3D_SARS2/Misc_Notebooks/2021_01_13_SARS2_PDB_Sifts_Mapping/SIFTS_Mapped_PDBs/COVID19N_7ACT_A.pdb'
            else:
                return '/home/sdw95/3D_SARS2/Misc_Notebooks/2021_01_13_SARS2_PDB_Sifts_Mapping/SIFTS_Mapped_PDBs/COVID19N_6WZQ_A.pdb'
        return list(prot2pdbs[p1])[0]
    else:
        return pdb
interactions["P1 PDB"] = interactions[["P1", "P2", "P1 PDB"]].apply(lambda x: do(*x), axis=1)

# Select Best Available Human Structure

In [172]:
# Read SIFTS Data (to use for mapping eclair predictions onto structure
# for selecting structure with the best cummulative ECLAIR score)
sifts = pd.read_csv("/home/resources/sifts/parsed_files/pdbresiduemapping.txt", sep="\t")

In [173]:
# Select Best PDB Model for Human Protein (best overlap with ECLAIR Predictions)
def get_best_pdb(cov, uni):
    # First Fetch ECLAIR Predictions
    pair = "_".join([cov, uni])
    preds = inter2preds[pair]
    preds = preds[preds["Prot"] == (uni > cov)].set_index("Pos")["Pred"].to_dict()
    
    # Fetch all SIFTS mappings for this protein
    tmp = sifts[sifts["UniProt"] == uni][["PDB", "Chain", "UniProt", "MappableResInPDBChainOnUniprotBasis"]].copy()
    
    # To filter out PDB-like structures (difficult to work with)
    real_pdbs = [x for x in tmp["PDB"].unique() if os.path.exists("/home/resources/pdb/data/{0}/pdb{1}.ent.gz".format(x.lower()[1:3], x.lower()))]
    tmp = tmp[tmp["PDB"].isin(real_pdbs)]
    
    # NOTE: There's a manual edit here because some code needed to be re-run
    #       and to update the summary files, and the best PDB changed since docking
    #
    # Manually select PDB / Chain for one protein where the selected chain
    # changed between the original run and the current date (need to keep models
    # file consistent with the structures that were actually used for docking)
    if(uni == "P15151"):
        return ("3EPC", "R")
    
    if(len(tmp) == 0):
        return np.nan, np.nan
    elif(len(tmp) == 1):
        return tmp[["PDB", "Chain"]].values[0]
    
    
    tmp["Len"] = tmp["MappableResInPDBChainOnUniprotBasis"].map(lambda x: len(unzip_res_range(x)))
    tmp["Weighted_Len"] = tmp["MappableResInPDBChainOnUniprotBasis"].map(lambda x:sum([preds[int(x)] for x in unzip_res_range(x)]))
    
    
    #tmp = tmp.sort_values(["Weighted_Len", "Len"], ascending=False)
    
    return tmp[["PDB", "Chain"]].values[0]    
# FUNCTION END
tmp = interactions[["P1", "P2"]].apply(lambda x: get_best_pdb(*x), axis=1)
interactions["P2 PDB"] = [x[0] for x in tmp]
interactions["P2 Chain"] = [x[1] for x in tmp]

In [174]:
# Interactions with human + viral structure available
# (No ModBase Models)
print len(interactions[(~pd.isnull(interactions["P1 PDB"]))&(~pd.isnull(interactions["P2 PDB"]))])

120


In [175]:
# Add in Modbase Models where no PDB structure available
modbase = pd.read_csv("/home/resources/modbase/parsed_files/all_modbase_models.txt", sep="\t")
modbase[modbase["modpipe_quality_score"] >= 1.1].drop_duplicates("uniprot")

uni2modbase = modbase.set_index("uniprot")["modbase_modelID"].map(lambda x: ("/home/resources/modbase/data/hash/{0}.pdb".format(x))).to_dict()

interactions.ix[pd.isnull(interactions["P2 PDB"]), "P2 Chain"] = " "
interactions.ix[pd.isnull(interactions["P2 PDB"]), "P2 PDB"] = interactions.ix[pd.isnull(interactions["P2 PDB"]), "P2"].map(lambda x: uni2modbase[x] if x in uni2modbase else np.nan)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


In [176]:
print len(interactions[(~pd.isnull(interactions["P1 PDB"]))&(~pd.isnull(interactions["P2 PDB"]))])

239


# Store Local Copy of all Undocked Structures

In [177]:
# COVID protein structures were previous re-indexed so that PDB position
# corresponds with sequence position (code not shown?)

# Here we can just copy them locally
f2f = dict()
for f in tqdm_notebook(interactions["P1 PDB"].unique()):
    if(pd.isnull(f)):
        f2f[f] = f
        continue
    
    # Handle renaming for Homology Models Differently
    if("adr66" in f):
        # Reassign Chain ID to "A" (Viral protein structure will consistently be chain A)
        # Raw Homology Models are assigned no chain (" ")
        # Previously Mapped PDB structures were already reassigend chain "A" when reindexed 
        pdb_df = my.pdb2df(f)
        pdb_df["Chain"] = "A"
        
        # Save
        my.df2pdb("{0}/Data/Undocked_Structures/COVID19{1}".format(base_dir, os.path.basename(f)), pdb_df)
        f2f[f] = "{0}/Data/Undocked_Structures/COVID19{1}".format(base_dir, os.path.basename(f))
    else:
        # Copy
        os.system("cp {0} {1}/Data/Undocked_Structures/{2}".format(f, base_dir, os.path.basename(f)))
        f2f[f] = "{0}/Data/Undocked_Structures/{1}".format(base_dir, os.path.basename(f))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))




In [178]:
# Create position maps for all PDB files used for human structures
# (PDB, Chain, UniProt) --> (PDB_Pos --> UniProt_Pos)
keep_pdbs = set(interactions["P2 PDB"].unique())
pos_maps = sifts[sifts["PDB"].isin(keep_pdbs)].set_index(["PDB", "Chain", "UniProt"])[["MappableResInPDBChainOnUniprotBasis", "MappableResInPDBChainOnPDBBasis"]].apply(lambda x: dict(zip(unzip_res_range(x[1]), unzip_res_range(x[0]))), axis=1).to_dict()

In [179]:
# Reindex structures so they are numbered by Uniprot pos
for uni, pdb, chain in tqdm_notebook(interactions[["P2", "P2 PDB", "P2 Chain"]].values):
    #if(not uni == "P15151"):
    #    continue
    
    if(pd.isnull(pdb)):
        continue
    
    # Handle Modbase Structures (should already be indexed, just copy)
    if(chain == " "):
        # Reassign the Chain from no chain to chain "B" (B is consistently the human chain for this project)
        pdb_df = my.pdb2df(pdb)
        pdb_df["Chain"] = "B"
        
        # Save
        my.df2pdb("{0}/Data/Undocked_Structures/{1}_{2}".format(base_dir, uni, os.path.basename(pdb)), pdb_df)
        f2f[(pdb, chain)] = "{0}/Data/Undocked_Structures/{1}_{2}".format(base_dir, uni, os.path.basename(pdb))
        continue
    
    pos_map = pos_maps[pdb, chain, uni]
    
    pdb_df = my.pdb2df(pdb)
    pdb_df = pdb_df[pdb_df["Data Type"] == "ATOM"]
    pdb_df = pdb_df[pdb_df["Chain"] == chain]
    def do(x):
        chain, pos = x
        try:
            return pos_map[str(pos)]
        except:
            return -pos
    pdb_df["Residue ID"] = pdb_df[["Chain", "Residue ID"]].apply(do, axis=1)
    
    # Reassign the Chain
    pdb_df["Chain"] = "B"
    
    my.df2pdb("{0}/Data/Undocked_Structures/{1}_{2}_{3}.pdb".format(base_dir, uni, pdb, chain), pdb_df)
    f2f[(pdb, chain)] = "{0}/Data/Undocked_Structures/{1}_{2}_{3}.pdb".format(base_dir, uni, pdb, chain)

HBox(children=(IntProgress(value=0, max=332), HTML(value=u'')))




# Create Models Summary

In [180]:
# Map local location of undocked structure
interactions["P1 PDB File"] = interactions["P1 PDB"].map(lambda x: f2f[x])
interactions["P2 PDB File"] = interactions[["P2 PDB", "P2 Chain"]].apply(lambda x: f2f[tuple(x)] if not x[0] is np.nan else f2f[x[0]], axis=1)

In [181]:
models = []
for p1, p1_pdb, p1_f in tqdm_notebook(interactions[["P1", "P1 PDB", "P1 PDB File"]].drop_duplicates().values):
    p1_chain = None
    if(pd.isnull(p1_pdb)):
        source = None
    elif("COVID" in p1):
        if("adr66" in p1_pdb):
            source = "Modeller"
        else:
            source = "PDB"
            p1_chain = p1_pdb.split("/")[-1].split("_")[-1].split(".")[0]
    else:
        raise
    
    pdb_id = None
    pdb_chain = None
    if(source == "PDB"):
        pdb_id = p1_pdb.split("/")[-1].split("_")[-2].upper()
        pdb_chain = p1_chain
    
    modbase_ID = None
    if(source == "ModBase"):
        modbase_ID = os.path.basename(p1_pdb).split(".")[0]
    
    all_resi = None
    if(not source == None):
        pdb_df = my.pdb2df(p1_f)
        all_resi = zip_res_range([str(x) for x in sorted(pdb_df[(pdb_df["Data Type"] == "ATOM")&(pdb_df["Residue ID"] > 0)]["Residue ID"].unique())])
    
    if(not source == None):
        models.append([p1, source, p1_f, pdb_id, pdb_chain, modbase_ID, all_resi])

for p1, p1_pdb, p1_chain, p1_f in tqdm_notebook(interactions[["P2", "P2 PDB", "P2 Chain", "P2 PDB File"]].drop_duplicates().values):
    if(pd.isnull(p1_pdb)):
        source = None
    elif("COVID" in p1):
        source = "Modeller"
    elif(len(p1_pdb) == 4):
        source = "PDB"
    else:
        source = "ModBase"
    
    pdb_id = None
    pdb_chain = None
    if(source == "PDB"):
        pdb_id = p1_pdb.upper()
        pdb_chain = p1_chain
    
    modbase_ID = None
    if(source == "ModBase"):
        modbase_ID = os.path.basename(p1_pdb).split(".")[0]
    
    all_resi = None
    if(not source == None):
        pdb_df = my.pdb2df(p1_f)
        all_resi = zip_res_range([str(x) for x in sorted(pdb_df[(pdb_df["Data Type"] == "ATOM")&(pdb_df["Residue ID"] > 0)]["Residue ID"].unique())])
    
    if(not source == None):
        models.append([p1, source, p1_f, pdb_id, pdb_chain, modbase_ID, all_resi])
models = pd.DataFrame(models, columns=["ID", "Source", "PDB_File", "PDB_ID", "PDB_Chain", "ModBase_ID", "Resi_Covered"])

HBox(children=(IntProgress(value=0, max=28), HTML(value=u'')))




HBox(children=(IntProgress(value=0, max=332), HTML(value=u'')))




In [11]:
# Read in Protein Lengths
proteins = pd.read_csv("{0}/Data/Proteins.txt".format(base_dir), sep="\t")
uni2seq = proteins.set_index("ID")["Sequence"].to_dict()

# Calculate Coverage per model
models["Coverage"] = models[["ID", "Resi_Covered"]].apply(lambda x: len(unzip_res_range(x[1])) / float(len(uni2seq[x[0]])), axis=1)

In [13]:
# Save Selected Models
models.to_csv("{0}/Data/Models.txt".format(base_dir), sep="\t", index=None)

# Orient Structures for Docking

In [183]:
# NOTE: This is technically unnecessary for current HADDOCK docking setup.
#       Had previously used Rosetta and implemented ECLAIR predictions to
#       guide docking by restricting the initial chain orientation for each
#       dock with most likely interfaces pointed towards each other
#
#       It's still somewhat useful to clearly pair up the structures used for
#       each docking trial, so I still do it here anyway.

In [184]:
# Reorient Structures for guided Docking
if(not os.path.exists("{0}/Data/Oriented_Structures".format(base_dir))):
    os.mkdir("{0}/Data/Oriented_Structures".format(base_dir))

In [185]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import normalize

In [186]:
def x_rotation_matrix(theta):
    return np.array([1,              0,              0,
                     0,              np.cos(theta),  -np.sin(theta),
                     0,              np.sin(theta),  np.cos(theta)
                    ]).reshape((3, 3))
# FUNCTION END

def y_rotation_matrix(theta):
    return np.array([np.cos(theta),  0,              np.sin(theta),
                     0,              1,              0,
                     -np.sin(theta), 0,              np.cos(theta)
                    ]).reshape((3, 3))
# FUNCTION END

def z_rotation_matrix(theta):
    return np.array([np.cos(theta),  -np.sin(theta), 0,
                     np.sin(theta),  np.cos(theta), 0,
                     0,               0,             1
                    ]).reshape((3, 3))
# FUNCTION END

def rotate(xyz, theta_x, theta_y, theta_z, origin=np.array([0, 0, 0]), return_function=True):
    rot_matx = x_rotation_matrix(theta_x)
    rot_maty = y_rotation_matrix(theta_y)
    rot_matz = z_rotation_matrix(theta_z)
    
    if(return_function):
        return lambda x: rot_matz.dot(rot_maty.dot(rot_matx.dot(x - origin))) + origin
    return rot_matz.dot(rot_maty.dot(rot_matx.dot(xyz - origin))) + origin
# FUNCTION END

def translate(df, offset):
    df[["X", "Y", "Z"]] = df[["X", "Y", "Z"]] - offset
# FUNCTION END

def rotate_pdb(df, tx=None, ty=None, tz=None, origin=np.zeros(3), rot_func=None, angle_max=180):
    if(origin is "center"):
        origin = center_of_mass(pose, chain)
    if(tx is None):
        tx, ty, tz = (np.random.random(size=3) - 0.5)*angle_max*np.pi/180.0
    
    if(rot_func is None):
        rot_func = rotate(origin, tx, ty, tz, origin, True)
    
    # Apply the rotation
    tmp = np.concatenate(df[["X", "Y", "Z"]].apply(lambda x: rot_func(np.array(x)), axis=1).to_list(), axis=0).reshape(len(df), 3)
    df["X"] = np.round(tmp[:,0], 3)
    df["Y"] = np.round(tmp[:,1], 3)
    df["Z"] = np.round(tmp[:,2], 3)
# FUNCTION END

def orient_pdb(df, refA, refB):
    # Center
    translate(df, refA)
    refB = refB - refA
    
    # Generate First Rotation
    tmp = refB
    
    d_x = (tmp[0] - 0)
    d_z = (tmp[2] - 0)
    if(d_x*d_z != 0):
        sign = 1
        theta1 = sign*np.arctan(d_z / d_x)
        rot_mat1 = y_rotation_matrix(theta1)
    else:
        rot_mat1 = np.identity(3)
    
    # Generate Second Rotation
    tmp = rot_mat1.dot(tmp)
    
    d_x = tmp[0] - 0
    d_y = tmp[1] - 0
    if(d_x*d_y != 0):
        sign = -1#*[-1, 1][d_x*d_z <= 0]
        theta2 = -sign*np.arctan(d_x / d_y)
        rot_mat2 = z_rotation_matrix(theta2)
    else:
        rot_mat2 = np.identity(3)
    
    # Generate Third Rotation (to correct something?)
    tmp = rot_mat2.dot(tmp)
    
    if(tmp[1] < 0):
        theta3 = np.pi
        rot_mat3 = z_rotation_matrix(theta3)
    else:
        rot_mat3 = np.identity(3)
    
    tmp = rot_mat3.dot(tmp)
    
    # Apply the rotation
    rot_func = lambda x: rot_mat3.dot(rot_mat2.dot(rot_mat1.dot(np.array(x))))
    rotate_pdb(df, rot_func=rot_func)
# FUNCTION END

def rot_matrix_from_plane(ref1, ref2, ref3):
    # Center by reference 1
    original_delta = ref1.copy()

    # Rotate along Z axis so that ref 2 is at Y=0
    tmp = ref2 - original_delta

    d_x = tmp[0] - 0
    d_y = tmp[1] - 0
    if(d_x*d_y != 0):
        sign = -1#*[-1, 1][d_x*d_z <= 0]
        theta1 = sign*np.arctan(d_y / d_x)
        rot_mat1 = z_rotation_matrix(theta1)
    else:
        rot_mat1 = np.identity(3)

    # Roate along X axis so that ref 3 is at Z=0
    tmp = rot_mat1.dot(ref3 - original_delta)

    d_y = tmp[1] - 0
    d_z = tmp[2] - 0
    if(d_x*d_y != 0):
        sign = -1#*[-1, 1][d_x*d_z <= 0]
        theta2 = -sign*np.arctan(d_y / d_z)
        rot_mat2 = x_rotation_matrix(theta2)
    else:
        rot_mat2 = np.identity(3)

    # Return a function for this transformation
    return lambda x: rot_mat2.dot(rot_mat1.dot(x - original_delta))
# FUNCTION END

def orient_pdb_by_linear_regression(df, fit_col="Pred", up=True):
    # Generate Linear Regression Fit to the provided Label Column
    clf = LinearRegression()
    clf.fit(df[(~pd.isnull(df[fit_col]))&(df["Atom Name"] == "CA")][["X", "Y", "Z"]], df[(~pd.isnull(df[fit_col]))&(df["Atom Name"] == "CA")][fit_col])
    
    # Generate rotation matrix to match plane
    ref1 = np.array([-clf.intercept_/clf.coef_[0], 0, 0])
    ref2 = np.array([0, -clf.intercept_/clf.coef_[1], 0])
    ref3 = np.array([0, 0, -clf.intercept_/clf.coef_[2]])
    
    rot_func = rot_matrix_from_plane(ref1, ref2, ref3)
    
    # Apply Rotation and Center
    rotate_pdb(df, rot_func=rot_func)
    translate(df, df[["X", "Y", "Z"]].mean().values)
    
    # Final rotation to make sure interface is pointing the right direction
    if(up == True):
        if(clf.coef_[1] < 0):
            rotate_pdb(df, tx=np.deg2rad(180), ty=0, tz=0)
    else:
        if(clf.coef_[1] > 0):
            rotate_pdb(df, tx=np.deg2rad(180), ty=0, tz=0)
# FUNCTION END

In [193]:
# Reorient all structures so that interface faces are facing each other
for uniA, uniB, pdbA, pdbB, chainB in tqdm_notebook(interactions[["P1", "P2", "P1 PDB File", "P2 PDB File", "P2 Chain"]].values):
    if(os.path.exists("{0}/Data/Oriented_Structures/{1}_{2}.pdb".format(base_dir, uniA, uniB))):
        continue
    
    chainA = " "
    
    if(pd.isnull(pdbA) or pd.isnull(pdbB)):
        continue
    
    # Read Structures
    A_df = my.pdb2df(pdbA)
    if(not chainA == " "):
        A_df = A_df[A_df["Chain"] == chainA].copy()
    
    B_df = my.pdb2df(pdbB)
    #if(not chainB == " "):
    #    B_df = B_df[B_df["Chain"] == chainB].copy()
    
    if(len(A_df) == 0 or len(B_df) == 0):
        print "ERROR: no data for", uniA, uniB, pdbA, pdbB, chainA, chainB, len(A_df), len(B_df)
        continue
    
    # Get Eclair Preds
    tmp = inter2preds["_".join([uniA, uniB])]
    tmp = tmp.set_index(["Prot", "Pos"])["Pred"].to_dict()
    prot = 0 #int(uniA > uniB)
    
    # Assign Prediction Scores
    if(uniA != uniB):
        A_df["Pred"] = A_df["Residue ID"].map(lambda x: tmp[(prot, x)] if x > 0 else np.nan)
        B_df["Pred"] = B_df["Residue ID"].map(lambda x: tmp[(1-prot, x)] if x > 0 else np.nan)
    else:
        A_df["Pred"] = A_df["Residue ID"].map(lambda x: tmp[(0, x)] if x > 0 else np.nan)
        B_df["Pred"] = B_df["Residue ID"].map(lambda x: tmp[(0, x)] if x > 0 else np.nan)
    
    
    # Begin Reorientation
    
    # Handle Chain A
    orient_pdb_by_linear_regression(A_df, up=True)
    
    # Handle Chain B
    orient_pdb_by_linear_regression(B_df, up=False)
    
    # Final Translation to separate chains by 5 A
    translate(A_df, np.array([0, A_df["Y"].max() + 2.5, 0]))
    translate(B_df, np.array([0, B_df["Y"].min() - 2.5, 0]))
    
    # ADD CODE TO OPTIMZIE ROTATION ALONG Y AXIS (ONLY IN CHAIN B)
    
    
    # Record Centroid Locations
    ires_centroid_A = A_df[(A_df["Atom Name"] == "CA")&(~pd.isnull(A_df["Pred"]))][["X", "Y", "Z", "Pred"]].apply(lambda x: np.array(x[:3])*x[3], axis=1).sum() / A_df[(A_df["Atom Name"] == "CA")&(~pd.isnull(A_df["Pred"]))]["Pred"].sum()
    ires_centroid_B = B_df[(B_df["Atom Name"] == "CA")&(~pd.isnull(B_df["Pred"]))][["X", "Y", "Z", "Pred"]].apply(lambda x: np.array(x[:3])*x[3], axis=1).sum() / B_df[(B_df["Atom Name"] == "CA")&(~pd.isnull(B_df["Pred"]))]["Pred"].sum()
    
    # Make sure X:Z centorids of the Interfaces Line Up
    translate(A_df, np.array([ires_centroid_A[0], 0, ires_centroid_A[2]]))
    translate(B_df, np.array([ires_centroid_B[0], 0, ires_centroid_B[2]]))
    
    # Save Reoriented Structures
    A_df["Chain"] = "A"
    B_df["Chain"] = "B"
    my.df2pdb("{0}/Data/Oriented_Structures/{1}_{2}.pdb".format(base_dir, uniA, uniB), pd.concat([A_df, B_df]))
    my.df2pdb("{0}/Data/Oriented_Structures/{1}_{2}_A.pdb".format(base_dir, uniA, uniB), A_df)
    my.df2pdb("{0}/Data/Oriented_Structures/{1}_{2}_B.pdb".format(base_dir, uniA, uniB), B_df)
    
    
    ## Create Pymol Session
    #pymolCmd.reinitialize()
    #
    ## Load Original Structure
    #name = "{0}_{1}".format(uniA, uniB)
    #
    ## Load Reoriented Structures
    #pymolCmd.load("oriented_structures/{0}_A.pdb".format(name), name + "_A")
    #pymolCmd.load("oriented_structures/{0}_B.pdb".format(name), name + "_B")
    #
    #
    ## Color By Prediction
    #cmapA = cm.get_cmap('Greens')   
    #cmapB = cm.get_cmap('Blues')
    #
    #for resi, pred in A_df[["Residue ID", "Pred"]].drop_duplicates().values:
    #    if(np.isnan(pred)):
    #        pred = 0
    #    pymolCmd.color(matplotlib.colors.rgb2hex(cmapA(float(pred))[:3]).replace("#", "0x"), "{0}_A and resi {2}".format(name, "A", int(resi)))
    #    
    #for resi, pred in B_df[["Residue ID", "Pred"]].drop_duplicates().values:
    #    if(np.isnan(pred)):
    #        pred = 0
    #    pymolCmd.color(matplotlib.colors.rgb2hex(cmapB(float(pred))[:3]).replace("#", "0x"), "{0}_B and resi {2}".format(name, "B", int(resi)))
    #
    ## Save Structure Session
    #pymolCmd.save("PyMolSessions/{0}.pse".format(name))

HBox(children=(IntProgress(value=0, max=332), HTML(value=u'')))


