# Visualisation of eigenmodes

In this notebook we will visualise the direction and magnitude of a selected eigenmode. Clearly, there are some interesting patterns. All the calculated modes stored in `VisualizationRepo` originated from structures in `pdbavail = [ '../../DataRepo/PdbByAtomCount/*.pdb' ] ` will be visualized. Note that we used explicit external deflation to remove the >=6 free modes, so you can visualize modes starting from index 0.

In [1]:

import sys
import pickle
import numpy as np
import gc
import glob
import scipy.stats
import sklearn.cluster
import subprocess
import shutil
sys.path.append('..')
sys.path.append('../../')
sys.path.append('../../InchingLiteInteger/')
import InchingLiteInteger.Burn.Orthogonalization.T3
import InchingLiteInteger.Burn.Orthogonalization.T2
import InchingLiteInteger.Burn.Orthogonalization.T1
import InchingLiteInteger.Burn.Krylov.T3
import InchingLiteInteger.util
from InchingLiteInteger.Fuel.Coordinate.T1 import HeigvecOne_BoxCoxMagnitude

  from .autonotebook import tqdm as notebook_tqdm


# Visualization parameters

The `Bechmarking_folder` should store the eigenvector calculated, where we will look for `{Bechmarking_folder}/Eigvec_*_{pdbid}*.pkl`. 

In [2]:
np.random.seed(42)

#Benchmarking_folder = "../BenchmarkWindows6e0020/"


pdbavail = [ '../../DataRepo/CifShowcase/7r5jClashRemoved.cif' ] 
Benchmarking_folder = "../../ShowcaseLinuxJDMHDCupy0064/"

pdbavaildict = {'../../DataRepo/CifShowcase/7r5jClashRemoved.cif':[(0,1),(2,1)],
                #'../../DataRepo/CifShowcase/AirplaneYesH.cif':[(0,1),(1,1),(2,1)],
                #'../../DataRepo/CifShowcase/7pkz.cif':[(0,-1),(3,1),(6,1)]

                }


User_TheModeToShow = 14 # NOTE for airplane it should be 0,1 as EED was applied. For 3j3q 6
User_QuantileDisplay = 0.9 # NOTE Control the upper quantile threshold of what to be displayed 
User_RandomPickArrows = 3000
User_EigenvectorTwinDirection = -1

# =========
# Big arrow
# ===========
User_BigClusterArrowFloatingFactor = 0.5
User_DBscanMinDist = 1.5 # NOTE THis roughly cluster the 90% percentile arrows. largerr the less arrows


# =================
# animation
# =========================
User_ScalingAmplitudeAnimation = 8   # NOTE typicla value (2,1) for small amplitude fine movement or (4,2) for really large amplitude extrapolation where crack start filling in
User_ScalingStepSizeAnimation  = 4   # 


In [3]:

import tqdm

import pandas as pd
import numpy as np
import scipy
import openmm.app as mmapp
import mdtraj

#import seaborn as sns
import matplotlib.pyplot as plt


from torch.nn import functional as F
import torch





In [4]:

# str(t).zfill(len(str(n_timestep)))

# Print pml file to show some arrows

We will restrict the total number of arrow to be shown. Basically, only the top percent in magnitude will be printed and we will only print at max certain arrows only, otherwise the figure will be too busy! You can open the pymol script `.pml` using the pymol gui.

In [5]:


benchmark_inching = []
#for pdbfn in pdbavail:
for pdbfn in sorted(pdbavaildict.keys()):
    #pdbid = pdbfn.split("_")[2]

    PART00_IO = True
    if PART00_IO:

        pdbid = pdbfn.split("/")[-1].split(".")[0]

        # ============================
        # Where to store
        # ==========================
        DIR_Propagate = Benchmarking_folder
        #InchingLiteInteger.util.MkdirList([DIR_Propagate])

        X_df, X_top = InchingLiteInteger.util.BasicPdbCifLoading(pdbfn)
        protein_xyz = X_df[['x','y','z']].to_numpy().astype(np.float64)
        protein_xyz -= protein_xyz.mean(axis= 0)
        DIR_fn = glob.glob("%s/Eigvec_*_%s*.pkl" %(Benchmarking_folder, pdbid))
        for i_fn in DIR_fn:
                with open(i_fn,"rb") as fn:    
                    eigvec_ = pickle.load(fn)
    


    for User_TheModeToShow, motiondir in pdbavaildict[pdbfn]:

        PART00_Forloopeigvec = True
        if PART00_Forloopeigvec:

            eigvec = eigvec_[User_TheModeToShow]
            gc.collect()

            # NOTE This is important 
            if pdbfn.split(".")[-1] == 'pdb':
                nmfactor = 0.1
            else:
                nmfactor = 1


            deltaX_magnitude = HeigvecOne_BoxCoxMagnitude( eigvec,
                                User_WinsorizingWindow = (0.025, 0.975),
                                User_LogisticParam = (0.05, 1.0),
                                )

            eigvec_unit = eigvec / np.linalg.norm(eigvec, axis=1)[:,None] * motiondir

        print("IO done")

        # =========================
        # Propagate linear 
        # =========================
        PART01_Linearize = True
        # NOTE  Pymol has a strange 'bug' which forbids sticks to be shown when the structure is sufficiently large. 
        #       A guess is the limit of int32, something beyond 2.1 billion...


        if PART01_Linearize:

            InchingLiteInteger.util.SaveOneModeLinearisedAnime(
                    deltaX_magnitude[:,None] * eigvec_unit,
                    protein_xyz*nmfactor,
                    n_timestep = 16,
                    DIR_ReferenceStructure = pdbfn,#[:-4] + "trial.cif",
                    DIR_SaveFolder = DIR_Propagate,
                    SaveFormat = 'cif',
                    outputlabel = 'BoxCox_%s_%s'%(pdbid, User_TheModeToShow),
                    max_abs_deviation = 2.0*nmfactor*User_ScalingAmplitudeAnimation,
                    stepsize = 1.0*nmfactor*User_ScalingStepSizeAnimation,
                    UnitMovement = False,
                    max_n_output = 32,
                    SaveSeparate = True,
                    RemoveOrig = False, # NOTE This flag remove the unmoved structure from the trajectory produce
                    User_Bfactor=deltaX_magnitude
                    )

        print("Linearize done")
        # =========================
        # WWhat to plot
        # ==========================
        PART02_DecideWhatArrowsToPlot = True
        if PART02_DecideWhatArrowsToPlot:
            where_CaOrP = X_df.loc[X_df['name'].isin(["CA", "P"]) & ~X_df['element'].isin(["Ca"])].index.values
            where_larger = np.where((deltaX_magnitude > np.quantile(deltaX_magnitude, q = User_QuantileDisplay)))[0]
            # a ball with large displacement TODO Show the stacked detail
            where_larger_CaOrP = np.intersect1d(where_larger, where_CaOrP, assume_unique=False, return_indices=False)
            where_random = np.random.choice(where_larger_CaOrP, 
                                                size= min(User_RandomPickArrows, where_larger_CaOrP.shape[0]), replace = False) 

            # TODO Make  a big arrow for those large ones only! Cluster the coordinate by dbscan. 
            #      average the arrow put it in center and floating in air. 
            #      Make the arrow obvious enough to indicate the direction.
            where_CaOrP_subset = where_CaOrP[::max(1, int(protein_xyz.shape[0]/User_RandomPickArrows))]


            # ======================
            # Big Arrow
            # =========================
            
            clustering = sklearn.cluster.DBSCAN(eps=User_DBscanMinDist, min_samples=10, metric='euclidean', 
                                                metric_params=None, algorithm='kd_tree', 
                                                leaf_size=100, p=2, n_jobs=1).fit(protein_xyz[where_larger_CaOrP,:])
            unique_clusters = np.unique(clustering.labels_)
            DBSCAN_Coord = np.zeros((unique_clusters.shape[0],3))
            DBSCAN_UnitEigvec = np.zeros((unique_clusters.shape[0],3))
            DBSCAN_UnitEigvecmag = np.zeros((unique_clusters.shape[0],1))
            for i_cluster in unique_clusters:
                if i_cluster == -1:
                    continue
                same_cluster = where_larger_CaOrP[np.where(clustering.labels_ == i_cluster)[0]]
                DBSCAN_Coord[i_cluster,:] = np.mean(protein_xyz[same_cluster,:], axis=0)
                DBSCAN_UnitEigvec[i_cluster,:] = np.mean(eigvec_unit[same_cluster,:], axis=0)
                DBSCAN_UnitEigvecmag[i_cluster,:] = np.mean(deltaX_magnitude[same_cluster])

        print("dbscan done")
        # ========================
        # Print arrwo
        # ==========================
        PART03_PrintCgoArrows = True
        if PART03_PrintCgoArrows:
            # NOTE Pymol...
            if pdbfn.split(".")[-1] == 'pdb':
                nmfactor_ = 10.0
            else:
                nmfactor_ = 10.0


            #print(deltaX_magnitude)
            percentilescore_all =  np.argsort(np.argsort(deltaX_magnitude, axis=0), axis=0) / float(len(deltaX_magnitude)) # NOTE Assumed that each has a unique float
            print_cgoarrows = []

            # =================================
            # NOTE THe Big Clustered Arrow
            # ==================================
            for i_cluster in range(unique_clusters.shape[0]):
                
                # NOTE Point to point
                position_source = DBSCAN_Coord[i_cluster] * nmfactor_ 
                direction_size = 99 * DBSCAN_UnitEigvecmag[i_cluster]
                direction_= (User_EigenvectorTwinDirection * DBSCAN_UnitEigvec[i_cluster] *direction_size) #* deltaX_magnitude[atomindex_]*50)
                gap = direction_* User_BigClusterArrowFloatingFactor

                position_source += gap
                #position_source += direction_*User_BigClusterArrowFloatingFactor
                position_target = position_source + direction_

                x_s, y_s, z_s = position_source[0], position_source[1], position_source[2]
                x_t, y_t, z_t = position_target[0], position_target[1], position_target[2]
                thickness_ = 5 # percentilescore_all[atomindex_]
                print_cgoarrows.append("cgo_arrow [%.3f, %.3f, %.3f], [%.3f, %.3f, %.3f] " %(
                    x_s, y_s, z_s, x_t, y_t, z_t) + ', name = \"' + "ClusterArrow%s" %(i_cluster+1)+'\",' + " radius = %s, hradius = %s, hlength = %s, " %(thickness_, thickness_*2, direction_size[0]/2 ) + ' color = hotpink')
                    # hotpink black
            # ===========================
            # NOTE every n CA
            # ==============================
            choice_where =  where_random # where_CaOrP_subset
            for i_whererand in range(len(choice_where)):
                atomindex_ = choice_where[i_whererand]
                # NOTE Point to point
                position_source = protein_xyz[atomindex_]*nmfactor_
                direction_= (eigvec_unit[atomindex_] * User_EigenvectorTwinDirection *25 * deltaX_magnitude[atomindex_]) #* deltaX_magnitude[atomindex_]*50)
                position_target = position_source + direction_

                x_s, y_s, z_s = position_source[0], position_source[1], position_source[2]
                x_t, y_t, z_t = position_target[0], position_target[1], position_target[2]
                thickness_ = 0.1 # percentilescore_all[atomindex_]
                print_cgoarrows.append("cgo_arrow [%.3f, %.3f, %.3f], [%.3f, %.3f, %.3f] " %(
                    x_s, y_s, z_s, x_t, y_t, z_t) + ', name = \"' + "Index%s" %(atomindex_+1)+'\",' + " radius = %s, hradius = %s, hlength = 5.0, " %(thickness_*2, thickness_ * 10) + ' color = black')
                    # hotpink black




            with open('./ArrowTemplate.pml', 'r') as f :
                    filedata = f.read()
            """
            filedata = filedata.replace('REPLACE_WITH_FILENAME', './%s_BoxCox_%s_%s.cif'%(pdbid, pdbid, User_TheModeToShow))#'./%s.%s' %(pdbid ,pdbfn.split(".")[-1]))
            """


            n_timestep = len(list(glob.glob(Benchmarking_folder + '/%s_BoxCox_%s_%s*.cif'%(pdbid, pdbid, User_TheModeToShow))))

            tobeinserted  =[]
            for t in range(n_timestep):
                
                ffff = str(t).zfill(len(str(16)))
                tobeinserted.extend(["load %s_BoxCox_%s_%s%s.cif" %(pdbid,pdbid,User_TheModeToShow,ffff),
                "create obj3, %s_BoxCox_%s_%s%s, 0, %s"%(pdbid,pdbid,User_TheModeToShow,ffff, t +1),
                "delete %s_BoxCox_%s_%s%s"%(pdbid,pdbid,User_TheModeToShow,ffff)])

            filedata = filedata.replace('load REPLACE_WITH_FILENAME', "\n".join(tobeinserted)+"\n")#'./%s.%s' %(pdbid ,pdbfn.split(".")[-1]))

            

            filedata = filedata.replace('REPLACE_WITH_ID', '%s' %(pdbid))
            filedata = filedata.replace('REPLACE_WITH_CGOARROWS', "\n".join(print_cgoarrows))

            with open('%s/PymolSession_%s_%s.pml'%(DIR_Propagate, pdbid, User_TheModeToShow), 'w+') as f:
                    f.write(filedata)



        PART04_TidyUpFolders = True
        if PART04_TidyUpFolders:
            shutil.copy('./cgo_arrow.py', "%s/cgo_arrow.py" %(DIR_Propagate))
            # NOTE you probably need a bash script for this.
            #subprocess.call("cd %s ; pymol -cq ./PymolSession_%s_%s.pml" %(Benchmarking_folder,pdbid, User_TheModeToShow) , shell = True)

    del eigvec_
    gc.collect()

IO done


100%|██████████| 9/9 [00:02<00:00,  3.74it/s]


Linearize done
dbscan done
IO done


100%|██████████| 9/9 [00:02<00:00,  3.65it/s]


Linearize done
dbscan done
