In [1]:
from shapeGMMTorch import torch_sgmm
from shapeGMMTorch import scripts

import MDAnalysis as md
import torch
from glob import glob
import numpy as np
import os


apo_residues = "882 to 1061"
holo_residues = "981 to 1131"

apo_eq_traj = "/scratch/projects/hockygroup-archive/wpc252/projects/factin_vinculin/apo/setup/eq/md_v.xtc"
#apo_eq_gro = "/scratch/projects/hockygroup-archive/wpc252/projects/factin_vinculin/apo/setup/eq/npt.gro"
apo_eq_top = "apo_actin_vinculin_labeled.pdb"

holo_eq_traj = "/scratch/projects/hockygroup-archive/wpc252/projects/factin_vinculin/holo/setup/eq/md_v.xtc"
#holo_eq_gro = "/scratch/projects/hockygroup-archive/wpc252/projects/factin_vinculin/holo/setup/eq/npt.gro"
holo_eq_top = "holo_actin_vinculin_labeled.pdb"



In [2]:
def load_trajectory(trj,top,atoms="name CA", residues="", write_frame=None, write_prefix=None, segment=""):
    coord = md.Universe(top,trj)

    print("Number of atoms in trajectory:", coord.atoms.n_atoms)
    print("Number of frames in trajectory:", coord.trajectory.n_frames)
    # make atom selection
    if len(residues)>0:
        residue_string = f"and resnum {residues}" 
    else:
        residue_string = ""
    if len(atoms)>0:
        atom_string = f"and {atoms}" 
    else:
        atom_string = ""

    if len(segment)>0:
        segment_string = f"and segid {segment}" 
    else:
        segment_string = ""
    atomSel = coord.select_atoms(f'protein {atom_string} {residue_string} {segment_string}')
    print("Number of atoms in selection:", atomSel.n_atoms)
    print(atomSel)

    if write_frame is not None and write_prefix is not None:
        atomSel.write(f"{write_prefix}_frame{write_frame}.pdb",frames=coord.trajectory[[write_frame]])
    
    # create traj data of selection
    traj_data = np.empty((coord.trajectory.n_frames,atomSel.n_atoms,3),dtype=float)
    #loop traj
    for ts in coord.trajectory:
        traj_data[ts.frame,:] = atomSel.positions
        
    return traj_data

In [3]:
def get_representative_frames(sgmm, traj_data):
    """
    Write representative frames for each cluster
    This is defined as the frame with the largest LL to each cluster
    """
    # get meta data from inputs
    n_frames = traj_data.shape[0]
    n_atoms = sgmm.n_atoms
    n_clusters = sgmm.n_clusters
    # loop through clusters
    representative_list = []
    cluster_ids = sgmm.predict(traj_data)

    for cluster_id in range(n_clusters):
        indices = np.argwhere(cluster_ids==cluster_id).flatten()
        frameid_max = np.argmax(sgmm.predict_frame_log_likelihood[indices])
        representive_frame_id = indices[frameid_max]
        representative_list.append( (cluster_id, representive_frame_id, sgmm.predict_frame_log_likelihood[indices][frameid_max]) )
    return representative_list

In [18]:
#apo_trj = load_trajectory(apo_eq_traj,apo_eq_top,residues=apo_residues)
apo_trj = load_trajectory(apo_eq_traj,apo_eq_top,segment="V",atoms='backbone')

Number of atoms in trajectory: 235307
Number of frames in trajectory: 5001
Number of atoms in selection: 719
<AtomGroup [<Atom 29356: N of type N of resname ASP, resid 882 and segid V and altLoc >, <Atom 29360: CA of type C of resname ASP, resid 882 and segid V and altLoc >, <Atom 29368: C of type C of resname ASP, resid 882 and segid V and altLoc >, ..., <Atom 32186: N of type N of resname LYS, resid 1061 and segid V and altLoc >, <Atom 32188: CA of type C of resname LYS, resid 1061 and segid V and altLoc >, <Atom 32206: C of type C of resname LYS, resid 1061 and segid V and altLoc >]>


In [25]:
holo_trj = load_trajectory(holo_eq_traj,holo_eq_top,segment="V",atoms='backbone')


Number of atoms in trajectory: 228126
Number of frames in trajectory: 5001
Number of atoms in selection: 603
<AtomGroup [<Atom 29356: N of type N of resname SER, resid 981 and segid V and altLoc >, <Atom 29360: CA of type C of resname SER, resid 981 and segid V and altLoc >, <Atom 29367: C of type C of resname SER, resid 981 and segid V and altLoc >, ..., <Atom 31731: N of type N of resname PRO, resid 1131 and segid V and altLoc >, <Atom 31735: CA of type C of resname PRO, resid 1131 and segid V and altLoc >, <Atom 31743: C of type C of resname PRO, resid 1131 and segid V and altLoc >]>


In [26]:
# use frame weights to only use last fraction of trajectory
print("DOING APO")
frame_weights_selection = np.ones(len(apo_trj))
frame_weights_selection[:len(frame_weights_selection)//4]=0
apo_align_sgmm = scripts.sgmm_fit_with_attempts(apo_trj, 1, 1, frame_weights=frame_weights_selection, device=torch.device("cuda"))
rep_frames = get_representative_frames(apo_align_sgmm, apo_trj)
print("most representative frames",rep_frames)
#now write this frame
_ = load_trajectory(apo_eq_traj, apo_eq_top, atoms="", write_frame=rep_frames[0][1], write_prefix="representative_frames/apo_eq_trj_clust0rep")

print("DOING HOLO")
frame_weights_selection = np.ones(len(holo_trj))
frame_weights_selection[:len(frame_weights_selection)//4]=0
holo_align_sgmm = scripts.sgmm_fit_with_attempts(holo_trj, 1, 1, frame_weights=frame_weights_selection, device=torch.device("cuda"))
rep_frames = get_representative_frames(holo_align_sgmm, holo_trj)
print("most representative frames",rep_frames)
#now write this frame
_ = load_trajectory(holo_eq_traj, holo_eq_top, atoms="", write_frame=rep_frames[0][1], write_prefix="representative_frames/holo_eq_trj_clust0rep")



DOING APO
Number of training frames: 5001
Number of clusters: 1
Number of attempts: 1
 Attempt  Log Like per Frame    CPU Time (s)
--------------------------------------------------
       1            4267.878           4.370
most representative frames [(0, 4529, 4508.943567028365)]
Number of atoms in trajectory: 235307
Number of frames in trajectory: 5001
Number of atoms in selection: 32008
<AtomGroup [<Atom 1: N of type N of resname ASP, resid 1 and segid A1 and altLoc >, <Atom 2: H1 of type H of resname ASP, resid 1 and segid A1 and altLoc >, <Atom 3: H2 of type H of resname ASP, resid 1 and segid A1 and altLoc >, ..., <Atom 32206: C of type C of resname LYS, resid 1061 and segid V and altLoc >, <Atom 32207: OT1 of type O of resname LYS, resid 1061 and segid V and altLoc >, <Atom 32208: OT2 of type O of resname LYS, resid 1061 and segid V and altLoc >]>
DOING HOLO
Number of training frames: 5001
Number of clusters: 1
Number of attempts: 1
 Attempt  Log Like per Frame    CPU Time (s

array([[[27.22000122, 20.42000008, 64.02000427],
        [26.87000084, 19.51000023, 63.67000198],
        [27.74000168, 20.43000031, 64.91999817],
        ...,
        [76.32000732, 30.79000282, 36.79000092],
        [76.9200058 , 31.22000122, 37.86000061],
        [75.07000732, 30.69000244, 36.74000168]],

       [[27.47000122, 19.72999954, 64.29000092],
        [26.65000153, 19.27000046, 63.85000229],
        [28.10000229, 19.04000092, 64.73000336],
        ...,
        [74.95000458, 31.78000259, 33.31000137],
        [73.76000214, 31.90000153, 33.66000366],
        [75.77000427, 32.67000198, 33.68000412]],

       [[27.01000214, 20.28000069, 63.80000305],
        [26.7100029 , 19.34000206, 63.4600029 ],
        [27.35000229, 20.26000023, 64.77999878],
        ...,
        [76.70000458, 31.15000153, 34.15000153],
        [76.30000305, 32.29000092, 33.75000381],
        [76.27999878, 30.63000107, 35.25      ]],

       ...,

       [[26.83000183, 20.09000015, 63.81000519],
        [27

In [5]:
## Now get data with force
kBT=0.593

fes_apo_top = "apo_actin_vinculin_fes_labeled.pdb"
force_dirs = glob("/scratch/projects/hockygroup-archive/wpc252/projects/factin_vinculin/apo/fes/actin_apoVinc_opes_pullProj_free_gpu_pn_*20.0/*")
for force_dir in force_dirs:
    label = os.path.basename(os.path.dirname(force_dir))+'_seed'+os.path.basename(force_dir)
    xtc_file = glob(os.path.join(force_dir,"*full*.xtc"))
    if not xtc_file: 
        print("Skipping",force_dir)
    opes_file = glob(f"{force_dir}/*opes*.out")
    if len(opes_file)>1:
        print("Error, more than one opes file in",force_dir)
    else:
        opes_file = opes_file[0]
    opes_data = np.loadtxt(opes_file)
    #check for repeat rows
    repeat_lines = np.where( opes_data[1:,0]-opes_data[:-1,0] ==0 )[0] 
    opes_data = np.delete(opes_data, repeat_lines,axis=0)

    bias_trj = opes_data[::20,1]
    bias_trj-=bias_trj[0]
    
    frame_weights = np.exp(bias_trj/kBT)
    print(frame_weights.min())
    print(frame_weights.max())

    print(f"...Analyzing {xtc_file}")
    biased_trj = load_trajectory(xtc_file,fes_apo_top,segment="V",atoms='name CA')
    print(biased_trj.shape)
    print(frame_weights.shape)
    try:
        apo_align_sgmm = scripts.sgmm_fit_with_attempts(biased_trj, 1, 1, frame_weights=frame_weights, device=torch.device("cuda"),dtype=torch.float64)
        
        rep_frames = get_representative_frames(apo_align_sgmm, biased_trj)
        print(f"most representative frames for {force_dir}:",rep_frames)
    #now write this frame
        _ = load_trajectory(xtc_file, fes_apo_top, atoms="", write_frame=rep_frames[0][1], write_prefix=f"representative_frames/fes/apo/{label}_rep")
    except RuntimeError as e:
        print("Error processing")
        print(e)


1.0
8749509445648457.0
...Analyzing ['/scratch/projects/hockygroup-archive/wpc252/projects/factin_vinculin/apo/fes/actin_apoVinc_opes_pullProj_free_gpu_pn_20.0/7/7_fes_pullVt.full.xtc']
Number of atoms in trajectory: 290139
Number of frames in trajectory: 2501
Number of atoms in selection: 180
<AtomGroup [<Atom 29360: CA of type C of resname ASP, resid 882 and segid V and altLoc >, <Atom 29372: CA of type C of resname GLU, resid 883 and segid V and altLoc >, <Atom 29387: CA of type C of resname GLU, resid 884 and segid V and altLoc >, ..., <Atom 32148: CA of type C of resname VAL, resid 1059 and segid V and altLoc >, <Atom 32164: CA of type C of resname ARG, resid 1060 and segid V and altLoc >, <Atom 32188: CA of type C of resname LYS, resid 1061 and segid V and altLoc >]>
(2501, 180, 3)
(2501,)
Number of training frames: 2501
Number of clusters: 1
Number of attempts: 1
 Attempt  Log Like per Frame    CPU Time (s)
--------------------------------------------------
       1             

In [None]:
kBT=0.593

fes_holo_top = "holo_actin_vinculin_fes_labeled.pdb"
force_dirs = glob("/scratch/projects/hockygroup-archive/wpc252/projects/factin_vinculin/holo/fes/actin_holoVinc_opes_pullProj_free_gpu_pn_*20.0//*")
for force_dir in reversed(force_dirs):
    label = os.path.basename(os.path.dirname(force_dir))+'_seed'+os.path.basename(force_dir)
    xtc_file = glob(os.path.join(force_dir,"*full*.xtc"))
    if not xtc_file: 
        print("Skipping",force_dir)
    opes_file = glob(f"{force_dir}/*opes*.out")
    if len(opes_file)>1:
        print("Error, more than one opes file in",force_dir)
    else:
        opes_file = opes_file[0]
    opes_data = np.loadtxt(opes_file)
    #check for repeat rows
    repeat_lines = np.where( opes_data[1:,0]-opes_data[:-1,0] ==0 )[0] 
    opes_data = np.delete(opes_data, repeat_lines,axis=0)

    bias_trj = opes_data[::20,1]
    bias_trj-=bias_trj[0]
    ##max out bias to avoid numerical errors
    ##bias_trj[bias_trj>20]=20

    
    frame_weights = np.exp(bias_trj/kBT)
    print(frame_weights.min())
    print(frame_weights.max())

    print(f"...Analyzing {xtc_file}")
    biased_trj = load_trajectory(xtc_file,fes_holo_top,segment="V",atoms='name CA')
    print(biased_trj.shape)
    print(frame_weights.shape)
    try:
        holo_align_sgmm = scripts.sgmm_fit_with_attempts(biased_trj, 1, 1, frame_weights=frame_weights, device=torch.device("cuda"),dtype=torch.float64)
        
        rep_frames = get_representative_frames(holo_align_sgmm, biased_trj)
        print(f"most representative frames for {force_dir}:",rep_frames)
    #now write this frame
        _ = load_trajectory(xtc_file, fes_holo_top, atoms="", write_frame=rep_frames[0][1], write_prefix=f"representative_frames/fes/holo/{label}_rep")
    except RuntimeError as e:
        print("Error processing")
        print(e)

1.0
1.4040774978473268e+16
...Analyzing ['/scratch/projects/hockygroup-archive/wpc252/projects/factin_vinculin/holo/fes/actin_holoVinc_opes_pullProj_free_gpu_pn_-20.0/18/18_fes_pullVt.full.xtc']
Number of atoms in trajectory: 293509
Number of frames in trajectory: 2501
Number of atoms in selection: 151
<AtomGroup [<Atom 29360: CA of type C of resname SER, resid 981 and segid V and altLoc >, <Atom 29371: CA of type C of resname SER, resid 982 and segid V and altLoc >, <Atom 29382: CA of type C of resname LYS, resid 983 and segid V and altLoc >, ..., <Atom 31697: CA of type C of resname LYS, resid 1129 and segid V and altLoc >, <Atom 31719: CA of type C of resname THR, resid 1130 and segid V and altLoc >, <Atom 31735: CA of type C of resname PRO, resid 1131 and segid V and altLoc >]>
(2501, 151, 3)
(2501,)
Number of training frames: 2501
Number of clusters: 1
Number of attempts: 1
 Attempt  Log Like per Frame    CPU Time (s)
--------------------------------------------------
       1    

In [26]:
#example bias trj
example_trj="/scratch/projects/hockygroup-archive/wpc252/projects/factin_vinculin/apo/fes/actin_apoVinc_FES_pullProj_highE_pn_0.0/26/26_fes_pullVt.run.50000000.xtc"
example_gro=example_trj.replace(".xtc",".gro")
example_biased_CA_trj = load_trajectory(example_trj, example_gro, apo_residues)
opes_out_file=glob(os.path.join(os.path.dirname(example_trj),"*.opes.out"))[0]



Number of atoms in trajectory: 235307
Number of frames in trajectory: 2001
Number of atoms in selection: 180
<AtomGroup [<Atom 29360: CA of type C of resname ASP, resid 882 and segid SYSTEM>, <Atom 29372: CA of type C of resname GLU, resid 883 and segid SYSTEM>, <Atom 29387: CA of type C of resname GLU, resid 884 and segid SYSTEM>, ..., <Atom 32148: CA of type C of resname VAL, resid 1059 and segid SYSTEM>, <Atom 32164: CA of type C of resname ARG, resid 1060 and segid SYSTEM>, <Atom 32188: CA of type C of resname LYS, resid 1061 and segid SYSTEM>]>
[0.       0.       0.       ... 8.299064 7.39794  6.510049]
(50001,)


In [29]:

#bias_trj = 
print(bias_trj)
print(bias_trj.shape)

[ 0.       14.897801 22.935641 ...  5.981277  3.13292   6.510049]
(2001,)


In [66]:

example_sgmm_metaD_weights_1cluster = scripts.sgmm_fit_with_attempts(example_biased_CA_trj, 1, 1, frame_weights, device=torch.device("cuda"))
rep_frame_list = get_representative_frames(example_sgmm_metaD_weights_1cluster, example_biased_CA_trj)
print(rep_frame_list)

Number of training frames: 2001
Number of clusters: 1
Number of attempts: 1
 Attempt  Log Like per Frame    CPU Time (s)
--------------------------------------------------
       1            1565.988           2.573
[(0, 191, 1822.5401490274614)]


In [94]:
#write representative frame
tmp = load_trajectory(example_trj, example_gro, atoms="", write_frame=10, write_prefix="example_biased_apo_snap")



Number of atoms in trajectory: 235307
Number of frames in trajectory: 2001
Number of atoms in selection: 32008
<AtomGroup [<Atom 1: N of type N of resname ASP, resid 1 and segid SYSTEM>, <Atom 2: H1 of type H of resname ASP, resid 1 and segid SYSTEM>, <Atom 3: H2 of type H of resname ASP, resid 1 and segid SYSTEM>, ..., <Atom 32206: C of type C of resname LYS, resid 1061 and segid SYSTEM>, <Atom 32207: OT1 of type O of resname LYS, resid 1061 and segid SYSTEM>, <Atom 32208: OT2 of type O of resname LYS, resid 1061 and segid SYSTEM>]>


In [72]:
get_frame_catdcd("test_out_clust0",example_trj,example_gro,rep_frame_list[0][1])

/share/apps/vmd/1.9.3/lib/vmd/plugins/LINUXAMD64/bin/catdcd5.1/catdcd: error while loading shared libraries: libexpat.so.0: cannot open shared object file: No such file or directory


AttributeError: 'numpy.ndarray' object has no attribute 'write'

In [None]:
# do cluster scan for apo eq
# uniform weights
cluster_array = np.arange(1,4,1)
train, cv = scripts.cross_validate_cluster_scan(apo_trj, 2500, covar_type="kronecker", cluster_array = cluster_array, n_training_sets=3, n_attempts = 5, dtype=torch.float32, device=torch.device("cuda"))
# shapeGMMTorch plots has a built-in plot function for this type of plot
from shapeGMMTorch import plots
# also need to load matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
# create figure
fig, ax = plt.subplots(1,2,figsize=(8,4), dpi= 120, facecolor='w', edgecolor='k',sharex=True,sharey=True)
# Uniform Weights
ax[0].set_title("Uniform",fontsize=12)
plots.plot_log_likelihood_with_dd(ax[0],cluster_array,train,cv,fontsize=12,ylabel1=True,ylabel2=False,xlabel=True)
plt.show()