In [1]:
import os 

import glob 
import h5py 

import numpy as np 

In [2]:
import MDAnalysis as mda

In [3]:
from MDAnalysis.analysis.rms import RMSD

In [4]:
from utils import cm_to_cvae 

Using TensorFlow backend.


In [8]:
omm_list = sorted(glob.glob('../vhp_exp.3rd/omm_runs_*'))

# RMSD

In [15]:
ref_pdb_file = '../vhp_exp.3rd/pdb/vhp1ww.pdb' 
start_point = '../vhp_exp.3rd/pdb/vhp1ww_solv.gro'

In [17]:
RMSD_all = []
for omm in omm_list: 
    dcd_file = os.path.join(omm, 'output.dcd')
    mda_traj = mda.Universe(start_point, dcd_file)
    ref_traj = mda.Universe(ref_pdb_file)
    R = RMSD(mda_traj, ref_traj, select='protein and name CA')
    R.run() 
    RMSD_all.append(R.rmsd[:,2])

In [19]:
RMSD_all = np.hstack(RMSD_all)

# Embed

In [22]:
model_weight = '../vhp_exp.3rd/CVAE_exps/cvae_weight.h5'

In [43]:
cm_data_lists = [] 
num_frame = 0 
for omm in omm_list: 
    cm_file = os.path.join(omm, 'output_cm.h5')
    cm_h5 = h5py.File(cm_file, 'r') 
#     print cm_h5[u'contact_maps']
    cm_data_lists.append(cm_h5[u'contact_maps'].value) 
    num_frame += cm_h5[u'contact_maps'].shape[1]
    cm_h5.close() 

In [44]:
num_frame * 0.01

900.0

In [25]:
cvae_input = cm_to_cvae(cm_data_lists)

In [26]:
print cvae_input.shape

(90000, 36, 36, 1)

In [27]:
from utils import predict_from_cvae
cm_predict = predict_from_cvae(model_weight, cvae_input, hyper_dim=3)

Instructions for updating:
Colocations handled automatically by placer.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 36, 36, 1)    0                                            
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 36, 36, 64)   640         input_1[0][0]                    
__________________________________________________________________________________________________
conv2d_2 (Conv2D)               (None, 18, 18, 64)   36928       conv2d_1[0][0]                   
__________________________________________________________________________________________________
conv2d_3 (Conv2D)               (None, 18, 18, 64)   36928       conv2d_2[0][0]                   
_____________________________________

In [28]:
cm_predict.shape

(90000, 3)

# outliers 

In [30]:
from utils import outliers_from_latent
eps = 0.2 

while True:
    outliers = np.squeeze(outliers_from_latent(cm_predict, eps=eps))
    n_outlier = len(outliers)
    print('dimension = {0}, eps = {1:.2f}, number of outlier found: {2}'.format(
        3, eps, n_outlier))
    if n_outlier > 200:
        eps = eps + 0.05
    else:
        outlier_list = outliers 
        break

dimension = 3, eps = 0.20, number of outlier found: 995
dimension = 3, eps = 0.25, number of outlier found: 415
dimension = 3, eps = 0.30, number of outlier found: 182


In [31]:
outlier_list

array([ 1615,  1693,  1695,  4077,  4112,  4113,  4377,  8227,  8323,
        8353,  8355,  8448,  8449,  8451,  8452,  8466,  8919,  8924,
        8926,  8945,  9564,  9569,  9786,  9789,  9790,  9791,  9909,
        9911,  9919,  9971,  9987, 11144, 14314, 14402, 15153, 16049,
       18024, 18027, 18028, 18032, 21772, 21773, 21775, 21776, 21785,
       23060, 23068, 23090, 23094, 23095, 23096, 23103, 23123, 23135,
       23136, 23139, 23140, 23141, 23142, 23143, 23144, 23149, 23178,
       23192, 23193, 25896, 27050, 27051, 27052, 27053, 27054, 27055,
       27056, 27057, 27062, 27063, 27068, 27138, 27207, 28051, 28062,
       28068, 28069, 28071, 28073, 28074, 28088, 28092, 28127, 28129,
       28130, 28131, 28133, 28134, 28135, 28136, 28152, 28180, 28181,
       28199, 28203, 28206, 28209, 28210, 28214, 28217, 28218, 28222,
       28225, 28230, 28233, 28235, 28244, 28245, 28246, 28249, 31250,
       31264, 31349, 31384, 31387, 31389, 31403, 31488, 31489, 33049,
       33050, 34865,

In [32]:
h5_save = h5py.File('./latent3d_vhp.h5', 'w') 

In [33]:
h5_save.create_dataset('cm_predict', data=cm_predict)  
h5_save.create_dataset('RMSD', data=RMSD_all) 
h5_save.create_dataset('outliers', data=outlier_list)  

<HDF5 dataset "outliers": shape (182,), type "<i8">

In [34]:
h5_save.close() 

In [37]:
h5_save = h5py.File('./latent3d_vhp.h5', 'r') 

In [38]:
h5_save.items()

[(u'RMSD', <HDF5 dataset "RMSD": shape (90000,), type "<f8">),
 (u'cm_predict', <HDF5 dataset "cm_predict": shape (90000, 3), type "<f4">),
 (u'outliers', <HDF5 dataset "outliers": shape (182,), type "<i8">)]