In [1]:
import os 
import glob 
import h5py 
import numpy as np 

In [2]:
import MDAnalysis as mda

In [3]:
from MDAnalysis.analysis.rms import RMSD
from MDAnalysis.analysis import distances

In [4]:
from utils import cm_to_cvae 

Using TensorFlow backend.


In [5]:
omm_list = sorted(glob.glob('../MD_exps/fs-pep/omm_runs_*'))[:120*5]

# RMSD

In [6]:
ref_pdb_file = '../MD_exps/fs-pep/pdb/fs-peptide.pdb'

In [7]:
RMSD_all = []
for omm in omm_list: 
    dcd_file = os.path.join(omm, 'output.dcd')
    mda_traj = mda.Universe(ref_pdb_file, dcd_file)
    ref_traj = mda.Universe(ref_pdb_file)
    R = RMSD(mda_traj, ref_traj, select='protein and name CA')
    R.run() 
    RMSD_all.append(R.rmsd[:,2])

In [8]:
RMSD_all = np.hstack(RMSD_all)

# Embed

In [9]:
model_weight = '../CVAE_exps/cvae_runs_03_1570644424/cvae_weight.h5'

In [10]:
cm_data_lists = [] 
num_frame = 0 
for omm in omm_list: 
    cm_file = os.path.join(omm, 'output_cm.h5')
    cm_h5 = h5py.File(cm_file, 'r', libver='latest', swmr=True) 
#     print cm_h5[u'contact_maps']
    cm_data_lists.append(cm_h5[u'contact_maps'].value) 
    num_frame += cm_h5[u'contact_maps'].shape[1]
    cm_h5.close() 

In [11]:
h5py.__version__

'2.8.0'

In [12]:
num_frame * 0.05

16800.0

In [13]:
cvae_input = cm_to_cvae(cm_data_lists)

In [14]:
cvae_input.shape

(336000, 22, 22, 1)

In [15]:
from utils import predict_from_cvae
cm_predict = predict_from_cvae(model_weight, cvae_input, hyper_dim=3)

Instructions for updating:
Colocations handled automatically by placer.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 22, 22, 1)    0                                            
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 22, 22, 64)   640         input_1[0][0]                    
__________________________________________________________________________________________________
conv2d_2 (Conv2D)               (None, 11, 11, 64)   36928       conv2d_1[0][0]                   
__________________________________________________________________________________________________
conv2d_3 (Conv2D)               (None, 11, 11, 64)   36928       conv2d_2[0][0]                   
_____________________________________

In [16]:
cm_predict.shape

(336000, 3)

# outliers 

In [17]:
from utils import outliers_from_latent
eps = 0.2 

while True:
    outliers = np.squeeze(outliers_from_latent(cm_predict, eps=eps))
    n_outlier = len(outliers)
    print('dimension = {0}, eps = {1:.2f}, number of outlier found: {2}'.format(
        3, eps, n_outlier))
    if n_outlier > 150:
        eps = eps + 0.05
    else:
        outlier_list = outliers 
        break

dimension = 3, eps = 0.20, number of outlier found: 132


In [18]:
outlier_list

array([  3583,   3605,   3606,   3633,   3635,   3637,   3710,  10303,
        13129,  24887,  25745,  36742,  42125,  42126,  53895,  53896,
        53897,  53898,  53899,  60856,  71568,  71769,  77172,  77267,
        82542,  87814,  92188,  92725,  93109, 103019, 103025, 103026,
       103034, 103038, 103044, 103046, 103047, 103049, 103051, 103063,
       103064, 103902, 103909, 115520, 115688, 121448, 121454, 123601,
       123631, 123633, 125393, 135268, 139196, 145311, 145316, 145529,
       147840, 151131, 151212, 151213, 151217, 151218, 151220, 151224,
       151225, 151226, 151227, 151228, 151229, 151363, 151402, 151404,
       151407, 151564, 151889, 151890, 151892, 151893, 151894, 151898,
       151901, 151907, 151919, 151924, 151925, 151927, 151942, 165602,
       167005, 167012, 167013, 168799, 180855, 181271, 181301, 182189,
       182196, 187801, 187836, 216310, 219645, 228891, 234848, 234849,
       234899, 235471, 236584, 236585, 272810, 272844, 272846, 272847,
      

In [19]:
h5_save = h5py.File('./latent3d_fsp.h5', 'w') 

In [20]:
h5_save.create_dataset('cm_predict', data=cm_predict)  
h5_save.create_dataset('RMSD', data=RMSD_all) 
h5_save.create_dataset('outliers', data=outlier_list)  

<HDF5 dataset "outliers": shape (132,), type "<i8">

In [21]:
h5_save.close() 

In [22]:
h5_save = h5py.File('./latent3d_fsp.h5', 'r') 

In [23]:
h5_save.items()

[(u'RMSD', <HDF5 dataset "RMSD": shape (336000,), type "<f8">),
 (u'cm_predict', <HDF5 dataset "cm_predict": shape (336000, 3), type "<f4">),
 (u'outliers', <HDF5 dataset "outliers": shape (132,), type "<i8">)]