In [1]:
from __future__ import print_function

In [2]:
from glob import glob
import numpy as np
import sys, os, h5py, time, errno
import GPUtil
import MDAnalysis as mda
from sklearn.cluster import DBSCAN

from CVAE import CVAE 
from utils import start_rabbit, start_worker, start_flower_monitor, read_h5py_file, cm_to_cvae, job_on_gpu
from utils import find_frame, write_pdb_frame, make_dir_p, outliers_from_cvae
from utils import omm_job, cvae_job 

Using TensorFlow backend.


In [3]:
GPU_ids = [gpu.id for gpu in GPUtil.getGPUs()] 
print('Available GPUs', GPU_ids) 

Available GPUs [0, 1]


In [4]:
top_file = None
pdb_file = os.path.abspath('./pdb/100-fs-peptide-400K.pdb')

In [5]:
# number of cvae jobs 
n_cvae = 1 

# logs for scheudler

In [6]:
work_dir = os.path.abspath('./')
log_dir = os.path.join(work_dir, 'scheduler_logs') 
make_dir_p(log_dir)

In [7]:
rabbitmq_log = os.path.join(log_dir, 'rabbit_server_log.txt') 
start_rabbit(rabbitmq_log)
time.sleep(5)

celery_worker_log = os.path.join(log_dir, 'celery_worker_log.txt') 
start_worker(celery_worker_log)
start_flower_monitor() 
print('Waiting 10 seconds for the server to stablize.')
time.sleep(10)

Waiting 10 seconds for the server to stablize.


## Jobs
* Assign job_id according to the available GPUs on the board. 
* Start the simulation according the job_labels containing job_id and gpu_id for individual jobs 

In [8]:
jobs = []
for gpu_id in GPU_ids: 
    job = omm_job(job_id=int(time.time()), gpu_id=gpu_id, top_file=top_file, pdb_file=pdb_file)
    job.start() 
    print('haha')
    jobs.append(job) 
    time.sleep(2)

haha
haha


In [9]:
jobs

[<utils.omm_job at 0x7f4ccb88c310>, <utils.omm_job at 0x7f4ccb827a90>]

# Read the output h5 files

In [7]:
cm_files = sorted(glob('store*/*_cm.h5')) 

cm_data_lists = [read_h5py_file(cm_file) for cm_file in cm_files] 

In [8]:
cm_files

['store_omm_run1542409149/output_cm.h5',
 'store_omm_run1542409151/output_cm.h5']

# Get updates from h5 file
Once every minute

In [9]:
frame_number = lambda lists: sum([cm.shape[1] for cm in lists])
frame_number(cm_data_lists)

25220

In [10]:
frame_marker = 0 
while frame_number(cm_data_lists) < 1000: 
    for cm in cm_data_lists: 
        cm.refresh() 
    if frame_number(cm_data_lists) > frame_marker: 
        print('Current number of frames from OpenMM:', frame_number(cm_data_lists)) 
        frame_marker = int((10000 + frame_marker) / 10000) * 10000
        print('    Next report at frame', frame_marker) 
#     time.sleep(600)

# All contact to h5

In [11]:
cvae_input = cm_to_cvae(cm_data_lists)

In [12]:
cm_data_lists

[<HDF5 dataset "contact_maps": shape (1326, 12972), type "<f4">,
 <HDF5 dataset "contact_maps": shape (1326, 12248), type "<f4">]

In [13]:
train_data_length = [ cm_data.shape[1] for cm_data in cm_data_lists]

omm_log = os.path.join(log_dir, 'openmm_log.txt') 

log = open(omm_log, 'w') 

for i, n_frame in enumerate(train_data_length): 
    log.writelines("{} {}\n".format(cm_files[i], n_frame))    
log.close()

In [14]:
cvae_input = cm_to_cvae(cm_data_lists)

cvae_input_dir = os.path.join(work_dir, 'cvae_input') 
make_dir_p(cvae_input_dir)

cvae_input_file = os.path.join(cvae_input_dir, 'cvae_input.h5')
cvae_input_save = h5py.File(cvae_input_file, 'w')
cvae_input_save.create_dataset('contact_maps', data=cvae_input)
cvae_input_save.close() 

# CVAE

In [None]:
hyper_dims = np.array(range(n_cvae)) + 3

In [None]:
cvae_jobs = []
for i in range(n_cvae): 
    cvae_j = cvae_job(time.time(), i, cvae_input_file, hyper_dim=3) 
    stop_jobs = job_on_gpu(i, jobs) 
    stop_jobs.stop()  
    cvae_j.start() 
    jobs.append(cvae_j) 
    cvae_jobs.append(cvae_j)
    time.sleep(2)

In [None]:
cvae_j.job.status

In [20]:
while [cvae_j.job.status for cvae_j in cvae_jobs] != u'SUCCESS': 
    time.sleep(10)
print('CVAE done. ') 
model_weight, model_save = cvae_j.job.result 

CVAE done. 


In [25]:
[u'SUCCESS', u'SUCCESS', u'SUCCESS', u'SCESS'] == u'SUCCESS'

False

# Identifier base on CVAE result

In [5]:
model_weight = 'cvae_model_3_1542664915/cvae_weight.h5'

In [6]:
cm_h5 = h5py.File('cvae_input/cvae_input.h5', 'r', libver='latest', swmr=True)
cvae_input = cm_h5[u'contact_maps'] 

In [7]:
outlier_list = []
for _ in range(2): 
    outliers = outliers_from_cvae(model_weight, cvae_input,hyper_dim=3)
    outlier_list.append(outliers)
    
outlier_list = np.unique(np.array(outlier_list).flatten())

# Find the frame

In [8]:
traj_info = open('./scheduler_logs/openmm_log.txt', 'r').read().split()

traj_dict = dict(zip(traj_info[::2], np.array(traj_info[1::2]).astype(int)))

traj_dict

outliers_pdb = './outliers_pdb'
make_dir_p(outliers_pdb)

for outlier in outlier_list: 
    traj_file, frame_number = find_frame(traj_dict, outlier) 
    outlier_pdb_file = os.path.join(outliers_pdb, '%d.pdb' % outlier)
    outlier_pdb = write_pdb_frame(traj_file, pdb_file, frame_number, outlier_pdb_file) 