In [1]:
from __future__ import print_function

In [2]:
from glob import glob
import numpy as np
import sys, os, h5py, time, errno
import GPUtil
import MDAnalysis as mda
from sklearn.cluster import DBSCAN

from CVAE import CVAE 
from utils import start_rabbit, start_worker, start_flower_monitor, read_h5py_file, cm_to_cvae, job_on_gpu
from utils import find_frame, write_pdb_frame, make_dir_p, outliers_from_cvae, job_list
from utils import omm_job, cvae_job 

Using TensorFlow backend.


In [3]:
GPU_ids = [gpu.id for gpu in GPUtil.getGPUs()] 
print('Available GPUs', GPU_ids) 

Available GPUs [0, 1]


In [4]:
top_file = None
pdb_file = os.path.abspath('./pdb/100-fs-peptide-400K.pdb')

In [5]:
# number of cvae jobs 
n_cvae = 1 

# logs for scheudler

In [6]:
work_dir = os.path.abspath('./')
log_dir = os.path.join(work_dir, 'scheduler_logs') 
make_dir_p(log_dir)

In [7]:
rabbitmq_log = os.path.join(log_dir, 'rabbit_server_log.txt') 
start_rabbit(rabbitmq_log)
time.sleep(5)

celery_worker_log = os.path.join(log_dir, 'celery_worker_log.txt') 
start_worker(celery_worker_log)
start_flower_monitor() 
print('Waiting 10 seconds for the server to stablize.')
time.sleep(10)

Waiting 10 seconds for the server to stablize.


## Jobs
* Assign job_id according to the available GPUs on the board. 
* Start the simulation according the job_labels containing job_id and gpu_id for individual jobs 

In [8]:
jobs = job_list()

In [9]:
for gpu_id in GPU_ids: 
    job = omm_job(job_id=int(time.time()), gpu_id=gpu_id, top_file=top_file, pdb_file=pdb_file)
    job.start()
    print('haha')
    jobs.append(job) 
    time.sleep(2)

haha
haha


In [10]:
for gpu_id in GPU_ids: 
    job = omm_job(job_id=int(time.time()), gpu_id=gpu_id, top_file=top_file, pdb_file=pdb_file)
#     job.start()
    print('haha')
    jobs.append(job) 
    time.sleep(2)

haha
haha


# Read the output h5 files

In [11]:
cm_files = sorted(glob('omm*/*_cm.h5')) 

cm_data_lists = [read_h5py_file(cm_file) for cm_file in cm_files] 

In [12]:
cm_files

['omm_run1543423738/output_cm.h5',
 'omm_run1543423740/output_cm.h5',
 'omm_run1543428029/output_cm.h5',
 'omm_run1543428031/output_cm.h5',
 'omm_run_1543432197/output_cm.h5',
 'omm_run_1543432199/output_cm.h5']

# Get updates from h5 file
Once every minute

In [13]:
frame_number = lambda lists: sum([cm.shape[1] for cm in lists])
frame_number(cm_data_lists)

3257

In [14]:
frame_marker = 0 
while frame_number(cm_data_lists) < 500: 
    for cm in cm_data_lists: 
        cm.refresh() 
    if frame_number(cm_data_lists) > frame_marker: 
        print('Current number of frames from OpenMM:', frame_number(cm_data_lists)) 
        frame_marker = int((100 + frame_marker) / 100) * 100
        print('    Next report at frame', frame_marker) 
#     time.sleep(600)

# All contact to h5

In [15]:
train_data_length = [ cm_data.shape[1] for cm_data in cm_data_lists]

omm_log = os.path.join(log_dir, 'openmm_log.txt') 

log = open(omm_log, 'w') 

for i, n_frame in enumerate(train_data_length): 
    log.writelines("{} {}\n".format(cm_files[i], n_frame))    
log.close()

In [16]:
cvae_input = cm_to_cvae(cm_data_lists)

cvae_input_dir = os.path.join(work_dir, 'cvae_input') 
make_dir_p(cvae_input_dir)

cvae_input_file = os.path.join(cvae_input_dir, 'cvae_input.h5')
cvae_input_save = h5py.File(cvae_input_file, 'w')
cvae_input_save.create_dataset('contact_maps', data=cvae_input)
cvae_input_save.close() 

In [17]:
cvae_input.shape

(3257, 22, 22, 1)

# CVAE

In [18]:
hyper_dims = np.array(range(n_cvae)) + 3

In [20]:
# cvae_jobs = []
for i in range(n_cvae): 
    cvae_j = cvae_job(time.time(), i, cvae_input_file, hyper_dim=3) 
    stop_jobs = jobs.get_job_from_gpu_id(i) 
    stop_jobs.stop()  
    time.sleep(2)
    print('hehre')
    cvae_j.start() 
    jobs.append(cvae_j) 
#     cvae_jobs.append(cvae_j)
    time.sleep(2)

hehre


In [21]:
jobs

[<utils.omm_job at 0x7ff638e8a690>,
 <utils.omm_job at 0x7ff638e21dd0>,
 <utils.omm_job at 0x7ff638d827d0>,
 <utils.omm_job at 0x7ff6978a9e10>,
 <utils.cvae_job at 0x7ff634668ad0>]

In [22]:
jobs.get_job_from_gpu_id(0) 

<utils.cvae_job at 0x7ff634668ad0>

In [23]:
jobs.get_running_jobs()

[<utils.omm_job at 0x7ff638e21dd0>, <utils.cvae_job at 0x7ff634668ad0>]

In [27]:
jobs.get_job_from_gpu_id(0)

<utils.cvae_job at 0x7f4e3d030b90>

In [24]:
avail_gpu = GPU_ids[:]

for job in jobs.get_running_jobs():
    avail_gpu.remove(job.gpu_id)

avail_gpu

[]

In [27]:
[cvae_j.job.status for cvae_j in jobs.get_cvae_jobs()]

[u'SUCCESS']

In [34]:
while [cvae_j.job.status for cvae_j in jobs.get_cvae_jobs()] != [u'SUCCESS'] * len(jobs.get_cvae_jobs()): 
    time.sleep(.5)
print('CVAE done. ') 

CVAE done. 


In [33]:
[u'SUCCESS', u'SUCCESS', u'SUCCESS'] == [u'SUCCESS'] * 3

True

# Identifier base on CVAE result

In [5]:
model_weight = 'cvae_model_3_1542664915/cvae_weight.h5'

In [6]:
cm_h5 = h5py.File('cvae_input/cvae_input.h5', 'r', libver='latest', swmr=True)
cvae_input = cm_h5[u'contact_maps'] 

In [35]:
outlier_list = []
for cvae_j in jobs.get_cvae_jobs(): 
    outliers = outliers_from_cvae(cvae_j.job.result[0], cvae_input, hyper_dim=cvae_j.hyper_dim, eps=0.35) 
    outlier_list.append(outliers) 
    
outlier_list = np.unique(np.array(outlier_list).flatten())

In [36]:
outlier_list 

array([  50,   51,   63,  190,  277,  314,  316,  324,  327,  334,  335,
        338,  341,  355,  356,  484,  510,  530,  533,  539,  540,  541,
        555,  558,  577,  580,  609,  610,  624,  625,  659,  660,  676,
        677,  678,  679,  693,  699,  702,  703,  704,  707,  709,  710,
        711,  734,  742,  743,  746,  749,  750,  770,  771,  777,  778,
        798,  832,  840,  868,  950,  952,  953,  954,  956,  957,  958,
        959,  960,  961,  962,  963,  965,  966,  967,  968,  972,  994,
        999, 1000, 1015, 1022, 1023, 1024, 1025, 1027, 1050, 1053, 1055,
       1069, 1078, 1084, 1085, 1086, 1088, 1089, 1117, 1122, 1128, 1129,
       1130, 1151, 1152, 1153, 1154, 1155, 1156, 1157, 1159, 1160, 1161,
       1162, 1163, 1165, 1167, 1168, 1169, 1170, 1171, 1172, 1173, 1174,
       1175, 1176, 1177, 1178, 1179, 1180, 1181, 1182, 1183, 1184, 1185,
       1186, 1187, 1188, 1189, 1192, 1193, 1206, 1208, 1219, 1224, 1235,
       1237, 1239, 1240, 1242, 1272, 1273, 1276, 12

# Find the frame

In [37]:
traj_info = open('./scheduler_logs/openmm_log.txt', 'r').read().split()

traj_dict = dict(zip(traj_info[::2], np.array(traj_info[1::2]).astype(int)))

traj_dict

outliers_pdb = './outliers_pdb'
make_dir_p(outliers_pdb)

for outlier in outlier_list: 
    traj_file, frame_number = find_frame(traj_dict, outlier) 
    outlier_pdb_file = os.path.join(outliers_pdb, '%d.pdb' % outlier)
    outlier_pdb = write_pdb_frame(traj_file, pdb_file, frame_number, outlier_pdb_file) 