-
Notifications
You must be signed in to change notification settings - Fork 28
/
simple_job_models.py
197 lines (154 loc) · 7.4 KB
/
simple_job_models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
import builtins as __builtin__
import pandas as pd
import numpy as np
import argparse
import glob
import time
import os
try:
import __builtin__
except ImportError:
# Python 3
import builtins as __builtin__
# For debugging purposes only:
def print(*args, **kwargs):
__builtin__.print('\t simple_jobs: ', end="")
return __builtin__.print(*args, **kwargs)
START_TIME = time.time()
parser = argparse.ArgumentParser()
parser.add_argument('-n_it','--iteration_no',required=True,help='Number of current iteration')
parser.add_argument('-mdd','--morgan_directory',required=True,help='Path to the Morgan fingerprint directory for the database')
parser.add_argument('-time','--time',required=True,help='Time limit for training')
parser.add_argument('-file_path','--file_path',required=True,help='Path to the project directory, including project directory name')
parser.add_argument('-nhp','--number_of_hyp',required=True,help='Number of hyperparameters')
parser.add_argument('-titr','--total_iterations',required=True,help='Desired total number of iterations')
parser.add_argument('-isl','--is_last',required=True,help='True/False for is this last iteration')
parser.add_argument('-gp','--gpu_part',required=True,help='name(s) of GPU partitions')
parser.add_argument('-tf_e','--tensorflow_env',required=True,help='name of tensorflow-gpu environment')
# adding parameter for where to save all the data to:
parser.add_argument('-save', '--save_path', required=False, default=None)
# allowing for variable number of molecules to test and validate from:
parser.add_argument('-n_mol', '--number_mol', required=False, default=1000000, help='Size of validation/test set to be used')
parser.add_argument('-pfm', '--percent_first_mols', required=False, default=-1, help='% of top scoring molecules to be considered as virtual hits in the first iteration (for standard DD run on 11 iterations, we recommend 1)') # these two inputs must be percentages
parser.add_argument('-plm', '--percent_last_mols', required=False, default=-1, help='% of top scoring molecules to be considered as virtual hits in the last iteration (for standard DD run on 11 iterations, we recommend 0.01)')
# Pass the threshold
parser.add_argument('-ct', '--recall', required=False, default=0.9, help='Recall, [0,1] range, default value 0.9')
funct_flags = parser.add_mutually_exclusive_group(required=False)
funct_flags.add_argument('-expdec', '--exponential_dec', required=False, default=-1) # must pass in the base number
funct_flags.add_argument('-polydec', '--polynomial_dec', required=False, default=-1) # You must also pass in to what power for this flag
io_args, extra_args = parser.parse_known_args()
n_it = int(io_args.iteration_no)
mdd = io_args.morgan_directory
time_model = io_args.time
nhp = int(io_args.number_of_hyp)
isl = io_args.is_last
titr = int(io_args.total_iterations)
rec = float(io_args.recall)
gpu_part = str(io_args.gpu_part)
env = str(io_args.tensorflow_env)
protein = str(io_args.file_path).split('/')[-1]
num_molec = int(io_args.number_mol)
percent_first_mols = float(io_args.percent_first_mols)/100
percent_last_mols = float(io_args.percent_last_mols)/100
exponential_dec = int(io_args.exponential_dec)
polynomial_dec = int(io_args.polynomial_dec)
DATA_PATH = io_args.file_path # Now == file_path/protein
SAVE_PATH = io_args.save_path
# if no save path is provided we just save it in the same location as the data
if SAVE_PATH is None: SAVE_PATH = DATA_PATH
# sums the first column and divides it by 1 million (this is our total database size)
t_mol = pd.read_csv(mdd+'/Mol_ct_file_%s.csv'%protein,header=None)[[0]].sum()[0]/1000000 # num of compounds in each file is mol_ct_file
cummulative = 0.25*n_it
dropout = [0.2, 0.5]
learn_rate = [0.0001]
bin_array = [2, 3]
wt = [2, 3]
if nhp < 144:
bs = [256]
else:
bs = [128, 256]
if nhp < 48:
oss = [10]
elif nhp < 72:
oss = [5, 10]
else:
oss = [5, 10, 20]
if nhp < 24:
num_units = [1500, 2000]
else:
num_units = [100, 1500, 2000]
try:
os.mkdir(SAVE_PATH+'/iteration_'+str(n_it)+'/simple_job')
except OSError: # catching file exists error
pass
# Clearing up space from previous iteration
for f in glob.glob(SAVE_PATH+'/iteration_'+str(n_it)+'/simple_job/*'):
os.remove(f)
scores_val = []
with open(DATA_PATH+'/iteration_'+str(1)+'/validation_labels.txt','r') as ref:
ref.readline() # first line is ignored
for line in ref:
scores_val.append(float(line.rstrip().split(',')[0]))
scores_val = np.array(scores_val)
first_mols = int(100*t_mol/13) if percent_first_mols == -1.0 else int(percent_first_mols * len(scores_val))
print(first_mols)
last_mols = 100 if percent_last_mols == -1.0 else int(percent_last_mols * len(scores_val))
print(last_mols)
if n_it==1:
# 'good_mol' is the number of top scoring molecules to save at the end of the iteration
good_mol = first_mols
else:
if exponential_dec != -1:
good_mol = int() #TODO: create functions for these
elif polynomial_dec != -1:
good_mol = int()
else:
good_mol = int(((last_mols-first_mols)*n_it + titr*first_mols-last_mols)/(titr-1)) # linear decrease as interations increase
print(isl)
#If this is the last iteration then we save only 100 molecules
if isl == 'True':
good_mol = 100 if percent_last_mols == -1.0 else int(percent_last_mols * len(scores_val))
cf_start = np.mean(scores_val) # the mean of all the docking scores (labels) of the validation set:
t_good = len(scores_val)
# we decrease the threshold value until we have our desired num of mol left.
while t_good > good_mol:
cf_start -= 0.005
t_good = len(scores_val[scores_val<cf_start])
print('Threshold (cutoff):',cf_start)
print('Molec under threshold:', t_good)
print('Goal molec:', good_mol)
print('Total molec:', len(scores_val))
all_hyperparas = []
for o in oss: # Over Sample Size
for batch in bs:
for nu in num_units:
for do in dropout:
for lr in learn_rate:
for ba in bin_array:
for w in wt: # Weight
all_hyperparas.append([o,batch,nu,do,lr,ba,w,cf_start])
print('Total hyp:', len(all_hyperparas))
# Creating all the jobs for each hyperparameter combination:
other_args = ' '.join(extra_args) + '-rec {} -n_it {} -t_mol {} --data_path {} --save_path {} -n_mol {}'.format(rec, n_it, t_mol, DATA_PATH, SAVE_PATH, num_molec)
print(other_args)
count = 1
for i in range(len(all_hyperparas)):
with open(SAVE_PATH+'/iteration_'+str(n_it)+'/simple_job/simple_job_'+str(count)+'.sh', 'w') as ref:
ref.write('#!/bin/bash\n')
ref.write('#SBATCH --ntasks=1\n')
ref.write('#SBATCH --gres=gpu:1\n')
ref.write('#SBATCH --cpus-per-task=1\n')
ref.write('#SBATCH --job-name=phase_4\n')
ref.write('#SBATCH --mem=0 # memory per node\n')
ref.write('#SBATCH --partition=%s\n'%gpu_part)
ref.write('#SBATCH --time='+time_model+' # time (DD-HH:MM)\n')
ref.write('\n')
cwd = os.getcwd()
ref.write('cd {}/scripts_2\n'.format(cwd))
hyp_args = '-os {} -bs {} -num_units {} -dropout {} -learn_rate {} -bin_array {} -wt {} -cf {}'.format(*all_hyperparas[i])
ref.write('source ~/.bashrc\n')
ref.write('conda activate %s\n'%env)
ref.write('python -u progressive_docking.py ' + hyp_args + ' ' + other_args)
ref.write("\n echo complete")
count += 1
print('Runtime:', time.time() - START_TIME)