new weights

jacquesboitreaud · May 7, 2020 · 738e165 · 738e165
1 parent 8b06736
commit 738e165
Show file tree

Hide file tree

Showing 13 changed files with 114 additions and 91 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,8 +1,8 @@
 __pycache__
 *.csv
-runs
 model_backups
 data_curation
 eval/plots
 data/
 results/
+optim/bo_results
diff --git a/cbas/cbas.py b/cbas/cbas.py
@@ -22,6 +22,7 @@
 from rdkit import Chem
 
 from utils import *
+from dgl_utils import * 
 from model import model_from_json
 from oracles import qed, deterministic_cdf_oracle, normal_cdf_oracle
 from gen_train import GenTrain

diff --git a/dgl_utils.py b/dgl_utils.py
@@ -0,0 +1,28 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Thu May  7 18:42:15 2020
+
+@author: jacqu
+"""
+
+import dgl
+
+def send_graph_to_device(g, device):
+    """
+    Send dgl graph to device
+    :param g: :param device:
+    :return:
+    """
+    g.set_n_initializer(dgl.init.zero_initializer)
+    g.set_e_initializer(dgl.init.zero_initializer)
+
+    # nodes
+    labels = g.node_attr_schemes()
+    for l in labels.keys():
+        g.ndata[l] = g.ndata.pop(l).to(device, non_blocking=True)
+
+    # edges
+    labels = g.edge_attr_schemes()
+    for i, l in enumerate(labels.keys()):
+        g.edata[l] = g.edata.pop(l).to(device, non_blocking=True)
+    return g
diff --git a/docking/dock1smiles.py b/docking/dock1smiles.py
@@ -25,7 +25,7 @@
     pass
 
     parser = argparse.ArgumentParser()
-    parser.add_argument("-i", "--input", default=' O=CC=C(C1C2=NC=C(C=NC3=CC=CC=C3N=C2)C=C1F)NC=C', 
+    parser.add_argument("-i", "--input", default='O=CC=C(C1C2=NC=C(C=NC3=CC=CC=C3N=C2)C=C1F)NC=C', 
                         help="Smiles to dock")
     parser.add_argument("-s", "--server", default='mac', help="Server to run the docking on, for path and configs.")
     parser.add_argument("-e", "--ex", default=16, help="exhaustiveness parameter for vina")
@@ -34,5 +34,6 @@
 
     PYTHONSH, VINA = set_path(args.server)
 
-    dock(smile=args.input, unique_id=1, pythonsh=PYTHONSH, vina=VINA, parallel=True, exhaustiveness=args.ex)
+    sc = dock(smile=args.input, unique_id=1, pythonsh=PYTHONSH, vina=VINA, parallel=True, exhaustiveness=args.ex)
+    print('Score :', sc)
 
diff --git a/eval/diagnostic_plots.py b/eval/diagnostic_plots.py
@@ -52,6 +52,7 @@
 
     from eval.eval_utils import *
     from utils import *
+    from dgl_utils import * 
 
     # Should be same as for training
     properties = ['QED', 'logP', 'molWt']

diff --git a/model.py b/model.py
@@ -34,6 +34,7 @@
 from dgl.nn.pytorch.conv import GATConv, RelGraphConv
 
 from utils import *
+from dgl_utils import * 
 
 
 class MultiGRU(nn.Module):

diff --git a/optim/BO.py b/optim/BO.py
@@ -49,14 +49,19 @@
     from dataloaders.molDataset import Loader
     from model import Model, model_from_json
     from utils import *
-    from BO_utils import get_fitted_model
+    from dgl_utils import * 
+    from bo_utils import get_fitted_model
     from docking.docking import dock, set_path
 
     parser = argparse.ArgumentParser()
+
+    parser.add_argument( '--bo_name', help="Name for BO results subdir ",
+                        default='first_bo')
+
     parser.add_argument( '--name', help="saved model weights fname. Located in saved_models subdir",
-                        default='kekule')
+                        default='inference_default')
     parser.add_argument('-n', "--n_steps", help="Nbr of optim steps", type=int, default=50)
-    parser.add_argument('-q', "--n_queries", help="Nbr of queries per step", type=int, default=50)
+    parser.add_argument('-q', "--n_queries", help="Nbr of queries per step", type=int, default=100)
 
     parser.add_argument('-o', '--objective', default='aff_pred') # 'qed', 'aff', 'aff_pred'
 
@@ -72,8 +77,10 @@
     VAE is on GPU, decoding and aff prediction with MLP are on GPU, but Gaussian process operations on CPU 
     (if training set of gaussian process becomes big after some steps, may not fit on gpu 
     """
+
+    soft_mkdir('bo_results')
+    soft_mkdir(os.path.join('bo_results',args.bo_name))
 
-    device = 'cuda' if torch.cuda.is_available() else 'cpu'
     vocab = 'selfies'
     # Loader for initial sample
     loader = Loader(props=[], 
@@ -84,11 +91,12 @@
                     test_only=True)
 
     # Load model (on gpu if available)
-    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    device = 'cuda' if torch.cuda.is_available() else 'cpu' # the model device 
     model = model_from_json(args.name)
     model.to(device)
     model.eval()
 
+    # Search space 
     d = model.l_size
     dtype = torch.float
     bounds = torch.tensor([[-3.0] * d, [3.0] * d], device='cpu', dtype=dtype)
@@ -115,7 +123,9 @@
     elif args.objective == 'aff' : 
         PYTHONSH, VINA = set_path(args.server)
         scores_init = -1* torch.tensor(df.drd3).view(-1,1).cpu() # careful, maximize -aff <=> minimize binding energy (negative value)
-
+
+    # Tracing results
+    sc_dict = {}
     best_value = torch.max(scores_init).item()
     best_observed.append(best_value)
     train_obj = scores_init
@@ -204,12 +214,14 @@ def optimize_acqf_and_get_observation(acq_func, device):
         if(args.verbose):
             print(' oracle outputs:')
             print(new_score.numpy())
+        sc_dict[iteration]=new_score.numpy()
 
         # update training points
 
         train_smiles+= new_smiles 
         train_z = torch.cat((train_z, new_z.cpu()), dim=0)
         train_obj = torch.cat((train_obj, new_score), dim=0)
+        state_dict = GP_model.state_dict()
 
         # update progress
         avg_score = torch.mean(new_score).item()
@@ -219,9 +231,24 @@ def optimize_acqf_and_get_observation(acq_func, device):
         idx = idx.item()
         best_smiles = train_smiles[idx]
 
-        state_dict = GP_model.state_dict()
-
-
         print(f'current best mol: {best_smiles}, with oracle score {best_value.item()}')
         print(f'average score of fresh samples at iter {iteration}: {avg_score}')
         print("\n")
+
+        # Save 
+        with open(os.path.join('bo_results',args.bo_name,'sample_scores.pickle'), 'wb') as f :
+            pickle.dump(sc_dict, f)
+
+    train_obj=train_obj.numpy()
+    idces = np.argsort(train_obj)
+    idces=idces[:100]
+
+    with open(os.path.join('bo_results',args.bo_name,'top_samples.txt'), 'w') as f :
+        for i in idces :
+            f.write(train_smiles[i], ',  ', train_obj[i] )
+    print('wrote top samples and scores to txt. ')
+
+
+
+
+
diff --git a/optim/log.txt b/optim/log.txt
diff --git a/results/saved_models/inference_default/params.json b/results/saved_models/inference_default/params.json
@@ -6,12 +6,12 @@
     "load_iter": 0,
     "decode": "selfies",
     "build_alphabet": false,
-    "latent_size": 96,
+    "latent_size": 56,
     "lr": 0.001,
     "clip_norm": 50.0,
     "beta": 0.0,
     "step_beta": 0.002,
-    "max_beta": 0.1,
+    "max_beta": 0.5,
     "warmup": 40000,
     "processes": 20,
     "batch_size": 64,
@@ -30,48 +30,47 @@
     "bin_affs": false,
     "features_dim": 16,
     "num_rels": 4,
-    "l_size": 96,
-    "voc_size": 34,
+    "l_size": 56,
+    "voc_size": 33,
     "max_len": 54,
     "N_properties": 3,
     "N_targets": 1,
     "binned_scores": false,
     "device": "cuda",
     "index_to_char": {
-        "0": "[C]",
-        "1": "[epsilon]",
-        "2": "[#C]",
-        "3": "[=S]",
-        "4": "[s]",
-        "5": "[O]",
+        "0": "[Branch1_1]",
+        "1": "[Branch1_2]",
+        "2": "[Branch1_3]",
+        "3": "[Ring1]",
+        "4": "[Branch2_1]",
+        "5": "[Branch2_2]",
         "6": "[Branch2_3]",
-        "7": "[Cl]",
-        "8": "[Expl-Ring1]",
-        "9": "[S]",
-        "10": "[N]",
-        "11": "[Ring1]",
-        "12": "[Branch1_2]",
-        "13": "[=c]",
-        "14": "[nHexpl]",
-        "15": "[o]",
-        "16": "[-c]",
-        "17": "[Branch2_1]",
-        "18": "[Branch1_3]",
-        "19": "[Expl-Ring2]",
-        "20": "[Br]",
-        "21": "[Hexpl]",
-        "22": "[#N]",
-        "23": "[Branch1_1]",
-        "24": "[Branch2_2]",
-        "25": "[=O]",
-        "26": "[-n]",
-        "27": "[=C]",
-        "28": "[=N]",
-        "29": "[n]",
-        "30": "[F]",
-        "31": "[Expl=Ring1]",
-        "32": "[Ring2]",
-        "33": "[c]"
+        "7": "[Ring2]",
+        "8": "[Branch3_1]",
+        "9": "[Branch3_2]",
+        "10": "[Branch3_3]",
+        "11": "[Ring3]",
+        "12": "[O]",
+        "13": "[=O]",
+        "14": "[N]",
+        "15": "[=N]",
+        "16": "[C]",
+        "17": "[=C]",
+        "18": "[#C]",
+        "19": "[S]",
+        "20": "[=S]",
+        "21": "[P]",
+        "22": "[F]",
+        "23": "[C@Hexpl]",
+        "24": "[C@@Hexpl]",
+        "25": "[C@expl]",
+        "26": "[C@@expl]",
+        "27": "[H]",
+        "28": "[NHexpl]",
+        "29": "[epsilon]",
+        "30": "[Cl]",
+        "31": "[#N]",
+        "32": "[Br]"
     },
     "props": [
         "QED",
@@ -85,5 +84,9 @@
     "properties": true,
     "target": true,
     "parallel": false,
-    "gcn_layers": 3
+    "gcn_layers": 3,
+    "load_name": "default",
+    "no_props": false,
+    "no_aff": false,
+    "gcn_hdim": 32
 }
diff --git a/results/saved_models/inference_default/weights.pth b/results/saved_models/inference_default/weights.pth
diff --git a/train.py b/train.py
@@ -34,6 +34,7 @@
 sys.path.append(script_dir)
 
 from utils import *
+from dgl_utils import * 
 from model import Model
 from loss_func import VAELoss, weightedPropsLoss, affsRegLoss, affsClassifLoss
 from dataloaders.molDataset import molDataset, Loader

diff --git a/train_triplets.py b/train_triplets.py
@@ -33,6 +33,7 @@
 from loss_func import Loss, RecLoss, tripletLoss
 from dataloaders.tripletsDataset import Loader
 from utils import *
+from dgl_utils import * 
 
 if __name__ == "__main__":
 

diff --git a/utils.py b/utils.py
@@ -8,7 +8,6 @@
 """
 
 import numpy as np
-import dgl
 import torch
 import pandas as pd
 
@@ -174,26 +173,6 @@ def debug_memory():
         print('{}\t{}'.format(*line))
 
 
-def send_graph_to_device(g, device):
-    """
-    Send dgl graph to device
-    :param g: :param device:
-    :return:
-    """
-    g.set_n_initializer(dgl.init.zero_initializer)
-    g.set_e_initializer(dgl.init.zero_initializer)
-
-    # nodes
-    labels = g.node_attr_schemes()
-    for l in labels.keys():
-        g.ndata[l] = g.ndata.pop(l).to(device, non_blocking=True)
-
-    # edges
-    labels = g.edge_attr_schemes()
-    for i, l in enumerate(labels.keys()):
-        g.edata[l] = g.edata.pop(l).to(device, non_blocking=True)
-    return g
-
 
 # ============== Smiles handling utils ===============================