In [1]:
import numpy as np
import logging
from rdkit import Chem,RDLogger
from catemb.data import CatDataset,get_idx_split
from catemb.utils import setup_logger
from catemb.train import CLTrain
from catemb.model import CL2D3DMol
from catemb.encoder import EquiformerV2,DimeNetPP
from torch_geometric.data import DataLoader
from datetime import datetime
random_seed = 42
RDLogger.DisableLog('rdApp.*')
pt = Chem.GetPeriodicTable()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = CatDataset(root="../dataset/processed",name="lig_cat_dataset_xtbopt_178674",trunc=5000,save_smiles=True,read_coord=True)
dataloader = DataLoader(dataset,batch_size=8,shuffle=True)
for data in dataloader:
    break

In [3]:
data.E

tensor([-13.7421,  -6.9186,  -7.0671, -14.3649, -18.4212,  -8.4835, -16.0217,
        -11.3172])

## 构造模型

In [12]:
from torch_scatter import scatter

In [5]:
model1 = DimeNetPP()

In [6]:
model2 = EquiformerV2()

In [20]:
v = model(data.x[:,0],data.mol_coords,data.batch)

In [27]:
from torch.nn import MSELoss

In [28]:
lossfunc = MSELoss()

In [29]:
lossfunc(model.energy,data.E)

tensor(9454.3105, grad_fn=<MseLossBackward0>)

In [25]:
model.energy

tensor([1.8257, 1.6685, 1.1002, 2.0952, 2.3191, 1.6585, 0.7852, 1.1253],
       grad_fn=<DivBackward0>)

In [26]:
data.E

tensor([ -78.4892, -110.5388, -116.8915, -146.1047,  -84.7115,  -49.8759,
         -96.3153,  -31.2736])

RuntimeError: The expanded size of the tensor (128) must match the existing size (465) at non-singleton dimension 1.  Target sizes: [465, 128].  Tensor sizes: [1, 465]

## Old

In [2]:
param_data = {"root":"../dataset/processed","name":"lig_cat_dataset","trunc":0,'train_ratio':0.9,'valid_ratio':0.1,'batch_size':8}
param_2d = {'gnum_layer':4, 'emb_dim':128, 'dest_dim':64, 'gnn_aggr':"add", 'bond_feat_red':"mean", 
            'gnn_type':'gcn', 'JK':"last", 'drop_ratio':0.0, 'node_readout':"sum"}

param_3d = {
        'model_type':'equif',
        'use_pbc':False,
        'regress_forces':False,
        'otf_graph':True,
        'max_neighbors':64,
        'max_radius':5.0,
        'num_layers':6,
        'sphere_channels':128,
        'attn_hidden_channels':128,
        'num_heads':4,
        'attn_alpha_channels':32,
        'attn_value_channels':16,
        'ffn_hidden_channels':64,
        'norm_type':"rms_norm_sh",
        'lmax_list':[4],
        'mmax_list':[2],
        'grid_resolution':None,
        'num_sphere_samples':128,
        'edge_channels':128,
        'use_atom_edge_embedding':True,
        'share_atom_edge_embedding':False,
        'use_m_share_rad':False,
        'distance_function':"gaussian",
        'num_distance_basis':64,
        'attn_activation':"scaled_silu",
        'use_s2_act_attn':False,
        'use_attn_renorm':True,
        'ffn_activation':"scaled_silu",
        'use_gate_act':False,
        'use_grid_mlp':False,
        'use_sep_s2_act':True,
        'alpha_drop':0.1,
        'drop_path_rate':0.05,
        'proj_drop':0.0,
        'weight_init':"normal",
        'final':'mean' # xlc
}

'''

param_3d = {
        'model_type':'dimenetpp',
        'cutoff':5.0, 'num_layers':4, 
        'hidden_channels':128, 'out_channels':128, 'int_emb_size':64, 'basis_emb_size':8, 'out_emb_channels':256, 
        'num_spherical':7, 'num_radial':6, 'envelope_exponent':5, 
        'num_before_skip':1, 'num_after_skip':2, 'num_output_layers':3, 
        'output_init':'GlorotOrthogonal',
}
'''
param_cl = {"metric":"InfoNCE_dot_prod",
            "T":1.0,
            "cl_weight":1.0,
            "kl_weight":-1.0,
            "reduce":"mean",
            }
param_optimizer = {'type':'adamw','lr':1e-4}
param_scheduler = {'type': 'reduceonplateau', 'lr_decay_step_size': 20, 'lr_decay_factor': 0.95, 'min_lr': 5e-6, 'warmup_step':50000}
param_other = {"log_iter_step":10, 'clip_norm':50.0, 'epoch':200,'save_path':"./save_model",'device':'cuda:0',"seed":42, "tag":"dev"}
dt = datetime.strftime(datetime.now(), "%y%m%d-%H%Mh")
param_other['save_path'] = f"{param_other['save_path']}/{dt}"

In [3]:
full_params = {"param_2d":param_2d,
               "param_3d":param_3d,
               "param_cl":param_cl,
               "param_optimizer":param_optimizer,
               "param_scheduler":param_scheduler,
               "param_other":param_other}

In [4]:
setup_logger(param_other['save_path'])
np.save(f"{param_other['save_path']}/full_params.npy",full_params)
logging.info(str(full_params))
new_dataset = CatDataset(**param_data,seed=param_other['seed'])
data_split_dict = get_idx_split(len(new_dataset),
                                int(param_data['train_ratio']*len(new_dataset)),
                                int(param_data['valid_ratio']*len(new_dataset)),seed=param_other['seed'])
train_dataset = new_dataset[data_split_dict['train']]
valid_dataset = new_dataset[data_split_dict['valid']]

{'param_2d': {'gnum_layer': 4, 'emb_dim': 128, 'dest_dim': 64, 'gnn_aggr': 'add', 'bond_feat_red': 'mean', 'gnn_type': 'gcn', 'JK': 'last', 'drop_ratio': 0.0, 'node_readout': 'sum'}, 'param_3d': {'model_type': 'equif', 'use_pbc': False, 'regress_forces': False, 'otf_graph': True, 'max_neighbors': 64, 'max_radius': 5.0, 'num_layers': 6, 'sphere_channels': 128, 'attn_hidden_channels': 128, 'num_heads': 4, 'attn_alpha_channels': 32, 'attn_value_channels': 16, 'ffn_hidden_channels': 64, 'norm_type': 'rms_norm_sh', 'lmax_list': [4], 'mmax_list': [2], 'grid_resolution': None, 'num_sphere_samples': 128, 'edge_channels': 128, 'use_atom_edge_embedding': True, 'share_atom_edge_embedding': False, 'use_m_share_rad': False, 'distance_function': 'gaussian', 'num_distance_basis': 64, 'attn_activation': 'scaled_silu', 'use_s2_act_attn': False, 'use_attn_renorm': True, 'ffn_activation': 'scaled_silu', 'use_gate_act': False, 'use_grid_mlp': False, 'use_sep_s2_act': True, 'alpha_drop': 0.1, 'drop_path_ra

In [5]:
len(new_dataset)

195503

In [6]:
train_dataloader = DataLoader(train_dataset, batch_size=param_data['batch_size'], shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=param_data['batch_size'], shuffle=False)
clmodel = CL2D3DMol(param_2d,param_3d)
cltrain = CLTrain(clmodel,param_3d,param_cl,param_optimizer,param_scheduler,param_other)

In [7]:
cltrain.run(train_dataloader,valid_dataloader)

Epoch 1/200, training...
Step [10/21994], loss: 0.2590, cl loss: 0.2590, kl loss: 0.0000, g_norm: 0.4000, lr: 0.00010000
Step [20/21994], loss: 0.2540, cl loss: 0.2540, kl loss: 0.0000, g_norm: 0.5607, lr: 0.00010000
Step [30/21994], loss: 0.2463, cl loss: 0.2463, kl loss: 0.0000, g_norm: 1.8204, lr: 0.00010000
Step [40/21994], loss: 0.2514, cl loss: 0.2514, kl loss: 0.0000, g_norm: 2.5357, lr: 0.00010000
Step [50/21994], loss: 0.2058, cl loss: 0.2058, kl loss: 0.0000, g_norm: 4.4643, lr: 0.00010000
Step [60/21994], loss: 0.2054, cl loss: 0.2054, kl loss: 0.0000, g_norm: 4.4345, lr: 0.00010000
Step [70/21994], loss: 0.1933, cl loss: 0.1933, kl loss: 0.0000, g_norm: 3.0169, lr: 0.00010000
Step [80/21994], loss: 0.2016, cl loss: 0.2016, kl loss: 0.0000, g_norm: 8.9625, lr: 0.00010000
Step [90/21994], loss: 0.1916, cl loss: 0.1916, kl loss: 0.0000, g_norm: 3.1983, lr: 0.00010000
Step [100/21994], loss: 0.1870, cl loss: 0.1870, kl loss: 0.0000, g_norm: 6.4489, lr: 0.00010000
Step [110/2199

KeyboardInterrupt: 

In [10]:
for data in train_dataloader:
    data = data.to('cuda:0')
    break