In [1]:
import numpy as np
import pandas as pd
import deepchem as dc
import torch
import lightning as L

from rdkit import Chem
from torch.utils.data import DataLoader
from lightning.pytorch.callbacks import ModelCheckpoint

from model.demolta import DeMOLTaCollateFn, DeMOLTaConfig
from trainer import LitDeMOLTaForSMILESClassification, LitDeMOLTaGeneratorForSMILESClassification, LitDeMOLTaDiscriminatorForSMILESClassification

Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (C:\Users\dust\AppData\Roaming\Python\Python310\site-packages\deepchem\models\torch_models\__init__.py)
Skipped loading some Jax models, missing a dependency. No module named 'jax'


In [2]:
BATCH_SIZE = 2
SEED = 42

In [3]:
train_df = pd.read_csv('./dataset/data/train.csv')
test_df = pd.read_csv('./dataset/data/test.csv')

In [4]:
train_smiles = train_df['SMILES'].tolist()

In [5]:
Xs = np.zeros(len(train_smiles))
Ys = np.ones(len(train_smiles))
dataset = dc.data.DiskDataset.from_numpy(X=Xs,y=Ys,w=np.zeros(len(train_smiles)),ids=train_smiles)

In [6]:
scaffoldsplitter = dc.splits.ScaffoldSplitter()

In [7]:
train_indices, val_indices, _ = scaffoldsplitter.split(
    dataset=dataset,
    frac_train=0.8,
    frac_valid=0.2,
    frac_test=0.0,
    seed=SEED,
)

In [8]:
train_dataset = []
for train_index in train_indices:
    smiles = train_df['SMILES'].iloc[train_index]
    labels = train_df['MLM'].iloc[train_index]
    sample = {
        'mol': Chem.MolFromSmiles(smiles),
        'labels' : labels
    }
    train_dataset.append(sample)

val_dataset = []
for val_index in val_indices:
    smiles = train_df['SMILES'].iloc[val_index]
    labels = train_df['MLM'].iloc[val_index]
    sample = {
        'mol': Chem.MolFromSmiles(smiles),
        'labels' : labels
    }
    val_dataset.append(sample)

In [9]:
finetune_collate_fn_config = DeMOLTaConfig(
    do_masking=False,
    smiles_only=False,
)
finetune_collate_fn = DeMOLTaCollateFn(finetune_collate_fn_config)

In [10]:
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, collate_fn=finetune_collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE*2, collate_fn=finetune_collate_fn)

In [12]:
discriminator_config = DeMOLTaConfig(
    num_layers=12,
    hidden_dim=384,
    ff_dim=1536,
    num_heads=6,
    dropout=0.1,
    progressive_layer_drop_prob=0.0
)

In [13]:
lit_model = LitDeMOLTaDiscriminatorForSMILESClassification(
    discriminator_config=discriminator_config,
)

In [14]:
lit_model.discriminator.load_state_dict(torch.load('./checkpoint/current_step=350000-Loss_G=0.0000-Loss_D=18.3434-discriminator.pt'))

<All keys matched successfully>

In [15]:
checkpoint_callback = ModelCheckpoint(
    monitor='Val_Loss',
    dirpath='./checkpoint/',
    filename='DeMOLTa-discriminator-{epoch:02d}-{Loss:.2f}-{Val_Loss:.2f}',
    save_top_k=3,
)

In [16]:
trainer = L.Trainer(
    accelerator='gpu',
    precision='16-mixed',
    max_epochs=10,
    callbacks=[checkpoint_callback],
    accumulate_grad_batches=16
)


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [17]:
trainer.fit(lit_model, train_dataloader, val_dataloader)

You are using a CUDA device ('NVIDIA GeForce RTX 3070') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type                 | Params
-------------------------------------------------------
0 | discriminator | DeMOLTaDiscriminator | 53.4 M
1 | classfier     | Linear               | 385   
-------------------------------------------------------
53.4 M    Trainable params
0         Non-trainable params
53.4 M    Total params
213.504   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [11]:
generator_config = DeMOLTaConfig(
    num_layers=6,
    hidden_dim=384,
    ff_dim=1536,
    num_heads=6,
    dropout=0.0,
    progressive_layer_drop_prob=0.0
)

In [12]:
lit_model = LitDeMOLTaGeneratorForSMILESClassification(
    generator_config=generator_config,
)

In [14]:
lit_model.generator.load_state_dict(torch.load('./checkpoint/current_step=350000-Loss_G=0.0000-Loss_D=18.3434-generator.pt'))


<All keys matched successfully>

In [15]:
checkpoint_callback = ModelCheckpoint(
    monitor='Val_Loss',
    dirpath='./checkpoint/',
    filename='DeMOLTa-generator-{epoch:02d}-{Loss:.2f}-{Val_Loss:.2f}',
    save_top_k=3,
)

In [16]:
trainer = L.Trainer(
    accelerator='gpu',
    precision=16,
    max_epochs=10,
    callbacks=[checkpoint_callback],
    accumulate_grad_batches=16,
)


  rank_zero_warn(
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [17]:
trainer.fit(lit_model, train_dataloader, val_dataloader)

You are using a CUDA device ('NVIDIA GeForce RTX 3070') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | generator | DeMOLTaGenerator | 26.9 M
1 | classfier | Linear           | 385   
-----------------------------------------------
26.9 M    Trainable params
0         Non-trainable params
26.9 M    Total params
107.594   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

In [11]:
generator_config = DeMOLTaConfig(
    num_layers=6,
    hidden_dim=384,
    ff_dim=1536,
    num_heads=6,
    dropout=0.0,
    progressive_layer_drop_prob=0.0
)
discriminator_config = DeMOLTaConfig(
    num_layers=12,
    hidden_dim=384,
    ff_dim=1536,
    num_heads=6,
    dropout=0.0,
    progressive_layer_drop_prob=0.0
)

In [12]:
lit_model = LitDeMOLTaForSMILESClassification(
    generator_config=generator_config,
    discriminator_config=discriminator_config,
)

In [13]:
lit_model.generator.load_state_dict(torch.load('./checkpoint/current_step=350000-Loss_G=0.0000-Loss_D=18.3434-generator.pt'))
lit_model.discriminator.load_state_dict(torch.load('./checkpoint/current_step=350000-Loss_G=0.0000-Loss_D=18.3434-discriminator.pt'))

<All keys matched successfully>

In [13]:
checkpoint_callback = ModelCheckpoint(
    monitor='Val_Loss',
    dirpath='./checkpoint/',
    filename='DeMOLTa-scratch-{epoch:02d}-{Loss:.2f}-{Val_Loss:.2f}',
    save_top_k=3,
)

In [14]:
trainer = L.Trainer(
    accelerator='gpu',
    precision=16,
    max_epochs=10,
    callbacks=[checkpoint_callback],
)


  rank_zero_warn(
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(lit_model, train_dataloader, val_dataloader)

You are using a CUDA device ('NVIDIA GeForce RTX 3070') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type                 | Params
-------------------------------------------------------
0 | generator     | DeMOLTaGenerator     | 26.9 M
1 | discriminator | DeMOLTaDiscriminator | 53.4 M
2 | classfier     | Linear               | 385   
-------------------------------------------------------
80.3 M    Trainable params
0         Non-trainable params
80.3 M    Total params
321.096   Total estimated model params size (MB)


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

  rank_zero_warn(


                                                                           

  loss = F.mse_loss(x, labels) ** 0.5
  rank_zero_warn(


Epoch 0:   0%|          | 1/1399 [00:00<02:32,  9.15it/s, v_num=61, Loss=84.80]

  loss = F.mse_loss(x, labels) ** 0.5


Epoch 1:  20%|█▉        | 277/1399 [00:28<01:55,  9.71it/s, v_num=61, Loss=48.20, Val_Loss=34.50] 

Epoch 1:  21%|██▏       | 300/1399 [00:31<01:54,  9.62it/s, v_num=61, Loss=32.00, Val_Loss=34.50]