# Setup

## Anaconda Setup

First, we set up Anaconda environment to install all necessary packages.
Note that after running this block, the Colab Kernel will restart. After restarting the Kernel, proceed to the next block.

In [2]:
!pip install -q condacolab
import condacolab
condacolab.install()

⏬ Downloading https://github.com/conda-forge/miniforge/releases/download/23.1.0-1/Mambaforge-23.1.0-1-Linux-x86_64.sh...
📦 Installing...
📌 Adjusting configuration...
🩹 Patching environment...
⏲ Done in 0:00:15
🔁 Restarting kernel...


Check that Anaconda is installed properly and also check the version of Anaconda.

In [1]:
!conda --version

conda 23.1.0


In [2]:
import condacolab
condacolab.check()

✨🍰✨ Everything looks OK!


Now we can install all necessary packages to run HBCVTr via Anaconda and pip.

In [3]:
!conda install -c conda-forge rdkit=2023.3.2 -y
!conda install -c conda-forge deepsmiles
!pip install transformers==4.31.0 SmilesPE==0.0.3

Collecting package metadata (current_repodata.json): - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | done
Solving environment: - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | /

## Import Git Repository

Clone the repository from Github

In [4]:
!git clone https://github.com/imeewan/HBCVTr

Cloning into 'HBCVTr'...
remote: Enumerating objects: 170, done.[K
remote: Counting objects: 100% (100/100), done.[K
remote: Compressing objects: 100% (94/94), done.[K
remote: Total 170 (delta 58), reused 12 (delta 4), pack-reused 70[K
Receiving objects: 100% (170/170), 182.41 KiB | 4.68 MiB/s, done.
Resolving deltas: 100% (92/92), done.


Change directory to the cloned repository.

In [5]:
%cd HBCVTr

/content/HBCVTr


## Download Models

Finally, we download the trained models from Google Drive.

In [6]:
!gdown --id 1hDDNY9kE3Y-IFJEeILDxwG5NbRWMCWA8
!gdown --id 1vAkxP3y-FD5N5BpbfXIzTn5-nORlnv4T

Downloading...
From: https://drive.google.com/uc?id=1hDDNY9kE3Y-IFJEeILDxwG5NbRWMCWA8
To: /content/HBCVTr/hbv_model.pt
100% 1.12G/1.12G [00:18<00:00, 59.0MB/s]
Downloading...
From: https://drive.google.com/uc?id=1vAkxP3y-FD5N5BpbfXIzTn5-nORlnv4T
To: /content/HBCVTr/hcv_model.pt
100% 1.12G/1.12G [00:15<00:00, 72.2MB/s]


In [7]:
!mv hbv_model.pt model/
!mv hcv_model.pt model/

# Run Demo

In this demo, we will run a prediction using our HBCVTr model.

First, let's import all necessary packages.

In [8]:
from BartDataset import BartDataset
from CustomBart_Atomic_Tokenizer import CustomBart_Atomic_Tokenizer
from CustomBart_FG_Tokenizer import CustomBart_FG_Tokenizer
from TqdmWrap import TqdmWrap
from DualInputDataset import DualInputDataset
from DualBartModel import DualBartModel, CustomBartModel
import torch
from torch import nn
from torch.utils.data import DataLoader, RandomSampler, Dataset
from torch.optim import AdamW
import pandas as pd
import numpy as np
import random
import deepsmiles
from SmilesPE.tokenizer import *
from SmilesPE.pretokenizer import atomwise_tokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from rdkit import Chem
import codecs
from transformers import AdamW, BartTokenizer, BartForConditionalGeneration, BartConfig, get_linear_schedule_with_warmup, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, PreTrainedTokenizer
import re
from tqdm.auto import tqdm
from tqdm import tqdm
import itertools
import json
import os
from utils import *
from pretrained_utils import *
from rdkit import Chem
from rdkit.Chem import SaltRemover

Input the smiles and virus choice to predict here.

In [9]:
# smiles = input("Enter the SMILES of the compound: ")
smiles = 'C[C@H](Cn1cnc2c(N)ncnc21)OCP(=O)(O)OP(=O)(O)CO[C@H](C)Cn1cnc2c(N)ncnc21'
# virus_choice = input("Do you want to predict the compound's activity against HBV or HCV? (Enter HBV or HCV): ").lower()
virus_choice = 'hbv'

Finally, we run the prediction.

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("Analysis in progress ...")

if virus_choice == 'hbv':
  model_path = "model/hbv_model.pt"
  max_pact = max_pact_hbv
  min_pact = min_pact_hbv
elif virus_choice == 'hcv':
  model_path = "model/hcv_model.pt"
  max_pact = max_pact_hcv
  min_pact = min_pact_hcv
else:
  raise ValueError("Invalid input. Please enter either 'HBV' or 'HCV'.")

max_length = 250

model = DualBartModel(config1, config2, reg_mod)
model.load_state_dict(torch.load(model_path, map_location=device))
model.to(device)

smiles_data_no_salt = remove_salt(smiles)
smiles = smiles_data_no_salt

input_encoding1 = tokenizer1.encode_plus(smiles, truncation=True, max_length=max_length, padding='max_length', return_tensors="pt")
input_encoding2 = tokenizer2.encode_plus(smiles, truncation=True, max_length=max_length, padding='max_length', return_tensors="pt")

input_ids1 = input_encoding1['input_ids'].to(device)
attention_mask1 = input_encoding1['attention_mask'].to(device)
input_ids2 = input_encoding2['input_ids'].to(device)
attention_mask2 = input_encoding2['attention_mask'].to(device)


with torch.no_grad():
  output = model(input_ids1=input_ids1, attention_mask1=attention_mask1,
                  input_ids2=input_ids2, attention_mask2=attention_mask2)

prediction = output
prediction_value = prediction.cpu().numpy()[0]
print('SMILES: ', smiles)
print('Predicted pACT: ', prediction_value * (max_pact - min_pact) + min_pact)
predicted_EC50 = 10**-(prediction_value * (max_pact - min_pact) + min_pact) * 10**9
print('Predicted EC50 :', predicted_EC50, 'nM')

Analysis in progress ...
SMILES:  C[C@H](Cn1cnc2c(N)ncnc21)OCP(=O)(O)OP(=O)(O)CO[C@H](C)Cn1cnc2c(N)ncnc21
Predicted pACT:  8.122957168817521
Predicted EC50 : 7.534298651631602 nM


# Training Demo

## Training Parameters

Set Pytorch device.

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Function for training a model.

In [11]:

def training_model(combinations):
    for idx, (d_model1, encoder_ffn_dim1, num_attention_heads1, num_hidden_layers1, dropout1, lr1) in enumerate(combinations):

        (d_model2, encoder_ffn_dim2, num_attention_heads2, num_hidden_layers2, dropout2, lr2) = (d_model1, encoder_ffn_dim1, num_attention_heads1, num_hidden_layers1, dropout1, lr1)

        max_r2 = -100

        config1 = BartConfig(
            vocab_size=len(atomic_vocab),
            d_model=d_model1,
            encoder_ffn_dim=encoder_ffn_dim1,
            num_attention_heads=num_attention_heads1,
            num_hidden_layers=num_hidden_layers1,
            pad_token_id=tokenizer1.pad_token_id,
            max_position_embeddings=max_length,
            dropout=dropout1,
        )

        config2 = BartConfig(
            vocab_size=len(fg_vocab),
            d_model=d_model2,
            encoder_ffn_dim=encoder_ffn_dim2,
            num_attention_heads=num_attention_heads2,
            num_hidden_layers=num_hidden_layers2,
            pad_token_id=tokenizer2.pad_token_id,
            max_position_embeddings=max_length,
            dropout=dropout2,
        )


        model = DualBartModel(config1, config2, reg_mod)
        model.to(device)
        model.apply(weights_init)
        optimizer = AdamW(model.parameters(), lr=lr1, weight_decay=weight_decay)

        print(f"Model {idx+1} configurations: ")
        print(f"d_model1: {d_model1}, encoder_ffn_dim1: {encoder_ffn_dim1}, num_attention_heads1: {num_attention_heads1}, num_hidden_layers1: {num_hidden_layers1}")
        print(f"d_model2: {d_model2}, encoder_ffn_dim2: {encoder_ffn_dim2}, num_attention_heads2: {num_attention_heads2}, num_hidden_layers2: {num_hidden_layers2}")

        log_file_path = f"model/new_model2.log"

        os.makedirs(os.path.dirname(log_file_path), exist_ok=True)

        config_dict = {
            'd_model1': d_model1,
            'encoder_ffn_dim1': encoder_ffn_dim1,
            'num_attention_heads1': num_attention_heads1,
            'num_hidden_layers1': num_hidden_layers1,
            'd_model2': d_model2,
            'encoder_ffn_dim2': encoder_ffn_dim2,
            'num_attention_heads2': num_attention_heads2,
            'num_hidden_layers2': num_hidden_layers2,
            'dropout1': dropout1,
            'dropout2': dropout2,
            'lr': lr1,
            'regression_dim': reg_mod,
            'weight_decay': weight_decay,
        }

        with open(log_file_path, 'w') as outfile:
            outfile.write(json.dumps(config_dict) + '\n')

        for epoch in range(num_epochs):
            model.train()
            total_train_loss = 0

            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", dynamic_ncols=True)):
                inputs1, inputs2 = batch['input_ids1'].to(device).long(), batch['input_ids2'].to(device).long()
                attention_mask1, attention_mask2 = batch['attention_mask1'].to(device).long(), batch['attention_mask2'].to(device).long()

                labels = batch['labels'].to(device).float()
                optimizer.zero_grad()

                outputs = model(input_ids1=inputs1, attention_mask1=attention_mask1,
                                input_ids2=inputs2, attention_mask2=attention_mask2)
                pred = outputs
                loss = torch.nn.MSELoss()(pred, labels)

                total_train_loss += loss.item()

                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()

            avg_train_loss = total_train_loss / len(train_dataloader)

            model.eval()
            total_eval_loss = 0
            total_eval_r2 = 0

            for batch in val_dataloader:
                with torch.no_grad():
                    inputs1, inputs2 = batch['input_ids1'].to(device).long(), batch['input_ids2'].to(device).long()
                    attention_mask1, attention_mask2 = batch['attention_mask1'].to(device).long(), batch['attention_mask2'].to(device).long()

                    labels = batch['labels'].to(device).float()

                    outputs = model(input_ids1=inputs1, attention_mask1=attention_mask1,
                                    input_ids2=inputs2, attention_mask2=attention_mask2)
                    pred = outputs
                    loss = torch.nn.MSELoss()(pred, labels)
                    total_eval_loss += loss.item()
                    total_eval_r2 += r2_score(labels.cpu().numpy(), pred.cpu().detach().numpy())

            avg_val_loss = total_eval_loss / len(val_dataloader)
            avg_val_r2 = total_eval_r2 / len(val_dataloader)

            log_dict = {
                'epoch': epoch+1,
                'avg_train_loss': avg_train_loss,
                'avg_val_loss': avg_val_loss,
                'avg_val_r2': avg_val_r2,

            }

            with open(log_file_path, 'a') as outfile:
                outfile.write(json.dumps(log_dict) + '\n')

            if avg_val_r2 > max_r2:
                torch.save(model.state_dict(), f"model/new_model2.pt")
                max_r2 = avg_val_r2

Set training parameters and perform tokenization.

In [12]:
max_length = 250
batch_size = 8

data_path = "data/hbv_dataset.csv"
train_dataloader, val_dataloader = train_val_proc(data_path)

d_models = [16]
encoder_ffn_dims = [16]
num_attention_heads = [2]
num_hidden_layers = [1]
dropouts = [0.15]
learning_rates = [1e-6]
reg_mod = [32, 32]
weight_decay = 0.001
num_epochs = 50

param_combinations = list(itertools.product(d_models, encoder_ffn_dims, num_attention_heads, num_hidden_layers, dropouts, learning_rates))
combinations = param_combinations

Tokenizing: 100%|██████████| 1552/1552 [00:00<00:00, 6296.72it/s]
Tokenizing: 100%|██████████| 389/389 [00:00<00:00, 9434.39it/s]
Tokenizing: 100%|██████████| 1552/1552 [00:10<00:00, 147.10it/s]
Tokenizing: 100%|██████████| 389/389 [00:02<00:00, 172.13it/s]


Finally, train the model.

In [None]:
training_model(combinations)



Model 1 configurations: 
d_model1: 16, encoder_ffn_dim1: 16, num_attention_heads1: 2, num_hidden_layers1: 1
d_model2: 16, encoder_ffn_dim2: 16, num_attention_heads2: 2, num_hidden_layers2: 1


Iteration:   0%|          | 0/25 [00:00<?, ?it/s]