In [1]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2024.3.5-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.9 kB)
Downloading rdkit-2024.3.5-cp310-cp310-manylinux_2_28_x86_64.whl (33.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.1/33.1 MB[0m [31m60.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2024.3.5


In [3]:
gdrive_path = '/content/drive/My Drive/Colab Notebooks' 
import pandas as pd
import numpy as np
from rdkit import Chem
import sys
import os

sys.path.append(gdrive_path)
from models.WeaveNet import WeaveNet
from src.featurizer import SimpleFeaturizer

In [4]:
dataset_directory : str = '/content/drive/MyDrive/Colab Notebooks/data'
# dataset_directory : str = './data/open'
train_fname : str = 'train.csv' # TODO: Change this to the name of the dataset file
train_data : pd.DataFrame = pd.read_csv(os.path.join(dataset_directory, train_fname))
smiles : list[str] = train_data['Smiles'].tolist()
label : list[int] = train_data['pIC50'].tolist()

In [5]:
featurizer : SimpleFeaturizer = SimpleFeaturizer(max_dist=7)
datapoints=list()
for i in smiles:
  datapoints.append(Chem.MolFromSmiles(i))

In [6]:
atom_features, pair_features, atom_to_pair = featurizer.featurize(datapoints)
label = np.array(label)

print(f'shape:{atom_features.shape} max:{np.max(atom_features)} min:{np.min(atom_features)}')
print(f'shape:{pair_features.shape} max:{np.max(pair_features)} min:{np.min(pair_features)}')
print(f'shape:{atom_to_pair.shape} max:{np.max(atom_to_pair)} min:{np.min(atom_to_pair)}')


shape:(1952, 72, 10) max:1.0 min:0.0
shape:(1952, 2556, 2) max:7.0 min:0.0
shape:(1952, 2, 2556) max:71 min:0


In [7]:
train_index = 1562
train_atom_features = atom_features[:train_index]
train_pair_features = pair_features[:train_index]
train_atom_to_pair = atom_to_pair[:train_index]
train_label = label[:train_index]
test_atom_features = atom_features[train_index:]
test_pair_features = pair_features[train_index:]
test_atom_to_pair = atom_to_pair[train_index:]
test_label = label[train_index:]

In [5]:
from models import WeaveNet
from tensorflow import keras
weavenet = WeaveNet(n_tasks=1,
                    max_n_atoms=72,
                    max_n_pairs=2556,
                    n_weave=3,
                    n_atom_feat=[10, 12, 24],
                    n_pair_feat=[2, 12, 24],
                    n_graph_feat=128,
                    final_conv_kernel_size=2,
                    fully_connected_layer_sizes=[2000,100],
                    mode="regression").build()

# keras.utils.plot_model(weavenet, show_shapes=True)

In [12]:
from keras.optimizers import AdamW
from keras.losses import Huber
from keras.metrics import MeanAbsoluteError

LEARNING_RATE = 1e-2
LOSS_FUNCTION = Huber()
METRICS = [MeanAbsoluteError()]

weavenet.compile(
  optimizer=AdamW(learning_rate=LEARNING_RATE),
  loss=LOSS_FUNCTION,
  metrics=METRICS,
  run_eagerly=True
)

In [13]:
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
save_dir = os.path.join(gdrive_path, 'models', 'weavenet')
EPOCHS = 500
BATCH_SIZE = 64

checkpointer = ModelCheckpoint(
  filepath=os.path.join(save_dir, 'model_best.keras'),
  monitor='val_loss', verbose=1,
  save_best_only=True
)
reduce_lr = ReduceLROnPlateau(
  monitor='val_loss',
  factor=0.1,
  patience=15,
  verbose=1,
  min_delta=1e-8
)
earlystopper = EarlyStopping(
  monitor='val_loss',
  patience=30,
  verbose=1
)
# tensorboard_callback = tf.keras.callbacks.TensorBoard(
#   log_dir='tmp/tfdbg2_logdir',
#   histogram_freq=1,
#   profile_batch = '500,520'
# )
CALLBACKS = [checkpointer, reduce_lr, earlystopper]
# CALLBACKS = [checkpointer, reduce_lr, earlystopper, tensorboard_callback]

weavenet.fit(
  # x=[atom_features, pair_features, atom_to_pair],
  # y=label,
  x=[train_atom_features, train_pair_features, train_atom_to_pair],
  y=train_label,
  epochs=500,
  batch_size=32,
  # validation_data=(validation_data, test_dataset.y),
  validation_data=([test_atom_features, test_pair_features, test_atom_to_pair], test_label),
  callbacks=CALLBACKS
)

weavenet.save(os.path.join(save_dir, 'model_final.keras'))

Epoch 1/500




[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 551ms/step - loss: 5.2635 - mean_absolute_error: 5.7370
Epoch 1: val_loss improved from inf to 44.81601, saving model to /content/drive/My Drive/Colab Notebooks/models/weavenet/model_best.keras
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 640ms/step - loss: 5.2067 - mean_absolute_error: 5.6797 - val_loss: 44.8160 - val_mean_absolute_error: 45.3160 - learning_rate: 0.0100
Epoch 2/500
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 547ms/step - loss: 0.4774 - mean_absolute_error: 0.8541
Epoch 2: val_loss improved from 44.81601 to 5.25773, saving model to /content/drive/My Drive/Colab Notebooks/models/weavenet/model_best.keras
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 584ms/step - loss: 0.4773 - mean_absolute_error: 0.8540 - val_loss: 5.2577 - val_mean_absolute_error: 5.7577 - learning_rate: 0.0100
Epoch 3/500
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0

In [6]:
featurizer = SimpleFeaturizer(max_dist=7, max_atoms=72, max_pairs=2556)

dataset_directory : str = '/content/drive/MyDrive/Colab Notebooks/data'
save_dir = os.path.join(gdrive_path, 'models', 'weavenet')
test_file = 'test.csv'
test_data = pd.read_csv(os.path.join(dataset_directory, test_file))
test_smiles = test_data['Smiles'].tolist()
test_datapoints = list()
for i in test_smiles:
  test_datapoints.append(Chem.MolFromSmiles(i))
test_atom_features, test_pair_features, test_atom_to_pair = featurizer.featurize(test_datapoints)

TypeError: SimpleFeaturizer.__init__() got an unexpected keyword argument 'max_atoms'

In [15]:
print(test_atom_features.shape, test_pair_features.shape, test_atom_to_pair.shape)

(113, 42, 10) (113, 861, 2) (113, 2, 861)


In [16]:
def pIC50_to_IC50(pic50_values):
        """Convert pIC50 values to IC50 (nM)."""
        return 10 ** (9 - pic50_values)

In [17]:
weavenet.load_weights(os.path.join(save_dir, 'model_best.keras'))
# model = load_model(os.path.join(save_dir, 'model_best.keras'), custom_objects={'WeaveLayer': WeaveLayer, 'WeaveGather': WeaveGather})
test_pred = weavenet.predict([test_atom_features, test_pair_features, test_atom_to_pair])
print(test_pred.shape)

ValueError: Input 0 of layer "functional_1" is incompatible with the layer: expected shape=(None, 72, 10), found shape=(32, 42, 10)

In [17]:
submit = pd.read_csv(os.path.join(dataset_directory, 'sample_submission.csv'))
submit['IC50_nM'] = pIC50_to_IC50(test_pred)
submit.to_csv(f'{save_dir}/baseline_submission.csv', index=False)