In [1]:
# THIS IS THE SETUP CODE FOR THE VIRTUALENV INSIDE OF THE DRIVE
# You only need to run this cell once, as it sets up your google drive mount
# Once that is done you can comment it out

# This code will create a venv and clone the repo into your github
# This allows for persistence of your jupyter notebook


In [2]:
import sys

In [3]:
# Install pytorch
# Determine CUDA version (can be done with !nvcc --version)
# Find relevant command if another CUDA version is used: https://pytorch.org/get-started/locally/
# For CUDA 12.1

#!{sys.executable} -m pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

# Add here for other CUDA version

# For CPU-only:
#!{sys.executable} -m pip3 install torch torchvision torchaudio

In [4]:
# Installing pyg packages
import torch
# Using the pyg.org is necessary as it makes it way faster than regular pip install
!{sys.executable} -m pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-{torch.__version__}.html

# Install torch_geometric
!{sys.executable} -m pip install torch_geometric

C:\Users\Tim\PycharmProjects\MolRep\.venv\Scripts\python.exe: No module named pip3
C:\Users\Tim\PycharmProjects\MolRep\.venv\Scripts\python.exe: No module named pip3


In [5]:
# Install other requirements
!{sys.executable} -m pip install -r requirements.txt


C:\Users\Tim\PycharmProjects\MolRep\.venv\Scripts\python.exe: No module named pip3


In [6]:
#TODO test whether this is needed; these packages were added to requirements.txt
#!pip install rdkit scikit-network geomloss ogb scikit-optimize

In [1]:
import sys
import os
sys.path.append('../')
import numpy as np


from MolRep import MolRep
from MolRep.Utils.logger import Logger
from MolRep.Utils.config_from_dict import Config
from MolRep.Experiments.experiments import EndToEndExperiment

In [2]:
MODEL_CONFIG_DIR = '../MolRep/Configs' # Need to set! The directory of Model Configurations files, such as config_CMPNN.yml.
DATASET_DIR = '../DataSets'     # Need to set! The directory of Datasets downloaded from Google Drive.
OUTPUT_DIR = '../Outputs'

# Output file name
_CONFIG_BASE = 'config_'
_CONFIG_FILENAME = 'config_results.json'

# Args
_FOLDS = 5
MODEL_NAME = 'CMPNN'#'MorganFP'#'MolecularFingerprint' #'CMPNN'
DATASET_NAME = 'BBBP'

In [3]:
dataset_config, dataset, model_configurations, model_selector, exp_path = MolRep.construct_dataset(
        dataset_name = DATASET_NAME,
        model_name = MODEL_NAME,
        inner_k = _FOLDS,
        config_dir = MODEL_CONFIG_DIR,
        datasets_dir = DATASET_DIR,
        output_dir=OUTPUT_DIR
)

CMPNN
{'GIN': <class 'MolRep.Models.graph_based.GIN.GIN'>, 'ECC': <class 'MolRep.Models.graph_based.ECC.ECC'>, 'DGCNN': <class 'MolRep.Models.graph_based.DGCNN.DGCNN'>, 'DiffPool': <class 'MolRep.Models.graph_based.DiffPool.DiffPool'>, 'GraphSAGE': <class 'MolRep.Models.graph_based.GraphSAGE.GraphSAGE'>, 'GAT': <class 'MolRep.Models.graph_based.GAT.GAT'>, 'GraphNet': <class 'MolRep.Models.graph_based.GraphNet.GraphNet'>, 'MPNN': <class 'MolRep.Models.graph_based.MPNN.MPNN'>, 'CMPNN': <class 'MolRep.Models.graph_based.CMPNN.CMPNN'>, 'DMPNN': <class 'MolRep.Models.graph_based.DMPNN.DMPNN'>, 'MAT': <class 'MolRep.Models.sequence_based.MAT.MAT'>, 'CoMPT': <class 'MolRep.Models.sequence_based.CoMPT.CoMPT'>, 'BiLSTM': <class 'MolRep.Models.sequence_based.BiLSTM.BiLSTM'>, 'SALSTM': <class 'MolRep.Models.sequence_based.SALSTM.SALSTM'>, 'Transformer': <class 'MolRep.Models.sequence_based.Transformer.Transformer'>, 'VAE': <class 'MolRep.Models.unsupervised_based.VAE.VAE'>, 'RandomForest': <class

TypeError: expected str, bytes or os.PathLike object, not NoneType

In [None]:
config_id = 0  # the idx of model config since there are more than 100 combinations of hyper-parameters.
KFOLD_FOLDER = os.path.join(exp_path, str(_FOLDS) + '_FOLD_MS')
exp_config_name = os.path.join(KFOLD_FOLDER, _CONFIG_BASE + str(config_id + 1))
config_filename = os.path.join(exp_config_name, _CONFIG_FILENAME)
if not os.path.exists(exp_config_name):
    os.makedirs(exp_config_name)

In [None]:
config = model_configurations[config_id]

# model configs could be change
# for example:
# config['device'] = 'cpu' or config['batch_size'] = 32

logger = Logger(str(os.path.join(exp_config_name, 'experiment.log')), mode='w')
logger.log('Configuration: ' + str(config))

In [None]:
k_fold_dict = {
    'config': config,
    'folds': [{} for _ in range(_FOLDS)],
    'avg_TR_score': 0.,
    'avg_VL_score': 0.,
    'std_TR_score': 0.,
    'std_VL_score': 0.
}

In [None]:
dataset_getter = MolRep.construct_dataloader(dataset)
for k in range(_FOLDS):
    logger.log(f"Training in Fold: {k+1}")
    dataset_getter.set_inner_k(k)

    fold_exp_folder = os.path.join(exp_config_name, 'FOLD_' + str(k + 1))
    # Create the experiment object which will be responsible for running a specific experiment
    experiment = EndToEndExperiment(config, dataset_config, fold_exp_folder)

    model_path = os.path.join(fold_exp_folder, f"{MODEL_NAME}_{DATASET_NAME}_fold_{k}.pt")
    training_score, training_loss, validation_score, best_validation_score, validation_loss = experiment.run_valid(dataset_getter, logger, other={'model_path': model_path})

    print('training_score:', training_score, 'validation_score:', best_validation_score, 'best_validation_score:', best_validation_score)
    print('training_loss:', training_loss, 'validation_loss:',validation_loss)
    logger.log(str(k+1) + ' split, TR Score: ' + str(training_score) +
                ' VL Score: ' + str(validation_score))

    k_fold_dict['folds'][k]['TR_score'] = training_score
    k_fold_dict['folds'][k]['VL_score'] = validation_score

tr_scores = np.array([k_fold_dict['folds'][k]['TR_score'] for k in range(_FOLDS)])
vl_scores = np.array([k_fold_dict['folds'][k]['VL_score'] for k in range(_FOLDS)])

k_fold_dict['avg_TR_score'] = tr_scores.mean()
k_fold_dict['std_TR_score'] = tr_scores.std()
k_fold_dict['avg_VL_score'] = vl_scores.mean()
k_fold_dict['std_VL_score'] = vl_scores.std()


log_str = f"TR avg is %.4f std is %.4f; VL avg is %.4f std is %.4f" % (
            k_fold_dict['avg_TR_score'], k_fold_dict['std_TR_score'], k_fold_dict['avg_VL_score'], k_fold_dict['std_VL_score']
        )
logger.log(log_str)