[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/liulinbo19/MapLight-TDC/blob/main/tt_rdkit.ipynb)

In [1]:
!curl -o maplight.py https://raw.githubusercontent.com/liulinbo19/MapLight-TDC/main/tt_rdkit.py

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  8658  100  8658    0     0  25434      0 --:--:-- --:--:-- --:--:-- 25464


In [2]:
# installs for Colab
!pip install rdkit PyTDC tqdm catboost

Collecting rdkit
  Downloading rdkit-2024.9.5-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Collecting PyTDC
  Downloading pytdc-1.1.14.tar.gz (151 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/151.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m143.4/151.3 kB[0m [31m6.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m151.3/151.3 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting catboost
  Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting accelerate==0.33.0 (from PyTDC)
  Downloading accelerate-0.33.0-py3-none-any.whl.metadata (18 kB)
Collecting dataclasses<1.0,>=0.6 (from PyTDC)
  Downloading dataclasses-0.6-py3-none-any.whl.metadata (3.0 kB)
Collecting datasets<2.20.0 (from PyTDC)
  Downloading datasets-2.19.2-py3-none

In [None]:
from maplight import *
from tqdm import tqdm
import catboost as cb

from tdc.benchmark_group import admet_group

In [None]:
benchmark_config = {
    'caco2_wang': ('regression', False),
    'bioavailability_ma': ('binary', False),
    'lipophilicity_astrazeneca': ('regression', False),
    'solubility_aqsoldb': ('regression', False),
    'hia_hou': ('binary', False),
    'pgp_broccatelli': ('binary', False),
    'bbb_martins': ('binary', False),
    'ppbr_az': ('regression', False),
    'vdss_lombardo': ('regression', True),
    'cyp2c9_veith': ('binary', False),
    'cyp2d6_veith': ('binary', False),
    'cyp3a4_veith': ('binary', False),
    'cyp2c9_substrate_carbonmangels': ('binary', False),
    'cyp2d6_substrate_carbonmangels': ('binary', False),
    'cyp3a4_substrate_carbonmangels': ('binary', False),
    'half_life_obach': ('regression', True),
    'clearance_hepatocyte_az': ('regression', True),
    'clearance_microsome_az': ('regression', True),
    'ld50_zhu': ('regression', False),
    'herg': ('binary', False),
    'ames': ('binary', False),
    'dili': ('binary', False)
}

In [None]:
group = admet_group(path = 'data/')

# 打开一个文件用于写入结果
with open('/content/tt_rdkit.txt', 'w') as f:
    # change comment to run all benchmarks
    for admet_benchmark in benchmark_config.keys():
    # for admet_benchmark in [list(benchmark_config.keys())[7]]:
        predictions_list = []
        for seed in tqdm([1, 2, 3, 4, 5]):
            benchmark = group.get(admet_benchmark)
            predictions = {}
            name = benchmark['name']
            train, test = benchmark['train_val'], benchmark['test']

            # --------------------------------------------- #
            #  Train your model using train, valid, test    #
            #  Save test prediction in y_pred_test variable #
            X_train = get_fingerprints(train['Drug'])
            X_test = get_fingerprints(test['Drug'])

            task, log_scale = benchmark_config[name]
            params = {
                    'random_strength': 2,
                    'random_seed': seed,
                    'verbose': 0,
                }

            if task == 'regression':
                Y_scaler = scaler(log=log_scale)
                Y_scaler.fit(train['Y'].values)
                train['Y_scale'] = Y_scaler.transform(train['Y'].values)

                params['loss_function'] = 'MAE'
                model = cb.CatBoostRegressor(**params)
                model.fit(X_train, train['Y_scale'].values)

                y_pred_test = Y_scaler.inverse_transform(model.predict(X_test)).reshape(-1)
            elif task == 'binary':
                params['loss_function'] = 'Logloss'
                model = cb.CatBoostClassifier(**params)
                model.fit(X_train, train['Y'].values)

                y_pred_test = model.predict_proba(X_test)[:, 1]
            # --------------------------------------------- #

            predictions[name] = y_pred_test
            predictions_list.append(predictions)

        results = group.evaluate_many(predictions_list)

        # 将结果写入文件
        f.write('\n\n{}'.format(results))
        print('\n\n{}'.format(results))