### Full corss-validation pipeline for downstream eval

Here we provide an example notebook how to run cross validation embedding logprobs for any dataset.

#### Imports and loading the model

In [2]:
import os
os.chdir(str(Path.cwd().resolve().parents[0]))

from pathlib import Path
from typing import Tuple

import numpy as np
import pandas as pd
import deepchem as dc

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler

from src.utils import scaffold_kfold_split

In [3]:
from src import PretrainedACEMol

acemol = PretrainedACEMol('/data/prastalog/models/scaling/250k/checkpoints/last.ckpt', device='cuda:0')

#### Loading the dataset and embedding

In [4]:
task = 'inhibition of the human beta-secretase 1 (BACE-1)'

tasks, datasets, _ = dc.molnet.load_bace_classification(
    featurizer="Raw",
    spliter="scaffold",
)

train_ds, valid_ds, test_ds = datasets

# Get all smiles and targets together for cross validation
smiles = (
    list(train_ds.ids)
    + list(valid_ds.ids)
    + list(test_ds.ids)
)

targets = np.concatenate([
    train_ds.y,
    valid_ds.y,
    test_ds.y,
]).squeeze().tolist()

In [5]:
embedded = acemol.embed(smiles, task, targets, 32)

Map:   0%|          | 0/1513 [00:00<?, ? examples/s]

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:03<00:00, 14.21it/s]


#### Coss validation

In [6]:
def cross_val_aucroc(embeded: pd.DataFrame, folds: float=4, seed: int=42) -> Tuple[float, float]:
    """Compute the cross validation results.

    Args:
        embeded (pd.DataFrame): return from acemol.embed.
        folds (float, optional): number of kfolds. defaults to 4.
        seed (int, optional): seed for kfolds. defaults to 42.

    Returns:
        Tuple[float, float]: mean and std %AUCROC.
    """
    
    y_select = embedded['target'].values.tolist()
    x_select = embedded['embeddings'].values.tolist()
    
    folds = scaffold_kfold_split(embedded['smiles'], k=folds, seed=seed)
    
    custom_folds = []
    for i in range(len(folds)):
        test_idx = np.array(folds[i])
        train_idx = np.array([j for k, f in enumerate(folds) if k != i for j in f])
        custom_folds.append((train_idx, test_idx))
    
    lambda_theory = np.sqrt(2 * np.log(len(x_select[0])) / len(x_select))
    
    pipeline = Pipeline([
        ('scaler', MinMaxScaler()),
        ('regressor', LogisticRegression(
            penalty="l1",
            solver="liblinear",
            C=1 / lambda_theory,
            class_weight='balanced'
        ))
    ])
    
    scores = cross_val_score(pipeline, x_select, y_select, cv=custom_folds, scoring='roc_auc')
    mean = scores.mean() * 100
    std = scores.std() * 100

    return mean, std

In [7]:
m, s = cross_val_aucroc(embedded)
print(f'%AUCROC for {task}: {m:.2f} +- {s:.2f}')

%AUCROC for inhibition of the human beta-secretase 1 (BACE-1): 83.86 +- 2.76


#### Some variation is expected depending on the used seed and the number of folds

In [8]:
m, s = cross_val_aucroc(embedded, folds=5)
print(f'%AUCROC for {task}: {m:.2f} +- {s:.2f}')

%AUCROC for inhibition of the human beta-secretase 1 (BACE-1): 84.94 +- 1.46


In [9]:
m, s = cross_val_aucroc(embedded, seed=13)
print(f'%AUCROC for {task}: {m:.2f} +- {s:.2f}')

%AUCROC for inhibition of the human beta-secretase 1 (BACE-1): 83.36 +- 3.40
