In [None]:
%load_ext lab_black

## Setup

### Setup Intel Extensions

In [None]:
from sklearnex import patch_sklearn

In [None]:
patch_sklearn()

### Predictable randomness

In [None]:
import numpy as np

seed = 0


def rng():
    return np.random.RandomState(seed)

### Shared parameters

In [None]:
param_scalers = [None]

### Preprocessing and parameter search

In [None]:
from sklearn.model_selection import KFold, RandomizedSearchCV


def make_parameter_search(model, cv_params):
    return RandomizedSearchCV(
        model,
        cv_params,
        scoring="balanced_accuracy",
        refit=True,
        cv=KFold(n_splits=10, shuffle=True, random_state=seed),
        verbose=3,
        error_score="raise",
        n_jobs=-1,
    )


def parameter_search_name(ps):
    return ps.estimator.steps[-1][0]

### Cross validation

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, recall_score, precision_score


def cross_validation(model, X, y):
    return cross_validate(
        model,
        X,
        y,
        scoring={
            "accuracy": "accuracy",
            "sensitivity": "recall",
            "specificity": make_scorer(recall_score, pos_label=0),
            "balanced_accuracy": "balanced_accuracy",
            "f1": "f1",
            "roc_auc": "roc_auc",
            "precision": make_scorer(precision_score, zero_division=0),
            "matthews_corrcoef": "matthews_corrcoef",
        },
        cv=KFold(n_splits=10, shuffle=True, random_state=seed),
        n_jobs=-1,
    )

## Define Models

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import FunctionTransformer

In [None]:
clf_logr = make_parameter_search(
    Pipeline(
        steps=[
            ("scaler", None),
            ("pca", PCA(n_components=8, random_state=rng())),
            (
                "logr",
                LogisticRegression(solver="saga", max_iter=10000, random_state=rng()),
            ),
        ]
    ),
    {
        "scaler": param_scalers,
        "logr__penalty": ["elasticnet"],
        "logr__C": [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
        "logr__l1_ratio": [0, 0.25, 0.5, 0.75, 1],
    },
)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf_rf = make_parameter_search(
    Pipeline(
        steps=[
            ("scaler", None),
            (
                "rf",
                RandomForestClassifier(max_features=1.0, random_state=rng(), n_jobs=-1),
            ),
        ]
    ),
    {
        "scaler": param_scalers,
        "rf__class_weight": ["balanced"],
        "rf__n_estimators": [
            5,
            10,
            25,
            50,
        ],  # , 100, 250
        "rf__max_depth": [2, 4, 8, 16],  # , 32, 64
    },
)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
clf_knn = make_parameter_search(
    Pipeline(steps=[("scaler", None), ("knn", KNeighborsClassifier())]),
    {
        "scaler": param_scalers,
        "knn__n_neighbors": [3, 5, 9, 11, 13, 17, 19],
        "knn__weights": ["uniform", "distance"],
        "knn__metric": ["euclidean", "manhattan"],
    },
)

In [None]:
from sklearn.svm import SVC

In [None]:
clf_svc = make_parameter_search(
    Pipeline(
        steps=[
            ("scaler", None),
            ("svc", SVC(class_weight="balanced", probability=True, random_state=rng())),
        ]
    ),
    {
        "scaler": param_scalers,
        "svc__kernel": ["rbf"],
        "svc__C": [0.1, 1.0, 10.0, 100.0, 1000.0],
        "svc__gamma": [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0],
    },
)

In [None]:
from xgboost import XGBClassifier

In [None]:
clf_xgb = make_parameter_search(
    Pipeline(
        steps=[
            ("scaler", None),
            ("xgb", XGBClassifier(random_state=rng(), n_jobs=-1)),
        ]
    ),
    {
        "scaler": param_scalers,
        "xgb__scale_pos_weight": [0.1, 0.5, 1, 5, 10],
        "xgb__objective": [None, "binary:logistic"],
        "xgb__n_estimators": [5, 10, 25, 50, 100, 250],
        "xgb__max_depth": [2, 4, 8, 16, 32, 64],
    },
)

In [None]:
models = [clf_logr, clf_rf, clf_knn, clf_svc, clf_xgb]
targets = [
    "BCRP",
    "BCRP-S",
    "BSEP",
    "MATE1",
    "MDR1",
    "MDR1-S",
    "MRP2-S",
    "MRP3",
    "MRP3-S",
    "OATP1B1",
    "OATP1B3",
    "OCT1",
    "OCT2",
]

## Training

### Training routine

In [None]:
from rdkit.Chem.PandasTools import LoadSDF
import numpy as np
from molvs import Standardizer, MolVSError
from lib.descriptors.cdk import ECFPCalc, ECFPID
import joblib

In [None]:
standardizer = Standardizer()


def standardize(mol):
    try:
        mol = standardizer.fragment_parent(mol)
        mol = standardizer.charge_parent(mol)
        mol = standardizer.stereo_parent(mol)
        return mol
    except MolVSError:
        return None

In [None]:
ecfp_calc = ECFPCalc(ECFPID.ECFP4)

def descriptor_calc(mol):
    return ecfp_calc(mol)

In [None]:
def load_data(target):
    data = LoadSDF(
        f"ba_assets/data_for_models/data_threshold_all_filled_0.5_all_masters/training_chembl+manual/{target}.sdf"
    )

    # Dask dataframe should be the better choice, but it seems to have some problems
    # with converting object types to string.
    data = data.assign(ROMol=data.ROMol.map(standardize))
    data = data.assign(Classification=data.Classification.astype(int))
    data = data.dropna(subset=["ROMol"])

    data = data.assign(Descriptors=data.ROMol.map(descriptor_calc))

    return np.stack(data.Descriptors), np.stack(data.Classification)

In [None]:
def train_model(model, data):
    # Destructure training data
    X, y = data

    # Train
    with joblib.parallel_config("dask", n_jobs=-1):
        model.fit(X, y)

    # Cross validate
    cross = cross_validation(model, X, y)

    return model, cross

### Configure task runner

In [None]:
from dask_jobqueue import SLURMCluster
from dask.distributed import Client
import atexit
import shutil

In [None]:
if "cluster" in globals():
    cluster.close()
shutil.rmtree("logs", ignore_errors=True)

cluster = SLURMCluster(
    cores=1,
    job_cpu=16,
    memory="128 GB",
    scheduler_options={"host": "0.0.0.0", "dashboard_address": ":8787"},
    log_directory="logs",
    # worker_extra_args=["--resources", "singleton=1"],
)

atexit.register(lambda: cluster.close())

In [None]:
cluster.scale(len(models) * len(targets))

In [None]:
client = Client(cluster)

### Start jobs

In [None]:
data = {name: client.submit(load_data, name) for name in targets}

In [None]:
jobs = {
    f"{target_name}-{parameter_search_name(model)}": client.submit(
        train_model,
        model,
        target_data,
        # resources={"singleton": 1},
    )
    for model in models
    for (target_name, target_data) in data.items()
}

### Write out models

In [None]:
from pathlib import Path
import pandas as pd
import joblib

In [None]:
model_dir = Path("models")
model_dir.mkdir(exist_ok=True)

In [None]:
for name, (model, cv) in ((name, job.result()) for name, job in jobs.items() if job.done()):
    joblib.dump(model, model_dir.joinpath(f"{name}.pkl"))
    pd.DataFrame(cv).to_csv(model_dir.joinpath(f"{name}.csv"), index=False)