# Transporter classification training code

## Setup

In [None]:
%load_ext lab_black

### Predictable randomness

In [None]:
import numpy as np

In [None]:
seed = 0

In [None]:
def rng():
    return np.random.RandomState(seed)

### Shared parameters

In [None]:
param_scalers = [None]

### Preprocessing

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
from common.lib.descriptors.cdk import ECFPID, ECFPTransformer

In [None]:
from sklearn.preprocessing import FunctionTransformer
from molvs import Standardizer
from rdkit.Chem.rdchem import Mol
import numpy as np

In [None]:
preprocessing_pipeline = Pipeline(
    steps=[
        (
            "standardizer",
            FunctionTransformer(
                np.vectorize(Standardizer().fragment_parent, otypes=[Mol])
            ),
        ),
        ("descriptors", ECFPTransformer(ECFPID.ECFP4)),
    ]
)

### Parameter search

In [None]:
from sklearn.model_selection import KFold, GridSearchCV

In [None]:
def make_parameter_search(model, cv_params):
    return GridSearchCV(
        model,
        cv_params,
        scoring="balanced_accuracy",
        refit=True,
        cv=KFold(n_splits=10, shuffle=True, random_state=seed),
        verbose=3,
        error_score="raise",
        n_jobs=-1,
    )

### Cross validation helpers

In [None]:
from sklearn.metrics import get_scorer, make_scorer, precision_score, recall_score
from sklearn.model_selection import cross_validate

In [None]:
def external_validation(model, X, y):
    def score(model, X, y, scoring={}):
        return pd.DataFrame(
            {
                key: [get_scorer(definition)(model, X, y)]
                for key, definition in scoring.items()
            }
        )

    return score(
        model,
        X,
        y,
        scoring={
            "accuracy": "accuracy",
            "sensitivity": "recall",
            "specificity": make_scorer(recall_score, pos_label=0),
            "balanced_accuracy": "balanced_accuracy",
            "f1": "f1",
            "roc_auc": "roc_auc",
            "precision": make_scorer(precision_score, zero_division=0),
            "matthews_corrcoef": "matthews_corrcoef",
        },
    )

In [None]:
def cross_validation(model, X, y):
    return cross_validate(
        model,
        X,
        y,
        scoring={
            "accuracy": "accuracy",
            "sensitivity": "recall",
            "specificity": make_scorer(recall_score, pos_label=0),
            "balanced_accuracy": "balanced_accuracy",
            "f1": "f1",
            "roc_auc": "roc_auc",
            "precision": make_scorer(precision_score, zero_division=0),
            "matthews_corrcoef": "matthews_corrcoef",
        },
        cv=KFold(n_splits=10, shuffle=True, random_state=seed),
        n_jobs=-1,
    )

## Define Models

In [None]:
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [None]:
clf_logr = make_parameter_search(
    Pipeline(
        steps=[
            ("scaler", None),
            ("pca", PCA(n_components=8, random_state=rng())),
            (
                "logr",
                LogisticRegression(solver="saga", max_iter=10000, random_state=rng()),
            ),
        ]
    ),
    {
        "scaler": param_scalers,
        "logr__penalty": ["elasticnet"],
        "logr__C": [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
        "logr__l1_ratio": [0, 0.25, 0.5, 0.75, 1],
    },
)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

In [None]:
clf_rf = make_parameter_search(
    Pipeline(
        steps=[
            ("scaler", None),
            (
                "rf",
                RandomForestClassifier(max_features=1.0, random_state=rng(), n_jobs=-1),
            ),
        ]
    ),
    {
        "scaler": param_scalers,
        "rf__class_weight": ["balanced"],
        "rf__n_estimators": [5, 10, 25, 50],
        "rf__max_depth": [2, 4, 8, 16],
    },
)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

In [None]:
clf_knn = make_parameter_search(
    Pipeline(steps=[("scaler", None), ("knn", KNeighborsClassifier())]),
    {
        "scaler": param_scalers,
        "knn__n_neighbors": [3, 5, 9, 11, 13, 17, 19],
        "knn__weights": ["uniform", "distance"],
        "knn__metric": ["euclidean", "manhattan"],
    },
)

In [None]:
from sklearn.kernel_approximation import Nystroem
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

In [None]:
clf_svc = make_parameter_search(
    Pipeline(
        steps=[
            ("scaler", None),
            ("nystroem", Nystroem()),
            ("svc", SVC(class_weight="balanced", random_state=rng(), probability=True)),
        ]
    ),
    {
        "scaler": param_scalers,
        "nystroem__gamma": [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0],
        "svc__C": [0.1, 1.0, 10.0, 100.0, 1000.0],
    },
)

In [None]:
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline

In [None]:
clf_xgb = make_parameter_search(
    Pipeline(
        steps=[
            ("scaler", None),
            ("xgb", XGBClassifier(random_state=rng(), n_jobs=-1)),
        ]
    ),
    {
        "scaler": param_scalers,
        "xgb__scale_pos_weight": [0.1, 0.5, 1, 5, 10],
        "xgb__objective": [None, "binary:logistic"],
        "xgb__n_estimators": [5, 10, 25, 50, 100, 250],
        "xgb__max_depth": [2, 4, 8, 16, 32, 64],
    },
)

In [None]:
models = {
    "logr_clf": clf_logr,
    "rf_clf": clf_rf,
    "knn_clf": clf_knn,
    "svc_clf": clf_svc,
    "xgb_clf": clf_xgb,
}

## Define input data

In [None]:
import numpy as np
from rdkit.Chem.PandasTools import LoadSDF

In [None]:
targets = {
    ("Inhibition", "BCRP"): "BCRP",
    ("Inhibition", "BSEP"): "BSEP",
    ("Inhibition", "MATE1"): "MATE1",
    ("Inhibition", "MDR1"): "MDR1",
    ("Inhibition", "MRP3"): "MRP3",
    ("Inhibition", "OATP1B1"): "OATP1B1",
    ("Inhibition", "OATP1B3"): "OATP1B3",
    ("Inhibition", "OCT1"): "OCT1",
    ("Inhibition", "OCT2"): "OCT2",
    # ("Substrate", "BCRP"): "BCRP-S",
    # ("Substrate", "MDR1"): "MDR1-S",
    # ("Substrate", "MRP2"): "MRP2-S",
    # ("Substrate", "MRP3"): "MRP3-S",
}

In [None]:
def load_data(filename, purpose):
    data = LoadSDF(
        f"ba_assets/data_for_models/data_threshold_all_filled_0.5_all_masters/{purpose}_chembl/{filename}.sdf"
    )

    return np.stack(preprocessing_pipeline.transform(data.ROMol)), np.stack(
        data.Classification.astype(int)
    )

## Training

### Training routine

In [None]:
import pandas as pd
import joblib
from rdkit.Chem.PandasTools import LoadSDF

In [None]:
def train_model(model, training, test):
    X, y = training
    X_test, y_test = test

    with joblib.parallel_config("loky", n_jobs=-1):
        model.fit(X, y)

    # Cross validate
    cv = cross_validation(model, X, y)

    # Validate on external data
    external = external_validation(model, X_test, y_test)

    return model, cv, external

### Configure task runner

In [None]:
import atexit
import shutil
import os

from dask.distributed import Client
from dask_jobqueue import SLURMCluster

In [None]:
if cluster := globals().get("cluster"):
    cluster.close()
shutil.rmtree("logs", ignore_errors=True)

cluster = SLURMCluster(
    cores=1,
    job_cpu=32,
    memory="8 GB",
    scheduler_options={"interface": "ens9f0", "dashboard_address": ":8787"},
    log_directory="logs",
)

atexit.register(lambda: cluster.close())

In [None]:
cluster.adapt(minimum=4, maximum=len(models) * len(targets))

In [None]:
client = Client(cluster)

### Start jobs

In [None]:
data_training = {
    id: client.submit(load_data, filename, "training", priority=1)
    for id, filename in targets.items()
}

In [None]:
data_testing = {
    id: client.submit(load_data, filename, "testing", priority=1)
    for id, filename in targets.items()
}

In [None]:
jobs = {
    (target_id, model_id): client.submit(
        train_model,
        model,
        data_training[target_id],
        data_testing[target_id],
        key=f"train_model_{model_id}-{target_id}",
    )
    for model_id, model in models.items()
    for target_id in targets.keys()
}

### Write out models

In [None]:
from pathlib import Path
import pandas as pd

In [None]:
model_dir = Path("models_")

In [None]:
results = [(id, job.result()) for id, job in jobs.items() if job.done()]

for ((purpose_id, target_id), model_id), (model, cv, ext) in results:
    target_dir = model_dir.joinpath(purpose_id).joinpath(target_id)

    target_dir.mkdir(exist_ok=True, parents=True)

    joblib.dump(model, target_dir.joinpath(f"{model_id}.pkl"))
    pd.DataFrame(cv).to_csv(target_dir.joinpath(f"{model_id}.cross.csv"), index=False)
    pd.DataFrame(ext).to_csv(
        target_dir.joinpath(f"{model_id}.external.csv"), index=False
    )

jobs_count, done_count = len(jobs), len([job for job in jobs.values() if job.done()])

print(f"Saved {len(results)}/{len(jobs)} models")