In [None]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler
from sklearn.neighbors import LocalOutlierFactor
from molvs import Standardizer
from lib.descriptors.cdk import ECFPTransformer, ECFPID
from rdkit.Chem.PandasTools import LoadSDF
import numpy as np
import pandas as pd

### Load data

In [None]:
data_path = "ba_assets/data_for_models/data_threshold_all_filled_0.5_all_masters/training_chembl+manual/%s.sdf"
targets = [
    "BCRP",
    "BCRP-S",
    "BSEP",
    "MATE1",
    "MDR1",
    "MDR1-S",
    "MRP2-S",
    "MRP3",
    "MRP3-S",
    "OATP1B1",
    "OATP1B3",
    "OCT1",
    "OCT2",
]

In [None]:
grouped_mols = {target: LoadSDF(data_path % target).ROMol for target in targets}

In [None]:
all_mols = pd.concat(grouped_mols.values())

### Train

In [None]:
pipeline = Pipeline(
    steps=[
        ("standardizer", FunctionTransformer(np.vectorize(Standardizer().standardize))),
        ("descriptors", ECFPTransformer(ECFPID.ECFP4)),
        ("scaler", MinMaxScaler()),
        ("pca", PCA(n_components=2)),
        ("pca_scaler", MinMaxScaler()),
        (
            "lof",
            LocalOutlierFactor(
                n_neighbors=5, novelty=True, contamination=0.1, metric="euclidean"
            ),
        ),
    ]
)

In [None]:
pipeline.fit(all_mols)

In [None]:
xx, yy = np.meshgrid(np.linspace(-0.2, 1.2, 100), np.linspace(-0.2, 1.2, 100))

In [None]:
def make_grid(mols):
    pipeline[-1].fit(pipeline[:-1].transform(mols))
    ZZ = pipeline[-1].decision_function(np.c_[xx.ravel(), yy.ravel()])
    ZZ = ZZ.reshape(xx.shape)

    return ZZ

## Export

In [None]:
import joblib
import json
from pathlib import Path

In [None]:
def name_to_path(name):
    target, *maybe_substrate = name.split("-")

    if maybe_substrate == ["S"]:
        return Path(f"Substrate/{target}/ad.pkl")
    else:
        return Path(f"Inhibition/{target}/ad.pkl")

In [None]:
def name_to_canonical(name):
    target, *maybe_substrate = name.split("-")

    if maybe_substrate == ["S"]:
        return f"S-{target}"
    else:
        return f"I-{target}"

In [None]:
model_dir = Path("models")

### Embedder

In [None]:
joblib.dump(pipeline[1:-1], "models/embedder.pkl")

### Density map data

In [None]:
Path("ad-grids.json").write_text(
    json.dumps(
        [
            {
                "grid": np.flip(make_grid(mols), axis=0).flatten().tolist(),
                "name": name_to_canonical(name),
            }
            for name, mols in grouped_mols.items()
        ]
    )
)

### LOF checkers

In [None]:
for target, mols in grouped_mols.items():
    pipeline[-1].fit(pipeline[:-1].transform(mols))
    path = name_to_path(target)
    
    joblib.dump(pipeline[2:], model_dir.joinpath(path))