In [None]:
import subprocess

import numpy as np
import pandas as pd

%matplotlib inline
from tqdm.auto import tqdm

from utils import (check_pains_brenk, filter_for_mw_bounds,
                   filter_for_nitrofuran, filter_for_quinolone,
                   filter_for_sulfonamide, keep_valid_molecules)

  from rdkit.Chem import MCS, Descriptors, PandasTools, AllChem


In [2]:
def chunk_up_and_run_predictions(
    data_path, data_file, features_file, model_dir, chunksize=1000, smiles_col="SMILES"
):

    # first chunk up data + features
    data = pd.read_csv(data_path + data_file)
    with np.load(data_path + features_file) as ftsdata:
        ftsdata = ftsdata["features"]

    temp_dir_chunks = "../out/predictions_from_models/chunks/"
    for i in range(0, len(data), chunksize):
        smis = pd.DataFrame(data.iloc[i : i + chunksize, :])
        smis.to_csv(temp_dir_chunks + str(i) + ".csv", index=False)
        nps = ftsdata[i : i + chunksize, :]
        np.save(temp_dir_chunks + str(i) + ".npy", nps)

    # could do this command line - convenient to keep within notebook for now
    # actually run predictions
    for j in range(0, len(data), chunksize):
        activate_command = "conda activate chemprop; "
        run_command = (
            "chemprop_predict --test_path "
            + temp_dir_chunks
            + str(j)
            + ".csv"
            + " --checkpoint_dir "
            + model_dir
            + " --preds_path "
            + temp_dir_chunks
            + str(j)
            + "_preds.csv"
            + " --features_path "
            + temp_dir_chunks
            + str(j)
            + ".npy --no_features_scaling --smiles_column "
            + smiles_col
            + " --ensemble_variance --gpu 0"
        )

        full_command = activate_command + run_command
        test = subprocess.run(full_command, shell=True, capture_output=True)

    # now smush chunks back together
    df = pd.read_csv(temp_dir_chunks + "0_preds.csv")
    for i in range(chunksize, len(data), chunksize):
        new = pd.read_csv(temp_dir_chunks + str(i) + "_preds.csv")
        df = pd.concat([df, new])
    return df

# PK GNN on 37K screen

In [None]:
data_path = "../data/library_info/"
df = chunk_up_and_run_predictions(
    data_path="../data/library_info/",
    data_file="37Kclean.csv",
    features_file="37Kclean.npz",
    model_dir="../models/pk_screen_models_11152021/FINAL151/",
)
df.to_csv(
    "../out/predictions_from_models/pk_model/37k_screen/37K_chunks_with_151_model.csv",
    index=False,
)

# PK+37K GNN on 800K

In [None]:
data_path = "../data/library_info/"
df = chunk_up_and_run_predictions(
    data_path="../data/library_info/",
    data_file="broad800k.csv",
    features_file="broad800k.npz",
    model_dir="../models/pk_37k_screen_models_03192022/FINALbayHO04052022/",
    smiles_col="smiles",
)
df.to_csv(
    "../out/predictions_from_models/pk_37k_model/800k/broad800K_melis_predictions_04_25_2022.csv",
    index=False,
)

# PK+37K GNN on 5M 'easy-to-order' set

In [None]:
data_path = "../data/library_info/"
df = chunk_up_and_run_predictions(
    data_path="../data/library_info/",
    data_file="cleaned_full_all_dbs_04_19_2022.csv",
    features_file="cleaned_full_all_dbs_04_19_2022.npz",
    model_dir="../models/pk_37k_screen_models_03192022/FINALbayHO04052022/",
)
df.to_csv(
    "../out/predictions_from_models/pk_37k_model/5m/extended_screen_set_melis_predictions_05_01_2022.csv",
    index=False,
)

# PK+37K+1st round screen GNN on 800K

In [None]:
data_path = "../data/library_info/"
df = chunk_up_and_run_predictions(
    data_path="../data/library_info/",
    data_file="broad800k.csv",
    features_file="broad800k.npz",
    model_dir="../models/pk_37k_first_round_val_screen_models_10262022/FINALbayHO11152022/",
    smiles_col="smiles",
)
df.to_csv(
    "../out/predictions_from_models/pk_37k_1round_model/800k/broad800K_melis_predictions_with_FINALbayHO11152022_11_16_2022.csv",
    index=False,
)

# PK+37K+1st round screen GNN on 5M 'easy-to-order' set

In [None]:
data_path = "../data/library_info/"
df = chunk_up_and_run_predictions(
    data_path="../data/library_info/",
    data_file="cleaned_full_all_dbs_04_19_2022.csv",
    features_file="cleaned_full_all_dbs_04_19_2022.npz",
    model_dir="../models/pk_37k_first_round_val_screen_models_10262022/FINALbayHO11152022/",
)
df.to_csv(
    "../out/predictions_from_models/pk_37k_1round_model/5m/extended_screen_set_with_FINALbayHO11152022_melis_predictions_11_22_2022.csv",
    index=False,
)

### REAL Diversity Set: 
"Virtual screening of the ultra-large databases can be performed iteratively, starting with a small subset. Such a diverse subset can provide essential data to teach AI-based algorithms or already result in promising hits. REAL Diversity Set has 43.8 million compounds identified using the MaxMin algorithm in the entire REAL Database. The compounds have no analogs having a Tanimoto similarity of more than 0.65 (Morgan 2 fingerprint, 512 bit) within the set and within the entire Enamine stock screening compound collection. REAL Diversity Set compounds comply with the Ro5 and Veber criteria: MW≤500, SlogP≤5, HBA≤10, HBD≤5, rotatable bonds≤10, and TPSA≤140 and lack PAINS and toxicophores."

REAL Diversity Set, 43.8M cpds, CXSMILES

https://enamine.net/compound-collections/real-compounds/real-database-subsets

In [None]:
enamine = pd.read_csv(
    "../data/library_info/Enamine_Diverse_REAL_drug-like_43.8M_cxsmiles.cxsmiles",
    sep="\t",
)
chunksize = 10000
temp_dir = "../data/enamine_REAL_diversity/temp_dir_enamine_cleaned_chunks/"
for index, i in tqdm(enumerate(range(0, len(enamine), chunksize))):
    try:
        test = pd.read_csv(temp_dir + str(index) + ".csv")
    except:  # this is good - we can do the prediction
        print(i)
        df = enamine.iloc[i : i + chunksize, :]

        # keep only valid cpds
        df, mols = keep_valid_molecules(df, "smiles")

        # keep only cpds without pains or brenk
        df, mols = check_pains_brenk(
            df, mols
        )  # already check for PAINS now also check Brenk

        # remove common abx
        df, mols = filter_for_nitrofuran(df, mols)
        df, mols = filter_for_sulfonamide(df, mols)
        df, mols = filter_for_quinolone(df, mols)

        # no need to filter on logP since already slogP < 5

        # MW > 100 and MW < 600
        df, mols = filter_for_mw_bounds(
            df, mols, lower_bound=100, upper_bound=500
        )  # since they are already MW<500
        df.to_csv(temp_dir + str(index) + ".csv", index=False)

Use 06_predict_with_model_on_preprocessed_chunks.sh to run predictions.