In [None]:
example_smiles_d = {"Gozetotide": "C1=CC(=C(C=C1CCC(=O)NCCCCCC(=O)NCCCC[C@@H](C(=O)O)NC(=O)N[C@@H](CCC(=O)O)C(=O)O)CN(CCN(CC2=C(C=CC(=C2)CCC(=O)O)O)CC(=O)O)CC(=O)O)O",
                    "Micafungin": "CCCCCOC1=CC=C(C=C1)C2=CC(=NO2)C3=CC=C(C=C3)C(=O)N[C@H]4C[C@H]([C@H](NC(=O)[C@@H]5[C@H]([C@H](CN5C(=O)[C@@H](NC(=O)[C@@H](NC(=O)[C@@H]6C[C@H](CN6C(=O)[C@@H](NC4=O)[C@@H](C)O)O)[C@@H]([C@H](C7=CC(=C(C=C7)O)OS(=O)(=O)O)O)O)[C@@H](CC(=O)N)O)C)O)O)O",
                    "Candicidin": "C[C@H]1/C=C/C=C/C=C/C=C\C=C/C=C/C=C/[C@@H](C[C@@H]([C@@H]([C@H](CC(=O)C[C@H](C[C@H](C[C@H](CC(=O)CCCC(=O)CC(=O)O[C@@H]1[C@@H](C)C[C@H](C)[C@H](CC(=O)C2=CC=C(C=C2)N)O)O)O)O)O)C(=O)O)O)O[C@H]3[C@H]([C@H]([C@@H]([C@H](O3)C)O)N)O",
                    "Control": "C=CC(=O)N1CCC[C@H](C1)N2C3=NC=NC(=C3C(=N2)C4=CC=C(C=C4)OC5=CC=CC=C5)N",
                   }
smiles_d = {SMILES_name or "compound": SMILES} if SMILES != "" else example_smiles_d

In [None]:
!test -d smiles_to_properties || (git clone https://github.com/hgbrian/smiles_to_properties.git && mv smiles_to_properties/* .)

In [None]:
!pip install rdkit --quiet
!pip install mols2grid==1.0.0 --quiet

In [None]:
!pip install chemprop==1.5.2 --quiet

In [None]:
if run_slow_smiles2caption:
    !pip install transformers==4.22.1 --quiet
    !pip install sentencepiece==0.1.97 --quiet
    !pip install t5x==0.0.0 --quiet

In [None]:
from tqdm.auto import tqdm
import pandas as pd

df_smiles = pd.DataFrame(smiles_d.items(), columns=["compound", "smiles"])
display(df_smiles)

In [None]:
if run_slow_smiles2caption:
    from transformers import T5Tokenizer, T5ForConditionalGeneration
    tokenizer = T5Tokenizer.from_pretrained("laituan245/molt5-large-smiles2caption", model_max_length=512)
    model = T5ForConditionalGeneration.from_pretrained('laituan245/molt5-large-smiles2caption')

    for ix, row in tqdm(df_smiles.iterrows(), total=len(df_smiles)):
        input_ids = tokenizer(row.smiles, return_tensors="pt").input_ids
        outputs = model.generate(input_ids, num_beams=5, max_length=512)
        df_smiles.at[ix, "caption"] = tokenizer.decode(outputs[0], skip_special_tokens=True)

    display(df_smiles)

In [None]:
TO_RETRAIN = []

moleculenet = {"BBBP": ("https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/BBBP.csv", "classification", ("p_np", "BBB Penetration")),
               "delaney-processed": ("https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/delaney-processed.csv", "regression", ("ESOL predicted log solubility in mols per litre", "Water solubility (log)")),
               "tox21": ("https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/tox21.csv.gz", "classification", ("", "")),
               "toxcast_data": ("https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/toxcast_data.csv.gz", "classification", ("", "")),
               "SAMPL": ("https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/SAMPL.csv", "regression", ("expt", "Hydration free energy")),
               "Lipophilicity": ("https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/Lipophilicity.csv", "regression", ("exp", "Lipophilicity")),
               "clintox": ("https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/clintox.csv.gz", "classification", ("FDA_APPROVED", "FDA Approved")),
               "sider": ("https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/sider.csv.gz", "classification", ("","")),
               "muv": ("https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/muv.csv.gz", "classification", ("", "")),
               }

for dataset in TO_RETRAIN:
    url, dataset_type, (target_columns, rename_target_columns) = moleculenet[dataset]

    !(test -f {dataset}.csv || wget {url}) && (test -f {dataset}.csv.gz && gunzip {dataset}.csv.gz)

    print(f"training {dataset} {target_columns}")
    arg_target_columns = f'--target_columns "{target_columns}"' if target_columns != "" else ""
    !chemprop_train --data_path {dataset}.csv --dataset_type {dataset_type} --save_dir {dataset}_checkpoints --smiles_columns smiles --ignore_columns mol_id {arg_target_columns}

In [None]:
TO_RETEST = []

for dataset in TO_RETEST:
    url, dataset_type, target_columns = moleculenet[dataset]
    print(f"predicting {dataset} {target_columns}")
    !chemprop_predict --test_path {dataset}.csv --checkpoint_dir {dataset}_checkpoints --smiles_column smiles --preds_path {dataset}_preds

In [None]:
df_smiles.to_csv("df_smiles.csv", index=None)

for dataset, (url, dataset_type, target_columns) in moleculenet.items():
    if dataset == "muv" and not run_muv_tasks: continue
    !chemprop_predict --test_path df_smiles.csv --checkpoint_dir {dataset}_checkpoints --smiles_column smiles --preds_path df_{dataset}_preds.csv
    !mv df_{dataset}_preds.csv df_smiles.csv

df_smiles = pd.read_csv("df_smiles.csv").rename(columns={old:new for old, new in [v[2] for v in moleculenet.values()]})

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid");

df_smiles_melt = df_smiles.melt(id_vars = set(df_smiles.columns) & {"compound", "smiles", "caption", "logS_warn"})

graphs = {"Toxicity": ["NR-AR","NR-AR-LBD","NR-AhR","NR-Aromatase","NR-ER","NR-ER-LBD","NR-PPAR-gamma","SR-ARE","SR-ATAD5","SR-HSE","SR-MMP","SR-p53"]}

In [None]:
for title, cols in graphs.items():
    df_plot = df_smiles_melt.loc[lambda df: df.variable.isin(cols)]
    f, ax = plt.subplots(figsize=(6+2*df_plot.variable.nunique(), 2 + 1*df_plot.variable.nunique()));
    sns.barplot(data=df_plot, x="variable", y="value", hue="compound");

    ax.set_title(title, pad=20);
    ax.title.set_fontsize(24 + 1*df_plot.variable.nunique());
    ax.tick_params(axis='x', labelsize=16);
    ax.tick_params(axis='y', labelsize=18);
    ax.set_ylabel('');
    ax.set_xlabel('');

    ax.legend(loc='upper left', fontsize=14, bbox_to_anchor=(1, 1));


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

for title, cols in graphs.items():
    df_plot = df_smiles_melt.loc[lambda df: df.variable.isin(cols)]
    f, ax = plt.subplots(figsize=(6 + 2 * df_plot.variable.nunique(), 2 + 1 * df_plot.variable.nunique()))

    sns.barplot(data=df_plot, x="variable", y="value", hue="compound", ax=ax)

    ax.set_title(title, pad=20)
    ax.title.set_fontsize(24 + 1 * df_plot.variable.nunique())
    ax.tick_params(axis='x', labelsize=16)
    ax.tick_params(axis='y', labelsize=18)
    ax.set_ylabel('')
    ax.set_xlabel('')

    ax.legend(loc='upper left', fontsize=14, bbox_to_anchor=(1, 1))
    plt.savefig(f"{title}.png", bbox_inches='tight')
    plt.close(f)

In [None]:
import mols2grid

mols2grid.display(df_smiles.applymap(lambda x: round(x, 3) if isinstance(x, float) else x),
                  smiles_col="smiles",
                  tooltip=None,
                  tooltip_trigger="hover",
                  tooltip_placement="right")

In [None]:
import mols2grid
import pandas as pd
from IPython.display import display, HTML

df_display = df_smiles.applymap(lambda x: round(x, 3) if isinstance(x, float) else x)

html_output = mols2grid.display(df_display,
                                smiles_col="smiles",
                                tooltip=None,
                                tooltip_trigger="hover",
                                tooltip_placement="right")._repr_html_()

with open("mols2grid_display.html", "w") as file:
    file.write(html_output)
display(HTML(html_output))

In [None]:
if download_results:
    from google.colab import files
    df_smiles.applymap(lambda x: round(x, 4) if isinstance(x, float) else x).to_csv("df_smiles.tsv", index=None, sep='\t')
    files.download('df_smiles.tsv')