# GROVER
Generate GROVER fingerprints for SMILES-drugs coming from LINCS + SciPlex3.

Steps:
1. Load LINCS + SciPlex3, extract SMILES
2. Generate fingerprints using GROVER
3. Save SMILES -> fingerprint mapping as a pandas df.

## Step 1: Get all relevant SMILES from datasets

In [6]:
import scanpy as sc
from rdkit import Chem
import pandas as pd
import numpy as np
from pathlib import Path

In [7]:
def canonicalize(smiles): 
    if smiles:
        return Chem.MolToSmiles(Chem.MolFromSmiles(smiles), canonical=True)
    else:
        return None

In [8]:
# SET
datasets_fpath = Path("/home/icb/simon.boehm/Masters_thesis/MT_code/datasets")

In [9]:
for key, dataset in [("SMILES", "trapnell_cpa.h5ad"),
                     ("SMILES", "trapnell_cpa_subset.h5ad"),
                     ("canonical_smiles", "lincs_full_smiles.h5ad")]:
    outpath = Path("data/embeddings/") / dataset.replace(".h5ad", ".csv")
    if outpath.exists():
        print(outpath, "found, skipping")
    else:
        print("Generating", outpath)
        df = sc.read(datasets_fpath / dataset)
        # load all unique SMILES in the dataset and canonicalize them
        canonical_smiles = pd.Series(df.obs[key].unique()).apply(canonicalize).dropna()
        # dump to csv
        canonical_smiles.to_csv(outpath, index=False, header=["smiles"])

data/embeddings/trapnell_cpa.csv found, skipping
data/embeddings/trapnell_cpa_subset.csv found, skipping
data/embeddings/lincs_full_smiles.csv found, skipping


## Step 2: Generate fingerprints

- TODO: Right now we generate `rdkit_2d_normalized` features. Are these the correct ones?
- TODO: There are pretrained & finetuned models also available, maybe that's useful for us:
    - SIDER: Drug side effect prediction task
    - ClinTox: Drug toxicity prediction task
    - ChEMBL log P prediction task

In [10]:
%%bash
set -euox pipefail
for file in data/embeddings/*.csv; do
    # First we generate the feature embedding for the SMILES, which is an extra input
    # into GROVER
    echo "FILE: $file"
    features=$(echo $file | sed 's:.csv:.npz:')
    if [[ ! -f $features ]]; then
        echo "Generating features: $features"
        python scripts/save_features.py --data_path "$file" \
                                --save_path "$features" \
                                --features_generator rdkit_2d_normalized \
                                --restart
    fi;
    
    # Second we input SMILES + Features into grover and get the fingerprint out
    # 'both' means we get a concatenated fingerprint of combined atoms + bonds features
    outfile=$(echo $file | sed 's:.csv:_grover_base_both.npz:')
    echo "EMB: $outfile"
    if [[ ! -f $outfile ]]; then
        echo "Generating embedding: $outfile"
        python main.py fingerprint --data_path "$file" \
                           --features_path "$features" \
                           --checkpoint_path data/model/grover_base.pt \
                           --fingerprint_source both \
                           --output "$outfile"
    fi;
done;

FILE: data/embeddings/lincs_full_smiles.csv
EMB: data/embeddings/lincs_full_smiles_grover_base_both.npz
FILE: data/embeddings/trapnell_cpa.csv
EMB: data/embeddings/trapnell_cpa_grover_base_both.npz
FILE: data/embeddings/trapnell_cpa_subset.csv
EMB: data/embeddings/trapnell_cpa_subset_grover_base_both.npz


+ for file in 'data/embeddings/*.csv'
+ echo 'FILE: data/embeddings/lincs_full_smiles.csv'
++ echo data/embeddings/lincs_full_smiles.csv
++ sed s:.csv:.npz:
+ features=data/embeddings/lincs_full_smiles.npz
+ [[ ! -f data/embeddings/lincs_full_smiles.npz ]]
++ echo data/embeddings/lincs_full_smiles.csv
++ sed s:.csv:_grover_base_both.npz:
+ outfile=data/embeddings/lincs_full_smiles_grover_base_both.npz
+ echo 'EMB: data/embeddings/lincs_full_smiles_grover_base_both.npz'
+ [[ ! -f data/embeddings/lincs_full_smiles_grover_base_both.npz ]]
+ for file in 'data/embeddings/*.csv'
+ echo 'FILE: data/embeddings/trapnell_cpa.csv'
++ echo data/embeddings/trapnell_cpa.csv
++ sed s:.csv:.npz:
+ features=data/embeddings/trapnell_cpa.npz
+ [[ ! -f data/embeddings/trapnell_cpa.npz ]]
++ echo data/embeddings/trapnell_cpa.csv
++ sed s:.csv:_grover_base_both.npz:
+ outfile=data/embeddings/trapnell_cpa_grover_base_both.npz
+ echo 'EMB: data/embeddings/trapnell_cpa_grover_base_both.npz'
+ [[ ! -f data/embe

In [11]:
trapnell_base = np.load("data/embeddings/trapnell_cpa_grover_base_both.npz")
trapnell_large = np.load("data/embeddings/trapnell_cpa_grover_large_both.npz")

In [12]:
print("Shape of GROVER_base embedding:", trapnell_base["fps"].shape)
print("Shape of GROVER_large embedding:", trapnell_large["fps"].shape)

Shape of GROVER_base embedding: (188, 3400)
Shape of GROVER_large embedding: (188, 5000)


## Step 3: Generate DataFrame with SMILES -> Embedding mapping

In [44]:
def flatten(x: np.ndarray):
    assert len(x.shape) == 2 and x.shape[0] == 1
    return x[0]

embeddings_fpath = Path("data/embeddings")
for file in embeddings_fpath.iterdir():
    final_dfs = []
    if str(file).endswith("csv"):
        # read original SMILES list
        df = pd.read_csv(file)
        # read generated embedding (.npz has only one key, 'fps')
        emb = np.load(str(file).replace(".csv", "_grover_base_both.npz"))["fps"]
        assert len(df) == emb.shape[0]
        # generate a DataFrame with SMILES and Embedding in each row
        final_df = pd.DataFrame(emb, index=df["smiles"].values, columns=[f"latent_{i+1}" for i in range(emb.shape[1])])
        final_dfs.append(final_df)
    # join into one dataframe
    final_df = pd.concat(final_dfs)
    # remove duplicates indices (=SMILES)
    final_df = final_df[~final_df.index.duplicated(keep="first")]
    final_df.to_parquet(embeddings_fpath / "grover_base.parquet")

In [45]:
df = pd.read_parquet("data/embeddings/grover_base.parquet")

In [46]:
df

Unnamed: 0,latent_1,latent_2,latent_3,latent_4,latent_5,latent_6,latent_7,latent_8,latent_9,latent_10,...,latent_3391,latent_3392,latent_3393,latent_3394,latent_3395,latent_3396,latent_3397,latent_3398,latent_3399,latent_3400
COc1cc2c(cc1O)CC[C@@H]1[C@@H]2CC[C@]2(C)[C@@H](O)CC[C@@H]12,0.195959,-0.278208,0.153157,0.105822,-0.083688,0.428338,-0.032398,-0.844538,0.089965,1.319402,...,1.593061e-17,5.766101e-14,2.957989e-11,0.168378,0.16738,1.481515e-18,2.324150e-16,4.703598e-08,0.166633,0.894112
COc1cc2c(cc1OCCCN1CCCC1)N=C(N)C21CCC1,0.139694,0.113071,0.175902,0.140174,-0.099123,0.344958,0.253862,-1.119520,-0.246315,0.652924,...,1.593061e-17,5.766101e-14,2.957989e-11,0.168378,0.16738,1.481515e-18,2.324150e-16,9.607067e-01,0.166633,0.867332
CN(C)CCC(CSc1ccccc1)Nc1ccc(S(=O)(=O)NC(=O)c2ccc(N3CCN(Cc4ccccc4-c4ccc(Cl)cc4)CC3)cc2)cc1[N+](=O)[O-],0.211542,-0.666226,0.157627,0.162469,-0.182653,0.151127,0.100305,0.488502,-0.243197,-0.471426,...,9.999565e-01,5.766101e-14,2.957989e-11,0.168378,0.16738,1.481515e-18,2.324150e-16,4.703598e-08,0.166633,0.006762
Cc1c(NC(=O)OCC2COCCN2)cn2ncnc(Nc3ccc4c(cnn4Cc4cccc(F)c4)c3)c12,0.102223,-1.600725,0.061120,0.095598,-0.132348,0.116600,0.056430,-0.221550,-0.291598,-0.232704,...,1.593061e-17,5.766101e-14,2.957989e-11,0.168378,0.16738,1.481515e-18,2.324150e-16,4.703598e-08,0.166633,0.122994
CN(C)Cc1ccc(-c2nc3cccc4c3n2CCNC4=O)cc1,0.244959,-0.644244,0.087852,0.138099,-0.131626,0.028204,0.026444,0.449965,-0.200088,-0.053318,...,1.593061e-17,5.766101e-14,2.957989e-11,0.168378,0.16738,1.481515e-18,2.324150e-16,4.703598e-08,0.166633,0.853671
N#C/C(=C\c1ccc(O)c(O)c1)C(=O)NCc1ccccc1,0.287355,0.229655,0.084712,0.098294,-0.047807,0.109526,0.057974,1.094841,-0.192098,-0.275683,...,1.593061e-17,5.766101e-14,2.957989e-11,0.168378,0.16738,1.481515e-18,2.324150e-16,4.703598e-08,0.166633,0.300882
NC(=O)c1ncn([C@@H]2O[C@H](CO)[C@@H](O)[C@H]2O)c1N,0.191298,-0.848057,0.003274,-0.051926,0.026726,0.185980,-0.152911,4.109511,-0.187004,0.572909,...,1.593061e-17,5.766101e-14,2.957989e-11,0.168378,0.16738,1.481515e-18,2.324150e-16,4.703598e-08,0.166633,0.220589
Cc1csc(-c2nnc(Nc3ccc(Oc4ncccc4-c4ccnc(N)n4)cc3)c3ccccc23)c1,0.086284,-1.125728,0.019980,0.038208,-0.117713,0.120502,0.092392,0.283795,-0.230029,-0.543394,...,1.593061e-17,5.766101e-14,2.957989e-11,0.168378,0.16738,1.481515e-18,1.000000e+00,4.703598e-08,0.166633,0.099186
CC(C)[C@H](C(=O)Nc1ccc(C(=O)NO)cc1)c1ccccc1,0.199641,-0.516763,0.117617,0.105079,-0.110813,0.007691,0.069716,0.848334,-0.292238,0.016338,...,1.593061e-17,5.766101e-14,2.957989e-11,0.168378,0.16738,1.481515e-18,2.324150e-16,4.703598e-08,0.166633,0.483650
O=C(Nc1c[nH]nc1-c1nc2cc(CN3CCOCC3)ccc2[nH]1)NC1CC1,0.129860,-0.756894,0.021848,0.076669,-0.140153,0.176244,0.068027,-0.192751,-0.173946,0.069980,...,1.593061e-17,5.766101e-14,2.957989e-11,0.168378,0.16738,1.481515e-18,2.324150e-16,4.703598e-08,1.000000,0.414833
