# Description
This is a basic notebook with solution which uses SentenceTransformers library.
For now I suppose that it is already installed. Later I'll make it Google Colab-runnable

In [1]:
# here should be install commands

In [59]:
import pandas as pd
import numpy as np
from pathlib import Path
from sentence_transformers import SentenceTransformer, CrossEncoder
import torch
import random

import torch.nn as nn
from sentence_transformers import InputExample
from sentence_transformers.cross_encoder.evaluation import CEBinaryClassificationEvaluator, CECorrelationEvaluator
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.cross_encoder.evaluation import CEBinaryAccuracyEvaluator
from sentence_transformers import util as st_util
from torch.utils.data import DataLoader

from collections import defaultdict

In [27]:
# constants

def seed_everything(seed=42):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    pass

# Cell with constants
DATADIR = Path("../data")
if not DATADIR.exists():
  # DATADIR.mkdir(DATADIR)
  !gdown --id 1qnvNxd6SvhwHPxD0huTpmODB270ENs7j
  !tar -xzvf inhibitors_data.tar.gz

RANDOM_SEED = 2407
seed_everything(RANDOM_SEED)

TMP_DIR = Path("../tmp")
TMP_DIR.mkdir(exist_ok=True)

train_df = pd.read_csv(DATADIR / "train.csv", index_col=0)
test_df = pd.read_csv(DATADIR / "test.csv", index_col=0)

# train_df['canonical'] = train_df.Smiles.apply(smiles2canonical)
# test_df['canonical'] = test_df.Smiles.apply(smiles2canonical)

MODELNAME = TMP_DIR / "embeddings"
CROSS_ENCODER_PATH = TMP_DIR / "cross-encoder"
# the name of the baseline BERT model which is getting fine-tuned
SMILES_COL = "Smiles"  # "canonical" 
NFOLDS = 5

TRAIN_SIZE = 10



In [28]:
# model = SentenceTransformer(MODEL_NAME)
model = SentenceTransformer(MODELNAME.as_posix())
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.',
    'The quick brown fox jumps over the lazy dog.']

#Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences)

#Print the embeddings
for sentence, embedding in zip(sentences, embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding.shape)
    print("")

Sentence: This framework generates embeddings for each input sentence
Embedding: (768,)

Sentence: Sentences are passed as a list of string.
Embedding: (768,)

Sentence: The quick brown fox jumps over the lazy dog.
Embedding: (768,)



## Notes on train dataset processing
1. At first we will be using simple scheme which guarantiees that our train data is balanced. First strategy: each time we pick pairs of active and inactive molecules A, B and I, J. Add to the dataset (A, B, 1), (I, J, 1), and 2 random of [(A, I, 0), (A, J, 0), (B, I, 0), (B, J, 0)]. Or maybe triplet loss? Will check at the documentation later what is the dataset in that case.
2. We can also take into account Murcko scaffold or some other form of scaffolds. And pick (some of) the molecules in the way to compare molecules with the same scaffold from different parts of dataset.
3. For validation we might want to specifically use scaffolds


In [55]:
def make_shuffled_dataset_for_fold(train_df, index, size=100, smiles_col="Smiles"):
    df = train_df.loc[index]
    m = df.Active.mean()
    assert m > 0 and m < 1
    active_molecules = df.loc[df.Active, smiles_col].values
    inactive_molecules = df.loc[~df.Active, smiles_col].values
    new_data = []
    for k in range(size//4):
        A, B = np.random.choice(active_molecules, 2)
        I, J = np.random.choice(inactive_molecules, 2)
        new_data.extend([(A, B, 1), (I, J, 1)])
        neg = [(A, I, 0), (A, J, 0), (B, I, 0), (B, J, 0)]
        i, j = np.random.choice(len(neg), 2)
        new_data.append(neg[i])
        new_data.append(neg[j])
    np.random.shuffle(new_data)
    return new_data

# for testing we are specifically interested in comparing molecules from the same scaffold
# since the errors will probably be higher in this case

def make_val_dataset_for_fold(val_df, val_index, subsample=True, smiles_col="Smiles",
        scaffold_col=None, max_size=None):
    df = val_df.loc[val_index]
    active_molecules = df.loc[df.Active, smiles_col].values
    inactive_molecules = df.loc[~df.Active, smiles_col].values

    val_data = []

    if scaffold_col is None:
        # don't use scaffolds, just compare all
        for k, molecule_a in enumerate(active_molecules):
            for molecule_i in inactive_molecules:
                val_data.append((molecule_a, molecule_i, 0))
                if max_size is not None and max_size <= len(val_data):
                    return val_data
            for molecule_b in inactive_molecules[k + 1:]:
                if molecule_a == molecule_b:
                    continue
                val_data.append((molecule_a, molecule_b, 1))
                if max_size is not None and max_size <= len(val_data):
                    return val_data
        for k, molecule_i in enumerate(inactive_molecules):
            for molecule_j in inactive_molecules[k + 1:]:
                val_data.append((molecule_i, molecule_j, 1))
                if max_size is not None and max_size <= len(val_data):
                    return val_data
    else:
        # do subsampling based on scaffolds
        # idea: molecules from the same scaffold should be close to each other 
        # if they are all active or inactive
        # and remote if they are diverse in terms of activity
        scaffold_active = defaultdict(set)
        for s, m in df.loc[df.Active, [scaffold_col, smiles_col]].values:
            scaffold_active[s].add(m)
        
        scaffold_inactive = defaultdict(set)    
        for s, m in df.loc[~df.Active, [scaffold_col, smiles_col]].values:
            scaffold_inactive[s].add(m)
        
        all_scaffolds = df.loc[:, scaffold_col].unique()

        for scaffold in all_scaffolds:
            active_molecules = scaffold_active[scaffold]
            inactive_molecules = scaffold_inactive[scaffold]
            for ds in [active_molecules, inactive_molecules]:
                for k, mol1 in enumerate(ds):
                    for mol2 in ds[k + 1:]:
                        val_data.append((mol1, mol2, 1))
                        if max_size is not None and max_size <= len(val_data):
                            return val_data
            for mol_a in active_molecules:
                for mol_i in inactive_molecules:
                    val_data.append((mol_a, mol_i, 0))
                    if max_size is not None and max_size <= len(val_data):
                        return val_data
    return val_data

In [56]:


active_ids = np.where(train_df.Active == 1)[0]
inactive_ids = np.where(train_df.Active == 0)[0]
all_folds = [
    (
        [*active_ids[:3], *inactive_ids[:3]],
        [*active_ids[-3:], *inactive_ids[-3:]]
    )
]

In [57]:
for fold, (train_index, val_index) in enumerate(all_folds):
    #for fold, (train_index, val_index) in enumerate(all_folds):
    print("Fold", fold)
    train_samples = make_shuffled_dataset_for_fold(train_df, train_index, size=TRAIN_SIZE)
    print(len(train_samples))
    val_samples = make_val_dataset_for_fold(
        train_df, val_index, subsample=True,
        max_size=10
    )
    print(len(val_samples))
    # todo: uncommend to use murcko scaffold and add code to compute them
    # test_samples = make_val_dataset_for_fold(train_df, val_index, subsample=True, scaffold_col="murcko")
    # print(train_index)
    train_dataset = [
        InputExample(texts=[i, j], label=w)  ##int(w > 0) ) 
        for (i, j, w) in train_samples
    ]
    # test_dataset = [
    #     InputExample(texts=[i, j], label=w) # int(w > 0))
    #     for k in val_index
    #     for (i, j, w) in val_samples
    #     if np.abs(w) >= 0.1
    # ]
    #val_dataset = [(i, j, np.clip((w+1.)/2, 0., 1)) for i in val_index for (i, j, w) in all_data[i]]
    val_dataset = [
        ([i, j], w)  #int(w > 0) ) 
        for (i, j, w) in val_samples
    ]  # we don't use holdout dataset because we have too few data

    #(sentences1, sentences2, scores) = list(zip(*val_dataset))
    (sentences, scores) = list(zip(*val_dataset))
    #evaluator = EmbeddingSimilarityEvaluator(sentences1, sentences2, scores)
    evaluator = CECorrelationEvaluator(sentences, scores)
    #binary_evaluator = CEBinaryClassificationEvaluator(sentences[:100], scores[:100])
    #evaluator = CEBinaryAccuracyEvaluator(sentences[:100], scores[:100])
    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16)
    # val_dataloader = DataLoader(val_dataset, shuffle=False, batch_size=16)

    #model = SentenceTransformer(MODELNAME)
    #EleutherAI/gpt-neo-125M
    #try:

    model = CrossEncoder(CROSS_ENCODER_PATH, num_labels=1)
    model.fit(
        train_dataloader,
        evaluator=evaluator,  #evaluator,
        epochs=1,
        loss_fct=nn.MSELoss(),
        evaluation_steps=400,
        warmup_steps=20,
        output_path=(TMP_DIR / f"./cross_encoder_{fold}").as_posix(),
    )
    break

Fold 0
100
10


Iteration: 100%|██████████| 7/7 [00:44<00:00,  6.39s/it]
Epoch: 100%|██████████| 1/1 [00:46<00:00, 46.94s/it]


## Notes on inference
Next we need to use those pretrained cross-encoders to extract embeddings. Or, to use them to pretrain sentenceTransformer model (which produces object's embeddings)

Ok, let's suppose we have molecular smiles and can precompute the embeddings.
We can use them as a set of features for some complex model OR (and this is the simplest way) we can compute i.e., cos distance between train's active molecules and a target molecule, somehow aggregate (mean, max, median, ...) and decide if it is more active than inactive (we can also pick the same number of molecules from train set - maybe by training additional model to detect models similarity based on graph? and use them for computations).

Let's start with the simplest case - comparing with all active and N = len(active) randomly picked from inactive. (For reproducibility during inference, we'll set seed to RANDOM_SEED before selecting them).