# Importing the required libraries

In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import rdChemReactions

# Loading the Dataset

In [None]:
df=pd.read_excel('Dreher_and_Doyle_input_data.xlsx')
df

Unnamed: 0,Ligand,Additive,Base,Aryl halide,Output
0,CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P([C@@]3(C[...,CC1=CC(C)=NO1,CN(C)P(N(C)C)(N(C)C)=NP(N(C)C)(N(C)C)=NCC,ClC1=NC=CC=C1,70.410458
1,CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P([C@@]3(C[...,O=C(OC)C1=CC=NO1,CN(C)P(N(C)C)(N(C)C)=NP(N(C)C)(N(C)C)=NCC,BrC1=NC=CC=C1,11.064457
2,CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P(C3CCCCC3)...,O=C(OC)C1=CC=NO1,CN(C)P(N(C)C)(N(C)C)=NP(N(C)C)(N(C)C)=NCC,IC1=CC=C(CC)C=C1,10.223550
3,CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P(C(C)(C)C)...,CCOC(C1=CON=C1)=O,CN1CCCN2C1=NCCC2,ClC1=CC=C(C(F)(F)F)C=C1,20.083383
4,CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P([C@@]3(C[...,CC1=CC(C)=NO1,CN1CCCN2C1=NCCC2,ClC1=CC=C(OC)C=C1,0.492663
...,...,...,...,...,...
3950,CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P(C3CCCCC3)...,C1(C2=CC=CC=C2)=CON=C1,CN(C)P(N(C)C)(N(C)C)=NP(N(C)C)(N(C)C)=NCC,BrC1=CC=C(OC)C=C1,4.344677
3951,CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P(C(C)(C)C)...,CC1=CC(N2C=CC=C2)=NO1,CN1CCCN2C1=NCCC2,BrC1=CC=C(OC)C=C1,47.156275
3952,CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P(C(C)(C)C)...,CCOC(C1=CON=C1)=O,CC(C)(C)/N=C(N(C)C)/N(C)C,ClC1=CC=C(C(F)(F)F)C=C1,0.701552
3953,CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P([C@@]3(C[...,C1(N(CC2=CC=CC=C2)CC3=CC=CC=C3)=CC=NO1,CN(C)P(N(C)C)(N(C)C)=NP(N(C)C)(N(C)C)=NCC,ClC1=CC=C(C(F)(F)F)C=C1,15.561565


# Preparing the Dataset

We use the function MolToSmiles from rdkit library to make the useful columns into reaction smiles. Reactions smiles (Simplified molecular-input line-entry system) are the text-based representation of molecules and chemical reactions. 


In [None]:
def canonicalize_with_dict(smi, can_smi_dict={}):
    if smi not in can_smi_dict.keys():
        return Chem.MolToSmiles(Chem.MolFromSmiles(smi))
    else:
        return can_smi_dict[smi]
    
def generate_buchwald_hartwig_rxns(df):
    df = df.copy()
    fwd_template = '[F,Cl,Br,I]-[c;H0;D3;+0:1](:[c,n:2]):[c,n:3].[NH2;D1;+0:4]-[c:5]>>[c,n:2]:[c;H0;D3;+0:1](:[c,n:3])-[NH;D2;+0:4]-[c:5]'
    methylaniline = 'Cc1ccc(N)cc1'
    pd_catalyst = Chem.MolToSmiles(Chem.MolFromSmiles('O=S(=O)(O[Pd]1~[NH2]C2C=CC=CC=2C2C=CC=CC1=2)C(F)(F)F'))
    methylaniline_mol = Chem.MolFromSmiles(methylaniline)
    rxn = rdChemReactions.ReactionFromSmarts(fwd_template)
    products = []
    for i, row in df.iterrows():
        reacts = (Chem.MolFromSmiles(row['Aryl halide']), methylaniline_mol)
        rxn_products = rxn.RunReactants(reacts)

        rxn_products_smiles = set([Chem.MolToSmiles(mol[0]) for mol in rxn_products])
        assert len(rxn_products_smiles) == 1
        products.append(list(rxn_products_smiles)[0])
    df['product'] = products
    rxns = []
    can_smiles_dict = {}
    for i, row in df.iterrows():
        aryl_halide = canonicalize_with_dict(row['Aryl halide'], can_smiles_dict)
        can_smiles_dict[row['Aryl halide']] = aryl_halide
        ligand = canonicalize_with_dict(row['Ligand'], can_smiles_dict)
        can_smiles_dict[row['Ligand']] = ligand
        base = canonicalize_with_dict(row['Base'], can_smiles_dict)
        can_smiles_dict[row['Base']] = base
        additive = canonicalize_with_dict(row['Additive'], can_smiles_dict)
        can_smiles_dict[row['Additive']] = additive

        reactants = f"{aryl_halide}.{methylaniline}.{pd_catalyst}.{ligand}.{base}.{additive}"
        rxns.append(f"{reactants}>>{row['product']}")
    return rxns

In [None]:
df['rxn']= generate_buchwald_hartwig_rxns(df)
reactions_df=df[['rxn','Output']]
reactions_df

Unnamed: 0,rxn,Output
0,Clc1ccccn1.Cc1ccc(N)cc1.O=S(=O)(O[Pd]1c2ccccc2...,70.410458
1,Brc1ccccn1.Cc1ccc(N)cc1.O=S(=O)(O[Pd]1c2ccccc2...,11.064457
2,CCc1ccc(I)cc1.Cc1ccc(N)cc1.O=S(=O)(O[Pd]1c2ccc...,10.223550
3,FC(F)(F)c1ccc(Cl)cc1.Cc1ccc(N)cc1.O=S(=O)(O[Pd...,20.083383
4,COc1ccc(Cl)cc1.Cc1ccc(N)cc1.O=S(=O)(O[Pd]1c2cc...,0.492663
...,...,...
3950,COc1ccc(Br)cc1.Cc1ccc(N)cc1.O=S(=O)(O[Pd]1c2cc...,4.344677
3951,COc1ccc(Br)cc1.Cc1ccc(N)cc1.O=S(=O)(O[Pd]1c2cc...,47.156275
3952,FC(F)(F)c1ccc(Cl)cc1.Cc1ccc(N)cc1.O=S(=O)(O[Pd...,0.701552
3953,FC(F)(F)c1ccc(Cl)cc1.Cc1ccc(N)cc1.O=S(=O)(O[Pd...,15.561565


# Splitting the dataset into train and test dataset

In [None]:


train_df = reactions_df.iloc[:2767] 
test_df = reactions_df.iloc[2767:]
train_df.columns = ['text', 'labels']
test_df.columns = ['text', 'labels']
mean = train_df.labels.mean()
std = train_df.labels.std()
train_df['labels'] = (train_df['labels'] - mean) / std
test_df['labels'] = (test_df['labels'] - mean) / std
train_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,text,labels
0,Clc1ccccn1.Cc1ccc(N)cc1.O=S(=O)(O[Pd]1c2ccccc2...,1.387974
1,Brc1ccccn1.Cc1ccc(N)cc1.O=S(=O)(O[Pd]1c2ccccc2...,-0.796876
2,CCc1ccc(I)cc1.Cc1ccc(N)cc1.O=S(=O)(O[Pd]1c2ccc...,-0.827835
3,FC(F)(F)c1ccc(Cl)cc1.Cc1ccc(N)cc1.O=S(=O)(O[Pd...,-0.464841
4,COc1ccc(Cl)cc1.Cc1ccc(N)cc1.O=S(=O)(O[Pd]1c2cc...,-1.186082


# Pretrained reaction BERT models

There are currently two reaction BERT models in the rxnfp library - pretrained (trained with on a reaction MLM task) and ft (additionally trained on a reaction classification task). For this example, we will use the pretrained model as starting point for the training of our Yield-BERT. On the Buchwald-Hartwig reactions both base models performed similarly.

In [None]:
import pkg_resources
import torch
from rxnfp.models import SmilesClassificationModel
model_path =  pkg_resources.resource_filename(
                "rxnfp",
                f"models/transformers/bert_pretrained" # change pretrained to ft to start from the other base model
)
yield_bert = SmilesClassificationModel('bert', model_path,use_cuda=torch.cuda.is_available())

Some weights of the model checkpoint at /usr/local/lib/python3.7/site-packages/rxnfp/models/transformers/bert_pretrained were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceC

# Hyperparameter tuning

Most of the hyperparameter are already fixed by the base model. Here we decided only to tune the dropout probability and the learning rate. SimpleTransformers has wandb nicely integrated. An example how to setup a hyperparameter sweep can be found in the training scripts. The wandb parameters are read using dotenv.

# Training

As you can also be seen from the training scripts, once the data is in the right shape a training run can be started within a few lines of code.

For this example we will go with the following parameters,

{dropout=0.7987, learning_rate=0.00009659},

and launch a training. We have to reinitiate the BERT model with the correct parameters.

In [None]:
model_args = {
     'num_train_epochs': 15, 'overwrite_output_dir': True,
    'learning_rate': 0.00009659, 'gradient_accumulation_steps': 1,
    'regression': True, "num_labels":1, "fp16": False,
    "evaluate_during_training": False, 'manual_seed': 42,
    "max_seq_length": 300, "train_batch_size": 16,"warmup_ratio": 0.00,
    "config" : { 'hidden_dropout_prob': 0.7987 } 
}

model_path =  pkg_resources.resource_filename(
                "rxnfp",
                f"models/transformers/bert_pretrained" # change pretrained to ft to start from the other base model
)

yield_bert = SmilesClassificationModel("bert", model_path, num_labels=1, 
                                       args=model_args, use_cuda=torch.cuda.is_available())

yield_bert.train_model(train_df, output_dir=f"outputs_buchwald_hartwig_test_project", eval_df=test_df)

Some weights of the model checkpoint at /usr/local/lib/python3.7/site-packages/rxnfp/models/transformers/bert_pretrained were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceC

  0%|          | 0/2767 [00:00<?, ?it/s]



Epoch:   0%|          | 0/15 [00:00<?, ?it/s]

Running Epoch 0 of 15:   0%|          | 0/173 [00:00<?, ?it/s]

Running Epoch 1 of 15:   0%|          | 0/173 [00:00<?, ?it/s]

Running Epoch 2 of 15:   0%|          | 0/173 [00:00<?, ?it/s]

Running Epoch 3 of 15:   0%|          | 0/173 [00:00<?, ?it/s]

Running Epoch 4 of 15:   0%|          | 0/173 [00:00<?, ?it/s]

Running Epoch 5 of 15:   0%|          | 0/173 [00:00<?, ?it/s]

Running Epoch 6 of 15:   0%|          | 0/173 [00:00<?, ?it/s]

Running Epoch 7 of 15:   0%|          | 0/173 [00:00<?, ?it/s]

Running Epoch 8 of 15:   0%|          | 0/173 [00:00<?, ?it/s]

Running Epoch 9 of 15:   0%|          | 0/173 [00:00<?, ?it/s]

Running Epoch 10 of 15:   0%|          | 0/173 [00:00<?, ?it/s]

Running Epoch 11 of 15:   0%|          | 0/173 [00:00<?, ?it/s]

Running Epoch 12 of 15:   0%|          | 0/173 [00:00<?, ?it/s]

Running Epoch 13 of 15:   0%|          | 0/173 [00:00<?, ?it/s]

Running Epoch 14 of 15:   0%|          | 0/173 [00:00<?, ?it/s]

(2595, 1.0165795686617063)