In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_absolute_error,r2_score
from sklearn.model_selection import train_test_split

random_seed = 42


## RXNFP

In [2]:
from rxnfp.transformer_fingerprints import (
    RXNBERTFingerprintGenerator, get_default_model_and_tokenizer, generate_fingerprints
)


model, tokenizer = get_default_model_and_tokenizer()
model.to("cpu")
rxnfp_generator = RXNBERTFingerprintGenerator(model, tokenizer, force_no_cuda=True)

example_rxn = "Nc1cccc2cnccc12.O=C(O)c1cc([N+](=O)[O-])c(Sc2c(Cl)cncc2Cl)s1>>O=C(Nc1cccc2cnccc12)c1cc([N+](=O)[O-])c(Sc2c(Cl)cncc2Cl)s1"

fp = rxnfp_generator.convert(example_rxn)

  from .autonotebook import tqdm as notebook_tqdm
NVIDIA GeForce RTX 4090 with CUDA capability sm_89 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_37 sm_50 sm_60 sm_70.
If you want to use the NVIDIA GeForce RTX 4090 GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/



### Aryl-Scope

In [24]:
rxn_data = pd.read_csv("../dataset/rxn_data/aryl_scope_ligand/aryl-scope-ligand.csv")
lig_smi_lst, rct1_smi_lst, rct2_smi_lst, pdt_smi_lst = rxn_data['ligand_smiles'].to_list(),rxn_data['electrophile_smiles'].to_list(),rxn_data['nucleophile_smiles'].to_list(),rxn_data['product_smiles'].to_list()
label = rxn_data['yield'].to_numpy()

In [15]:
rxn_smi_lst = [f"{rct1}.{rct2}.{lig}>>{pdt}" for rct1, rct2, lig, pdt in zip(rct1_smi_lst, rct2_smi_lst, lig_smi_lst, pdt_smi_lst)]
rxn_fp_arr = np.array(rxnfp_generator.convert_batch(rxn_smi_lst))

In [18]:
r2_ave = []
mae_ave = []
#model = deepcopy(model2)
model = ExtraTreesRegressor(n_estimators=500,random_state=random_seed,n_jobs=-1)
all_test_y = []
all_test_p = []
for seed in range(10):
    train_x,test_x,train_y,test_y = train_test_split(rxn_fp_arr,label,test_size=0.2,random_state=seed)
    model.fit(train_x,train_y)
    test_p = model.predict(test_x)
    r2 = r2_score(test_y,test_p)
    mae = mean_absolute_error(test_y,test_p)
    print(f"seed: {seed}, r2: {r2:.4f}, mae: {mae:.4f}")
    all_test_y.append(test_y)
    all_test_p.append(test_p)
    r2_ave.append(r2)
    mae_ave.append(mae)
r2_ave = np.mean(r2_ave)
mae_ave = np.mean(mae_ave)
print(f"r2_ave: {r2_ave:.4f}, mae_ave: {mae_ave:.4f}")

seed: 0, r2: 0.4481, mae: 14.5618
seed: 1, r2: 0.4962, mae: 15.0616
seed: 2, r2: 0.5144, mae: 15.0098
seed: 3, r2: 0.5623, mae: 14.4801
seed: 4, r2: 0.6203, mae: 13.7814
seed: 5, r2: 0.5833, mae: 13.3453
seed: 6, r2: 0.5335, mae: 14.5894
seed: 7, r2: 0.5180, mae: 15.4483
seed: 8, r2: 0.4796, mae: 15.0775
seed: 9, r2: 0.5402, mae: 15.2989
r2_ave: 0.5296, mae_ave: 14.6654


#### 把CatEmb拼接进去

In [36]:
lig_cat_emb = np.loadtxt("./gen_desc/aryl_scope_cat_emb.txt")
lig_smi4chk = np.loadtxt("./gen_desc/aryl_scope_cat_emb_smi.txt",dtype=str)
for lig,ligc in zip(lig_smi_lst,lig_smi4chk):
    assert lig == ligc

In [37]:
rxn_smi_lst = [f"{rct1}.{rct2}>>{pdt}" for rct1, rct2, pdt in zip(rct1_smi_lst, rct2_smi_lst, pdt_smi_lst)]
rxn_fp_arr = np.array(rxnfp_generator.convert_batch(rxn_smi_lst))
rxn_merge_fp = np.concatenate((rxn_fp_arr,lig_cat_emb),axis=1)

In [38]:
r2_ave = []
mae_ave = []
#model = deepcopy(model2)
model = ExtraTreesRegressor(n_estimators=500,random_state=random_seed,n_jobs=-1)
all_test_y = []
all_test_p = []
for seed in range(10):
    train_x,test_x,train_y,test_y = train_test_split(rxn_merge_fp,label,test_size=0.2,random_state=seed)
    model.fit(train_x,train_y)
    test_p = model.predict(test_x)
    r2 = r2_score(test_y,test_p)
    mae = mean_absolute_error(test_y,test_p)
    print(f"seed: {seed}, r2: {r2:.4f}, mae: {mae:.4f}")
    all_test_y.append(test_y)
    all_test_p.append(test_p)
    r2_ave.append(r2)
    mae_ave.append(mae)
r2_ave = np.mean(r2_ave)
mae_ave = np.mean(mae_ave)
print(f"r2_ave: {r2_ave:.4f}, mae_ave: {mae_ave:.4f}")

seed: 0, r2: 0.7452, mae: 8.8274
seed: 1, r2: 0.7441, mae: 9.4511
seed: 2, r2: 0.7741, mae: 9.3127
seed: 3, r2: 0.7459, mae: 9.8653
seed: 4, r2: 0.8188, mae: 8.2050
seed: 5, r2: 0.7558, mae: 9.2348
seed: 6, r2: 0.7810, mae: 9.2121
seed: 7, r2: 0.7486, mae: 9.6306
seed: 8, r2: 0.6978, mae: 9.7623
seed: 9, r2: 0.7823, mae: 8.8638
r2_ave: 0.7594, mae_ave: 9.2365


## Denmark

In [3]:
rxn_data = pd.read_csv("../dataset/rxn_data/denmark/NS_acetal_dataset_with_pdt.csv")
rxn_data

Unnamed: 0.1,Unnamed: 0,Imine,Thiol,Catalyst,ΔΔG,Product
0,0,O=C(/N=C/c1ccccc1)c1ccccc1,Sc1ccccc1,O=P1(O)Oc2c(-c3ccccc3)cc3ccccc3c2-c2c(c(-c3ccc...,1.179891,O=C(NC(Sc1ccccc1)c1ccccc1)c1ccccc1
1,1,O=C(/N=C/c1ccccc1)c1ccccc1,CCS,O=P1(O)Oc2c(-c3ccccc3)cc3ccccc3c2-c2c(c(-c3ccc...,0.501759,CCSC(NC(=O)c1ccccc1)c1ccccc1
2,2,O=C(/N=C/c1ccccc1)c1ccccc1,SC1CCCCC1,O=P1(O)Oc2c(-c3ccccc3)cc3ccccc3c2-c2c(c(-c3ccc...,0.650584,O=C(NC(SC1CCCCC1)c1ccccc1)c1ccccc1
3,3,O=C(/N=C/c1ccccc1)c1ccccc1,COc1ccc(S)cc1,O=P1(O)Oc2c(-c3ccccc3)cc3ccccc3c2-c2c(c(-c3ccc...,1.238109,COc1ccc(SC(NC(=O)c2ccccc2)c2ccccc2)cc1
4,4,O=C(/N=C/c1ccc(C(F)(F)F)cc1)c1ccccc1,Sc1ccccc1,O=P1(O)Oc2c(-c3ccccc3)cc3ccccc3c2-c2c(c(-c3ccc...,1.179891,O=C(NC(Sc1ccccc1)c1ccc(C(F)(F)F)cc1)c1ccccc1
...,...,...,...,...,...,...
1070,1070,O=C(/N=C/c1ccccc1)c1ccccc1,Sc1ccccc1,O=P1(O)Oc2c(-c3cc(C(F)(F)F)cc(C(F)(F)F)c3)cc3c...,1.531803,O=C(NC(Sc1ccccc1)c1ccccc1)c1ccccc1
1071,1071,O=C(/N=C/c1ccccc1)c1ccccc1,Cc1ccccc1S,O=P1(O)Oc2c(-c3cc(C(F)(F)F)cc(C(F)(F)F)c3)cc3c...,1.531803,Cc1ccccc1SC(NC(=O)c1ccccc1)c1ccccc1
1072,1072,O=C(/N=C/c1ccc(C(F)(F)F)cc1)c1ccccc1,Cc1ccccc1S,O=P1(O)Oc2c(-c3cc(C(F)(F)F)cc(C(F)(F)F)c3)cc3c...,1.370104,Cc1ccccc1SC(NC(=O)c1ccccc1)c1ccc(C(F)(F)F)cc1
1073,1073,O=C(/N=C/c1cccc2ccccc12)c1ccccc1,Sc1ccccc1,O=P1(O)Oc2c(-c3cc(C(F)(F)F)cc(C(F)(F)F)c3)cc3c...,1.301167,O=C(NC(Sc1ccccc1)c1cccc2ccccc12)c1ccccc1


In [4]:
imine_lst = rxn_data['Imine'].to_list()
thiol_lst = rxn_data['Thiol'].to_list()
cat_lst = rxn_data['Catalyst'].to_list()
pdt_lst = rxn_data['Product'].to_list()
label = rxn_data['ΔΔG'].to_numpy()

In [46]:
cat_label_map = {}
for cat,l in zip(cat_lst,label):
    if cat not in cat_label_map:
        cat_label_map[cat] = []
    cat_label_map[cat].append(l)
cat_label_lst = sorted([[cat,np.mean(l)] for cat,l in cat_label_map.items()],key=lambda x:x[1])

In [47]:
rxn_smi_lst = [f"{imine}.{thiol}.{cat}>>{pdt}" for imine,thiol,cat,pdt in zip(imine_lst,thiol_lst,cat_lst,pdt_lst)]
rxn_fp_arr = np.array(rxnfp_generator.convert_batch(rxn_smi_lst))

In [48]:
r2_ave = []
mae_ave = []
all_test_y = []
all_test_p = []
#model = deepcopy(model2)
model = ExtraTreesRegressor(n_estimators=500,random_state=random_seed,n_jobs=-1)
for seed in range(10):
    train_x,test_x,train_y,test_y = train_test_split(rxn_fp_arr,label,test_size=475/1075,random_state=seed)
    model.fit(train_x,train_y)
    test_p = model.predict(test_x)
    all_test_y.append(test_y)
    all_test_p.append(test_p)
    r2 = r2_score(test_y,test_p)
    mae = mean_absolute_error(test_y,test_p)
    print(f"seed: {seed}, r2: {r2:.4f}, mae: {mae:.4f}")

    r2_ave.append(r2)
    mae_ave.append(mae)
r2_ave = np.mean(r2_ave)
mae_ave = np.mean(mae_ave)
print(f"r2_ave: {r2_ave:.4f}, mae_ave: {mae_ave:.4f}")

seed: 0, r2: 0.4445, mae: 0.3861
seed: 1, r2: 0.4371, mae: 0.3799
seed: 2, r2: 0.4555, mae: 0.3885
seed: 3, r2: 0.4116, mae: 0.4028
seed: 4, r2: 0.4049, mae: 0.4056
seed: 5, r2: 0.4366, mae: 0.3884
seed: 6, r2: 0.4234, mae: 0.4008
seed: 7, r2: 0.4599, mae: 0.3885
seed: 8, r2: 0.4607, mae: 0.3850
seed: 9, r2: 0.4132, mae: 0.4066
r2_ave: 0.4347, mae_ave: 0.3932


### 把CatEmb拼接进去

In [5]:
cat_cat_emb = np.loadtxt("./gen_desc/denmark_cat_emb.txt")
cat_smi4chk = np.loadtxt("./gen_desc/denmark_cat_emb_smi.txt",dtype=str)
for cat,catc in zip(cat_lst,cat_smi4chk):
    assert cat == catc

In [6]:
rxn_smi_lst = [f"{imine}.{thiol}>>{pdt}" for imine,thiol,pdt in zip(imine_lst,thiol_lst,pdt_lst)]
rxn_fp_arr = np.array(rxnfp_generator.convert_batch(rxn_smi_lst))
rxn_merge_fp = np.concatenate([rxn_fp_arr,cat_cat_emb],axis=1)

In [7]:
r2_ave = []
mae_ave = []
all_test_y = []
all_test_p = []
#model = deepcopy(model2)
model = ExtraTreesRegressor(n_estimators=500,random_state=random_seed,n_jobs=-1)
for seed in range(10):
    train_x,test_x,train_y,test_y = train_test_split(rxn_merge_fp,label,test_size=475/1075,random_state=seed)
    model.fit(train_x,train_y)
    test_p = model.predict(test_x)
    all_test_y.append(test_y)
    all_test_p.append(test_p)
    r2 = r2_score(test_y,test_p)
    mae = mean_absolute_error(test_y,test_p)
    print(f"seed: {seed}, r2: {r2:.4f}, mae: {mae:.4f}")

    r2_ave.append(r2)
    mae_ave.append(mae)
r2_ave = np.mean(r2_ave)
mae_ave = np.mean(mae_ave)
print(f"r2_ave: {r2_ave:.4f}, mae_ave: {mae_ave:.4f}")

seed: 0, r2: 0.8938, mae: 0.1487
seed: 1, r2: 0.9014, mae: 0.1446
seed: 2, r2: 0.8925, mae: 0.1476
seed: 3, r2: 0.9021, mae: 0.1457
seed: 4, r2: 0.9114, mae: 0.1421
seed: 5, r2: 0.9102, mae: 0.1467
seed: 6, r2: 0.8959, mae: 0.1487
seed: 7, r2: 0.9126, mae: 0.1461
seed: 8, r2: 0.9112, mae: 0.1428
seed: 9, r2: 0.8933, mae: 0.1607
r2_ave: 0.9024, mae_ave: 0.1474


### 样本外

In [8]:
oos_imine = ['O=C(/N=C/c1ccc(Cl)cc1Cl)c1ccccc1']
oos_thiol = ['Cc1ccccc1S']
oos_cat = ['O=P1(O)Oc2c(-c3c(C4CCCCC4)cc(C4CCCCC4)cc3C3CCCCC3)cc3ccccc3c2-c2c(c(-c3c(C4CCCCC4)cc(C4CCCCC4)cc3C3CCCCC3)cc3ccccc23)O1',
 'CC(C)c1cc(C(C)C)c(-c2cc3ccccc3c3c2OP(=O)(O)Oc2c(-c4c(C(C)C)cc(C(C)C)cc4C(C)C)cc4ccccc4c2-3)c(C(C)C)c1',
 'COc1cccc(OC)c1-c1cc2ccccc2c2c1OP(=O)(O)Oc1c(-c3c(OC)cccc3OC)cc3ccccc3c1-2',
 'Cc1cc(C)c(-c2cc3ccccc3c3c2OP(=O)(O)Oc2c(-c4c(C)cc(C)cc4C)cc4ccccc4c2-3)c(C)c1',
 'O=P1(O)Oc2c(-c3c4ccccc4cc4ccccc34)cc3ccccc3c2-c2c(c(-c3c4ccccc4cc4ccccc34)cc3ccccc23)O1',
 'O=P1(O)Oc2c(-c3ccc4ccc5cccc6ccc3c4c56)cc3ccccc3c2-c2c(c(-c3ccc4ccc5cccc6ccc3c4c56)cc3ccccc23)O1', 
 'O=P1(O)Oc2c(-c3ccccc3OC(F)(F)F)cc3ccccc3c2-c2c(c(-c3ccccc3OC(F)(F)F)cc3ccccc23)O1',
 'CC(C)(C)c1cc(-c2cc3ccccc3c3c2OP(=O)(O)Oc2c(-c4cc(C(C)(C)C)cc(C(C)(C)C)c4)cc4ccccc4c2-3)cc(C(C)(C)C)c1',
 'CC(C)(C)c1cc(-c2cc3c(c4c2OP(=O)(O)Oc2c(-c5cc(C(C)(C)C)cc(C(C)(C)C)c5)cc5c(c2-4)CCCC5)CCCC3)cc(C(C)(C)C)c1',
 'Cc1ccc(-c2cc3ccccc3c3c2OP(=O)(O)Oc2c(-c4ccc(C)cc4)cc4ccccc4c2-3)cc1',
 'CC(C)(C)c1ccc(-c2cc3ccccc3c3c2OP(=O)(O)Oc2c(-c4ccc(C(C)(C)C)cc4)cc4ccccc4c2-3)cc1',
 'O=P1(O)Oc2c(-c3ccc(-c4ccc5ccccc5c4)cc3)cc3ccccc3c2-c2c(c(-c3ccc(-c4ccc5ccccc5c4)cc3)cc3ccccc23)O1',
 'COc1ccc(-c2cc3ccccc3c3c2OP(=O)(O)Oc2c(-c4ccc(OC)cc4)cc4ccccc4c2-3)cc1',
 'COCc1cccc(-c2cc3c(c4c2OP(=O)(O)Oc2c(-c5cccc(COC)c5)cc5c(c2-4)CCCC5)CCCC3)c1',
 'O=P1(O)Oc2c(-c3ccccc3)cc3ccccc3c2-c2c(c(-c3ccccc3)cc3ccccc23)O1',
 'C[Si](c1ccccc1)(c1ccccc1)c1cc2ccccc2c2c1OP(=O)(O)Oc1c([Si](C)(c3ccccc3)c3ccccc3)cc3ccccc3c1-2',
 'O=P1(O)Oc2c(Br)cc3c(c2-c2c4c(cc(Br)c2O1)CCCC4)CCCC3',
 'O=P1(O)Oc2c([Si](c3ccccc3)(c3ccccc3)c3ccccc3)cc3ccccc3c2-c2c(c([Si](c3ccccc3)(c3ccccc3)c3ccccc3)cc3ccccc23)O1',
 'O=P1(O)Oc2c(Cc3ccc(C(F)(F)F)cc3C(F)(F)F)cc3ccccc3c2-c2c(c(Cc3ccc(C(F)(F)F)cc3C(F)(F)F)cc3ccccc23)O1']
train_data_idx_lst = []
sub_test_data_idx_lst = []
cat_test_data_idx_lst = []
sub_cat_test_data_idx_lst = []
for i in range(len(imine_lst)):
    imine_smi = imine_lst[i]
    thiol_smi = thiol_lst[i]
    cat_smi = cat_lst[i]
    if (imine_smi in oos_imine or thiol_smi in oos_thiol) and not (cat_smi in oos_cat):
        sub_test_data_idx_lst.append(i)
    elif (cat_smi in oos_cat) and not (imine_smi in oos_imine or thiol_smi in oos_thiol):
        cat_test_data_idx_lst.append(i)
    elif (imine_smi in oos_imine or thiol_smi in oos_thiol) and (cat_smi in oos_cat):
        sub_cat_test_data_idx_lst.append(i)
    else:
        train_data_idx_lst.append(i)
len(train_data_idx_lst),len(sub_test_data_idx_lst),len(cat_test_data_idx_lst),len(sub_cat_test_data_idx_lst)

(384, 216, 304, 171)

In [9]:
train_x = rxn_merge_fp[train_data_idx_lst]
train_y = label[train_data_idx_lst]
sub_test_x = rxn_merge_fp[sub_test_data_idx_lst]
sub_test_y = label[sub_test_data_idx_lst]
cat_test_x = rxn_merge_fp[cat_test_data_idx_lst]
cat_test_y = label[cat_test_data_idx_lst]
sub_cat_test_x = rxn_merge_fp[sub_cat_test_data_idx_lst]
sub_cat_test_y = label[sub_cat_test_data_idx_lst]
model.fit(train_x,train_y)
sub_test_p = model.predict(sub_test_x)
cat_test_p = model.predict(cat_test_x)
sub_cat_test_p = model.predict(sub_cat_test_x)
r2_sub_test = r2_score(sub_test_y,sub_test_p)
r2_cat_test = r2_score(cat_test_y,cat_test_p)
r2_sub_cat_test = r2_score(sub_cat_test_y,sub_cat_test_p)
mae_sub_test = mean_absolute_error(sub_test_y,sub_test_p)
mae_cat_test = mean_absolute_error(cat_test_y,cat_test_p)
mae_sub_cat_test = mean_absolute_error(sub_cat_test_y,sub_cat_test_p)
#print(f"r2_sub_test: {r2_sub_test:.4f}, r2_cat_test: {r2_cat_test:.4f}, r2_sub_cat_test: {r2_sub_cat_test:.4f}")
print(f"mae_sub_test: {mae_sub_test:.4f}, mae_cat_test: {mae_cat_test:.4f}, mae_sub_cat_test: {mae_sub_cat_test:.4f}")

mae_sub_test: 0.1400, mae_cat_test: 0.4129, mae_sub_cat_test: 0.4528
