In [2]:
import pandas as pd
import numpy as np
import os
import rdkit.Chem as Chem
from tqdm import tqdm
import math
import time
from rdkit.Chem import AllChem
from rdkit import DataStructs

In [3]:
filePath = '../../../xtb_ml_data' # 提取全部的data，以及sensitizer和emitter
xtb_data = os.listdir(filePath)
dt_list = []

j = 0
for i in xtb_data:
    path = filePath + "/" + i
    dt = pd.read_csv(path).iloc[:, :3]
    dt.columns = ['SMILES', 'xTB_S1', 'xTB_T1']
    dt = dt[~dt['xTB_S1'].isin(['Invalid SMILES'])]
    dt['T1S1ratio'] = pd.to_numeric(dt['xTB_T1']) / pd.to_numeric(dt['xTB_S1'])
    dt_list.append(dt)

  dt = pd.read_csv(path).iloc[:, :3]


In [4]:
# 全部数据合并
dt_tot = pd.concat(dt_list)  
dt_tot = dt_tot.drop_duplicates(subset = 'SMILES', keep=False)
dt_tot.dropna(inplace = True)
dt_tot.reset_index(drop = True, inplace = True)

In [5]:
# 去掉全部带电的分子
index = []
for i in tqdm(range(len(dt_tot))):
    mol = Chem.MolFromSmiles(dt_tot.iloc[i, 0])
    if mol == None: 
        continue
    Chem.Kekulize(mol)
    if abs(Chem.GetFormalCharge(mol)) == 0:
        index.append(i)

dt_tot = dt_tot.iloc[index, :]
dt_tot.reset_index(drop = True, inplace = True)
dt_emit = dt_tot[(dt_tot['T1S1ratio'] > (1/2.2)) & (dt_tot['T1S1ratio'] < (1/1.8))]
dt_sens = dt_tot[(dt_tot['T1S1ratio'] > 0.8) & (dt_tot['T1S1ratio'] < 1)]

100%|██████████| 684444/684444 [02:14<00:00, 5083.33it/s]


In [6]:
# 划分测试集
np.random.seed(2022)
def is_large(smi): # split the target data according to atom_num > 20
    mol = Chem.MolFromSmiles(smi)
    atoms_num = mol.GetNumAtoms()
    if atoms_num > 20:
        return True
    if atoms_num <= 20:
        return False

test_rand = dt_tot.sample(n = 3000, replace = False)
test_emit = dt_emit.sample(n = 3000, replace = False)
test_sens = dt_sens.sample(n = 3000, replace = False)
test_tot = pd.concat([test_rand, test_emit, test_sens])  
test_tot = test_tot.drop_duplicates(subset = 'SMILES', keep=False)
test_tot['is_large'] = test_tot['SMILES'].apply(lambda x: is_large(x))
test_large = test_tot[test_tot['is_large'] == True]
test_small = test_tot[test_tot['is_large'] == False]

In [27]:
test_rand.to_csv('../test_set/test_rand.csv')
test_emit.to_csv('../test_set/test_emit.csv')
test_sens.to_csv('../test_set/test_sens.csv')
test_large.to_csv('../test_set/test_large.csv')
test_small.to_csv('../test_set/test_small.csv')
test_tot.to_csv('../test_set/test_tot.csv')

In [9]:
dt_tot_dup = pd.concat([test_tot.iloc[:, :-1], dt_tot])
dt_tot = dt_tot_dup.drop_duplicates(subset = 'SMILES', keep=False)
dt_tot.reset_index(drop = True, inplace = True)
dt_train = dt_tot.sample(n = 5000, replace = False, random_state=2022)
dt_train.to_csv('init_train.csv') # 第一轮训练集

In [10]:
dt_tot_round1 = pd.concat([dt_train, dt_tot])
dt_tot_round1 = dt_tot_round1.drop_duplicates(subset = 'SMILES', keep = False)
dt_tot_round1.reset_index(drop = True, inplace = True)

In [11]:
dt_tot_round1.to_csv('../data_tot/dt_tot_round1.csv')

In [2]:
def cut_and_pred(dt_tot_csv, cut_len, round, check_point_path):
    import math
    import os
    import pandas as pd
    import time

    os.mkdir(f'../cut_data/origin_cut/round_{round}')
    os.mkdir(f'../cut_data/pred_cut/round_{round}')

    dt_tot = pd.read_csv(dt_tot_csv)
    dt_tot.reset_index(drop = True, inplace = True)
    tot_len = len(dt_tot)
    file_num = math.ceil(tot_len / cut_len)
    for i in range(file_num):
        if i != (file_num - 1):
            dt_tot.iloc[i*cut_len:(i+1)*cut_len, :].to_csv(f'../cut_data/origin_cut/round_{round}/dt_tot_{i}.csv')
        if i == (file_num - 1):
            dt_tot.iloc[i*cut_len:, :].to_csv(f'../cut_data/origin_cut/round_{round}/dt_tot_{i}.csv')
    for i in range(file_num):
        print(i)
        test_path = f'../cut_data/origin_cut/round_{round}/dt_tot_{i}.csv'
        pred_path = f'../cut_data/pred_cut/round_{round}/dt_tot_{i}.csv'
        pred = f'dt_tot_{i}.csv'
        k = 0
        while pred not in os.listdir(f'../cut_data/pred_cut/round_{round}'):
            os.system(f'chemprop_predict --test_path {test_path} --checkpoint_dir {check_point_path} --preds_path {pred_path} --smiles_column SMILES --ensemble_variance --num_workers 0')
            time.sleep(10)
            if k >= 1:
                print(f'data{i} fail {k} time(s)')
            k = k + 1
            
    dt_list = []
    for i in range(file_num):
        dt_path = f'../cut_data/pred_cut/round_{round}/dt_tot_{i}.csv'
        dt = pd.read_csv(dt_path)
        dt_list.append(dt)
    pred_tot = pd.concat(dt_list).dropna()
    pred_tot.reset_index(drop = True, inplace = True)
    pred_tot = pred_tot[~pred_tot['xTB_S1'].isin(['Invalid SMILES'])]
    pred_tot['uncertainty_tot'] = pred_tot['xTB_S1_epi_unc'].apply(lambda x: float(x)) + pred_tot['xTB_T1_epi_unc'].apply(lambda x: float(x))
    pred_tot.sort_values(by = 'uncertainty_tot', ascending = False, inplace = True)
    pred_index = pred_tot.index[:20000]
    
    newtrain = dt_tot.iloc[pred_index, :]
    train_set = pd.read_csv(f'../train_set/train_round{round}.csv')
    train_set = pd.concat([newtrain, train_set])
    train_set.to_csv(f'../train_set/train_round{round+1}.csv')

    dt_tot_new = dt_tot.iloc[list(set(pred_tot.index)-set(pred_index)), :]
    dt_tot_new.reset_index(drop = True, inplace = True)
    dt_tot_new.to_csv(f'../data_tot/dt_tot_round{round+1}.csv')

In [26]:
def cut_and_pred_diversity(dt_tot_csv, cut_len, round, check_point_path):
    import math
    import os
    import pandas as pd
    import time
    import rdkit.Chem as Chem
    from rdkit.Chem import AllChem
    from rdkit import DataStructs
    from tqdm import tqdm

    os.mkdir(f'../batch_version/cut_data/origin_cut/round_{round}')
    os.mkdir(f'../batch_version/cut_data/pred_cut/round_{round}')

    dt_tot = pd.read_csv(dt_tot_csv)
    dt_tot.reset_index(drop = True, inplace = True)
    tot_len = len(dt_tot)
    file_num = math.ceil(tot_len / cut_len)
    for i in range(file_num):
        if i != (file_num - 1):
            dt_tot.iloc[i*cut_len:(i+1)*cut_len, :].to_csv(f'../batch_version/cut_data/origin_cut/round_{round}/dt_tot_{i}.csv')
        if i == (file_num - 1):
            dt_tot.iloc[i*cut_len:, :].to_csv(f'../batch_version/cut_data/origin_cut/round_{round}/dt_tot_{i}.csv')
    for i in range(file_num):
        print(i)
        test_path = f'../batch_version/cut_data/origin_cut/round_{round}/dt_tot_{i}.csv'
        pred_path = f'../batch_version/cut_data/pred_cut/round_{round}/dt_tot_{i}.csv'
        pred = f'dt_tot_{i}.csv'
        k = 0
        while pred not in os.listdir(f'../batch_version/cut_data/pred_cut/round_{round}'):
            os.system(f'chemprop_predict --test_path {test_path} --checkpoint_dir {check_point_path} --preds_path {pred_path} --smiles_column SMILES --ensemble_variance --num_workers 0')
            time.sleep(10)
            if k >= 1:
                print(f'data{i} fail {k} time(s)')
            k = k + 1
            
    dt_list = []
    for i in range(file_num):
        dt_path = f'../batch_version/cut_data/pred_cut/round_{round}/dt_tot_{i}.csv'
        dt = pd.read_csv(dt_path)
        dt_list.append(dt)
    pred_tot = pd.concat(dt_list).dropna()
    pred_tot.reset_index(drop = True, inplace = True)
    pred_tot = pred_tot[~pred_tot['xTB_S1'].isin(['Invalid SMILES'])]
    pred_tot['uncertainty_tot'] = pred_tot['xTB_S1_epi_unc'].apply(lambda x: float(x)) + pred_tot['xTB_T1_epi_unc'].apply(lambda x: float(x))
    pred_tot.sort_values(by = 'uncertainty_tot', ascending = False, inplace = True)

    suggest_list = []
    total_fingerprint = []
    pred_len = len(pred_tot)
    loop = 0
    similar_value_threshold = 0.4
    similar_num_threshold = 5

    total_smiles = pred_tot['SMILES'][0:20000]
    for smile in tqdm(total_smiles):
        mol = Chem.MolFromSmiles(smile)
        fingerprint = AllChem.GetMorganFingerprint(mol)
        total_fingerprint.append(fingerprint)

    for i in tqdm(range(20000)):
        suggest_list.append(i)
        k = 0
        loop += 1
        query_fingerprint = total_fingerprint[i]
        if i >= 10:
            target_fingerprints = total_fingerprint[0:i]
            scores = DataStructs.BulkTanimotoSimilarity(query_fingerprint, target_fingerprints)
            total_similar_num = len(list(filter(lambda x: x > similar_value_threshold, scores)))
            if total_similar_num > similar_num_threshold:
                suggest_list.pop()
    
    if len(suggest_list) < 20000:
        for i in tqdm(range(20000, pred_len)):
            new_smile = pred_tot['SMILES'][i]
            new_mol = Chem.MolFromSmiles(new_smile)
            new_fingerprint = AllChem.GetMorganFingerprint(new_mol,2)
            total_fingerprint.append(new_fingerprint)
            suggest_list.append(i)
            query_fingerprint = total_fingerprint[i]
            target_fingerprints = total_fingerprint[0:i]
            scores = DataStructs.BulkTanimotoSimilarity(query_fingerprint, target_fingerprints)
            total_similar_num = len(list(filter(lambda x: x > similar_value_threshold, scores)))
            if total_similar_num > similar_num_threshold:
                suggest_list.pop()
            if len(suggest_list) >= 20000:
                break

    pred_index = pred_tot.index[suggest_list]
    newtrain = dt_tot.iloc[pred_index, :]
    train_set = pd.read_csv(f'../batch_version/train_set/train_round{round}.csv')
    train_set = pd.concat([newtrain, train_set])
    train_set.to_csv(f'../batch_version/train_set/train_round{round+1}.csv')

    dt_tot_new = dt_tot.iloc[list(set(pred_tot.index)-set(pred_index)), :]
    dt_tot_new.reset_index(drop = True, inplace = True)
    dt_tot_new.to_csv(f'../batch_version/data_tot/dt_tot_round{round+1}.csv')

In [25]:
check_point_path = '../batch_version/model/round_2'
round = 2
for i in range(19):
    print(i)
    test_path = f'../batch_version/cut_data/origin_cut/round_2/dt_tot_{i}.csv'
    pred_path = f'../batch_version/cut_data/pred_cut/round_2/dt_tot_{i}.csv'
    k = 0
    pred = f'dt_tot_{i}.csv'
    while pred not in os.listdir(f'../batch_version/cut_data/pred_cut/round_{round}'):
        os.system(f'chemprop_predict --test_path {test_path} --checkpoint_dir {check_point_path} --preds_path {pred_path} --smiles_column SMILES --ensemble_variance --num_workers 0')
        time.sleep(10)
        if k >= 1:
            print(f'data{i} fail {k} time(s)')
        k = k + 1

0
data0 fail 1 time(s)


KeyboardInterrupt: 

In [30]:
dt_list = []
round = 2
dt_tot = pd.read_csv('../batch_version/data_tot/dt_tot_round2.csv')
dt_tot.reset_index(drop = True, inplace = True)
for i in range(19):
    dt_path = f'../batch_version/cut_data/pred_cut/round_{round}/dt_tot_{i}.csv'
    dt = pd.read_csv(dt_path)
    dt_list.append(dt)
pred_tot = pd.concat(dt_list).dropna()
pred_tot.reset_index(drop = True, inplace = True)
pred_tot = pred_tot[~pred_tot['xTB_S1'].isin(['Invalid SMILES'])]
pred_tot['uncertainty_tot'] = pred_tot['xTB_S1_epi_unc'].apply(lambda x: float(x)) + pred_tot['xTB_T1_epi_unc'].apply(lambda x: float(x))
pred_tot.sort_values(by = 'uncertainty_tot', ascending = False, inplace = True)

suggest_list = []
total_fingerprint = []
pred_len = len(pred_tot)
loop = 0
similar_value_threshold = 0.4
similar_num_threshold = 5

total_smiles = pred_tot['SMILES'][0:20000]
for smile in tqdm(total_smiles):
    mol = Chem.MolFromSmiles(smile)
    fingerprint = AllChem.GetMorganFingerprint(mol, 2)
    total_fingerprint.append(fingerprint)

for i in tqdm(range(20000)):
    suggest_list.append(i)
    k = 0
    loop += 1
    query_fingerprint = total_fingerprint[i]
    if i >= 10:
        target_fingerprints = total_fingerprint[0:i]
        scores = DataStructs.BulkTanimotoSimilarity(query_fingerprint, target_fingerprints)
        total_similar_num = len(list(filter(lambda x: x > similar_value_threshold, scores)))
        if total_similar_num > similar_num_threshold:
            suggest_list.pop()

if len(suggest_list) < 20000:
    for i in tqdm(range(20000, pred_len)):
        new_smile = pred_tot['SMILES'][i]
        new_mol = Chem.MolFromSmiles(new_smile)
        new_fingerprint = AllChem.GetMorganFingerprint(new_mol,2)
        total_fingerprint.append(new_fingerprint)
        suggest_list.append(i)
        query_fingerprint = total_fingerprint[i]
        target_fingerprints = total_fingerprint[0:i]
        scores = DataStructs.BulkTanimotoSimilarity(query_fingerprint, target_fingerprints)
        total_similar_num = len(list(filter(lambda x: x > similar_value_threshold, scores)))
        if total_similar_num > similar_num_threshold:
            suggest_list.pop()
        if len(suggest_list) >= 20000:
            break

pred_index = pred_tot.index[suggest_list]
newtrain = dt_tot.iloc[pred_index, :]
train_set = pd.read_csv(f'../batch_version/train_set/train_round{round}.csv')
train_set = pd.concat([newtrain, train_set])
train_set.to_csv(f'../batch_version/train_set/train_round{round+1}.csv')

dt_tot_new = dt_tot.iloc[list(set(pred_tot.index)-set(pred_index)), :]
dt_tot_new.reset_index(drop = True, inplace = True)
dt_tot_new.to_csv(f'../batch_version/data_tot/dt_tot_round{round+1}.csv')

100%|██████████| 20000/20000 [00:03<00:00, 5308.44it/s]
100%|██████████| 20000/20000 [02:30<00:00, 133.19it/s]
  1%|          | 6376/527917 [01:54<2:36:26, 55.56it/s]


In [3]:
tot_csv = '../data_tot/dt_tot_round7.csv'
check_path = '../model/round_7'
cut_and_pred(tot_csv, 30000, 7, check_path)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14


to do:
- ~~把测试集输出成csv文件整理好~~
- ~~修改cut_and_pred函数，把输入参数设定为csv，以及简单格式~~
- ~~把后续的pred和uncertainty排序整合进原有的cut_and_pred中，使其返回csv格式文件~~
- ~~编写好test脚本，计算针对不同test set的四个评价指标：MAE, RMSE, R^2, spearman correlation~~
- 整理train代码，并封装至脚本中
- 把train和cut_and_pred整合在一起
- 把全流程整合在一起

In [5]:
def test_index(round, check_point_path):
    import os
    import numpy as np
    import pandas as pd

    filePath = '../test_set' 
    os.mkdir(f'../pred_set/round_{round}')
    test_csv = os.listdir(filePath)
    test_result_list = []

    for i in test_csv:
        test_path = filePath + '/' + i
        test_set = pd.read_csv(test_path)
        pred_path = f'../pred_set/round_{round}/{i}'
        os.system(f'chemprop_predict --test_path {test_path} --checkpoint_dir {check_point_path} --preds_path {pred_path} --smiles_column SMILES --ensemble_variance --num_workers 0')
        
        pred_set = pd.read_csv(pred_path).dropna()
        pred_set.reset_index(drop = True, inplace = True)
        pred_set = pred_set[~pred_set['xTB_S1'].isin(['Invalid SMILES'])]
        S1_uncertainty = pred_set['xTB_S1_epi_unc'].apply(lambda x: float(x))
        T1_uncertainty = pred_set['xTB_T1_epi_unc'].apply(lambda x: float(x))

        # calculate
        S1_error = abs(test_set['xTB_S1'] - pred_set['xTB_S1'])
        T1_error = abs(test_set['xTB_T1'] - pred_set['xTB_T1'])
        S1_spearman_correlation = S1_uncertainty.corr(S1_error,'spearman')
        T1_spearman_correlation = T1_uncertainty.corr(T1_error,'spearman')
        S1_pearson_correlation = S1_uncertainty.corr(S1_error,'pearson')
        T1_pearson_correlation = T1_uncertainty.corr(T1_error,'pearson')
        S1_mae = S1_error.mean()
        T1_mae = T1_error.mean()
        S1_rmse = ((S1_error*S1_error).mean()) ** 0.5
        T1_rmse = ((T1_error*T1_error).mean()) ** 0.5

        # store the result in dictionary
        test_dict = {'File_name' : i, 'S1_spearman_correlation' : S1_spearman_correlation, 
        'T1_spearman_correlation' : T1_spearman_correlation, 'S1_pearson_correlation' : S1_pearson_correlation,
        'T1_pearson_correlation' : T1_pearson_correlation, 'S1_mae' : S1_mae, 'T1_mae' : T1_mae, 'S1_rmse' : S1_rmse, 'T1_rmse' : T1_rmse}
        test_result_list.append(test_dict)

    store_path = f'../test_performance/test_round{round}.csv'
    pd.DataFrame(test_result_list).to_csv(store_path)

In [6]:
test_index(8, '../model/round_8')

In [36]:
test_index(2, '../model/round_2')
test_index(3, '../batch_version/model/round_3')

In [18]:
a = pd.read_csv('../data_tot/dt_tot_round5.csv')
print(len(a))
b = pd.read_csv('../train_set/train_round5.csv')
print(len(b))
c = pd.concat([a, b]).drop_duplicates(subset = 'SMILES', keep = False)
print(len(c))

487906
100000
587906


In [18]:
dt_list = []
for i in range(10):
    dt_path = f'../cut_data/pred_cut/round_1/round1pred_{i}.csv'
    dt = pd.read_csv(dt_path)
    dt_list.append(dt)
pred_tot = pd.concat(dt_list).dropna()
pred_tot.reset_index(drop = True, inplace = True)
pred_tot = pred_tot[~pred_tot['xTB_S1'].isin(['Invalid SMILES'])]
pred_tot['uncertainty_tot'] = pred_tot['xTB_S1_epi_unc'].apply(lambda x: float(x)) + pred_tot['xTB_T1_epi_unc'].apply(lambda x: float(x))
pred_tot.sort_values(by = 'uncertainty_tot', ascending = False, inplace = True)
#pred_tot.reset_index(drop = True, inplace = True)

In [21]:
dt_list = []
for i in range(10):
    dt_path = f'../cut_data/origin_cut/round_1/round1dt_tot_{i}.csv'
    dt = pd.read_csv(dt_path)
    dt_list.append(dt)
origin_tot = pd.concat(dt_list).dropna()
origin_tot.reset_index(drop = True, inplace = True)
#pred_tot.reset_index(drop = True, inplace = True)

In [8]:
suggest_list = []
pred_len = len(pred_tot)
loop = 0

for i in range(pred_len):
    suggest_list.append(i)
    k = 0
    loop = loop + 1
    smile_target = pred_tot.loc[i, 'SMILES']
    m_target = Chem.MolFromSmiles(smile_target)
    fp_target = AllChem.GetMorganFingerprint(m_target, 2)
    if i >= 10:
        for j in range(i):
            smile_move = pred_tot.loc[j, 'SMILES']
            m_move = Chem.MolFromSmiles(smile_move)
            fp_move = AllChem.GetMorganFingerprint(m_move, 2)
            similarity = DataStructs.DiceSimilarity(fp_target, fp_move)
            if similarity > 0.5:
                k = k + 1
            if k >= 10:
                suggest_list.pop()
                break
    if len(suggest_list) >= 20000:
        break

In [15]:
suggest_list = []
total_fingerprint = []
pred_len = len(pred_tot)
#loop = 0
similar_value_threshold = 0.5
similar_num_threshold = 10

total_smiles = pred_tot.loc[list(range(20000)), 'SMILES']
for smile in tqdm(total_smiles):
    mol = Chem.MolFromSmiles(smile)
    fingerprint = AllChem.GetMorganFingerprint(mol,2)
    total_fingerprint.append(fingerprint)


for i in tqdm(range(20000)):
    suggest_list.append(i)
    #loop += 1
    query_fingerprint = total_fingerprint[i]
    if i >= 10:
        target_fingerprints = total_fingerprint[0:i]
        # need smiles to fingerprints code 
        scores = DataStructs.BulkTanimotoSimilarity(query_fingerprint, target_fingerprints)
        total_similar_num = len(list(filter(lambda x: x > similar_value_threshold, scores)))
        if total_similar_num > similar_num_threshold:
            suggest_list.pop()

100%|██████████| 20000/20000 [00:03<00:00, 6191.90it/s]
100%|██████████| 20000/20000 [01:53<00:00, 176.37it/s]


In [17]:
if len(suggest_list) < 20000:
    for i in tqdm(range(20000, pred_len)):
        new_smile = pred_tot.loc[i, 'SMILES']
        new_mol = Chem.MolFromSmiles(new_smile)
        new_fingerprint = AllChem.GetMorganFingerprint(new_mol,2)
        total_fingerprint.append(new_fingerprint)
        suggest_list.append(i)
        query_fingerprint = total_fingerprint[i]
        target_fingerprints = total_fingerprint[0:i]
        scores = DataStructs.BulkTanimotoSimilarity(query_fingerprint, target_fingerprints)
        total_similar_num = len(list(filter(lambda x: x > similar_value_threshold, scores)))
        if total_similar_num > similar_num_threshold:
            suggest_list.pop()
        if len(suggest_list) >= 20000:
            break

  0%|          | 75/547906 [00:01<2:01:54, 74.90it/s]


In [4]:
a = [1, 3, 5, 7]
b = a[0:2]