In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import warnings 
warnings.filterwarnings("ignore")

In [2]:
from DFTStructureGenerator import DFThandle, Tool
import glob, os, shutil, itertools, copy
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Geometry import Point3D
import numpy as np
from tqdm import tqdm
import pandas as pd
from hyperopt import hp
from morfeus import BuriedVolume
from sklearn.base import clone
import seaborn as sns

In [4]:
from matplotlib import pyplot as plt
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import KFold, cross_val_score
import pickle
np.random.seed(0)

In [None]:
data_dir = "Data"
csv_dir = os.path.join(data_dir, "Iteration_2", "Result")
row_csv = "Data_clear.csv"
target_csv = "Data_clear_with_sites.csv"

In [None]:
binol_3_3 = [1,2,3,13,18,19,20,21,23,24,25,26,28,29,30,96,98,104,] + np.arange(122, 131).tolist()
binol_4_4 = [62,63,66]
binol_6_6 = [31,32,35,36,] + np.arange(44, 48).tolist()
binol_7_7 = [51,52,56,57,60]
binol_other = [0,70,85,91,108,117,121]
all_binol_split = [binol_3_3, binol_4_4, binol_6_6, binol_7_7, binol_other]

# 1. Data Collection

In [None]:
with open(r"Data/all_fp_map2.pkl", 'rb')as f:
    # rd_mf_map, rd_des_map, morgan_map, modred_map, acsf_3D_map, soap_3D_map, mbtr_3D_map, lmbtr_3D_map, qm_dict, area_dict = pickle.load(f)
    qm_dict, area_dict = pickle.load(f)

# 2. Performance in Each Active-Learning

In [None]:
# data_csv = read_reaction_csv("Data/Result/BINOL_result_sum_0015.csv")
final_all_r2s, final_mae, final_pred = [], [], []
all_r2_split = [[] for _ in range(len(all_binol_split))]
for idx, data_csv in enumerate([DFThandle.read_reaction_csv(each) for each in glob.glob(f"{csv_dir}/BINOL_result_sum_*.csv")]):
    y = data_csv['R'].to_numpy() - data_csv["S"].to_numpy()
    all_X = DFThandle.descriptor_to_array(data_csv, None, [qm_dict, area_dict])
    target = y
    all_r2s = []
    all_mae = []
    all_pred = np.array([0 for _ in range(len(all_X))])
    kf = KFold(n_splits=5, shuffle=True, random_state=1)
    kf = list(kf.split(all_X))
    for train_ids, test_ids in kf:
        model = CatBoostRegressor(iterations=10000, learning_rate=0.01, depth=6, verbose=0, random_state = 0)
        train_X, train_Y = all_X[train_ids], target[train_ids]
        temp_train_X = DFThandle.get_reverse_result(train_X)
        temp_train_Y = train_Y * -1
        train_X_ = np.concatenate([train_X, temp_train_X])
        train_Y_ = np.append(train_Y, temp_train_Y)
        model.fit(train_X_, train_Y_)
        y_pred = model.predict(all_X[test_ids])
        r2s = r2_score(target[test_ids], y_pred)
        all_r2s.append(r2s)
        all_mae.append(mean_absolute_error(target[test_ids], y_pred))
        all_pred[test_ids] = y_pred
    for binol_split_id, binol_split in enumerate(all_binol_split):
        row_in_split = [id_ for id_, each in data_csv.iterrows() if each['Binol'] in binol_split]
        all_r2_split[binol_split_id].append(r2_score(target[row_in_split], all_pred[row_in_split]))
    print(f"Round: {idx + 1}, all_R2: {np.mean(all_r2s)}, all_MAE: {np.mean(all_mae)}")
    final_all_r2s.append(np.mean(all_r2s))
    final_mae.append(np.mean(all_mae))
    final_pred.append(all_pred)

In [None]:
Tool.plot_scatter_with_metrics(y, all_pred, min_=-120, max_=120, figsize=(4,4))


In [None]:
plt.figure(figsize=(5, 5))
plt.xticks(fontsize=24)
plt.yticks(fontsize=24)

for color, r2_split, name in zip(['#719fff', '#bbd9fe', '#feefd2', '#489b9c', '#74d8ff', '#e6e6e6'], all_r2_split, ["3,3", '4,4', '6,6', '7,7', 'other']):
    plt.plot(np.arange(len(r2_split)), r2_split, c=color, label=name, alpha=0.8)
plt.plot(np.arange(len(final_all_r2s)), final_all_r2s, c='black', label="all", )
plt.ylim(-1,1)

plt.legend(fontsize=12)
plt.savefig('test.png', dpi=300, bbox_inches='tight')