In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score
from scipy.stats import pearsonr, spearmanr

In [2]:
df_cat = pd.read_csv('./ELN_reaction_representation.csv')

  df_cat = pd.read_csv('./ELN_reaction_representation.csv')


In [3]:
y = df_cat['yield']
y

0      65.39
1      57.47
2      65.43
3      75.07
4      46.32
       ...  
744    42.82
745    42.75
746    46.13
747    24.34
748    29.00
Name: yield, Length: 749, dtype: float64

# ELN descriptors

In [4]:
feature_col = []
for i in df_cat.columns[12:]:
    if 'mapped_rxn' == i:
        continue
    if 'ReactSeq_MEO' in i:
        continue
    if 'ReactSeq_all' in i:
        continue
    if 'AP3_256' in i:
        continue
    if 'Drfp' in i:
        continue
    if 'rxnfp_pretrained' in i:
        continue
    if 'rxnfp_ft_10k' in i:
        continue
    feature_col.append(i)

In [5]:
feature_col

['temperature',
 'metal_amount',
 'amine_amount',
 'halide_amount',
 'base_amount',
 'ligand_amount',
 'reaction_volume',
 'solvent_1',
 'solvent_2',
 'amine_MaxEStateIndex',
 'amine_MinEStateIndex',
 'amine_MaxAbsEStateIndex',
 'amine_MinAbsEStateIndex',
 'amine_qed',
 'amine_MolWt',
 'amine_HeavyAtomMolWt',
 'amine_ExactMolWt',
 'amine_NumValenceElectrons',
 'amine_MaxPartialCharge',
 'amine_MinPartialCharge',
 'amine_MaxAbsPartialCharge',
 'amine_MinAbsPartialCharge',
 'amine_FpDensityMorgan1',
 'amine_FpDensityMorgan2',
 'amine_FpDensityMorgan3',
 'amine_BCUT2D_MWHI',
 'amine_BCUT2D_MWLOW',
 'amine_BCUT2D_CHGHI',
 'amine_BCUT2D_CHGLO',
 'amine_BCUT2D_LOGPHI',
 'amine_BCUT2D_LOGPLOW',
 'amine_BCUT2D_MRHI',
 'amine_BCUT2D_MRLOW',
 'amine_BalabanJ',
 'amine_BertzCT',
 'amine_Chi0',
 'amine_Chi0n',
 'amine_Chi0v',
 'amine_Chi1',
 'amine_Chi1n',
 'amine_Chi1v',
 'amine_Chi2n',
 'amine_Chi2v',
 'amine_Chi3n',
 'amine_Chi3v',
 'amine_Chi4n',
 'amine_Chi4v',
 'amine_HallKierAlpha',
 'amine

In [6]:
len(feature_col)

907

In [7]:
X = df_cat[feature_col]

In [8]:
# Lists to store the results
rmse_list = []
r2_list = []
pearson_corr_list = []
spearman_corr_list = []

for seed in range(30):
    # Splitting the dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)
    
    # Model initialization and training
    model = RandomForestRegressor(n_estimators=100, random_state=seed)
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    
    # Metrics calculation
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    pearson_corr, _ = pearsonr(y_test, y_pred)
    spearman_corr, _ = spearmanr(y_test, y_pred)
    
    # Store results
    rmse_list.append(rmse)
    r2_list.append(r2)
    pearson_corr_list.append(pearson_corr)
    spearman_corr_list.append(spearman_corr)
    
    print(f'Seed {seed} -> RMSE: {rmse:.4f}, R²: {r2:.4f}, Pearson: {pearson_corr:.4f}, Spearman: {spearman_corr:.4f}')

# Averages and standard deviations
average_rmse = np.mean(rmse_list)
average_r2 = np.mean(r2_list)
average_pearson = np.mean(pearson_corr_list)
average_spearman = np.mean(spearman_corr_list)

std_rmse = np.std(rmse_list)
std_r2 = np.std(r2_list)
std_pearson = np.std(pearson_corr_list)
std_spearman = np.std(spearman_corr_list)

# Final results
print(f'\nR²_mean: {average_r2:.4f}')
print(f'R²_sd: {std_r2:.4f}')
print(f'RMSE_mean: {average_rmse:.4f}')
print(f'RMSE_sd: {std_rmse:.4f}')
print(f'Pearson_corr_mean: {average_pearson:.4f}')
print(f'Pearson_corr_sd: {std_pearson:.4f}')
print(f'Spearman_corr_mean: {average_spearman:.4f}')
print(f'Spearman_corr_sd: {std_spearman:.4f}')


Seed 0 -> RMSE: 24.1364, R²: 0.3254, Pearson: 0.5894, Spearman: 0.5811
Seed 1 -> RMSE: 23.7420, R²: 0.3074, Pearson: 0.5581, Spearman: 0.5527
Seed 2 -> RMSE: 26.3217, R²: 0.2200, Pearson: 0.4864, Spearman: 0.4696
Seed 3 -> RMSE: 23.3958, R²: 0.3355, Pearson: 0.5808, Spearman: 0.5664
Seed 4 -> RMSE: 27.6426, R²: 0.2229, Pearson: 0.4851, Spearman: 0.4959
Seed 5 -> RMSE: 24.3982, R²: 0.2738, Pearson: 0.5631, Spearman: 0.5541
Seed 6 -> RMSE: 24.3596, R²: 0.2623, Pearson: 0.5248, Spearman: 0.5027
Seed 7 -> RMSE: 24.1183, R²: 0.2736, Pearson: 0.5306, Spearman: 0.5311
Seed 8 -> RMSE: 24.3173, R²: 0.2512, Pearson: 0.5153, Spearman: 0.5078
Seed 9 -> RMSE: 24.2397, R²: 0.2511, Pearson: 0.5253, Spearman: 0.5181
Seed 10 -> RMSE: 25.5533, R²: 0.2165, Pearson: 0.5025, Spearman: 0.4954
Seed 11 -> RMSE: 26.7604, R²: 0.2208, Pearson: 0.4822, Spearman: 0.4680
Seed 12 -> RMSE: 25.5443, R²: 0.2336, Pearson: 0.4923, Spearman: 0.4990
Seed 13 -> RMSE: 25.9207, R²: 0.2718, Pearson: 0.5272, Spearman: 0.5433
Se

# AP3_256

In [9]:
feature_col = []
for i in df_cat.columns[12:]:
    if 'mapped_rxn' == i:
        continue
    if 'ReactSeq_MEO' in i:
        continue
    if 'ReactSeq_all' in i:
        continue
#     if 'AP3_256' in i:
#         continue
    if 'Drfp' in i:
        continue
    if 'rxnfp_pretrained' in i:
        continue
    if 'rxnfp_ft_10k' in i:
        continue

    if 'amine_' in i and i != 'amine_amount':
        continue
    if 'halide_' in i:
        continue
    if 'product_' in i:
        continue
    feature_col.append(i)

In [10]:
feature_col

['temperature',
 'metal_amount',
 'amine_amount',
 'base_amount',
 'ligand_amount',
 'reaction_volume',
 'solvent_1',
 'solvent_2',
 'solvent_MaxEStateIndex',
 'solvent_MinEStateIndex',
 'solvent_MaxAbsEStateIndex',
 'solvent_MinAbsEStateIndex',
 'solvent_qed',
 'solvent_MolWt',
 'solvent_HeavyAtomMolWt',
 'solvent_ExactMolWt',
 'solvent_NumValenceElectrons',
 'solvent_MaxPartialCharge',
 'solvent_MinPartialCharge',
 'solvent_MaxAbsPartialCharge',
 'solvent_MinAbsPartialCharge',
 'solvent_FpDensityMorgan1',
 'solvent_FpDensityMorgan2',
 'solvent_FpDensityMorgan3',
 'solvent_BCUT2D_MWHI',
 'solvent_BCUT2D_MWLOW',
 'solvent_BCUT2D_CHGHI',
 'solvent_BCUT2D_CHGLO',
 'solvent_BCUT2D_LOGPHI',
 'solvent_BCUT2D_LOGPLOW',
 'solvent_BCUT2D_MRHI',
 'solvent_BCUT2D_MRLOW',
 'solvent_BalabanJ',
 'solvent_BertzCT',
 'solvent_Chi0',
 'solvent_Chi0n',
 'solvent_Chi0v',
 'solvent_Chi1',
 'solvent_Chi1n',
 'solvent_Chi1v',
 'solvent_Chi2n',
 'solvent_Chi2v',
 'solvent_Chi3n',
 'solvent_Chi3v',
 'solvent

In [11]:
len(feature_col)

617

In [12]:
X = df_cat[feature_col]

In [13]:
# Lists to store the results
rmse_list = []
r2_list = []
pearson_corr_list = []
spearman_corr_list = []

for seed in range(30):
    # Splitting the dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)
    
    # Model initialization and training
    model = RandomForestRegressor(n_estimators=100, random_state=seed)
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    
    # Metrics calculation
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    pearson_corr, _ = pearsonr(y_test, y_pred)
    spearman_corr, _ = spearmanr(y_test, y_pred)
    
    # Store results
    rmse_list.append(rmse)
    r2_list.append(r2)
    pearson_corr_list.append(pearson_corr)
    spearman_corr_list.append(spearman_corr)
    
    print(f'Seed {seed} -> RMSE: {rmse:.4f}, R²: {r2:.4f}, Pearson: {pearson_corr:.4f}, Spearman: {spearman_corr:.4f}')

# Averages and standard deviations
average_rmse = np.mean(rmse_list)
average_r2 = np.mean(r2_list)
average_pearson = np.mean(pearson_corr_list)
average_spearman = np.mean(spearman_corr_list)

std_rmse = np.std(rmse_list)
std_r2 = np.std(r2_list)
std_pearson = np.std(pearson_corr_list)
std_spearman = np.std(spearman_corr_list)

# Final results
print(f'\nR²_mean: {average_r2:.4f}')
print(f'R²_sd: {std_r2:.4f}')
print(f'RMSE_mean: {average_rmse:.4f}')
print(f'RMSE_sd: {std_rmse:.4f}')
print(f'Pearson_corr_mean: {average_pearson:.4f}')
print(f'Pearson_corr_sd: {std_pearson:.4f}')
print(f'Spearman_corr_mean: {average_spearman:.4f}')
print(f'Spearman_corr_sd: {std_spearman:.4f}')


Seed 0 -> RMSE: 23.3204, R²: 0.3702, Pearson: 0.6226, Spearman: 0.6278
Seed 1 -> RMSE: 23.6275, R²: 0.3140, Pearson: 0.5618, Spearman: 0.5585
Seed 2 -> RMSE: 26.1536, R²: 0.2299, Pearson: 0.4973, Spearman: 0.4786
Seed 3 -> RMSE: 23.3840, R²: 0.3362, Pearson: 0.5807, Spearman: 0.5621
Seed 4 -> RMSE: 27.0832, R²: 0.2540, Pearson: 0.5136, Spearman: 0.5342
Seed 5 -> RMSE: 24.5837, R²: 0.2627, Pearson: 0.5458, Spearman: 0.5369
Seed 6 -> RMSE: 24.3004, R²: 0.2658, Pearson: 0.5206, Spearman: 0.5055
Seed 7 -> RMSE: 23.4425, R²: 0.3137, Pearson: 0.5621, Spearman: 0.5634
Seed 8 -> RMSE: 23.8898, R²: 0.2773, Pearson: 0.5361, Spearman: 0.5236
Seed 9 -> RMSE: 24.1403, R²: 0.2572, Pearson: 0.5317, Spearman: 0.5442
Seed 10 -> RMSE: 26.5453, R²: 0.1544, Pearson: 0.4523, Spearman: 0.4349
Seed 11 -> RMSE: 26.9638, R²: 0.2090, Pearson: 0.4743, Spearman: 0.4620
Seed 12 -> RMSE: 25.8385, R²: 0.2159, Pearson: 0.4769, Spearman: 0.4849
Seed 13 -> RMSE: 25.3819, R²: 0.3018, Pearson: 0.5553, Spearman: 0.5777
Se

# DRFP

In [14]:
feature_col = []
for i in df_cat.columns[12:]:
    if 'mapped_rxn' == i:
        continue
    if 'ReactSeq_MEO' in i:
        continue
    if 'ReactSeq_all' in i:
        continue
    if 'AP3_256' in i:
        continue
#     if 'Drfp' in i:
#         continue
    if 'rxnfp_pretrained' in i:
        continue
    if 'rxnfp_ft_10k' in i:
        continue

    if 'amine_' in i and i != 'amine_amount':
        continue
    if 'halide_' in i:
        continue
    if 'product_' in i:
        continue
    feature_col.append(i)

In [15]:
feature_col

['temperature',
 'metal_amount',
 'amine_amount',
 'base_amount',
 'ligand_amount',
 'reaction_volume',
 'solvent_1',
 'solvent_2',
 'solvent_MaxEStateIndex',
 'solvent_MinEStateIndex',
 'solvent_MaxAbsEStateIndex',
 'solvent_MinAbsEStateIndex',
 'solvent_qed',
 'solvent_MolWt',
 'solvent_HeavyAtomMolWt',
 'solvent_ExactMolWt',
 'solvent_NumValenceElectrons',
 'solvent_MaxPartialCharge',
 'solvent_MinPartialCharge',
 'solvent_MaxAbsPartialCharge',
 'solvent_MinAbsPartialCharge',
 'solvent_FpDensityMorgan1',
 'solvent_FpDensityMorgan2',
 'solvent_FpDensityMorgan3',
 'solvent_BCUT2D_MWHI',
 'solvent_BCUT2D_MWLOW',
 'solvent_BCUT2D_CHGHI',
 'solvent_BCUT2D_CHGLO',
 'solvent_BCUT2D_LOGPHI',
 'solvent_BCUT2D_LOGPLOW',
 'solvent_BCUT2D_MRHI',
 'solvent_BCUT2D_MRLOW',
 'solvent_BalabanJ',
 'solvent_BertzCT',
 'solvent_Chi0',
 'solvent_Chi0n',
 'solvent_Chi0v',
 'solvent_Chi1',
 'solvent_Chi1n',
 'solvent_Chi1v',
 'solvent_Chi2n',
 'solvent_Chi2v',
 'solvent_Chi3n',
 'solvent_Chi3v',
 'solvent

In [16]:
len(feature_col)

2409

In [17]:
X = df_cat[feature_col]

In [18]:
# Lists to store the results
rmse_list = []
r2_list = []
pearson_corr_list = []
spearman_corr_list = []

for seed in range(30):
    # Splitting the dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)
    
    # Model initialization and training
    model = RandomForestRegressor(n_estimators=100, random_state=seed)
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    
    # Metrics calculation
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    pearson_corr, _ = pearsonr(y_test, y_pred)
    spearman_corr, _ = spearmanr(y_test, y_pred)
    
    # Store results
    rmse_list.append(rmse)
    r2_list.append(r2)
    pearson_corr_list.append(pearson_corr)
    spearman_corr_list.append(spearman_corr)
    
    print(f'Seed {seed} -> RMSE: {rmse:.4f}, R²: {r2:.4f}, Pearson: {pearson_corr:.4f}, Spearman: {spearman_corr:.4f}')

# Averages and standard deviations
average_rmse = np.mean(rmse_list)
average_r2 = np.mean(r2_list)
average_pearson = np.mean(pearson_corr_list)
average_spearman = np.mean(spearman_corr_list)

std_rmse = np.std(rmse_list)
std_r2 = np.std(r2_list)
std_pearson = np.std(pearson_corr_list)
std_spearman = np.std(spearman_corr_list)

# Final results
print(f'\nR²_mean: {average_r2:.4f}')
print(f'R²_sd: {std_r2:.4f}')
print(f'RMSE_mean: {average_rmse:.4f}')
print(f'RMSE_sd: {std_rmse:.4f}')
print(f'Pearson_corr_mean: {average_pearson:.4f}')
print(f'Pearson_corr_sd: {std_pearson:.4f}')
print(f'Spearman_corr_mean: {average_spearman:.4f}')
print(f'Spearman_corr_sd: {std_spearman:.4f}')


Seed 0 -> RMSE: 24.0543, R²: 0.3300, Pearson: 0.5988, Spearman: 0.6084
Seed 1 -> RMSE: 24.8349, R²: 0.2422, Pearson: 0.5050, Spearman: 0.5098
Seed 2 -> RMSE: 26.5574, R²: 0.2060, Pearson: 0.4788, Spearman: 0.4736
Seed 3 -> RMSE: 23.2614, R²: 0.3431, Pearson: 0.5880, Spearman: 0.5741
Seed 4 -> RMSE: 27.4178, R²: 0.2355, Pearson: 0.4957, Spearman: 0.5141
Seed 5 -> RMSE: 25.6164, R²: 0.1994, Pearson: 0.5011, Spearman: 0.4914
Seed 6 -> RMSE: 25.3297, R²: 0.2023, Pearson: 0.4747, Spearman: 0.4515
Seed 7 -> RMSE: 24.4075, R²: 0.2561, Pearson: 0.5140, Spearman: 0.5143
Seed 8 -> RMSE: 23.8250, R²: 0.2812, Pearson: 0.5403, Spearman: 0.5372
Seed 9 -> RMSE: 25.3503, R²: 0.1809, Pearson: 0.4812, Spearman: 0.4830
Seed 10 -> RMSE: 26.9463, R²: 0.1287, Pearson: 0.4440, Spearman: 0.4346
Seed 11 -> RMSE: 27.7088, R²: 0.1646, Pearson: 0.4349, Spearman: 0.4346
Seed 12 -> RMSE: 26.0220, R²: 0.2047, Pearson: 0.4702, Spearman: 0.4793
Seed 13 -> RMSE: 26.7759, R²: 0.2230, Pearson: 0.4931, Spearman: 0.5140
Se

# rxnfp_pretrained

In [19]:
feature_col = []
for i in df_cat.columns[12:]:
    if 'mapped_rxn' == i:
        continue
    if 'ReactSeq_MEO' in i:
        continue
    if 'ReactSeq_all' in i:
        continue
    if 'AP3_256' in i:
        continue
    if 'Drfp' in i:
        continue
#     if 'rxnfp_pretrained' in i:
#         continue
    if 'rxnfp_ft_10k' in i:
        continue

    if 'amine_' in i and i != 'amine_amount':
        continue
    if 'halide_' in i:
        continue
    if 'product_' in i:
        continue
    feature_col.append(i)

In [20]:
feature_col

['temperature',
 'metal_amount',
 'amine_amount',
 'base_amount',
 'ligand_amount',
 'reaction_volume',
 'solvent_1',
 'solvent_2',
 'solvent_MaxEStateIndex',
 'solvent_MinEStateIndex',
 'solvent_MaxAbsEStateIndex',
 'solvent_MinAbsEStateIndex',
 'solvent_qed',
 'solvent_MolWt',
 'solvent_HeavyAtomMolWt',
 'solvent_ExactMolWt',
 'solvent_NumValenceElectrons',
 'solvent_MaxPartialCharge',
 'solvent_MinPartialCharge',
 'solvent_MaxAbsPartialCharge',
 'solvent_MinAbsPartialCharge',
 'solvent_FpDensityMorgan1',
 'solvent_FpDensityMorgan2',
 'solvent_FpDensityMorgan3',
 'solvent_BCUT2D_MWHI',
 'solvent_BCUT2D_MWLOW',
 'solvent_BCUT2D_CHGHI',
 'solvent_BCUT2D_CHGLO',
 'solvent_BCUT2D_LOGPHI',
 'solvent_BCUT2D_LOGPLOW',
 'solvent_BCUT2D_MRHI',
 'solvent_BCUT2D_MRLOW',
 'solvent_BalabanJ',
 'solvent_BertzCT',
 'solvent_Chi0',
 'solvent_Chi0n',
 'solvent_Chi0v',
 'solvent_Chi1',
 'solvent_Chi1n',
 'solvent_Chi1v',
 'solvent_Chi2n',
 'solvent_Chi2v',
 'solvent_Chi3n',
 'solvent_Chi3v',
 'solvent

In [21]:
len(feature_col)

617

In [22]:
X = df_cat[feature_col]

In [23]:
# Lists to store the results
rmse_list = []
r2_list = []
pearson_corr_list = []
spearman_corr_list = []

for seed in range(30):
    # Splitting the dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)
    
    # Model initialization and training
    model = RandomForestRegressor(n_estimators=100, random_state=seed)
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    
    # Metrics calculation
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    pearson_corr, _ = pearsonr(y_test, y_pred)
    spearman_corr, _ = spearmanr(y_test, y_pred)
    
    # Store results
    rmse_list.append(rmse)
    r2_list.append(r2)
    pearson_corr_list.append(pearson_corr)
    spearman_corr_list.append(spearman_corr)
    
    print(f'Seed {seed} -> RMSE: {rmse:.4f}, R²: {r2:.4f}, Pearson: {pearson_corr:.4f}, Spearman: {spearman_corr:.4f}')

# Averages and standard deviations
average_rmse = np.mean(rmse_list)
average_r2 = np.mean(r2_list)
average_pearson = np.mean(pearson_corr_list)
average_spearman = np.mean(spearman_corr_list)

std_rmse = np.std(rmse_list)
std_r2 = np.std(r2_list)
std_pearson = np.std(pearson_corr_list)
std_spearman = np.std(spearman_corr_list)

# Final results
print(f'\nR²_mean: {average_r2:.4f}')
print(f'R²_sd: {std_r2:.4f}')
print(f'RMSE_mean: {average_rmse:.4f}')
print(f'RMSE_sd: {std_rmse:.4f}')
print(f'Pearson_corr_mean: {average_pearson:.4f}')
print(f'Pearson_corr_sd: {std_pearson:.4f}')
print(f'Spearman_corr_mean: {average_spearman:.4f}')
print(f'Spearman_corr_sd: {std_spearman:.4f}')


Seed 0 -> RMSE: 23.9158, R²: 0.3377, Pearson: 0.5981, Spearman: 0.5843
Seed 1 -> RMSE: 23.4927, R²: 0.3219, Pearson: 0.5681, Spearman: 0.5513
Seed 2 -> RMSE: 25.5808, R²: 0.2633, Pearson: 0.5202, Spearman: 0.4996
Seed 3 -> RMSE: 23.3289, R²: 0.3393, Pearson: 0.5832, Spearman: 0.5576
Seed 4 -> RMSE: 28.0328, R²: 0.2008, Pearson: 0.4621, Spearman: 0.4704
Seed 5 -> RMSE: 24.4389, R²: 0.2713, Pearson: 0.5509, Spearman: 0.5419
Seed 6 -> RMSE: 23.6454, R²: 0.3049, Pearson: 0.5563, Spearman: 0.5450
Seed 7 -> RMSE: 23.3630, R²: 0.3184, Pearson: 0.5690, Spearman: 0.5698
Seed 8 -> RMSE: 24.3661, R²: 0.2482, Pearson: 0.5107, Spearman: 0.5028
Seed 9 -> RMSE: 23.9292, R²: 0.2701, Pearson: 0.5373, Spearman: 0.5423
Seed 10 -> RMSE: 25.5891, R²: 0.2143, Pearson: 0.5079, Spearman: 0.5017
Seed 11 -> RMSE: 27.0702, R²: 0.2027, Pearson: 0.4625, Spearman: 0.4640
Seed 12 -> RMSE: 24.8625, R²: 0.2740, Pearson: 0.5249, Spearman: 0.5408
Seed 13 -> RMSE: 25.3369, R²: 0.3042, Pearson: 0.5542, Spearman: 0.5655
Se

# rxnfp_finetuned

In [24]:
feature_col = []
for i in df_cat.columns[12:]:
    if 'mapped_rxn' == i:
        continue
    if 'ReactSeq_MEO' in i:
        continue
    if 'ReactSeq_all' in i:
        continue
    if 'AP3_256' in i:
        continue
    if 'Drfp' in i:
        continue
    if 'rxnfp_pretrained' in i:
        continue
#     if 'rxnfp_ft_10k' in i:
#         continue

    if 'amine_' in i and i != 'amine_amount':
        continue
    if 'halide_' in i:
        continue
    if 'product_' in i:
        continue
    feature_col.append(i)

In [25]:
feature_col

['temperature',
 'metal_amount',
 'amine_amount',
 'base_amount',
 'ligand_amount',
 'reaction_volume',
 'solvent_1',
 'solvent_2',
 'solvent_MaxEStateIndex',
 'solvent_MinEStateIndex',
 'solvent_MaxAbsEStateIndex',
 'solvent_MinAbsEStateIndex',
 'solvent_qed',
 'solvent_MolWt',
 'solvent_HeavyAtomMolWt',
 'solvent_ExactMolWt',
 'solvent_NumValenceElectrons',
 'solvent_MaxPartialCharge',
 'solvent_MinPartialCharge',
 'solvent_MaxAbsPartialCharge',
 'solvent_MinAbsPartialCharge',
 'solvent_FpDensityMorgan1',
 'solvent_FpDensityMorgan2',
 'solvent_FpDensityMorgan3',
 'solvent_BCUT2D_MWHI',
 'solvent_BCUT2D_MWLOW',
 'solvent_BCUT2D_CHGHI',
 'solvent_BCUT2D_CHGLO',
 'solvent_BCUT2D_LOGPHI',
 'solvent_BCUT2D_LOGPLOW',
 'solvent_BCUT2D_MRHI',
 'solvent_BCUT2D_MRLOW',
 'solvent_BalabanJ',
 'solvent_BertzCT',
 'solvent_Chi0',
 'solvent_Chi0n',
 'solvent_Chi0v',
 'solvent_Chi1',
 'solvent_Chi1n',
 'solvent_Chi1v',
 'solvent_Chi2n',
 'solvent_Chi2v',
 'solvent_Chi3n',
 'solvent_Chi3v',
 'solvent

In [26]:
len(feature_col)

617

In [27]:
X = df_cat[feature_col]

In [28]:
# Lists to store the results
rmse_list = []
r2_list = []
pearson_corr_list = []
spearman_corr_list = []

for seed in range(30):
    # Splitting the dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)
    
    # Model initialization and training
    model = RandomForestRegressor(n_estimators=100, random_state=seed)
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    
    # Metrics calculation
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    pearson_corr, _ = pearsonr(y_test, y_pred)
    spearman_corr, _ = spearmanr(y_test, y_pred)
    
    # Store results
    rmse_list.append(rmse)
    r2_list.append(r2)
    pearson_corr_list.append(pearson_corr)
    spearman_corr_list.append(spearman_corr)
    
    print(f'Seed {seed} -> RMSE: {rmse:.4f}, R²: {r2:.4f}, Pearson: {pearson_corr:.4f}, Spearman: {spearman_corr:.4f}')

# Averages and standard deviations
average_rmse = np.mean(rmse_list)
average_r2 = np.mean(r2_list)
average_pearson = np.mean(pearson_corr_list)
average_spearman = np.mean(spearman_corr_list)

std_rmse = np.std(rmse_list)
std_r2 = np.std(r2_list)
std_pearson = np.std(pearson_corr_list)
std_spearman = np.std(spearman_corr_list)

# Final results
print(f'\nR²_mean: {average_r2:.4f}')
print(f'R²_sd: {std_r2:.4f}')
print(f'RMSE_mean: {average_rmse:.4f}')
print(f'RMSE_sd: {std_rmse:.4f}')
print(f'Pearson_corr_mean: {average_pearson:.4f}')
print(f'Pearson_corr_sd: {std_pearson:.4f}')
print(f'Spearman_corr_mean: {average_spearman:.4f}')
print(f'Spearman_corr_sd: {std_spearman:.4f}')


Seed 0 -> RMSE: 23.8877, R²: 0.3392, Pearson: 0.5995, Spearman: 0.5969
Seed 1 -> RMSE: 24.3814, R²: 0.2696, Pearson: 0.5231, Spearman: 0.5097
Seed 2 -> RMSE: 27.2679, R²: 0.1629, Pearson: 0.4343, Spearman: 0.4093
Seed 3 -> RMSE: 23.5630, R²: 0.3259, Pearson: 0.5724, Spearman: 0.5498
Seed 4 -> RMSE: 27.6522, R²: 0.2223, Pearson: 0.4773, Spearman: 0.4918
Seed 5 -> RMSE: 24.5448, R²: 0.2650, Pearson: 0.5496, Spearman: 0.5433
Seed 6 -> RMSE: 24.2286, R²: 0.2702, Pearson: 0.5280, Spearman: 0.5141
Seed 7 -> RMSE: 23.9719, R²: 0.2824, Pearson: 0.5390, Spearman: 0.5324
Seed 8 -> RMSE: 24.0512, R²: 0.2675, Pearson: 0.5237, Spearman: 0.5273
Seed 9 -> RMSE: 24.4851, R²: 0.2358, Pearson: 0.5116, Spearman: 0.5122
Seed 10 -> RMSE: 26.0086, R²: 0.1883, Pearson: 0.4712, Spearman: 0.4500
Seed 11 -> RMSE: 26.8782, R²: 0.2140, Pearson: 0.4698, Spearman: 0.4697
Seed 12 -> RMSE: 25.9942, R²: 0.2064, Pearson: 0.4609, Spearman: 0.4662
Seed 13 -> RMSE: 26.1060, R²: 0.2614, Pearson: 0.5166, Spearman: 0.5266
Se

# ReactSeq_all

In [29]:
feature_col = []
for i in df_cat.columns[12:]:
    if 'mapped_rxn' == i:
        continue
    if 'ReactSeq_MEO' in i:
        continue
#     if 'ReactSeq_all' in i:
#         continue
    if 'AP3_256' in i:
        continue
    if 'Drfp' in i:
        continue
    if 'rxnfp_pretrained' in i:
        continue
    if 'rxnfp_ft_10k' in i:
        continue

    if 'amine_' in i and i != 'amine_amount':
        continue
    if 'halide_' in i:
        continue
    if 'product_' in i:
        continue
    feature_col.append(i)

In [30]:
feature_col

['temperature',
 'metal_amount',
 'amine_amount',
 'base_amount',
 'ligand_amount',
 'reaction_volume',
 'solvent_1',
 'solvent_2',
 'solvent_MaxEStateIndex',
 'solvent_MinEStateIndex',
 'solvent_MaxAbsEStateIndex',
 'solvent_MinAbsEStateIndex',
 'solvent_qed',
 'solvent_MolWt',
 'solvent_HeavyAtomMolWt',
 'solvent_ExactMolWt',
 'solvent_NumValenceElectrons',
 'solvent_MaxPartialCharge',
 'solvent_MinPartialCharge',
 'solvent_MaxAbsPartialCharge',
 'solvent_MinAbsPartialCharge',
 'solvent_FpDensityMorgan1',
 'solvent_FpDensityMorgan2',
 'solvent_FpDensityMorgan3',
 'solvent_BCUT2D_MWHI',
 'solvent_BCUT2D_MWLOW',
 'solvent_BCUT2D_CHGHI',
 'solvent_BCUT2D_CHGLO',
 'solvent_BCUT2D_LOGPHI',
 'solvent_BCUT2D_LOGPLOW',
 'solvent_BCUT2D_MRHI',
 'solvent_BCUT2D_MRLOW',
 'solvent_BalabanJ',
 'solvent_BertzCT',
 'solvent_Chi0',
 'solvent_Chi0n',
 'solvent_Chi0v',
 'solvent_Chi1',
 'solvent_Chi1n',
 'solvent_Chi1v',
 'solvent_Chi2n',
 'solvent_Chi2v',
 'solvent_Chi3n',
 'solvent_Chi3v',
 'solvent

In [31]:
len(feature_col)

617

In [32]:
X = df_cat[feature_col]

In [34]:
# Lists to store the results
rmse_list = []
r2_list = []
pearson_corr_list = []
spearman_corr_list = []

for seed in range(30):
    # Splitting the dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)
    
    # Model initialization and training
    model = RandomForestRegressor(n_estimators=100, random_state=seed)
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    
    # Metrics calculation
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    pearson_corr, _ = pearsonr(y_test, y_pred)
    spearman_corr, _ = spearmanr(y_test, y_pred)
    
    # Store results
    rmse_list.append(rmse)
    r2_list.append(r2)
    pearson_corr_list.append(pearson_corr)
    spearman_corr_list.append(spearman_corr)
    
    print(f'Seed {seed} -> RMSE: {rmse:.4f}, R²: {r2:.4f}, Pearson: {pearson_corr:.4f}, Spearman: {spearman_corr:.4f}')

# Averages and standard deviations
average_rmse = np.mean(rmse_list)
average_r2 = np.mean(r2_list)
average_pearson = np.mean(pearson_corr_list)
average_spearman = np.mean(spearman_corr_list)

std_rmse = np.std(rmse_list)
std_r2 = np.std(r2_list)
std_pearson = np.std(pearson_corr_list)
std_spearman = np.std(spearman_corr_list)

# Final results
print(f'\nR²_mean: {average_r2:.4f}')
print(f'R²_sd: {std_r2:.4f}')
print(f'RMSE_mean: {average_rmse:.4f}')
print(f'RMSE_sd: {std_rmse:.4f}')
print(f'Pearson_corr_mean: {average_pearson:.4f}')
print(f'Pearson_corr_sd: {std_pearson:.4f}')
print(f'Spearman_corr_mean: {average_spearman:.4f}')
print(f'Spearman_corr_sd: {std_spearman:.4f}')


Seed 0 -> RMSE: 24.1145, R²: 0.3266, Pearson: 0.5824, Spearman: 0.5789
Seed 1 -> RMSE: 24.3551, R²: 0.2712, Pearson: 0.5211, Spearman: 0.5160
Seed 2 -> RMSE: 25.8552, R²: 0.2474, Pearson: 0.5077, Spearman: 0.4909
Seed 3 -> RMSE: 23.9499, R²: 0.3036, Pearson: 0.5513, Spearman: 0.5359
Seed 4 -> RMSE: 27.5557, R²: 0.2277, Pearson: 0.4854, Spearman: 0.4910
Seed 5 -> RMSE: 24.1581, R²: 0.2880, Pearson: 0.5583, Spearman: 0.5503
Seed 6 -> RMSE: 23.0559, R²: 0.3391, Pearson: 0.5827, Spearman: 0.5795
Seed 7 -> RMSE: 23.6540, R²: 0.3013, Pearson: 0.5571, Spearman: 0.5649
Seed 8 -> RMSE: 23.9453, R²: 0.2739, Pearson: 0.5325, Spearman: 0.5307
Seed 9 -> RMSE: 23.7537, R²: 0.2808, Pearson: 0.5375, Spearman: 0.5404
Seed 10 -> RMSE: 25.6942, R²: 0.2078, Pearson: 0.4788, Spearman: 0.4539
Seed 11 -> RMSE: 26.4857, R²: 0.2368, Pearson: 0.4919, Spearman: 0.4874
Seed 12 -> RMSE: 24.5893, R²: 0.2899, Pearson: 0.5420, Spearman: 0.5429
Seed 13 -> RMSE: 25.8505, R²: 0.2758, Pearson: 0.5302, Spearman: 0.5248
Se

# ReactSeq_MEO

In [35]:
feature_col = []
for i in df_cat.columns[12:]:
    if 'mapped_rxn' == i:
        continue
#     if 'ReactSeq_MEO' in i:
#         continue
    if 'ReactSeq_all' in i:
        continue
    if 'AP3_256' in i:
        continue
    if 'Drfp' in i:
        continue
    if 'rxnfp_pretrained' in i:
        continue
    if 'rxnfp_ft_10k' in i:
        continue

    if 'amine_' in i and i != 'amine_amount':
        continue
    if 'halide_' in i:
        continue
    if 'product_' in i:
        continue
    feature_col.append(i)

In [36]:
feature_col

['temperature',
 'metal_amount',
 'amine_amount',
 'base_amount',
 'ligand_amount',
 'reaction_volume',
 'solvent_1',
 'solvent_2',
 'solvent_MaxEStateIndex',
 'solvent_MinEStateIndex',
 'solvent_MaxAbsEStateIndex',
 'solvent_MinAbsEStateIndex',
 'solvent_qed',
 'solvent_MolWt',
 'solvent_HeavyAtomMolWt',
 'solvent_ExactMolWt',
 'solvent_NumValenceElectrons',
 'solvent_MaxPartialCharge',
 'solvent_MinPartialCharge',
 'solvent_MaxAbsPartialCharge',
 'solvent_MinAbsPartialCharge',
 'solvent_FpDensityMorgan1',
 'solvent_FpDensityMorgan2',
 'solvent_FpDensityMorgan3',
 'solvent_BCUT2D_MWHI',
 'solvent_BCUT2D_MWLOW',
 'solvent_BCUT2D_CHGHI',
 'solvent_BCUT2D_CHGLO',
 'solvent_BCUT2D_LOGPHI',
 'solvent_BCUT2D_LOGPLOW',
 'solvent_BCUT2D_MRHI',
 'solvent_BCUT2D_MRLOW',
 'solvent_BalabanJ',
 'solvent_BertzCT',
 'solvent_Chi0',
 'solvent_Chi0n',
 'solvent_Chi0v',
 'solvent_Chi1',
 'solvent_Chi1n',
 'solvent_Chi1v',
 'solvent_Chi2n',
 'solvent_Chi2v',
 'solvent_Chi3n',
 'solvent_Chi3v',
 'solvent

In [37]:
len(feature_col)

617

In [38]:
X = df_cat[feature_col]

In [39]:
# Lists to store the results
rmse_list = []
r2_list = []
pearson_corr_list = []
spearman_corr_list = []

for seed in range(30):
    # Splitting the dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)
    
    # Model initialization and training
    model = RandomForestRegressor(n_estimators=100, random_state=seed)
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    
    # Metrics calculation
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    pearson_corr, _ = pearsonr(y_test, y_pred)
    spearman_corr, _ = spearmanr(y_test, y_pred)
    
    # Store results
    rmse_list.append(rmse)
    r2_list.append(r2)
    pearson_corr_list.append(pearson_corr)
    spearman_corr_list.append(spearman_corr)
    
    print(f'Seed {seed} -> RMSE: {rmse:.4f}, R²: {r2:.4f}, Pearson: {pearson_corr:.4f}, Spearman: {spearman_corr:.4f}')

# Averages and standard deviations
average_rmse = np.mean(rmse_list)
average_r2 = np.mean(r2_list)
average_pearson = np.mean(pearson_corr_list)
average_spearman = np.mean(spearman_corr_list)

std_rmse = np.std(rmse_list)
std_r2 = np.std(r2_list)
std_pearson = np.std(pearson_corr_list)
std_spearman = np.std(spearman_corr_list)

# Final results
print(f'\nR²_mean: {average_r2:.4f}')
print(f'R²_sd: {std_r2:.4f}')
print(f'RMSE_mean: {average_rmse:.4f}')
print(f'RMSE_sd: {std_rmse:.4f}')
print(f'Pearson_corr_mean: {average_pearson:.4f}')
print(f'Pearson_corr_sd: {std_pearson:.4f}')
print(f'Spearman_corr_mean: {average_spearman:.4f}')
print(f'Spearman_corr_sd: {std_spearman:.4f}')


Seed 0 -> RMSE: 23.8202, R²: 0.3430, Pearson: 0.6048, Spearman: 0.6062
Seed 1 -> RMSE: 24.4841, R²: 0.2634, Pearson: 0.5146, Spearman: 0.5060
Seed 2 -> RMSE: 25.8846, R²: 0.2457, Pearson: 0.5045, Spearman: 0.4874
Seed 3 -> RMSE: 23.4466, R²: 0.3326, Pearson: 0.5768, Spearman: 0.5659
Seed 4 -> RMSE: 26.7642, R²: 0.2715, Pearson: 0.5243, Spearman: 0.5348
Seed 5 -> RMSE: 24.0665, R²: 0.2934, Pearson: 0.5672, Spearman: 0.5648
Seed 6 -> RMSE: 23.9127, R²: 0.2891, Pearson: 0.5380, Spearman: 0.5280
Seed 7 -> RMSE: 23.4473, R²: 0.3135, Pearson: 0.5697, Spearman: 0.5696
Seed 8 -> RMSE: 23.4127, R²: 0.3059, Pearson: 0.5635, Spearman: 0.5600
Seed 9 -> RMSE: 23.8259, R²: 0.2764, Pearson: 0.5369, Spearman: 0.5414
Seed 10 -> RMSE: 25.3947, R²: 0.2261, Pearson: 0.5035, Spearman: 0.4918
Seed 11 -> RMSE: 26.3647, R²: 0.2437, Pearson: 0.4979, Spearman: 0.4913
Seed 12 -> RMSE: 24.8167, R²: 0.2767, Pearson: 0.5294, Spearman: 0.5344
Seed 13 -> RMSE: 26.1394, R²: 0.2595, Pearson: 0.5143, Spearman: 0.5120
Se

# ReactSeq_all_no_condition

In [40]:
feature_col = []
for i in df_cat.columns[12:]:
    if 'ReactSeq_all' in i:
        feature_col.append(i)
feature_col

['ReactSeq_all_0',
 'ReactSeq_all_1',
 'ReactSeq_all_2',
 'ReactSeq_all_3',
 'ReactSeq_all_4',
 'ReactSeq_all_5',
 'ReactSeq_all_6',
 'ReactSeq_all_7',
 'ReactSeq_all_8',
 'ReactSeq_all_9',
 'ReactSeq_all_10',
 'ReactSeq_all_11',
 'ReactSeq_all_12',
 'ReactSeq_all_13',
 'ReactSeq_all_14',
 'ReactSeq_all_15',
 'ReactSeq_all_16',
 'ReactSeq_all_17',
 'ReactSeq_all_18',
 'ReactSeq_all_19',
 'ReactSeq_all_20',
 'ReactSeq_all_21',
 'ReactSeq_all_22',
 'ReactSeq_all_23',
 'ReactSeq_all_24',
 'ReactSeq_all_25',
 'ReactSeq_all_26',
 'ReactSeq_all_27',
 'ReactSeq_all_28',
 'ReactSeq_all_29',
 'ReactSeq_all_30',
 'ReactSeq_all_31',
 'ReactSeq_all_32',
 'ReactSeq_all_33',
 'ReactSeq_all_34',
 'ReactSeq_all_35',
 'ReactSeq_all_36',
 'ReactSeq_all_37',
 'ReactSeq_all_38',
 'ReactSeq_all_39',
 'ReactSeq_all_40',
 'ReactSeq_all_41',
 'ReactSeq_all_42',
 'ReactSeq_all_43',
 'ReactSeq_all_44',
 'ReactSeq_all_45',
 'ReactSeq_all_46',
 'ReactSeq_all_47',
 'ReactSeq_all_48',
 'ReactSeq_all_49',
 'ReactSeq

In [42]:
len(feature_col)

256

In [43]:
X = df_cat[feature_col]

In [44]:
# Lists to store the results
rmse_list = []
r2_list = []
pearson_corr_list = []
spearman_corr_list = []

for seed in range(30):
    # Splitting the dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)
    
    # Model initialization and training
    model = RandomForestRegressor(n_estimators=100, random_state=seed)
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    
    # Metrics calculation
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    pearson_corr, _ = pearsonr(y_test, y_pred)
    spearman_corr, _ = spearmanr(y_test, y_pred)
    
    # Store results
    rmse_list.append(rmse)
    r2_list.append(r2)
    pearson_corr_list.append(pearson_corr)
    spearman_corr_list.append(spearman_corr)
    
    print(f'Seed {seed} -> RMSE: {rmse:.4f}, R²: {r2:.4f}, Pearson: {pearson_corr:.4f}, Spearman: {spearman_corr:.4f}')

# Averages and standard deviations
average_rmse = np.mean(rmse_list)
average_r2 = np.mean(r2_list)
average_pearson = np.mean(pearson_corr_list)
average_spearman = np.mean(spearman_corr_list)

std_rmse = np.std(rmse_list)
std_r2 = np.std(r2_list)
std_pearson = np.std(pearson_corr_list)
std_spearman = np.std(spearman_corr_list)

# Final results
print(f'\nR²_mean: {average_r2:.4f}')
print(f'R²_sd: {std_r2:.4f}')
print(f'RMSE_mean: {average_rmse:.4f}')
print(f'RMSE_sd: {std_rmse:.4f}')
print(f'Pearson_corr_mean: {average_pearson:.4f}')
print(f'Pearson_corr_sd: {std_pearson:.4f}')
print(f'Spearman_corr_mean: {average_spearman:.4f}')
print(f'Spearman_corr_sd: {std_spearman:.4f}')


Seed 0 -> RMSE: 25.6554, R²: 0.2378, Pearson: 0.4943, Spearman: 0.4994
Seed 1 -> RMSE: 23.8926, R²: 0.2986, Pearson: 0.5494, Spearman: 0.5451
Seed 2 -> RMSE: 25.2240, R²: 0.2837, Pearson: 0.5380, Spearman: 0.5464
Seed 3 -> RMSE: 25.2083, R²: 0.2285, Pearson: 0.4820, Spearman: 0.4777
Seed 4 -> RMSE: 28.6408, R²: 0.1657, Pearson: 0.4256, Spearman: 0.4392
Seed 5 -> RMSE: 25.8547, R²: 0.1845, Pearson: 0.4569, Spearman: 0.4499
Seed 6 -> RMSE: 23.9666, R²: 0.2859, Pearson: 0.5349, Spearman: 0.5407
Seed 7 -> RMSE: 25.0514, R²: 0.2163, Pearson: 0.4853, Spearman: 0.4936
Seed 8 -> RMSE: 24.5106, R²: 0.2392, Pearson: 0.5006, Spearman: 0.5123
Seed 9 -> RMSE: 24.2069, R²: 0.2531, Pearson: 0.5173, Spearman: 0.5175
Seed 10 -> RMSE: 25.7013, R²: 0.2074, Pearson: 0.4708, Spearman: 0.4626
Seed 11 -> RMSE: 27.0373, R²: 0.2046, Pearson: 0.4533, Spearman: 0.4683
Seed 12 -> RMSE: 26.2337, R²: 0.1917, Pearson: 0.4477, Spearman: 0.4414
Seed 13 -> RMSE: 26.7526, R²: 0.2243, Pearson: 0.4782, Spearman: 0.4659
Se

# ReactSeq_MEO_no_condition

In [45]:
feature_col = []
for i in df_cat.columns[12:]:
    if 'ReactSeq_MEO' in i:
        feature_col.append(i)
feature_col

['ReactSeq_MEO_0',
 'ReactSeq_MEO_1',
 'ReactSeq_MEO_2',
 'ReactSeq_MEO_3',
 'ReactSeq_MEO_4',
 'ReactSeq_MEO_5',
 'ReactSeq_MEO_6',
 'ReactSeq_MEO_7',
 'ReactSeq_MEO_8',
 'ReactSeq_MEO_9',
 'ReactSeq_MEO_10',
 'ReactSeq_MEO_11',
 'ReactSeq_MEO_12',
 'ReactSeq_MEO_13',
 'ReactSeq_MEO_14',
 'ReactSeq_MEO_15',
 'ReactSeq_MEO_16',
 'ReactSeq_MEO_17',
 'ReactSeq_MEO_18',
 'ReactSeq_MEO_19',
 'ReactSeq_MEO_20',
 'ReactSeq_MEO_21',
 'ReactSeq_MEO_22',
 'ReactSeq_MEO_23',
 'ReactSeq_MEO_24',
 'ReactSeq_MEO_25',
 'ReactSeq_MEO_26',
 'ReactSeq_MEO_27',
 'ReactSeq_MEO_28',
 'ReactSeq_MEO_29',
 'ReactSeq_MEO_30',
 'ReactSeq_MEO_31',
 'ReactSeq_MEO_32',
 'ReactSeq_MEO_33',
 'ReactSeq_MEO_34',
 'ReactSeq_MEO_35',
 'ReactSeq_MEO_36',
 'ReactSeq_MEO_37',
 'ReactSeq_MEO_38',
 'ReactSeq_MEO_39',
 'ReactSeq_MEO_40',
 'ReactSeq_MEO_41',
 'ReactSeq_MEO_42',
 'ReactSeq_MEO_43',
 'ReactSeq_MEO_44',
 'ReactSeq_MEO_45',
 'ReactSeq_MEO_46',
 'ReactSeq_MEO_47',
 'ReactSeq_MEO_48',
 'ReactSeq_MEO_49',
 'ReactSeq

In [47]:
len(feature_col)

256

In [48]:
X = df_cat[feature_col]

In [49]:
# Lists to store the results
rmse_list = []
r2_list = []
pearson_corr_list = []
spearman_corr_list = []

for seed in range(30):
    # Splitting the dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)
    
    # Model initialization and training
    model = RandomForestRegressor(n_estimators=100, random_state=seed)
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    
    # Metrics calculation
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    pearson_corr, _ = pearsonr(y_test, y_pred)
    spearman_corr, _ = spearmanr(y_test, y_pred)
    
    # Store results
    rmse_list.append(rmse)
    r2_list.append(r2)
    pearson_corr_list.append(pearson_corr)
    spearman_corr_list.append(spearman_corr)
    
    print(f'Seed {seed} -> RMSE: {rmse:.4f}, R²: {r2:.4f}, Pearson: {pearson_corr:.4f}, Spearman: {spearman_corr:.4f}')

# Averages and standard deviations
average_rmse = np.mean(rmse_list)
average_r2 = np.mean(r2_list)
average_pearson = np.mean(pearson_corr_list)
average_spearman = np.mean(spearman_corr_list)

std_rmse = np.std(rmse_list)
std_r2 = np.std(r2_list)
std_pearson = np.std(pearson_corr_list)
std_spearman = np.std(spearman_corr_list)

# Final results
print(f'\nR²_mean: {average_r2:.4f}')
print(f'R²_sd: {std_r2:.4f}')
print(f'RMSE_mean: {average_rmse:.4f}')
print(f'RMSE_sd: {std_rmse:.4f}')
print(f'Pearson_corr_mean: {average_pearson:.4f}')
print(f'Pearson_corr_sd: {std_pearson:.4f}')
print(f'Spearman_corr_mean: {average_spearman:.4f}')
print(f'Spearman_corr_sd: {std_spearman:.4f}')


Seed 0 -> RMSE: 25.3210, R²: 0.2576, Pearson: 0.5159, Spearman: 0.5252
Seed 1 -> RMSE: 23.3457, R²: 0.3303, Pearson: 0.5799, Spearman: 0.5834
Seed 2 -> RMSE: 25.5529, R²: 0.2649, Pearson: 0.5170, Spearman: 0.5172
Seed 3 -> RMSE: 25.7054, R²: 0.1978, Pearson: 0.4542, Spearman: 0.4504
Seed 4 -> RMSE: 27.9617, R²: 0.2048, Pearson: 0.4579, Spearman: 0.4554
Seed 5 -> RMSE: 25.5339, R²: 0.2046, Pearson: 0.4828, Spearman: 0.4845
Seed 6 -> RMSE: 23.5990, R²: 0.3076, Pearson: 0.5549, Spearman: 0.5468
Seed 7 -> RMSE: 24.8489, R²: 0.2289, Pearson: 0.4935, Spearman: 0.4844
Seed 8 -> RMSE: 24.8353, R²: 0.2189, Pearson: 0.4798, Spearman: 0.4865
Seed 9 -> RMSE: 24.7526, R²: 0.2191, Pearson: 0.4870, Spearman: 0.4983
Seed 10 -> RMSE: 25.6535, R²: 0.2103, Pearson: 0.4796, Spearman: 0.4602
Seed 11 -> RMSE: 27.4522, R²: 0.1800, Pearson: 0.4253, Spearman: 0.4258
Seed 12 -> RMSE: 25.6472, R²: 0.2275, Pearson: 0.4793, Spearman: 0.4815
Seed 13 -> RMSE: 27.2704, R²: 0.1940, Pearson: 0.4480, Spearman: 0.4261
Se