In [43]:
import pandas as pd
import numpy as np
from gplearn.genetic import SymbolicRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_squared_error as MSE
import matplotlib.pyplot as plt
import re

# CSV 파일에서 데이터 로드
file_path = '20240806_correlation_copy.csv'
df = pd.read_csv(file_path)

In [44]:
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(df)

normalized_df = pd.DataFrame(normalized_data, columns=df.columns)

In [45]:
y_min, y_max = df["oxidation_S0 (kcal/mol)"].min(), df["oxidation_S0 (kcal/mol)"].max()

X = normalized_df[["lumo (eV)", "vertical_excitation_energy (eV)", "dipole_moment_norm_S0 (D)",
    "dipole_moment_norm_S1 (D)", "dipole_moment_norm_T1 (D)","0-0_S1 (eV)", "0-0_T1 (eV)",
    "adiabatic_S1-S0 (kcal/mol)", "adiabatic_T1-S0 (kcal/mol)", "reduction_S0 (kcal/mol)",
    "lumo-homo (eV)"]]
y = normalized_df["oxidation_S0 (kcal/mol)"]

est = SymbolicRegressor(population_size=1000,
                        generations=100,
                        stopping_criteria=0.01,
                        function_set=['add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv'],
                        metric='mean absolute error',
                        parsimony_coefficient=0.0001,
                        verbose=1,
                        random_state=0)

est.fit(X, y)

# 수식 출력
if hasattr(est, '_program'):
    print(est._program)
else:
    print("The model does not have a _program attribute.")

y_pred_normalized = est.predict(X)
y_pred = y_pred_normalized * (y_max - y_min) + y_min
y_actual = y * (y_max - y_min) + y_min

r2 = r2_score(y_actual, y_pred)
mae = MAE(y_actual, y_pred)
rmse = np.sqrt(MSE(y_actual, y_pred))

plt.figure()
plt.plot([y_min, y_max], [y_min, y_max], 'k-', lw=2)
plt.scatter(y_actual, y_pred, color = 'red', alpha=0.5)
plt.xlabel(f'Actual values of oxidation_S0 (kcal/mol)')
plt.ylabel(f'Predicted values of oxidation_S0 (kcal/mol)')
plt.title('Parity Plot of oxidation_S0 (kcal/mol)')

# R^2 값 그래프에 추가
plt.text(x=0.05, y=0.95, s=f'$R^2 = {r2:.4f}$', fontsize=12, ha='left', va='top', transform=plt.gca().transAxes)
plt.text(x=0.05, y=0.90, s=f'MAE = {mae:.4f} (kcal/mol)', fontsize=12, ha='left', va='top', transform=plt.gca().transAxes)
plt.text(x=0.05, y=0.85, s=f'RMSE = {rmse:.4f} (kcal/mol)', fontsize=12, ha='left', va='top', transform=plt.gca().transAxes)

# 그래프 이미지 파일로 저장
plt.savefig('GPlearn Oxidation1.png')
plt.close()


    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0     9.62          4.88613        5        0.0691603              N/A      1.37m
   1     4.54         0.435342        8        0.0640677              N/A      1.22m
   2     3.16         0.241238        8        0.0640677              N/A      1.18m
   3     3.94         0.146635        8        0.0640677              N/A      1.27m
   4     4.98         0.236262        8        0.0640677              N/A      1.19m
   5     6.81         0.462192       16        0.0547186              N/A      1.25m
   6     8.32         0.638486       23        0.0502454              N/A      1.25m
   7    10.77         0.436538       29        0.0488244              N/A      1.28m
   8    16.68         0.404505       39        0.0420372              N/A  

In [46]:
y_min, y_max = df["oxidation_S0 (kcal/mol)"].min(), df["oxidation_S0 (kcal/mol)"].max()

X = normalized_df[["lumo (eV)", "vertical_excitation_energy (eV)", "dipole_moment_norm_S0 (D)",
    "dipole_moment_norm_S1 (D)", "dipole_moment_norm_T1 (D)","0-0_S1 (eV)", "0-0_T1 (eV)",
    "adiabatic_S1-S0 (kcal/mol)", "adiabatic_T1-S0 (kcal/mol)", "reduction_S0 (kcal/mol)",
    "lumo-homo (eV)"]]
y = normalized_df["oxidation_S0 (kcal/mol)"]

est = SymbolicRegressor(population_size=1000,
                           generations=100, stopping_criteria=0.01,
                           function_set=['add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv'],
                           p_crossover=0.7, p_subtree_mutation=0.1,
                           p_hoist_mutation=0.05, p_point_mutation=0.1,
                           max_samples=0.9, verbose=1,
                           parsimony_coefficient=0.0001, random_state=0)

est.fit(X, y)

# 수식 출력
print(est._program)

y_pred_normalized = est.predict(X)
y_pred = y_pred_normalized * (y_max - y_min) + y_min
y_actual = y * (y_max - y_min) + y_min

r2 = r2_score(y_actual, y_pred)
mae = MAE(y_actual, y_pred)
rmse = np.sqrt(MSE(y_actual, y_pred))

plt.figure()
plt.plot([y_min, y_max], [y_min, y_max], 'k-', lw=2)
plt.scatter(y_actual, y_pred, color = 'red', alpha=0.5)
plt.xlabel(f'Actual values of oxidation_S0 (kcal/mol)')
plt.ylabel(f'Predicted values of oxidation_S0 (kcal/mol)')
plt.title('Parity Plot of oxidation_S0 (kcal/mol)')

# R^2 값 그래프에 추가
plt.text(x=0.05, y=0.95, s=f'$R^2 = {r2:.4f}$', fontsize=12, ha='left', va='top', transform=plt.gca().transAxes)
plt.text(x=0.05, y=0.90, s=f'MAE = {mae:.4f} (kcal/mol)', fontsize=12, ha='left', va='top', transform=plt.gca().transAxes)
plt.text(x=0.05, y=0.85, s=f'RMSE = {rmse:.4f} (kcal/mol)', fontsize=12, ha='left', va='top', transform=plt.gca().transAxes)

# 그래프 이미지 파일로 저장
plt.savefig('GPlearn Oxidation2.png')
plt.close()


    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0     9.62          4.86289        5        0.0687361        0.0729628      1.87m
   1     4.70         0.482565        4        0.0646815        0.0652773      1.47m
   2     3.50         0.289439        4        0.0643656        0.0681084      1.47m
   3     3.90          0.25895        5        0.0626351        0.0667593      1.39m
   4     4.17         0.233173        5        0.0628719         0.064636      1.32m
   5     4.40         0.171495        6         0.062105        0.0715107      1.36m
   6     5.00         0.295747        5        0.0621476        0.0711292      1.53m
   7     5.00         0.176365        6        0.0620817        0.0583975      1.31m
   8     5.01         0.169715        6         0.061347        0.0649836  