In [57]:
import pandas as pd
import numpy as np
from gplearn.genetic import SymbolicRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_squared_error as MSE
import matplotlib.pyplot as plt
import re

# CSV 파일에서 데이터 로드
file_path = '20240806_correlation_copy.csv'
df = pd.read_csv(file_path)

In [58]:
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(df)

normalized_df = pd.DataFrame(normalized_data, columns=df.columns)

In [59]:
y_min, y_max = df["oxidation_S0 (kcal/mol)"].min(), df["oxidation_S0 (kcal/mol)"].max()

X = normalized_df[["lumo (eV)", "vertical_excitation_energy (eV)", "dipole_moment_norm_S0 (D)",
    "dipole_moment_norm_S1 (D)", "dipole_moment_norm_T1 (D)","0-0_S1 (eV)", "0-0_T1 (eV)",
    "adiabatic_S1-S0 (kcal/mol)", "adiabatic_T1-S0 (kcal/mol)", "reduction_S0 (kcal/mol)",
    "lumo-homo (eV)"]]
y = normalized_df["oxidation_S0 (kcal/mol)"]

est = SymbolicRegressor(population_size=5000,
                        generations=500,
                        stopping_criteria=0.01,
                        function_set=['add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv'],
                        metric='mean absolute error',
                        parsimony_coefficient=0.01,
                        verbose=1,
                        random_state=0)

est.fit(X, y)

# 수식 출력
if hasattr(est, '_program'):
    print(est._program)
else:
    print("The model does not have a _program attribute.")

y_pred_normalized = est.predict(X)
y_pred = y_pred_normalized * (y_max - y_min) + y_min
y_actual = y * (y_max - y_min) + y_min

r2 = r2_score(y_actual, y_pred)
mae = MAE(y_actual, y_pred)
rmse = np.sqrt(MSE(y_actual, y_pred))

plt.figure()
plt.plot([y_min, y_max], [y_min, y_max], 'k-', lw=2)
plt.scatter(y_actual, y_pred, color = 'red', alpha=0.5)
plt.xlabel(f'Actual values of oxidation_S0 (kcal/mol)')
plt.ylabel(f'Predicted values of oxidation_S0 (kcal/mol)')
plt.title('Parity Plot of oxidation_S0 (kcal/mol)')

# R^2 값 그래프에 추가
plt.text(x=0.05, y=0.95, s=f'$R^2 = {r2:.4f}$', fontsize=12, ha='left', va='top', transform=plt.gca().transAxes)
plt.text(x=0.05, y=0.90, s=f'MAE = {mae:.4f} (kcal/mol)', fontsize=12, ha='left', va='top', transform=plt.gca().transAxes)
plt.text(x=0.05, y=0.85, s=f'RMSE = {rmse:.4f} (kcal/mol)', fontsize=12, ha='left', va='top', transform=plt.gca().transAxes)

# 그래프 이미지 파일로 저장
plt.savefig('GPlearn Oxidation1.png')
plt.close()


    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0     9.17          5.03878       10        0.0691503              N/A     23.22m
   1     3.41         0.390556        5        0.0645547              N/A     23.69m
   2     1.94          0.25331        5        0.0645547              N/A     23.19m
   3     1.06        0.0848126        1        0.0728007              N/A     22.62m
   4     1.03         0.082966        1        0.0728007              N/A     22.39m
   5     1.02        0.0823882        1        0.0728007              N/A     22.31m
   6     1.04        0.0800887        1        0.0728007              N/A     22.33m
   7     1.02          0.08874        1        0.0728007              N/A     22.32m
   8     1.03        0.0785291        1        0.0728007              N/A  

In [60]:
y_min, y_max = df["oxidation_S0 (kcal/mol)"].min(), df["oxidation_S0 (kcal/mol)"].max()

X = normalized_df[["lumo (eV)", "vertical_excitation_energy (eV)", "dipole_moment_norm_S0 (D)",
    "dipole_moment_norm_S1 (D)", "dipole_moment_norm_T1 (D)","0-0_S1 (eV)", "0-0_T1 (eV)",
    "adiabatic_S1-S0 (kcal/mol)", "adiabatic_T1-S0 (kcal/mol)", "reduction_S0 (kcal/mol)",
    "lumo-homo (eV)"]]
y = normalized_df["oxidation_S0 (kcal/mol)"]

est = SymbolicRegressor(population_size=5000,
                           generations=500, stopping_criteria=0.01,
                           function_set=['add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv'],
                           p_crossover=0.7, p_subtree_mutation=0.1,
                           p_hoist_mutation=0.05, p_point_mutation=0.1,
                           max_samples=0.9, verbose=1,
                           parsimony_coefficient=0.01, random_state=0)

est.fit(X, y)

# 수식 출력
print(est._program)

y_pred_normalized = est.predict(X)
y_pred = y_pred_normalized * (y_max - y_min) + y_min
y_actual = y * (y_max - y_min) + y_min

r2 = r2_score(y_actual, y_pred)
mae = MAE(y_actual, y_pred)
rmse = np.sqrt(MSE(y_actual, y_pred))

plt.figure()
plt.plot([y_min, y_max], [y_min, y_max], 'k-', lw=2)
plt.scatter(y_actual, y_pred, color = 'red', alpha=0.5)
plt.xlabel(f'Actual values of oxidation_S0 (kcal/mol)')
plt.ylabel(f'Predicted values of oxidation_S0 (kcal/mol)')
plt.title('Parity Plot of oxidation_S0 (kcal/mol)')

# R^2 값 그래프에 추가
plt.text(x=0.05, y=0.95, s=f'$R^2 = {r2:.4f}$', fontsize=12, ha='left', va='top', transform=plt.gca().transAxes)
plt.text(x=0.05, y=0.90, s=f'MAE = {mae:.4f} (kcal/mol)', fontsize=12, ha='left', va='top', transform=plt.gca().transAxes)
plt.text(x=0.05, y=0.85, s=f'RMSE = {rmse:.4f} (kcal/mol)', fontsize=12, ha='left', va='top', transform=plt.gca().transAxes)

# 그래프 이미지 파일로 저장
plt.savefig('GPlearn Oxidation2.png')
plt.close()


    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0     9.17          5.12147        5        0.0687361        0.0729628     32.14m
   1     3.52         0.461642        5        0.0649615        0.0609079     29.76m
   2     2.21         0.404715        5        0.0617434        0.0659495     28.37m
   3     1.35         0.420042        1        0.0714397        0.0850009     27.59m
   4     1.28         0.199411        1        0.0713773        0.0855606     27.68m
   5     1.35          0.21983        6        0.0658254         0.075133     27.60m
   6     1.29         0.261393        1        0.0714574        0.0848418     27.33m
   7     1.29         0.208137        1        0.0712259        0.0869173     27.28m
   8     1.30          0.20673        4        0.0713202        0.0621942  