In [57]:
import pandas as pd
import numpy as np
from gplearn.genetic import SymbolicRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_squared_error as MSE
import matplotlib.pyplot as plt
import re

# CSV 파일에서 데이터 로드
file_path = '20240806_correlation_copy.csv'
df = pd.read_csv(file_path)

In [58]:
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(df)

normalized_df = pd.DataFrame(normalized_data, columns=df.columns)

In [59]:
y_min, y_max = df["reduction_S0 (kcal/mol)"].min(), df["reduction_S0 (kcal/mol)"].max()

X = normalized_df[["homo (eV)", "vertical_excitation_energy (eV)", "dipole_moment_norm_S0 (D)",
    "dipole_moment_norm_S1 (D)", "dipole_moment_norm_T1 (D)","0-0_S1 (eV)", "0-0_T1 (eV)",
    "adiabatic_S1-S0 (kcal/mol)", "adiabatic_T1-S0 (kcal/mol)", "oxidation_S0 (kcal/mol)",
    "lumo-homo (eV)"]]
y = normalized_df["reduction_S0 (kcal/mol)"]

est = SymbolicRegressor(population_size=4000,
                        generations=100,
                        stopping_criteria=0.01,
                        function_set=['add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv'],
                        metric='mean absolute error',
                        parsimony_coefficient=0.0001,
                        verbose=1,
                        random_state=0)

est.fit(X, y)

# 수식 출력
if hasattr(est, '_program'):
    print(est._program)
else:
    print("The model does not have a _program attribute.")

y_pred_normalized = est.predict(X)
y_pred = y_pred_normalized * (y_max - y_min) + y_min
y_actual = y * (y_max - y_min) + y_min

r2 = r2_score(y_actual, y_pred)
mae = MAE(y_actual, y_pred)
rmse = np.sqrt(MSE(y_actual, y_pred))

plt.figure()
plt.plot([y_min, y_max], [y_min, y_max], 'k-', lw=2)
plt.scatter(y_actual, y_pred, color = 'red', alpha=0.5)
plt.xlabel(f'Actual values of reduction_S0 (kcal/mol)')
plt.ylabel(f'Predicted values of reduction_S0 (kcal/mol)')
plt.title('Parity Plot of reduction_S0 (kcal/mol)')

# R^2 값 그래프에 추가
plt.text(x=0.05, y=0.95, s=f'$R^2 = {r2:.4f}$', fontsize=12, ha='left', va='top', transform=plt.gca().transAxes)
plt.text(x=0.05, y=0.90, s=f'MAE = {mae:.4f} (kcal/mol)', fontsize=12, ha='left', va='top', transform=plt.gca().transAxes)
plt.text(x=0.05, y=0.85, s=f'RMSE = {rmse:.4f} (kcal/mol)', fontsize=12, ha='left', va='top', transform=plt.gca().transAxes)

# 그래프 이미지 파일로 저장
plt.savefig('GPlearn Reduction1.png')
plt.close()


    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0     9.11          6.12692        5        0.0847453              N/A      5.90m
   1     4.71         0.434184       10        0.0743309              N/A      6.49m
   2     4.04          0.28495       10        0.0743309              N/A      5.30m
   3     4.46          0.30454       13        0.0666559              N/A      5.46m
   4     6.47         0.353771       13        0.0666559              N/A      5.44m
   5     7.94         0.372909       13        0.0666559              N/A      5.84m
   6     8.44         0.382492       20        0.0628279              N/A      6.02m
   7     9.51         0.432883       22        0.0628279              N/A      5.76m
   8    11.13         0.513108       24        0.0600506              N/A  

In [61]:
y_min, y_max = df["reduction_S0 (kcal/mol)"].min(), df["reduction_S0 (kcal/mol)"].max()

X = normalized_df[["homo (eV)", "vertical_excitation_energy (eV)", "dipole_moment_norm_S0 (D)",
    "dipole_moment_norm_S1 (D)", "dipole_moment_norm_T1 (D)","0-0_S1 (eV)", "0-0_T1 (eV)",
    "adiabatic_S1-S0 (kcal/mol)", "adiabatic_T1-S0 (kcal/mol)", "oxidation_S0 (kcal/mol)",
    "lumo-homo (eV)"]]
y = normalized_df["reduction_S0 (kcal/mol)"]

est = SymbolicRegressor(population_size=3000,
                           generations=100, stopping_criteria=0.01,
                           function_set=['add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv'],
                           p_crossover=0.7, p_subtree_mutation=0.1,
                           p_hoist_mutation=0.05, p_point_mutation=0.1,
                           max_samples=0.9, verbose=1,
                           parsimony_coefficient=0.0001, random_state=0)

est.fit(X, y)

# 수식 출력
print(est._program)

y_pred_normalized = est.predict(X)
y_pred = y_pred_normalized * (y_max - y_min) + y_min
y_actual = y * (y_max - y_min) + y_min

r2 = r2_score(y_actual, y_pred)
mae = MAE(y_actual, y_pred)
rmse = np.sqrt(MSE(y_actual, y_pred))

plt.figure()
plt.plot([y_min, y_max], [y_min, y_max], 'k-', lw=2)
plt.scatter(y_actual, y_pred, color = 'red', alpha=0.5)
plt.xlabel(f'Actual values of reduction_S0 (kcal/mol)')
plt.ylabel(f'Predicted values of reduction_S0 (kcal/mol)')
plt.title('Parity Plot of reduction_S0 (kcal/mol)')

# R^2 값 그래프에 추가
plt.text(x=0.05, y=0.95, s=f'$R^2 = {r2:.4f}$', fontsize=12, ha='left', va='top', transform=plt.gca().transAxes)
plt.text(x=0.05, y=0.90, s=f'MAE = {mae:.4f} (kcal/mol)', fontsize=12, ha='left', va='top', transform=plt.gca().transAxes)
plt.text(x=0.05, y=0.85, s=f'RMSE = {rmse:.4f} (kcal/mol)', fontsize=12, ha='left', va='top', transform=plt.gca().transAxes)

# 그래프 이미지 파일로 저장
plt.savefig('GPlearn Reduction2.png')
plt.close()


    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0     9.13          3.85766        5        0.0847938        0.0843108      4.87m
   1     4.60         0.485361        7        0.0767345        0.0825015      4.84m
   2     3.82         0.452366        7        0.0768097        0.0818269      4.43m
   3     4.19         0.337785        6        0.0700556        0.0706212      4.37m
   4     5.70         0.406396        6        0.0697182        0.0736453      4.32m
   5     6.57         0.660627        6        0.0695054        0.0755532      4.46m
   6     6.87         0.325383        6        0.0692938        0.0774495      4.49m
   7     6.49          0.47867       18        0.0612806        0.0545289      4.48m
   8     6.11          0.28754       18        0.0600684        0.0653956  