In [76]:
import pandas as pd
import numpy as np
from gplearn.genetic import SymbolicRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_squared_error as MSE
import matplotlib.pyplot as plt
import re

# CSV 파일에서 데이터 로드
file_path = '20240806_correlation_copy.csv'
df = pd.read_csv(file_path)

In [77]:
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(df)

normalized_df = pd.DataFrame(normalized_data, columns=df.columns)

In [78]:
y_min, y_max = df["reduction_S0 (kcal/mol)"].min(), df["reduction_S0 (kcal/mol)"].max()

X = normalized_df[["homo (eV)", "vertical_excitation_energy (eV)", "dipole_moment_norm_S0 (D)",
    "dipole_moment_norm_S1 (D)", "dipole_moment_norm_T1 (D)","0-0_S1 (eV)", "0-0_T1 (eV)",
    "adiabatic_S1-S0 (kcal/mol)", "adiabatic_T1-S0 (kcal/mol)", "oxidation_S0 (kcal/mol)",
    "lumo-homo (eV)"]]
y = normalized_df["reduction_S0 (kcal/mol)"]

est = SymbolicRegressor(population_size=1000,
                        generations=100,
                        stopping_criteria=0.01,
                        function_set=['add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv'],
                        metric='mean absolute error',
                        parsimony_coefficient=0.0001,
                        verbose=1,
                        random_state=0)

est.fit(X, y)

# 수식 출력
if hasattr(est, '_program'):
    print(est._program)
else:
    print("The model does not have a _program attribute.")

y_pred_normalized = est.predict(X)
y_pred = y_pred_normalized * (y_max - y_min) + y_min
y_actual = y * (y_max - y_min) + y_min

r2 = r2_score(y_actual, y_pred)
mae = MAE(y_actual, y_pred)
rmse = np.sqrt(MSE(y_actual, y_pred))

plt.figure()
plt.plot([y_min, y_max], [y_min, y_max], 'k-', lw=2)
plt.scatter(y_actual, y_pred, color = 'red', alpha=0.5)
plt.xlabel(f'Actual values of reduction_S0 (kcal/mol)')
plt.ylabel(f'Predicted values of reduction_S0 (kcal/mol)')
plt.title('Parity Plot of reduction_S0 (kcal/mol)')

# R^2 값 그래프에 추가
plt.text(x=0.05, y=0.95, s=f'$R^2 = {r2:.4f}$', fontsize=12, ha='left', va='top', transform=plt.gca().transAxes)
plt.text(x=0.05, y=0.90, s=f'MAE = {mae:.4f} (kcal/mol)', fontsize=12, ha='left', va='top', transform=plt.gca().transAxes)
plt.text(x=0.05, y=0.85, s=f'RMSE = {rmse:.4f} (kcal/mol)', fontsize=12, ha='left', va='top', transform=plt.gca().transAxes)

# 그래프 이미지 파일로 저장
plt.savefig('GPlearn Reduction1.png')
plt.close()


    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0     9.62          5.13189        3         0.109358              N/A      2.01m
   1     4.38         0.432124        8         0.102453              N/A      1.51m
   2     3.18          0.22198        8         0.102453              N/A      1.32m
   3     3.42         0.184312        5         0.100677              N/A      1.44m
   4     6.24         0.237448       15          0.08306              N/A      1.26m
   5     8.11         0.230324       15          0.08306              N/A      1.40m
   6     9.36         0.204871       15        0.0777261              N/A      1.13m
   7    13.27         0.176691       17        0.0722877              N/A      1.80m
   8    15.67         0.170696       17        0.0712299              N/A  

In [81]:
y_min, y_max = df["reduction_S0 (kcal/mol)"].min(), df["reduction_S0 (kcal/mol)"].max()

X = normalized_df[["homo (eV)", "vertical_excitation_energy (eV)", "dipole_moment_norm_S0 (D)",
    "dipole_moment_norm_S1 (D)", "dipole_moment_norm_T1 (D)","0-0_S1 (eV)", "0-0_T1 (eV)",
    "adiabatic_S1-S0 (kcal/mol)", "adiabatic_T1-S0 (kcal/mol)", "oxidation_S0 (kcal/mol)",
    "lumo-homo (eV)"]]
y = normalized_df["reduction_S0 (kcal/mol)"]

est = SymbolicRegressor(population_size=2000,
                           generations=100, stopping_criteria=0.01,
                           function_set=['add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv'],
                           p_crossover=0.7, p_subtree_mutation=0.1,
                           p_hoist_mutation=0.05, p_point_mutation=0.1,
                           max_samples=0.9, verbose=1,
                           parsimony_coefficient=0.0001, random_state=0)

est.fit(X, y)

# 수식 출력
print(est._program)

y_pred_normalized = est.predict(X)
y_pred = y_pred_normalized * (y_max - y_min) + y_min
y_actual = y * (y_max - y_min) + y_min

r2 = r2_score(y_actual, y_pred)
mae = MAE(y_actual, y_pred)
rmse = np.sqrt(MSE(y_actual, y_pred))

plt.figure()
plt.plot([y_min, y_max], [y_min, y_max], 'k-', lw=2)
plt.scatter(y_actual, y_pred, color = 'red', alpha=0.5)
plt.xlabel(f'Actual values of reduction_S0 (kcal/mol)')
plt.ylabel(f'Predicted values of reduction_S0 (kcal/mol)')
plt.title('Parity Plot of reduction_S0 (kcal/mol)')

# R^2 값 그래프에 추가
plt.text(x=0.05, y=0.95, s=f'$R^2 = {r2:.4f}$', fontsize=12, ha='left', va='top', transform=plt.gca().transAxes)
plt.text(x=0.05, y=0.90, s=f'MAE = {mae:.4f} (kcal/mol)', fontsize=12, ha='left', va='top', transform=plt.gca().transAxes)
plt.text(x=0.05, y=0.85, s=f'RMSE = {rmse:.4f} (kcal/mol)', fontsize=12, ha='left', va='top', transform=plt.gca().transAxes)

# 그래프 이미지 파일로 저장
plt.savefig('GPlearn Reduction2.png')
plt.close()


    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0     9.15          3.73027        3         0.105092         0.112811      3.71m
   1     4.58          0.45382        8        0.0837923        0.0892862      3.03m
   2     4.00         0.337527        7        0.0836444        0.0906112      3.52m
   3     4.50         0.343613        9        0.0777149         0.077287      3.34m
   4     6.82         0.232589       10        0.0583384        0.0558641      2.70m
   5     7.75         0.197281       10        0.0578475        0.0602647      3.22m
   6     8.29         0.224602       10        0.0572159        0.0659265      3.25m
   7     8.90         0.356308       11        0.0572149        0.0659356      3.50m
   8     9.22         0.449017        9        0.0568182        0.0694912  