In [17]:
import pandas as pd
import numpy as np
from gplearn.genetic import SymbolicRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_squared_error as MSE
import matplotlib.pyplot as plt
import re

# CSV 파일에서 데이터 로드
file_path = '20240806_correlation_copy.csv'
df = pd.read_csv(file_path)

In [18]:
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(df)

normalized_df = pd.DataFrame(normalized_data, columns=df.columns)

In [19]:
y_min, y_max = df["adiabatic_S1-S0 (kcal/mol)"].min(), df["adiabatic_S1-S0 (kcal/mol)"].max()

X = normalized_df[["homo (eV)", "lumo (eV)", "vertical_excitation_energy (eV)", "dipole_moment_norm_S0 (D)",
    "dipole_moment_norm_S1 (D)", "dipole_moment_norm_T1 (D)", "0-0_T1 (eV)",
    "reduction_S0 (kcal/mol)", "adiabatic_T1-S0 (kcal/mol)", "oxidation_S0 (kcal/mol)",
    "lumo-homo (eV)"]]
y = normalized_df["adiabatic_S1-S0 (kcal/mol)"]

est = SymbolicRegressor(population_size=1000,
                        generations=100,
                        stopping_criteria=0.01,
                        function_set=['add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv'],
                        metric='mean absolute error',
                        parsimony_coefficient=0.0001,
                        verbose=1,
                        random_state=0)

est.fit(X, y)

# 수식 출력
if hasattr(est, '_program'):
    print(est._program)
else:
    print("The model does not have a _program attribute.")

y_pred_normalized = est.predict(X)
y_pred = y_pred_normalized * (y_max - y_min) + y_min
y_actual = y * (y_max - y_min) + y_min

r2 = r2_score(y_actual, y_pred)
mae = MAE(y_actual, y_pred)
rmse = np.sqrt(MSE(y_actual, y_pred))

plt.figure()
plt.plot([y_min, y_max], [y_min, y_max], 'k-', lw=2)
plt.scatter(y_actual, y_pred, color = 'red', alpha=0.5)
plt.xlabel(f'Actual values of adiabatic_S1-S0 (kcal/mol)')
plt.ylabel(f'Predicted values of adiabatic_S1-S0 (kcal/mol)')
plt.title('Parity Plot of adiabatic_S1-S0 (kcal/mol)')

# R^2 값 그래프에 추가
plt.text(x=0.05, y=0.95, s=f'$R^2 = {r2:.4f}$', fontsize=12, ha='left', va='top', transform=plt.gca().transAxes)
plt.text(x=0.05, y=0.90, s=f'MAE = {mae:.4f} (kcal/mol)', fontsize=12, ha='left', va='top', transform=plt.gca().transAxes)
plt.text(x=0.05, y=0.85, s=f'RMSE = {rmse:.4f} (kcal/mol)', fontsize=12, ha='left', va='top', transform=plt.gca().transAxes)

# 그래프 이미지 파일로 저장
plt.savefig('GPlearn Adiabatic1.png')
plt.close()


    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0     9.62          6.17786        6        0.0975508              N/A      1.57m
   1     4.63         0.446922        7        0.0593323              N/A      1.22m
   2     4.33         0.442957        8        0.0496848              N/A      1.19m
   3     9.18         0.372101       11        0.0458897              N/A      1.38m
   4     8.29          0.24386       11        0.0458897              N/A      1.19m
   5     8.17         0.127144       13        0.0444145              N/A      1.36m
   6     9.23          0.14215       21        0.0442412              N/A      1.18m
   7    10.32         0.160895       20        0.0440575              N/A      1.29m
   8    12.26         0.154724       18        0.0440575              N/A  

In [20]:
y_min, y_max = df["adiabatic_S1-S0 (kcal/mol)"].min(), df["adiabatic_S1-S0 (kcal/mol)"].max()

X = normalized_df[["homo (eV)", "lumo (eV)", "vertical_excitation_energy (eV)", "dipole_moment_norm_S0 (D)",
    "dipole_moment_norm_S1 (D)", "dipole_moment_norm_T1 (D)", "0-0_T1 (eV)",
    "reduction_S0 (kcal/mol)", "adiabatic_T1-S0 (kcal/mol)", "oxidation_S0 (kcal/mol)",
    "lumo-homo (eV)"]]
y = normalized_df["adiabatic_S1-S0 (kcal/mol)"]

est = SymbolicRegressor(population_size=1000,
                           generations=100, stopping_criteria=0.01,
                           function_set=['add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv'],
                           p_crossover=0.7, p_subtree_mutation=0.1,
                           p_hoist_mutation=0.05, p_point_mutation=0.1,
                           max_samples=0.9, verbose=1,
                           parsimony_coefficient=0.0001, random_state=0)

est.fit(X, y)

# 수식 출력
print(est._program)

y_pred_normalized = est.predict(X)
y_pred = y_pred_normalized * (y_max - y_min) + y_min
y_actual = y * (y_max - y_min) + y_min

r2 = r2_score(y_actual, y_pred)
mae = MAE(y_actual, y_pred)
rmse = np.sqrt(MSE(y_actual, y_pred))

plt.figure()
plt.plot([y_min, y_max], [y_min, y_max], 'k-', lw=2)
plt.scatter(y_actual, y_pred, color = 'red', alpha=0.5)
plt.xlabel(f'Actual values of adiabatic_S1-S0 (kcal/mol)')
plt.ylabel(f'Predicted values of adiabatic_S1-S0 (kcal/mol)')
plt.title('Parity Plot of adiabatic_S1-S0 (kcal/mol)')

# R^2 값 그래프에 추가
plt.text(x=0.05, y=0.95, s=f'$R^2 = {r2:.4f}$', fontsize=12, ha='left', va='top', transform=plt.gca().transAxes)
plt.text(x=0.05, y=0.90, s=f'MAE = {mae:.4f} (kcal/mol)', fontsize=12, ha='left', va='top', transform=plt.gca().transAxes)
plt.text(x=0.05, y=0.85, s=f'RMSE = {rmse:.4f} (kcal/mol)', fontsize=12, ha='left', va='top', transform=plt.gca().transAxes)

# 그래프 이미지 파일로 저장
plt.savefig('GPlearn Adiabatic2.png')
plt.close()


    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0     9.62          6.00665        3        0.0972795         0.110149      1.58m
   1     4.73         0.494179        7        0.0585956        0.0659361      1.65m
   2     4.56         0.572199        6         0.056741        0.0569442      1.44m
   3     8.60         0.513912       10        0.0456634        0.0479178      1.57m
   4     7.69         0.313713       17        0.0405968        0.0406937      1.40m
   5     8.57         0.184794       17        0.0400368        0.0457127      1.49m
   6    10.51         0.304793       25        0.0391324        0.0458407      1.46m
   7    13.52         0.227028       17        0.0378206        0.0424851      1.56m
   8    16.35         0.271018       17        0.0377292        0.0433048  