In [1]:
import pandas as pd
import numpy as np
from scipy.optimize import curve_fit
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error as MAE
import matplotlib.pyplot as plt
import re

# load data
file_path = '20240703_correlation_copy.csv'
df = pd.read_csv(file_path)
df['homo (eV)'] = -df['homo (eV)']
df['lumo (eV)'] = -df['lumo (eV)']
df['reduction_S0 (kcal/mol)'] = -df['reduction_S0 (kcal/mol)']

columns = df.columns

r2_matrix = pd.DataFrame(index=columns, columns=columns)

def log_func(x, A, B, C, D):
    return A * np.log(B * (x+2) + C) + D

for dependent_var in columns:
    independent_vars = df.columns.difference([dependent_var])

    for var in independent_vars:
        valid_data = df[[var, dependent_var]].dropna()

        X = valid_data[[var]].values.flatten()
        y = valid_data[dependent_var].values

        if len(valid_data) > 0:
            try:
                popt, _ = curve_fit(log_func, X, y, maxfev=10000000)
                A, B, C, D = popt

                y_pred = log_func(X, *popt)

                r2 = r2_score(y, y_pred)
                r2_matrix.loc[var, dependent_var] = r2
            except Exception as e:
                print(f'Error fitting model for independent variable: {var}, dependent variable: {dependent_var}, Error: {e}')
                r2_matrix.loc[var, dependent_var] = np.nan
        else:
            r2_matrix.loc[var, dependent_var] = np.nan


output_file_path = '20240716_logarithmic2_r2.csv'
r2_matrix.to_csv(output_file_path)

high_r2_models = r2_matrix[r2_matrix >= 0.90].stack().reset_index()
high_r2_models.columns = ['Independent_Var', 'Dependent_Var', 'R2']

  return A * np.log(B * (x+2) + C) + D
  return A * np.log(B * (x+2) + C) + D
  return A * np.log(B * (x+2) + C) + D
  return A * np.log(B * (x+2) + C) + D
  return A * np.log(B * (x+2) + C) + D
  return A * np.log(B * (x+2) + C) + D
  return A * np.log(B * (x+2) + C) + D
  return A * np.log(B * (x+2) + C) + D
  return A * np.log(B * (x+2) + C) + D
  return A * np.log(B * (x+2) + C) + D
  return A * np.log(B * (x+2) + C) + D
  return A * np.log(B * (x+2) + C) + D
  return A * np.log(B * (x+2) + C) + D
  return A * np.log(B * (x+2) + C) + D
  return A * np.log(B * (x+2) + C) + D
  return A * np.log(B * (x+2) + C) + D
  return A * np.log(B * (x+2) + C) + D
  return A * np.log(B * (x+2) + C) + D
  return A * np.log(B * (x+2) + C) + D
  return A * np.log(B * (x+2) + C) + D
  return A * np.log(B * (x+2) + C) + D
  return A * np.log(B * (x+2) + C) + D
  return A * np.log(B * (x+2) + C) + D
  return A * np.log(B * (x+2) + C) + D
  return A * np.log(B * (x+2) + C) + D
  return A * np.log(B * (

In [2]:
def log_func(x, A, B, C, D):
    return A * np.log(B * x + C) + D

# 첫 번째 모델에 대해 CSV 파일과 그래프 저장
row = high_r2_models.iloc[12]
independent_var = row['Independent_Var']
dependent_var = row['Dependent_Var']
r2 = row['R2']

# 결측치 제거
valid_data = df[[independent_var, dependent_var]].dropna()

X = valid_data[[independent_var]].values.flatten()
y = valid_data[dependent_var].values

try:
    popt, _ = curve_fit(log_func, X, y, maxfev=10000000)
    A, B, C, D = popt
    
except Exception as e:
    print(f'Error fitting model for independent variable: {var}, dependent variable: {dependent_var}, Error: {e}')

  return A * np.log(B * x + C) + D


In [3]:
# 모델 예측
X_fit = np.arange(1.5, 4.5, 0.01)[:, np.newaxis]
y_pred = log_func(X_fit, *popt)

# 그래프 생성
plt.figure()
plt.plot(X_fit, y_pred, 'b-', lw=2)
plt.scatter(X, y, color='red', alpha=0.5)
plt.xlabel(f'{independent_var}')
plt.ylabel(f'{dependent_var}')
plt.title(f'Logarithmic Correlation: {independent_var}__{dependent_var}')

equation = f'$y = {A:.2f}*ln({B:.2f}x {C:.2f}) + {D:.2f}$'

# R^2, MAE 값 그래프에 추가
plt.text(x=0.05, y=0.95, s=f'$R^2 = {r2:.4f}$', fontsize=12, ha='left', va='top', transform=plt.gca().transAxes)
plt.text(x=0.05, y=0.90, s=equation, fontsize=12, ha='left', va='top', transform=plt.gca().transAxes)

# 파일 이름에서 유효하지 않은 문자 제거
def sanitize_filename(filename):
    return re.sub(r'[\\/*?:"<>|]', "_", filename)

# 그래프 이미지 파일로 저장
img_filename = sanitize_filename(f'Logarithmic Correlation between {independent_var}__{dependent_var}.png')
plt.savefig(img_filename)
plt.close()


In [14]:
# 모델 예측
y_pred = log_func(X, *popt)

# Convert it to kcal/mole
mae = MAE(y, y_pred)*23.06
print(f'MAE: {mae} (kcal/mol)')

# 실제 값과 예측치 저장
result_df = pd.DataFrame({
    'Real': y,
    'Predict': y_pred
})

csv_filename = sanitize_filename(f'{independent_var}__{dependent_var}.csv')
result_df.to_csv(csv_filename, index=False)

# 그래프 생성
plt.figure()
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k-', lw=2)
plt.scatter(y, y_pred, color='red', alpha=0.5)
plt.xlabel(f'Actual values of {dependent_var}')
plt.ylabel(f'Predicted values of {dependent_var}')
plt.title(f'Model: {independent_var}__{dependent_var}')

# R^2 값 그래프에 추가
plt.text(x=0.05, y=0.95, s=f'$R^2 = {r2:.4f}$', fontsize=12, ha='left', va='top', transform=plt.gca().transAxes)
plt.text(x=0.05, y=0.90, s=f'MAE = {mae:.4f} (kcal/mol)', fontsize=12, ha='left', va='top', transform=plt.gca().transAxes)

# 그래프 이미지 파일로 저장
img_filename = sanitize_filename(f'{independent_var}__{dependent_var}.png')
plt.savefig(img_filename)
plt.close()

MAE: 2.2286062007841867 (kcal/mol)
