In [3]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_squared_error as MSE
import matplotlib.pyplot as plt
import re

# CSV 파일에서 데이터 로드
file_path = '20240703_correlation_copy.csv'
df = pd.read_csv(file_path)

In [25]:
# 결측값 제거
valid_data = df[['adiabatic_S1-S0 (kcal/mol)', 'vertical_excitation_energy (eV)', 'dipole_moment_norm_S1 (D)', 'dipole_moment_norm_T1 (D)', '0-0_T1 (eV)', 'reduction_S0 (kcal/mol)']].dropna()

# 독립 변수와 종속 변수 업데이트 (결측값 제거된 데이터 사용)
X = valid_data[['vertical_excitation_energy (eV)', 'dipole_moment_norm_S1 (D)', 'dipole_moment_norm_T1 (D)', '0-0_T1 (eV)', 'reduction_S0 (kcal/mol)']]
y = valid_data['adiabatic_S1-S0 (kcal/mol)']

std_dev = y.std(ddof=0)
std_dev2 = y.std(ddof=0)*23.06
print(std_dev)

model = LinearRegression()
model.fit(X, y)

y_pred = model.predict(X)

r2 = r2_score(y, y_pred)
mae = MAE(y, y_pred)
mae2 = MAE(y, y_pred)*23.06
rmse = np.sqrt(MSE(y, y_pred))
rmse2 = np.sqrt(MSE(y, y_pred))*23.06

# 실제 값과 예측치 저장
result_df = pd.DataFrame({
    'Real': y,
    'Predict': y_pred
})

# 파일 이름에서 유효하지 않은 문자 제거
def sanitize_filename(filename):
    return re.sub(r'[\\/*?:"<>|]', "_", filename)

csv_filename = sanitize_filename('Multiple Regression Adiabatic.csv')
result_df.to_csv(csv_filename, index=False)


# 그래프 생성
plt.figure()
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k-', lw=2)
plt.scatter(y, y_pred, color = 'red', alpha=0.5)
plt.xlabel(f'Actual values of adiabatic_S1-S0 (kcal/mol)')
plt.ylabel(f'Predicted values of adiabatic_S1-S0 (kcal/mol)')
plt.title('Parity Plot of adiabatic_S1-S0 (kcal/mol)')

# R^2 값 그래프에 추가
plt.text(x=0.05, y=0.95, s=f'$R^2 = {r2:.4f}$', fontsize=12, ha='left', va='top', transform=plt.gca().transAxes)
plt.text(x=0.05, y=0.90, s=f'MAE = {mae:.4f} (kcal/mol)', fontsize=12, ha='left', va='top', transform=plt.gca().transAxes)
plt.text(x=0.05, y=0.85, s=f'RMSE = {rmse:.4f} (kcal/mol)', fontsize=12, ha='left', va='top', transform=plt.gca().transAxes)

# 그래프 이미지 파일로 저장
img_filename = sanitize_filename('Multiple Regression Adiabatic.png')
plt.savefig(img_filename)
plt.close()


9.152405944652418


In [23]:
# 결측값 제거
valid_data = df[['reduction_S0 (kcal/mol)', 'homo (eV)', '0-0_S1 (eV)', 'adiabatic_T1-S0 (kcal/mol)']].dropna()

# 독립 변수와 종속 변수 업데이트 (결측값 제거된 데이터 사용)
y = valid_data['reduction_S0 (kcal/mol)']
X = valid_data[['homo (eV)', '0-0_S1 (eV)', 'adiabatic_T1-S0 (kcal/mol)']]

std_dev = y.std(ddof=0)
std_dev2 = y.std(ddof=0)*23.06
print(std_dev)

model = LinearRegression()
model.fit(X, y)

y_pred = model.predict(X)

r2 = r2_score(y, y_pred)
mae = MAE(y, y_pred)
mae2 = MAE(y, y_pred)*23.06
rmse = np.sqrt(MSE(y, y_pred))
rmse2 = np.sqrt(MSE(y, y_pred))*23.06

# 실제 값과 예측치 저장
result_df = pd.DataFrame({
    'Real': y,
    'Predict': y_pred
})

# 파일 이름에서 유효하지 않은 문자 제거
def sanitize_filename(filename):
    return re.sub(r'[\\/*?:"<>|]', "_", filename)

csv_filename = sanitize_filename('Multiple Regression Reduction.csv')
result_df.to_csv(csv_filename, index=False)


# 그래프 생성
plt.figure()
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k-', lw=2)
plt.scatter(y, y_pred, color = 'red', alpha=0.5)
plt.xlabel(f'Actual values of reduction_S0 (kcal/mol)')
plt.ylabel(f'Predicted values of reduction_S0 (kcal/mol)')
plt.title('Parity Plot of reduction_S0 (kcal/mol)')

# R^2 값 그래프에 추가
plt.text(x=0.05, y=0.95, s=f'$R^2 = {r2:.4f}$', fontsize=12, ha='left', va='top', transform=plt.gca().transAxes)
plt.text(x=0.05, y=0.90, s=f'MAE = {mae:.4f} (kcal/mol)', fontsize=12, ha='left', va='top', transform=plt.gca().transAxes)
plt.text(x=0.05, y=0.85, s=f'RMSE = {rmse:.4f} (kcal/mol)', fontsize=12, ha='left', va='top', transform=plt.gca().transAxes)

# 그래프 이미지 파일로 저장
img_filename = sanitize_filename('Multiple Regression Reduction.png')
plt.savefig(img_filename)
plt.close()


6.325547264995044


In [24]:
# 결측값 제거
valid_data = df[['oxidation_S0 (kcal/mol)', 'lumo (eV)', 'dipole_moment_norm_S1 (D)', 'dipole_moment_norm_T1 (D)', '0-0_S1 (eV)', 'adiabatic_T1-S0 (kcal/mol)']].dropna()

# 독립 변수와 종속 변수 업데이트 (결측값 제거된 데이터 사용)
y = valid_data['oxidation_S0 (kcal/mol)']
X = valid_data[['lumo (eV)', 'dipole_moment_norm_S1 (D)', 'dipole_moment_norm_T1 (D)', '0-0_S1 (eV)', 'adiabatic_T1-S0 (kcal/mol)']]

std_dev = y.std(ddof=0)
std_dev2 = y.std(ddof=0)*23.06
print(std_dev)

model = LinearRegression()
model.fit(X, y)

y_pred = model.predict(X)

r2 = r2_score(y, y_pred)
mae = MAE(y, y_pred)
mae2 = MAE(y, y_pred)*23.06
rmse = np.sqrt(MSE(y, y_pred))
rmse2 = np.sqrt(MSE(y, y_pred))*23.06

# 실제 값과 예측치 저장
result_df = pd.DataFrame({
    'Real': y,
    'Predict': y_pred
})

# 파일 이름에서 유효하지 않은 문자 제거
def sanitize_filename(filename):
    return re.sub(r'[\\/*?:"<>|]', "_", filename)

csv_filename = sanitize_filename('Multiple Regression Oxidation.csv')
result_df.to_csv(csv_filename, index=False)


# 그래프 생성
plt.figure()
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k-', lw=2)
plt.scatter(y, y_pred, color = 'red', alpha=0.5)
plt.xlabel(f'Actual values of oxidation_S0 (kcal/mol)')
plt.ylabel(f'Predicted values of oxidation_S0 (kcal/mol)')
plt.title('Parity Plot of oxidation_S0 (kcal/mol)')

# R^2 값 그래프에 추가
plt.text(x=0.05, y=0.95, s=f'$R^2 = {r2:.4f}$', fontsize=12, ha='left', va='top', transform=plt.gca().transAxes)
plt.text(x=0.05, y=0.90, s=f'MAE = {mae:.4f} (kcal/mol)', fontsize=12, ha='left', va='top', transform=plt.gca().transAxes)
plt.text(x=0.05, y=0.85, s=f'RMSE = {rmse:.4f} (kcal/mol)', fontsize=12, ha='left', va='top', transform=plt.gca().transAxes)

# 그래프 이미지 파일로 저장
img_filename = sanitize_filename('Multiple Regression Oxidation.png')
plt.savefig(img_filename)
plt.close()


15.161931325309032


In [21]:
# 결측값 제거
valid_data = df[['dipole_moment_norm_S0 (D)']].dropna()

std_dev = valid_data.std(ddof=0)
std_dev2 = valid_data.std(ddof=0)*23.06
print(std_dev)

dipole_moment_norm_S0 (D)    3.89232
dtype: float64
