In [10]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error
from math import sqrt

# 读取数据
file_path_train = 'C:/Users/k/Desktop/硕士论文/ORP/ORP4/ORP_function_train.xlsx'
sheet_name_train = 'Sheet1'
data_train = pd.read_excel(file_path_train, sheet_name=sheet_name_train)
y_train = data_train.iloc[:, -1]
X_train = data_train.iloc[:, 1:-1]

file_path_test = 'C:/Users/k/Desktop/硕士论文/ORP/ORP4/ORP_function_test.xlsx'
sheet_name_test = 'Sheet1'
data_test = pd.read_excel(file_path_test, sheet_name=sheet_name_test)
y_test = data_test.iloc[:, -1]
X_test = data_test.iloc[:, 1:-1]

# 添加常数项以便拟合截距
X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)

# 逐步回归
def stepwise_selection(X, y, 
                       initial_list=[], 
                       threshold_in=0.01, 
                       threshold_out = 0.05, 
                       verbose=True):
    included = list(initial_list)
    while True:
        changed=False
        # forward step
        excluded = list(set(X.columns)-set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed=True
            if verbose:
                print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval))

        # backward step
        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
        # use all coefs except intercept
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max() # null if pvalues is empty
        if worst_pval > threshold_out:
            changed=True
            worst_feature = pvalues.idxmax()
            included.remove(worst_feature)
            if verbose:
                print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
        if not changed:
            break
    return included

result = stepwise_selection(X_train, y_train)

print('resulting features:')
print(result)

# 使用选定的特征训练模型
X_train_s = sm.add_constant(X_train[result])
X_test_s = sm.add_constant(X_test[result])

model = sm.OLS(y_train, X_train_s).fit()

# 在训练集和测试集上的预测
y_train_pred = model.predict(X_train_s)
y_test_pred = model.predict(X_test_s)

# 计算校正后R²和RMSE
def adjusted_r_squared(r_squared, n, k):
    return 1 - (1 - r_squared) * (n - 1) / (n - k - 1)

def calc_r_squared(y, y_pred):
    ss_res = np.sum((y - y_pred) ** 2)
    ss_tot = np.sum((y - np.mean(y)) ** 2)
    r_squared = 1 - (ss_res / ss_tot)
    return r_squared

n_train = X_train.shape[0]
k_train = len(result)-1
r_squared_train = model.rsquared
adj_r_squared_train = adjusted_r_squared(r_squared_train, n_train, k_train)

r_squared_test = calc_r_squared(y_test, y_test_pred)
adj_r_squared_test = adjusted_r_squared(r_squared_test, n_test, k_test)

rmse_train = sqrt(mean_squared_error(y_train, y_train_pred))
rmse_test = sqrt(mean_squared_error(y_test, y_test_pred))

print(f'Adjusted R² (Training): {adj_r_squared_train}')
print(f'Adjusted R² (Testing): {adj_r_squared_test}')
print(f'RMSE (Training): {rmse_train}')
print(f'RMSE (Testing): {rmse_test}')

Add  const                          with p-value 4.96006e-83
Add  F                              with p-value 2.81073e-18
Add  ar                             with p-value 4.32976e-13
Add  -CH3                           with p-value 3.01114e-09
Add  -NH2                           with p-value 0.00453891
Add  C-ar                           with p-value 0.00590145
Add  ring N                         with p-value 0.00712312
Add  -C=C-                          with p-value 0.00612651
resulting features:
['const', 'F', 'ar', '-CH3', '-NH2', 'C-ar', 'ring N', '-C=C-']
Adjusted R² (Training): 0.6755548751258742
Adjusted R² (Testing): 0.5380171606808368
RMSE (Training): 0.45178224582131266
RMSE (Testing): 0.5084107743970273


In [11]:
# 打印回归结果摘要
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                      E   R-squared:                       0.689
Model:                            OLS   Adj. R-squared:                  0.676
Method:                 Least Squares   F-statistic:                     51.27
Date:                Fri, 23 Feb 2024   Prob (F-statistic):           6.15e-38
Time:                        12:49:36   Log-Likelihood:                -106.15
No. Observations:                 170   AIC:                             228.3
Df Residuals:                     162   BIC:                             253.4
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          2.9614      0.074     39.820      0.0

In [12]:
r_squared_test

0.6183620023015608