In [7]:
import pandas as pd
import statsmodels.api as sm
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, KFold
from skopt import BayesSearchCV
import math

def feature_selection_corr(x, y, threshold=0.5):
    """选择与目标变量相关性大于给定阈值的特征，并处理特征变量之间的相关性"""
    # 计算每个特征变量与目标变量的相关性
    corr_with_target = x.apply(lambda col: col.corr(y))

    # 保留与目标变量相关性大于阈值的特征变量
    selected_features = corr_with_target[abs(corr_with_target) > threshold].index.tolist()

    while True:
        # 计算选定特征变量之间的相关性矩阵
        corr_matrix = x[selected_features].corr().abs()

        # 查找高相关性的特征对
        high_corr_pairs = [(i, j) for i in corr_matrix.columns for j in corr_matrix.columns if i != j and corr_matrix.at[i, j] > threshold]

        if not high_corr_pairs:
            break

        # 删除与目标变量相关性较低的特征
        for col, row in high_corr_pairs:
            if corr_with_target[col] > corr_with_target[row]:
                selected_features.remove(row)
            else:
                selected_features.remove(col)

            # 更新特征与目标变量的相关性
            corr_with_target = x[selected_features].apply(lambda col: col.corr(y))
            break  # 重新开始循环以重新计算相关性矩阵

    return x[selected_features]


def stepwise_selection_cv(X, y, cv, threshold_in=0.05, threshold_out=0.1, verbose=True):
    included = []
    excluded = list(X.columns)
    best_rmse = float('inf')
    best_threshold_in = threshold_in
    best_threshold_out = threshold_out
    r2_scores = []  # 用于记录每次交叉验证的 R2 值

    # 使用 KFold 进行交叉验证
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)

    for train_index, val_index in kf.split(X):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        while True:
            changed = False
            new_pval = pd.Series(index=excluded, dtype=float)

            for new_column in excluded:
                model = sm.OLS(y_train, sm.add_constant(pd.DataFrame(X_train[included + [new_column]]))).fit()
                new_pval[new_column] = model.pvalues[new_column]

            best_pval = new_pval.min()

            if best_pval < threshold_in:
                best_feature = new_pval.idxmin()
                included.append(best_feature)
                excluded.remove(best_feature)
                changed = True
                if verbose:
                    print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval))

            model = sm.OLS(y_train, sm.add_constant(pd.DataFrame(X_train[included]))).fit()
            pvalues = model.pvalues.iloc[1:]
            worst_pval = pvalues.max()

            if worst_pval > threshold_out:
                changed = True
                worst_feature = pvalues.idxmax()
                included.remove(worst_feature)
                excluded.append(worst_feature)
                if verbose:
                    print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))

            if changed:
                x_train_with_const = sm.add_constant(X_train[included])
                y_train_pred = model.predict(x_train_with_const)
                rmse_cv = math.sqrt(mean_squared_error(y_train, y_train_pred))

                if rmse_cv < best_rmse:
                    best_rmse = rmse_cv
                    best_threshold_in = threshold_in
                    best_threshold_out = threshold_out
            else:
                break

        # 在验证集上计算 R2
        x_val_with_const = sm.add_constant(X_val[included])
        y_val_pred = model.predict(x_val_with_const)
        r2_val = r2_score(y_val, y_val_pred)
        r2_scores.append(r2_val)

    return included, best_threshold_in, best_threshold_out, r2_scores

# 读取训练数据
file_path_train = 'C:/Users/k/Desktop/硕士论文/ORP/ORP4/ORP_quantum_train.xlsx'
sheet_name_train = 'Sheet1'
data_train = pd.read_excel(file_path_train, sheet_name=sheet_name_train)
y_train = data_train.iloc[:, -1]
x_train = data_train.iloc[:, 0:-1]

# 筛选相关性的描述符
x_train_selected = feature_selection_corr(x_train, y_train)

# 执行步进多元线性回归
selected_features, best_threshold_in, best_threshold_out, r2_scores = stepwise_selection_cv(X=x_train_selected, y=y_train, cv=5, threshold_in=0.05, threshold_out=0.1, verbose=True)

# 输出每次交叉验证的 R2 值
print("R2 scores for each fold:", r2_scores)


# 使用选定的特征创建最终的多元线性回归模型
final_model = sm.OLS(y_train, sm.add_constant(x_train_selected[selected_features])).fit()

# 打印回归结果摘要
print(final_model.summary())

# 在测试集上进行预测
file_path_test = 'C:/Users/k/Desktop/硕士论文/ORP/ORP4/ORP_quantum_test.xlsx'
sheet_name_test = 'Sheet1'
data_test = pd.read_excel(file_path_test, sheet_name=sheet_name_test)
y_test = data_test.iloc[:, -1]
x_test = data_test.iloc[:, 0:-1]

x_test_selected = x_test[x_train_selected.columns]  # 保持和训练集相同的描述符

x_test_with_const = sm.add_constant(x_test_selected[selected_features])
y_test_pred = final_model.predict(x_test_with_const)

# 计算测试集上的R2和RMSE
r2_test = r2_score(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = math.sqrt(mse_test)

print(f"测试集上的R2: {r2_test}")
print(f"测试集上的均方根误差 (RMSE): {rmse_test}")



Add  Vertical IP                    with p-value 9.8318e-69
R2 scores for each fold: [0.7453503139443016, 0.7933921778446675, 0.9238181336513002, 0.9014424239058801, 0.9254659864122293]
                            OLS Regression Results                            
Dep. Variable:                      E   R-squared:                       0.870
Model:                            OLS   Adj. R-squared:                  0.869
Method:                 Least Squares   F-statistic:                     1126.
Date:                Wed, 21 Feb 2024   Prob (F-statistic):           2.22e-76
Time:                        20:16:23   Log-Likelihood:                -31.901
No. Observations:                 170   AIC:                             67.80
Df Residuals:                     168   BIC:                             74.07
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                  coef  