In [8]:
import pandas as pd
import statsmodels.api as sm
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, KFold
from skopt import BayesSearchCV
import math

def feature_selection_corr(x, y, threshold=0.5, corr_threshold=0.6):
    """选择与目标变量相关性大于给定阈值的特征，并处理特征变量之间的高相关性，删除其中与目标变量相关性较低的特征"""
    # 计算每个特征变量与目标变量的相关性
    corr_with_target = x.apply(lambda col: col.corr(y))

    # 保留与目标变量相关性大于阈值的特征变量
    selected_features = corr_with_target[abs(corr_with_target) > threshold].index

    # 计算选定特征变量之间的相关性矩阵
    corr_matrix = x[selected_features].corr()

    # 检查特征之间的相关性，并在相关性高于corr_threshold时移除其中一个
    while True:
        to_drop = []
        for i in range(len(corr_matrix)):
            for j in range(i+1, len(corr_matrix)):
                if abs(corr_matrix.iloc[i, j]) > corr_threshold:
                    # 比较两个特征与目标变量的相关性
                    if abs(corr_with_target[corr_matrix.index[i]]) > abs(corr_with_target[corr_matrix.index[j]]):
                        to_drop.append(corr_matrix.index[j])
                    else:
                        to_drop.append(corr_matrix.index[i])

        to_drop = list(set(to_drop))  # 去除重复项
        if not to_drop:
            break

        # 更新特征列表和相关性矩阵
        selected_features = [feat for feat in selected_features if feat not in to_drop]
        corr_matrix = x[selected_features].corr()

    return x[selected_features]


def stepwise_selection_cv(X, y, cv, threshold_in=0.05, threshold_out=0.1, verbose=True):
    included = []
    excluded = list(X.columns)
    best_rmse = float('inf')
    best_threshold_in = threshold_in
    best_threshold_out = threshold_out

    while True:
        changed = False
        new_pval = pd.Series(index=excluded, dtype=float)

        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included + [new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]

        best_pval = new_pval.min()

        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            excluded.remove(best_feature)
            changed = True
            if verbose:
                print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval))

        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max()

        if worst_pval > threshold_out:
            changed = True
            worst_feature = pvalues.idxmax()
            included.remove(worst_feature)
            excluded.append(worst_feature)
            if verbose:
                print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))

        if changed:
            x_train_with_const = sm.add_constant(X[included])
            y_train_pred = model.predict(x_train_with_const)
            rmse_cv = math.sqrt(mean_squared_error(y, y_train_pred))

            if rmse_cv < best_rmse:
                best_rmse = rmse_cv
                best_threshold_in = threshold_in
                best_threshold_out = threshold_out
        else:
            break

    return included, best_threshold_in, best_threshold_out

# 读取训练数据
file_path_train = 'C:/Users/k/Desktop/硕士论文/ORP/ORP4/ORP_quantum_train.xlsx'
sheet_name_train = 'Sheet1'
data_train = pd.read_excel(file_path_train, sheet_name=sheet_name_train)
y_train = data_train.iloc[:, -1]
x_train = data_train.iloc[:, 0:-1]

# 筛选相关性的描述符
x_train_selected = feature_selection_corr(x_train, y_train)

# 执行步进多元线性回归
selected_features, best_threshold_in, best_threshold_out = stepwise_selection_cv(X=x_train_selected, y=y_train, cv=5, threshold_in=0.05, threshold_out=0.1, verbose=True)

# 使用选定的特征创建最终的多元线性回归模型
final_model = sm.OLS(y_train, sm.add_constant(x_train_selected[selected_features])).fit()

# 打印回归结果摘要
print(final_model.summary())

# 在测试集上进行预测
file_path_test = 'C:/Users/k/Desktop/硕士论文/ORP/ORP4/ORP_quantum_test.xlsx'
sheet_name_test = 'Sheet1'
data_test = pd.read_excel(file_path_test, sheet_name=sheet_name_test)
y_test = data_test.iloc[:, -1]
x_test = data_test.iloc[:, 0:-1]

x_test_selected = x_test[x_train_selected.columns]  # 保持和训练集相同的描述符

x_test_with_const = sm.add_constant(x_test_selected[selected_features])
y_test_pred = final_model.predict(x_test_with_const)

# 计算测试集上的R2和RMSE
r2_test = r2_score(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = math.sqrt(mse_test)

print(f"测试集上的R2: {r2_test}")
print(f"测试集上的均方根误差 (RMSE): {rmse_test}")



Add  EHUMO                          with p-value 5.44442e-92
                            OLS Regression Results                            
Dep. Variable:                      E   R-squared:                       0.915
Model:                            OLS   Adj. R-squared:                  0.915
Method:                 Least Squares   F-statistic:                     1816.
Date:                Wed, 06 Mar 2024   Prob (F-statistic):           5.44e-92
Time:                        14:56:52   Log-Likelihood:                 4.4465
No. Observations:                 170   AIC:                            -4.893
Df Residuals:                     168   BIC:                             1.379
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------

In [9]:
selected_features

['EHUMO']

In [1]:
import pandas as pd
import statsmodels.api as sm
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, KFold
from skopt import BayesSearchCV
import math

def feature_selection_corr(x, y, threshold=0.6):
    """选择与目标变量相关性大于给定阈值的特征，并处理特征变量之间的相关性"""
    # 计算每个特征变量与目标变量的相关性
    corr_with_target = x.apply(lambda col: col.corr(y))

    # 保留与目标变量相关性大于阈值的特征变量
    selected_features = corr_with_target[abs(corr_with_target) > threshold].index
    
    # 计算选定特征变量之间的相关性矩阵
    corr_matrix = x[selected_features].corr().abs()
    
    # 处理特征变量之间的相关性，删除相关性大于阈值的特征
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool_))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    selected_features = list(set(selected_features) - set(to_drop))
    
    return x[selected_features]

def stepwise_selection_cv(X, y, cv, threshold_in=0.05, threshold_out=0.1, verbose=True):
    included = []
    excluded = list(X.columns)
    best_rmse = float('inf')
    best_threshold_in = threshold_in
    best_threshold_out = threshold_out
    r2_scores = []  # 用于记录每次交叉验证的 R2 值

    # 使用 KFold 进行交叉验证
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)

    for train_index, val_index in kf.split(X):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        while True:
            changed = False
            new_pval = pd.Series(index=excluded, dtype=float)

            for new_column in excluded:
                model = sm.OLS(y_train, sm.add_constant(pd.DataFrame(X_train[included + [new_column]]))).fit()
                new_pval[new_column] = model.pvalues[new_column]

            best_pval = new_pval.min()

            if best_pval < threshold_in:
                best_feature = new_pval.idxmin()
                included.append(best_feature)
                excluded.remove(best_feature)
                changed = True
                if verbose:
                    print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval))

            model = sm.OLS(y_train, sm.add_constant(pd.DataFrame(X_train[included]))).fit()
            pvalues = model.pvalues.iloc[1:]
            worst_pval = pvalues.max()

            if worst_pval > threshold_out:
                changed = True
                worst_feature = pvalues.idxmax()
                included.remove(worst_feature)
                excluded.append(worst_feature)
                if verbose:
                    print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))

            if changed:
                x_train_with_const = sm.add_constant(X_train[included])
                y_train_pred = model.predict(x_train_with_const)
                rmse_cv = math.sqrt(mean_squared_error(y_train, y_train_pred))

                if rmse_cv < best_rmse:
                    best_rmse = rmse_cv
                    best_threshold_in = threshold_in
                    best_threshold_out = threshold_out
            else:
                break

        # 在验证集上计算 R2
        x_val_with_const = sm.add_constant(X_val[included])
        y_val_pred = model.predict(x_val_with_const)
        r2_val = r2_score(y_val, y_val_pred)
        r2_scores.append(r2_val)

    return included, best_threshold_in, best_threshold_out, r2_scores

# 读取训练数据
file_path_train = 'C:/Users/k/Desktop/硕士论文/ORP/ORP4/ORP_quantum_train.xlsx'
sheet_name_train = 'Sheet1'
data_train = pd.read_excel(file_path_train, sheet_name=sheet_name_train)
y_train = data_train.iloc[:, -1]
x_train = data_train.iloc[:, 0:-1]

# 筛选相关性的描述符
x_train_selected = feature_selection_corr(x_train, y_train)

# 执行步进多元线性回归
selected_features, best_threshold_in, best_threshold_out, r2_scores = stepwise_selection_cv(X=x_train_selected, y=y_train, cv=5, threshold_in=0.05, threshold_out=0.1, verbose=True)

# 输出每次交叉验证的 R2 值
print("R2 scores for each fold:", r2_scores)


# 使用选定的特征创建最终的多元线性回归模型
final_model = sm.OLS(y_train, sm.add_constant(x_train_selected[selected_features])).fit()

# 打印回归结果摘要
print(final_model.summary())

# 在测试集上进行预测
file_path_test = 'C:/Users/k/Desktop/硕士论文/ORP/ORP4/ORP_quantum_test.xlsx'
sheet_name_test = 'Sheet1'
data_test = pd.read_excel(file_path_test, sheet_name=sheet_name_test)
y_test = data_test.iloc[:, -1]
x_test = data_test.iloc[:, 0:-1]

x_test_selected = x_test[x_train_selected.columns]  # 保持和训练集相同的描述符

x_test_with_const = sm.add_constant(x_test_selected[selected_features])
y_test_pred = final_model.predict(x_test_with_const)

# 计算测试集上的R2和RMSE
r2_test = r2_score(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = math.sqrt(mse_test)

print(f"测试集上的R2: {r2_test}")
print(f"测试集上的均方根误差 (RMSE): {rmse_test}")



Add  EHUMO                          with p-value 1.95986e-84
R2 scores for each fold: [0.8034847932764174, 0.8673512233482636, 0.9678458126613109, 0.9427390234516702, 0.9493603962962314]
                            OLS Regression Results                            
Dep. Variable:                      E   R-squared:                       0.915
Model:                            OLS   Adj. R-squared:                  0.915
Method:                 Least Squares   F-statistic:                     1816.
Date:                Sun, 10 Mar 2024   Prob (F-statistic):           5.44e-92
Time:                        16:25:23   Log-Likelihood:                 4.4465
No. Observations:                 170   AIC:                            -4.893
Df Residuals:                     168   BIC:                             1.379
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef  

In [2]:
# 计算训练集上的RMSE
x_train_selected_with_const = sm.add_constant(x_train_selected[selected_features])
y_train_pred = final_model.predict(x_train_selected_with_const)
rmse_train = math.sqrt(mean_squared_error(y_train, y_train_pred))
print(f"训练集上的均方根误差 (RMSE): {rmse_train}")

# 计算测试集上的校正后R2
n = len(y_test)
k = len(selected_features)
adjusted_r2_test = 1 - ((1 - r2_test) * (n - 1) / (n - k - 1))
print(f"测试集上的校正后R2: {adjusted_r2_test}")


训练集上的均方根误差 (RMSE): 0.23572385428291753
测试集上的校正后R2: 0.9443260041539944


In [3]:
from sklearn.metrics import r2_score, mean_absolute_percentage_error
# 计算MAPE
mape_train = mean_absolute_percentage_error(y_train, y_train_pred)
mape_test = mean_absolute_percentage_error(y_test, y_test_pred)
print(f'MAPE (Training): {mape_train}')
print(f'MAPE (Testing): {mape_test}')

MAPE (Training): 0.07097620993666932
MAPE (Testing): 0.06307143512023944


In [7]:
# 函数：根据模型计算预测值
def model_prediction(ehumo):
    return -27.21 * ehumo - 4.5
y_train_pred1 = model_prediction(x_train['EHUMO'])
y_test_pred1 = model_prediction(x_test['EHUMO'])
from sklearn.metrics import r2_score, mean_absolute_percentage_error
# 计算MAPE
mape_train = mean_absolute_percentage_error(y_train, y_train_pred1)
mape_test = mean_absolute_percentage_error(y_test, y_test_pred1)
print(f'MAPE (Training): {mape_train}')
print(f'MAPE (Testing): {mape_test}')

MAPE (Training): 0.07626388777049935
MAPE (Testing): 0.0630709861549391


In [9]:
rmse_train1 = math.sqrt(mean_squared_error(y_train, y_train_pred1))
rmse_train1 

0.2537522887681248

In [10]:
rmse_test1 = math.sqrt(mean_squared_error(y_test, y_test_pred1))
rmse_test1

0.20738889270251223