In [None]:
import pandas as pd  
from sklearn.model_selection import KFold, cross_val_score  
from sklearn.preprocessing import StandardScaler  # 可选，用于标准化特征  
from sklearn.base import BaseEstimator, ClassifierMixin  # 假设我们使用的是分类器  
  
# 假设你有一个包含xls文件路径的列表  
xls_files = ["C:\\Users\\admin\\Desktop\\三个月学习\\算法学习测试数据集\\bal.xls"]  # 替换为你的xls文件路径  
  
# 定义一个函数来读取xls文件并返回数据  
def read_xls(file_path):  
    return pd.read_excel(file_path)  
  
# 定义一个函数来执行十次十折交叉验证  
def ten_times_ten_fold_cv(estimator, X, y):  
    kf = KFold(n_splits=10, shuffle=True, random_state=42)  # 设置随机种子以确保可重复性  
    scores = []  
    for _ in range(10):  # 重复10次  
        fold_scores = cross_val_score(estimator, X, y, cv=kf, scoring='accuracy')  # 根据需要更改评分指标  
        scores.append(fold_scores)  
    # 将结果展平并计算平均值和标准差  
    flat_scores = [score for sublist in scores for score in sublist]  
    mean_score = sum(flat_scores) / len(flat_scores)  
    std_score = pd.Series(flat_scores).std()  
    return mean_score, std_score  
  
# 遍历每个xls文件，并对其执行交叉验证  
for file in xls_files:  
    data = read_xls(file)  
    # 假设最后一列是目标变量，其余列是特征  
    X = data.iloc[:, :-1]  
    y = data.iloc[:, -1]  
      
    # 可选：对特征进行标准化  
    scaler = StandardScaler()  
    X_scaled = scaler.fit_transform(X)  
      
    # 选择或定义你的模型（这里以随机森林分类器为例）  
    from sklearn.ensemble import RandomForestClassifier  
    model = RandomForestClassifier(random_state=42)  
      
    # 执行十次十折交叉验证  
    mean_score, std_score = ten_times_ten_fold_cv(model, X_scaled, y)  
      
    print(f"Results for {file}: Mean Score = {mean_score:.4f}, Std Score = {std_score:.4f}")  
  
# 注意：上面的代码假设你的目标变量是分类的。如果你的任务是回归，你需要更改`RandomForestClassifier`为`RandomForestRegressor`，  
# 并相应地调整`scoring`参数（例如，使用'neg_mean_squared_error'作为回归的评分指标）。

In [8]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

X,y=make_classification()
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=1)
estimator=RandomForestClassifier(oob_score=True,random_state=1)
estimator.fit(X_train,y_train)
print(estimator.oob_score_)

"""
对外层的bagging框架进行参数择优，即对n_estimators参数择优，其他参数仍然是默认值
"""
param_test1={'n_estimators':range(1,101,10)}
grid_search=GridSearchCV(estimator=RandomForestClassifier(random_state=1),param_grid=param_test1,scoring='roc_auc',cv=10)
grid_search.fit(X_train,y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)

"""
优化决策树参数的最大特征数max_features，其他参数设置为常数，且n_estimators为81
"""
param_test2={'max_features':range(1,21,1)}
grid_search_1=GridSearchCV(estimator=RandomForestClassifier(n_estimators=grid_search.best_params_['n_estimators'],random_state=1),param_grid=param_test2,scoring='roc_auc',cv=10)
grid_search_1.fit(X_train,y_train)
print(grid_search_1.best_params_)
print(grid_search_1.best_score_)

"""
用最优参数重新训练数据，计算泛化误差
"""
rfl=RandomForestClassifier(n_estimators=grid_search.best_params_['n_estimators'],max_features=grid_search_1.best_params_['max_features'],oob_score=True,random_state=1)
rfl.fit(X_train,y_train)
print(rfl.oob_score_)


0.925
{'n_estimators': 61}
0.99375
{'max_features': 6}
1.0
0.9375


In [7]:
rfl=RandomForestClassifier(n_estimators=grid_search.best_params_['n_estimators'],max_features=grid_search_1.best_params_['max_features'],oob_score=True,random_state=1)
rfl.fit(X_train,y_train)
print(rfl.oob_score_)

0.7875


In [3]:
import numpy as np  
from sklearn.datasets import make_classification  
from sklearn.model_selection import train_test_split  
from sklearn.ensemble import RandomForestClassifier  
from sklearn.model_selection import GridSearchCV  
  
# 生成数据集  
X, y = make_classification()  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)  
  
# 定义参数网格  
param_grid = {  
    'n_estimators': range(1, 101, 10),  # n_estimators 从 1 到 100，步长为 10  
    'max_features': range(1, 21)       # max_features 从 1 到 20  
}  
  
# 初始化 GridSearchCV  
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=1, oob_score=True),  
                           param_grid=param_grid,  
                           scoring='roc_auc',  # 使用 ROC AUC 作为评分指标  
                           cv=10)  # 使用 10 折交叉验证  
  
# 执行网格搜索  
grid_search.fit(X_train, y_train)  
  
# 输出最优参数和最优分数  
print("Best parameters found: ", grid_search.best_params_)  
print("Best cross-validation score: {:.4f}".format(grid_search.best_score_))  
  
# 使用最优参数重新训练模型，并计算 OOB 分数  
best_rf = RandomForestClassifier(**grid_search.best_params_, random_state=1, oob_score=True)  
best_rf.fit(X_train, y_train)  
print("OOB score with best parameters: {:.4f}".format(best_rf.oob_score_))



Best parameters found:  {'max_features': 7, 'n_estimators': 31}
Best cross-validation score: 0.9081
OOB score with best parameters: 0.8000
