前面训练过程耗时比较长，建议老师直接加载训练好的模型进行测试，也就是运行第6和第7部分
如果不使用训练好的ada_best.pkl，那么请运行第1、2、4、7部分。

# 1、导入所需要的库

In [1]:
# 导入所需的库
import pandas as pd
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# 2、数据预处理

In [2]:
data = pd.read_csv('./train.csv') # 读取数据集
data = data.drop(data.columns[data.nunique() == 1], axis=1) # 删除重复的列
data = data.drop(data.columns[data.columns.str.contains('home')], axis=1)  # 删除包含图片地址的列
data = data.drop(data.columns[2], axis=1) # 删除全是字符串的一列
data = data.drop(data.columns[data.columns.str.contains('\(')], axis=1)  # 暂时删掉元组类型的数据
data.columns = [np.arange(0,98)]
data.astype(float)

# 分配自变量和因变量
X = data.iloc[:,:-1]
y = data.iloc[:,-1]

# 训练集和测试集划分
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3、参数调优（这一步耗时较长）--参数调优的结果是n_estimators=500，learning_rate=0.9时模型最好
- 使用adaboost集成学习
- 学习率通过learning_rate缩小每个分类器的贡献程度。learning_rate和n_estimators之间存在权衡关系。
- n_estimators是终止推进的估计器的最大数目。如果完全拟合，学习过程就会提前停止。

In [3]:
# 构建AdaBoost模型 集成学习
ada = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=8), random_state=42) 

# 定义要调参的参数
param_grid = {
    'n_estimators': [10,50,100,150,200,300,400,500],
    'learning_rate': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
}

# 使用GridSearchCV进行参数调优
grid_search = GridSearchCV(estimator=ada, param_grid=param_grid, cv=5, scoring='roc_auc',n_jobs=-1,verbose=2)
grid_search.fit(X_train, y_train)

# 输出最优参数
print("Best parameters:", grid_search.best_params_)

Fitting 5 folds for each of 80 candidates, totalling 400 fits
Best parameters: {'learning_rate': 0.9, 'n_estimators': 500}


# 4、使用最优参数构建模型

In [3]:
# 使用最优参数构建模型
ada_best = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=8),
                              n_estimators=500,
                              learning_rate=0.9,
                              random_state=42)

# 训练模型并预测
ada_best.fit(X_train, y_train)
y_pred = ada_best.predict(X_test)


# 计算分类指标
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
# 计算AUC
auc = roc_auc_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)

Accuracy: 0.92625
Precision: 0.9367311072056239
Recall: 0.9586330935251799
AUC: 0.9055460549593113


# 5、保存训练好的ada_best模型

In [4]:
import pickle

# 保存模型
with open('ada_best.pkl', 'wb') as f:
    pickle.dump(ada_best, f)

# 6、加载ada_best模型

In [3]:
import pickle
# 加载模型
with open('ada_best.pkl', 'rb') as f:
    ada_best = pickle.load(f)



# 7、测试
- 老师只需更改 path的值即可

In [4]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
def test_data(path):
    data = pd.read_csv(path) # 读取数据集
    data = data.drop(data.columns[data.nunique() == 1], axis=1) # 删除重复的列
    data = data.drop(data.columns[data.columns.str.contains('home')], axis=1)  # 删除包含图片地址的列
    data = data.drop(data.columns[2], axis=1) # 删除全是字符串的一列
    data = data.drop(data.columns[data.columns.str.contains('\(')], axis=1)  # 暂时删掉元组类型的数据
    data.columns = [np.arange(0,98)]
    data.astype(float)
    X = data.iloc[:,:-1]
    y = data.iloc[:,-1]
    return X,y

path='train.csv' # 请自行更改
newdata,newtarget = test_data(path)
new_pred = ada_best.predict(newdata)



# 计算分类指标
accuracy = accuracy_score(newtarget, new_pred)
precision = precision_score(newtarget, new_pred)
recall = recall_score(newtarget, new_pred)
# 计算AUC
auc = roc_auc_score(newtarget, new_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)

Accuracy: 0.9852426213106553
Precision: 0.9867011451791651
Recall: 0.9914625092798812
AUC: 0.9819275736583455
