## 先设定好要使用哪些模型

In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import Perceptron
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.gaussian_process import GaussianProcessClassifier

from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

from tuning_model import tuning_model
from default_model import default_model
from utils import write_report_result, filter_algo
import numpy as np
from hyperparams_grid import *
import pandas as pd


algo=[
    #[BaggingClassifier(), 'BaggingClassifier', grid_bagging],
    [KNeighborsClassifier(), 'KNeighborsClassifier',grid_knn],
    [GradientBoostingClassifier(), 'GradientBoostingClassifier',grid_gbt],
    [AdaBoostClassifier(), 'AdaBoostClassifier', grid_adaboost],
    [RandomForestClassifier(), 'RandomForestClassifier',grid_rf],
    [DecisionTreeClassifier(), 'DecisionTreeClassifier',grid_dt],
    # [GaussianProcessClassifier(), 'GaussianProcessClassifier', grid_gaussian],  ## 这个有问题先去掉不要它
    [SVC(probability=True), 'SVM tuning',grid_svm],
    [GaussianNB(), 'GaussianNB',grid_nb],
    [LogisticRegression(), 'LogisticRegression',grid_lr],
    [MLPClassifier(), 'MLPClassifier',grid_mlp]
    #[ExtraTreesClassifier(), 'ExtraTreesClassifier', grid_extratrees]
]

smelltype = 'LargeClass'

def print_data_model():
    for detail in algo:
        print(detail[1])
        print(detail[0].get_params())


print_data_model()

KNeighborsClassifier
{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}
GradientBoostingClassifier
{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'log_loss', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
AdaBoostClassifier
{'algorithm': 'SAMME.R', 'base_estimator': 'deprecated', 'estimator': None, 'learning_rate': 1.0, 'n_estimators': 50, 'random_state': None}
RandomForestClassifier
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impuri

#### 数据导入和数据处理

In [None]:
from utils import data_processing

X, y = data_processing('dataset\LargeClass.csv')
smelltype = 'LargeClass'

## 开始网格搜索找到每个分类器的在每个fold中的最佳参数

**注意：**
* 每个fold的最佳参数都不同
* 会保存每个分类器在每个fold中的最佳参数

In [None]:
# 先进行过滤
algo = filter_algo(algo, tuned_path = f"tuning_model/{smelltype}/tuning")

if algo:
    # 网格搜索
    tuning_model_scores = tuning_model(X, y, smelltype, algo)
    tuning_res = write_report_result(tuning_model_scores, f"tuning_model\\{smelltype}")
    # 默认参数
    default_model_scores = default_model(f"tuning_model\\{smelltype}_data", smelltype, algo)
    default_res = write_report_result(default_model_scores, f"default_model\\{smelltype}")
else:
    print("All algorithms have been run.")

In [5]:
tuning_res

Unnamed: 0,Model,precision,recall,f1-score
0,KNeighborsClassifier,0.882353,0.384615,0.535714
1,GradientBoostingClassifier,0.885714,0.794872,0.837838
2,AdaBoostClassifier,0.842105,0.820513,0.831169
3,RandomForestClassifier,0.810811,0.769231,0.789474
4,DecisionTreeClassifier,0.885714,0.794872,0.837838
5,SVM tuning,0.794118,0.692308,0.739726
6,GaussianNB,0.568627,0.74359,0.644444
7,LogisticRegression,0.783784,0.74359,0.763158
8,MLPClassifier,0.794118,0.692308,0.739726


In [6]:
default_res

Unnamed: 0,Model,precision,recall,f1-score
0,KNeighborsClassifier,0.9,0.461538,0.610169
1,GradientBoostingClassifier,0.861111,0.794872,0.826667
2,AdaBoostClassifier,0.805556,0.74359,0.773333
3,RandomForestClassifier,0.868421,0.846154,0.857143
4,DecisionTreeClassifier,0.882353,0.769231,0.821918
5,SVM tuning,0.814815,0.564103,0.666667
6,GaussianNB,0.568627,0.74359,0.644444
7,LogisticRegression,0.827586,0.615385,0.705882
8,MLPClassifier,0.787879,0.666667,0.722222


## Begin using ensemble models

* 每一个fold应用不同模型参数设置

In [4]:
from sklearn.model_selection import KFold
import numpy as np

from ensemble import VoteEnsemble, StackingEnsemble
from hyperparams_grid import *
from utils import write_report_result, get_params_dict

se_classifer = StackingEnsemble(algo)
ve_classifer = VoteEnsemble(algo)

def ensemble_predict(X, y, params_dict, default=False):
    kf = KFold(n_splits=10, shuffle=True, random_state=42) 
    se_predicted_targets = np.array([])
    se_actual_targets = np.array([])
    ve_predicted_targets = np.array([])
    ve_actual_targets = np.array([])
    se_model_scores, ve_model_scores = [], []
    for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
        # set the parameter of the models first.
        if default:
            se_classifer.set_params()
            ve_classifer.set_params()
        else:
            se_classifer.set_params(params_dict[fold-1])
            ve_classifer.set_params(params_dict[fold-1])
        # spilt train test
        X_train = X[train_index]
        y_train = y[train_index]
        X_test = X[test_index]
        y_test = y[test_index]
        se_pred, se_acc, se_f1 = se_classifer.forward(train_X=X_train, train_y=y_train, test_X=X_test, test_y=y_test)
        ve_pred, ve_acc, ve_f1 = ve_classifer.forward(train_X=X_train, train_y=y_train, test_X=X_test, test_y=y_test)
        se_predicted_targets = np.append(se_predicted_targets, se_pred)
        se_actual_targets = np.append(se_actual_targets, y_test)

        print(f'Fold :{fold}\n Stacking Ensemble Accuracy: {se_acc}, Stacking Ensemble F1: {se_f1}\n Vote Ensemble Accuracy: {ve_acc}, Vote Ensemble F1: {ve_f1}')

        ve_predicted_targets = np.append(ve_predicted_targets, ve_pred)
        ve_actual_targets = np.append(ve_actual_targets, y_test)

    se_model_scores.append([se_predicted_targets, se_actual_targets, 'se'])
    ve_model_scores.append([ve_predicted_targets, ve_actual_targets, 've'])
    
    return se_model_scores + ve_model_scores

In [15]:
# 得到每一个fold的模型参数
params_dict = get_params_dict(algo=algo, smelltype=smelltype)
# 网格搜索集成模型
model_scores = ensemble_predict(X, y, params_dict)
write_report_result(model_scores, f"tuning_model\\{smelltype}")



Fold :1
 Stacking Ensemble Accuracy: 0.9444444444444444, Stacking Ensemble F1: 0.875
 Vote Ensemble Accuracy: 0.9444444444444444, Vote Ensemble F1: 0.875




Fold :2
 Stacking Ensemble Accuracy: 1.0, Stacking Ensemble F1: 1.0
 Vote Ensemble Accuracy: 1.0, Vote Ensemble F1: 1.0




Fold :3
 Stacking Ensemble Accuracy: 0.8888888888888888, Stacking Ensemble F1: 0.5
 Vote Ensemble Accuracy: 0.9166666666666666, Vote Ensemble F1: 0.6666666666666665




Fold :4
 Stacking Ensemble Accuracy: 1.0, Stacking Ensemble F1: 1.0
 Vote Ensemble Accuracy: 1.0, Vote Ensemble F1: 1.0




Fold :5
 Stacking Ensemble Accuracy: 0.9444444444444444, Stacking Ensemble F1: 0.6666666666666666
 Vote Ensemble Accuracy: 0.9444444444444444, Vote Ensemble F1: 0.6666666666666666




Fold :6
 Stacking Ensemble Accuracy: 1.0, Stacking Ensemble F1: 1.0
 Vote Ensemble Accuracy: 0.9722222222222222, Vote Ensemble F1: 0.6666666666666666




Fold :7
 Stacking Ensemble Accuracy: 1.0, Stacking Ensemble F1: 1.0
 Vote Ensemble Accuracy: 1.0, Vote Ensemble F1: 1.0




Fold :8
 Stacking Ensemble Accuracy: 0.8888888888888888, Stacking Ensemble F1: 0.6666666666666666
 Vote Ensemble Accuracy: 0.8888888888888888, Vote Ensemble F1: 0.6666666666666666




Fold :9
 Stacking Ensemble Accuracy: 0.9444444444444444, Stacking Ensemble F1: 0.8000000000000002
 Vote Ensemble Accuracy: 0.9444444444444444, Vote Ensemble F1: 0.8000000000000002
Fold :10
 Stacking Ensemble Accuracy: 1.0, Stacking Ensemble F1: 1.0
 Vote Ensemble Accuracy: 1.0, Vote Ensemble F1: 1.0
all
se                  
[[316   9]
 [  5  30]]
              precision    recall  f1-score   support

         0.0       0.98      0.97      0.98       325
         1.0       0.77      0.86      0.81        35

    accuracy                           0.96       360
   macro avg       0.88      0.91      0.89       360
weighted avg       0.96      0.96      0.96       360

all
ve                  
[[316   9]
 [  5  30]]
              precision    recall  f1-score   support

         0.0       0.98      0.97      0.98       325
         1.0       0.77      0.86      0.81        35

    accuracy                           0.96       360
   macro avg       0.88      0.91      0.89       360
weig



Unnamed: 0,Model,precision,recall,f1-score
0,se,0.769231,0.857143,0.810811
1,ve,0.769231,0.857143,0.810811


In [5]:
# 默认参数集成模型
model_scores = ensemble_predict(X, y, {}, default=True)
write_report_result(model_scores, f"default_model\\{smelltype}")

UnboundLocalError: local variable 'classifier' referenced before assignment