## 先设定好要使用哪些模型

In [2]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import Perceptron
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.gaussian_process import GaussianProcessClassifier

from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV



from hyperparams_grid import *

algo=[
    #[BaggingClassifier(), 'BaggingClassifier', grid_bagging],
    [KNeighborsClassifier(), 'KNeighborsClassifier',grid_knn],
    [GradientBoostingClassifier(), 'GradientBoostingClassifier',grid_gbt],
    #[AdaBoostClassifier(), 'AdaBoostClassifier', grid_adaboost],
    [RandomForestClassifier(), 'RandomForestClassifier',grid_rf],
    [DecisionTreeClassifier(), 'DecisionTreeClassifier',grid_dt],
    [GaussianProcessClassifier(), 'GaussianProcessClassifier', grid_gaussian],
    #[SVC(), 'SVM tuning',grid_svm],
    [GaussianNB(), 'GaussianNB',grid_nb],
    [LogisticRegression(), 'LogisticRegression',grid_lr],
    [MLPClassifier(), 'MLPClassifier',grid_mlp]
    #[ExtraTreesClassifier(), 'ExtraTreesClassifier', grid_extratrees]
]

smelltype = 'LongMethod'

def print_data_model():
    for detail in algo:
        print(detail[1])
        print(detail[0].get_params())


print_data_model()

DecisionTreeClassifier
{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': None, 'splitter': 'best'}


#### 数据导入和数据处理

In [3]:
import pandas as pd

LargeClass = pd.read_csv('..\dataset\LargeClass.csv', encoding='UTF8')
LongMethod = pd.read_csv('..\dataset\LongMethod.csv', encoding='UTF8')

#将LongMethod的所有Nan值填充为-1
LongMethod = LongMethod.fillna(-1)
#将LargeClass的所有Nan值填充为-1
LargeClass = LargeClass.fillna(-1) 

#去除'Experince Based '标签列
LabelLongMethod = LongMethod['smell']
LongMethod = LongMethod.drop('smell', axis=1)

#去除'smell'标签列
LabelLargeClass = LargeClass['smell']
LargeClass = LargeClass.drop('smell', axis=1)

#LargeClass标准化
mean = LongMethod.mean()
std = LongMethod.std()
LongMethod = (LongMethod - mean)/std

mean = LargeClass.mean()
std = LargeClass.std()
LargeClass = (LargeClass - mean)/std

## 开始网格搜索找到每个分类器的在每个fold中的最佳参数

**注意：**
* 每个fold的最佳参数都不同
* 会保存每个分类器在每个fold中的最佳参数

In [3]:
from tuning_model import tuning_model
from default_model import default_model
from utils import write_report_result, filter_algo

# 先进行过滤
algo = filter_algo(algo)
smelltype = 'LongMethod'
X = LongMethod.to_numpy()
y = LabelLongMethod.to_numpy()


if algo:
    # 网格搜索
    tuning_model_scores = tuning_model(X, y, smelltype, algo)
    # 默认参数
    default_model_scores = default_model(f"tuning_model\{smelltype}_data", smelltype, algo)
    tuning_res = write_report_result(tuning_model_scores, f"tuning_model\\{smelltype}")
    default_res = write_report_result(default_model_scores, f"default_model\\{smelltype}")
else:
    print("All algorithms have been run.")

tuning_model/LongMethod/tuning dose not exist
DecisionTreeClassifier Accuracy: 1.0000
fold 1:
DecisionTreeClassifier(criterion='entropy', max_depth=4)
Accuracy: 1.0
f-score: 1.0
[[32  0]
 [ 0  4]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        32
           1       1.00      1.00      1.00         4

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36

----------------------------------------------------------------------------------------------------
DecisionTreeClassifier Accuracy: 0.9722
fold 2:
DecisionTreeClassifier(criterion='entropy', max_depth=6)
Accuracy: 0.9722222222222222
f-score: 0.8571428571428571
[[32  1]
 [ 0  3]]
              precision    recall  f1-score   support

           0       1.00      0.97      0.98        33
           1       0.75      1.00      0.86         3

    accuracy                  

In [None]:
tuning_res

In [None]:

default_res

## Begin using ensemble models

* 每一个fold应用不同模型参数设置

In [None]:
from sklearn.model_selection import KFold
import numpy as np

from ensemble import VoteEnsemble, StackingEnsemble
from hyperparams_grid import *
from utils import write_report_result, get_params_dict

se_classifer = StackingEnsemble(algo)
ve_classifer = VoteEnsemble(algo)

def ensemble_predict(X, y, params_dict, default=False):
    kf = KFold(n_splits=10, shuffle=True, random_state=42) 
    se_predicted_targets = np.array([])
    se_actual_targets = np.array([])
    ve_predicted_targets = np.array([])
    ve_actual_targets = np.array([])
    se_model_scores, ve_model_scores = [], []
    for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
        # set the parameter of the models first.
        if default:
            se_classifer.set_params()
            ve_classifer.set_params()
        else:
            se_classifer.set_params(params_dict[fold-1])
            ve_classifer.set_params(params_dict[fold-1])
        # spilt train test
        X_train = X[train_index]
        y_train = y[train_index]
        X_test = X[test_index]
        y_test = y[test_index]
        se_pred, se_acc, se_f1 = se_classifer.forward(train_X=X_train, train_y=y_train, test_X=X_test, test_y=y_test)
        ve_pred, ve_acc, ve_f1 = ve_classifer.forward(train_X=X_train, train_y=y_train, test_X=X_test, test_y=y_test)
        se_predicted_targets = np.append(se_predicted_targets, se_pred)
        se_actual_targets = np.append(se_actual_targets, y_test)

        print(f'Fold :{fold}\n Stacking Ensemble Accuracy: {se_acc}, Stacking Ensemble F1: {se_f1}\n Vote Ensemble Accuracy: {ve_acc}, Vote Ensemble F1: {ve_f1}')

        ve_predicted_targets = np.append(ve_predicted_targets, ve_pred)
        ve_actual_targets = np.append(ve_actual_targets, y_test)

    se_model_scores.append([se_predicted_targets, se_actual_targets, 'se'])
    ve_model_scores.append([ve_predicted_targets, ve_actual_targets, 've'])
    
    return se_model_scores + ve_model_scores

In [15]:
X = LongMethod.to_numpy()
y = LabelLongMethod.to_numpy()

# 得到每一个fold的模型参数
params_dict = get_params_dict(algo=algo, smelltype=smelltype)
# 网格搜索集成模型
model_scores = ensemble_predict(X, y, params_dict)
write_report_result(model_scores, f"tuning_model\\{smelltype}")

{'DecisionTreeClassifier': {'criterion': 'entropy', 'max_depth': 4}}
Fold :1
 Stacking Ensemble Accuracy: 1.0, Stacking Ensemble F1: 1.0
 Vote Ensemble Accuracy: 1.0, Vote Ensemble F1: 1.0
{'DecisionTreeClassifier': {'criterion': 'entropy', 'max_depth': 6}}
Fold :2
 Stacking Ensemble Accuracy: 0.9722222222222222, Stacking Ensemble F1: 0.8571428571428571
 Vote Ensemble Accuracy: 1.0, Vote Ensemble F1: 1.0
{'DecisionTreeClassifier': {'criterion': 'entropy', 'max_depth': 2}}
Fold :3
 Stacking Ensemble Accuracy: 1.0, Stacking Ensemble F1: 1.0
 Vote Ensemble Accuracy: 1.0, Vote Ensemble F1: 1.0
{'DecisionTreeClassifier': {'criterion': 'entropy', 'max_depth': 4}}
Fold :4
 Stacking Ensemble Accuracy: 1.0, Stacking Ensemble F1: 1.0
 Vote Ensemble Accuracy: 1.0, Vote Ensemble F1: 1.0
{'DecisionTreeClassifier': {'criterion': 'entropy', 'max_depth': 4}}
Fold :5
 Stacking Ensemble Accuracy: 0.9722222222222222, Stacking Ensemble F1: 0.6666666666666666
 Vote Ensemble Accuracy: 0.9722222222222222, Vo

Unnamed: 0,Model,precision,recall,f1-score
0,se,0.925926,0.892857,0.909091
1,ve,0.925926,0.925926,0.925926


In [None]:
# 默认参数集成模型

model_scores = ensemble_predict(X, y, params_dict, default=True)
write_report_result(model_scores, f"default_model\\{smelltype}")