# Boosting

In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler


In [19]:
df=pd.read_csv("./Data/mushroom_cleaned.csv")

In [20]:
X = df.drop(columns=['class'])

In [21]:
# Chuẩn hóa
scaler = StandardScaler()
scaler.fit(X) # Tính trung bình và độ lệch chuẩn của từng đặc trưng
X = scaler.transform(X) # Trừ tb và chia cho độ lệch chuẩn

In [22]:
# Xử lý outlier

from scipy import stats

# Hàm để loại bỏ outlier bằng Z-score
def remove_outliers_zscore(df, column, threshold=2):
    z_scores = np.abs(stats.zscore(df[column]))
    return df[(z_scores < threshold)]

# Loại bỏ outlier trong các cột 'stem-width', 'stem-height'
df = remove_outliers_zscore(df, 'cap-diameter')
df = remove_outliers_zscore(df, 'stem-width')
df = remove_outliers_zscore(df, 'stem-height')

print(df.shape)  # Kích thước dữ liệu sau khi loại bỏ outlier

(46211, 9)


In [23]:
# chia tập 
X = df.drop(columns=['class'])
y = df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0, stratify = y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=0, stratify=y_train)

In [31]:
# Baseline
DT = DecisionTreeClassifier(max_depth=3)
DT_Adaboost = AdaBoostClassifier(DT, random_state=0)
DT_Adaboost.fit(X_train, y_train).score(X_val, y_val)



0.9598461168550132

In [34]:
# AdaBoost với Decision Tree
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1]
}

DT = DecisionTreeClassifier(max_depth=3)
DT_AdaBoost_GS = GridSearchCV(AdaBoostClassifier(DT, random_state=0), param_grid=param_grid)
DT_AdaBoost_GS.fit(X_train, y_train)
print(f'Accuracy: {DT_AdaBoost_GS.score(X_val, y_val)}')



Accuracy: 0.983649915845155


In [35]:
DT_AdaBoost_GS.best_params_

{'learning_rate': 1, 'n_estimators': 200}

In [25]:
# Adaboost với KNN
KNN = KNeighborsClassifier(n_neighbors=3)
KNN_AdaBoost = GridSearchCV(AdaBoostClassifier(DT, random_state=0), param_grid=param_grid)
KNN_AdaBoost.fit(X_train, y_train)
print(f'Accuracy: {KNN_AdaBoost.score(X_val, y_val)}')



Accuracy: 0.983649915845155


In [36]:
KNN_AdaBoost.best_params_

{'learning_rate': 1, 'n_estimators': 200}

In [26]:
GB_param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1],
    'max_depth': [1, 3, 5, 7]
}

In [30]:
# Gradient Boosting với mặc định sử dụng cây quyết định
GB = GradientBoostingClassifier(random_state=0)
GB_grid_search = GridSearchCV(GB, param_grid=GB_param_grid, n_jobs=-1)
GB_grid_search.fit(X_train, y_train)
print(f'Accuracy: {GB_grid_search.score(X_val, y_val)}')

Accuracy: 0.9884587641259919


In [37]:
GB_grid_search.best_params_

{'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200}

In [28]:
# XGBoost 
import xgboost as xgb

# Sử dụng GPU để huấn luyện mô hình
xgb_model = xgb.XGBClassifier(use_label_encoder=False, tree_method='gpu_hist', eval_metric='logloss', random_state=0)
xgb_grid_search = GridSearchCV(xgb_model, param_grid=GB_param_grid, n_jobs=-1)
xgb_grid_search.fit(
    X_train, y_train,
    early_stopping_rounds=10, # Nếu không cải thiện sau 10 vòng lặp thì dừng
    eval_set=[(X_val, y_val)], # Đánh giá hiệu suất
    verbose=True # Hiển thị quá trình
)
print(f'Accuracy: {xgb_grid_search.score(X_val, y_val)}')

[0]	validation_0-logloss:0.64209
[1]	validation_0-logloss:0.60156
[2]	validation_0-logloss:0.56431
[3]	validation_0-logloss:0.53660
[4]	validation_0-logloss:0.51037
[5]	validation_0-logloss:0.48627
[6]	validation_0-logloss:0.46767
[7]	validation_0-logloss:0.44925
[8]	validation_0-logloss:0.42932
[9]	validation_0-logloss:0.40990
[10]	validation_0-logloss:0.39746
[11]	validation_0-logloss:0.38428
[12]	validation_0-logloss:0.36945
[13]	validation_0-logloss:0.35819
[14]	validation_0-logloss:0.34167
[15]	validation_0-logloss:0.33241
[16]	validation_0-logloss:0.32025
[17]	validation_0-logloss:0.31080
[18]	validation_0-logloss:0.30377
[19]	validation_0-logloss:0.29707
[20]	validation_0-logloss:0.29136



    E.g. tree_method = "hist", device = "cuda"



[21]	validation_0-logloss:0.28266
[22]	validation_0-logloss:0.27399
[23]	validation_0-logloss:0.26947
[24]	validation_0-logloss:0.26386
[25]	validation_0-logloss:0.25941
[26]	validation_0-logloss:0.25155
[27]	validation_0-logloss:0.24723
[28]	validation_0-logloss:0.24406
[29]	validation_0-logloss:0.24039
[30]	validation_0-logloss:0.23717
[31]	validation_0-logloss:0.23126
[32]	validation_0-logloss:0.22506
[33]	validation_0-logloss:0.22095
[34]	validation_0-logloss:0.21212
[35]	validation_0-logloss:0.20992
[36]	validation_0-logloss:0.20112
[37]	validation_0-logloss:0.19167
[38]	validation_0-logloss:0.18993
[39]	validation_0-logloss:0.18182
[40]	validation_0-logloss:0.17957
[41]	validation_0-logloss:0.17224
[42]	validation_0-logloss:0.17061
[43]	validation_0-logloss:0.16434
[44]	validation_0-logloss:0.16288
[45]	validation_0-logloss:0.15721
[46]	validation_0-logloss:0.15615
[47]	validation_0-logloss:0.15065
[48]	validation_0-logloss:0.14895
[49]	validation_0-logloss:0.14792
[50]	validatio


    E.g. tree_method = "hist", device = "cuda"



In [38]:
xgb_grid_search.best_params_

{'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200}

In [29]:
y_pre = xgb_grid_search.predict(X_test)
evaluate = classification_report(y_test, y_pre, output_dict=True)
xgb_grid_search_df = pd.DataFrame(evaluate).transpose()
xgb_grid_search_df

Unnamed: 0,precision,recall,f1-score,support
0,0.987494,0.990246,0.988868,2153.0
1,0.991474,0.989064,0.990268,2469.0
accuracy,0.989615,0.989615,0.989615,0.989615
macro avg,0.989484,0.989655,0.989568,4622.0
weighted avg,0.98962,0.989615,0.989616,4622.0
