# Boosting

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler


In [13]:
df=pd.read_csv("./Data/mushroom_cleaned.csv")

In [14]:
X = df.drop(columns=['class'])

In [15]:
# Chuẩn hóa
scaler = StandardScaler()
scaler.fit(X) # Tính trung bình và độ lệch chuẩn của từng đặc trưng
X = scaler.transform(X) # Trừ tb và chia cho độ lệch chuẩn

In [16]:
# Xử lý outlier

from scipy import stats

# Hàm để loại bỏ outlier bằng Z-score
def remove_outliers_zscore(df, column, threshold=2):
    z_scores = np.abs(stats.zscore(df[column]))
    return df[(z_scores < threshold)]

# Loại bỏ outlier trong các cột 'stem-width', 'stem-height'
df = remove_outliers_zscore(df, 'cap-diameter')
df = remove_outliers_zscore(df, 'stem-width')
df = remove_outliers_zscore(df, 'stem-height')

print(df.shape)  # Kích thước dữ liệu sau khi loại bỏ outlier

(46211, 9)


In [17]:
# chia tập 
X = df.drop(columns=['class'])
y = df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0, stratify = y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=0, stratify=y_train)

In [18]:
# AdaBoost với Decision Tree
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1]
}

DT = DecisionTreeClassifier(max_depth=3)
DT_AdaBoost = GridSearchCV(AdaBoostClassifier(DT, random_state=0), param_grid=param_grid)
DT_AdaBoost.fit(X_train, y_train).score(X_val, y_val)



0.983649915845155

In [19]:
y_pre = DT_AdaBoost.predict(X_test)
evaluate = classification_report(y_test, y_pre, output_dict=True)
DT_AdaBoost_df = pd.DataFrame(evaluate).transpose()
DT_AdaBoost_df

Unnamed: 0,precision,recall,f1-score,support
0,0.983302,0.984673,0.983987,2153.0
1,0.986618,0.985419,0.986018,2469.0
accuracy,0.985071,0.985071,0.985071,0.985071
macro avg,0.98496,0.985046,0.985003,4622.0
weighted avg,0.985074,0.985071,0.985072,4622.0


In [20]:
# Adaboost với KNN
KNN = KNeighborsClassifier(n_neighbors=3)
KNN_AdaBoost = GridSearchCV(AdaBoostClassifier(DT, random_state=0), param_grid=param_grid)
KNN_AdaBoost.fit(X_train, y_train).score(X_val, y_val)



0.983649915845155

In [21]:
y_pre = KNN_AdaBoost.predict(X_test)
evaluate = classification_report(y_test, y_pre, output_dict=True)
KNN_AdaBoost_df = pd.DataFrame(evaluate).transpose()
KNN_AdaBoost_df

Unnamed: 0,precision,recall,f1-score,support
0,0.983302,0.984673,0.983987,2153.0
1,0.986618,0.985419,0.986018,2469.0
accuracy,0.985071,0.985071,0.985071,0.985071
macro avg,0.98496,0.985046,0.985003,4622.0
weighted avg,0.985074,0.985071,0.985072,4622.0


In [22]:
# Gradient Boosting với mặc định sử dụng cây quyết định
GB_param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1],
    'max_depth': [1, 3, 5, 7]
}

GB = GradientBoostingClassifier(max_depth=1,random_state=0)
# n_jobs=-1: Sử dụng tất cả các lõi CPU sẵn có để thực hiện việc tìm kiếm siêu tham số song song
GB_grid_search = GridSearchCV(GB, param_grid=GB_param_grid, n_jobs=-1)
GB_grid_search.fit(X_train, y_train).score(X_val, y_val)

0.9884587641259919

In [23]:
y_pre = GB_grid_search.predict(X_test)
evaluate = classification_report(y_test, y_pre, output_dict=True)
GB_grid_search_df = pd.DataFrame(evaluate).transpose()
GB_grid_search_df

Unnamed: 0,precision,recall,f1-score,support
0,0.989322,0.989782,0.989552,2153.0
1,0.991086,0.990684,0.990885,2469.0
accuracy,0.990264,0.990264,0.990264,0.990264
macro avg,0.990204,0.990233,0.990219,4622.0
weighted avg,0.990264,0.990264,0.990264,4622.0


In [24]:
# XGBoost 
import xgboost as xgb

# Sử dụng GPU để huấn luyện mô hình
xgb_model = xgb.XGBClassifier(use_label_encoder=False, tree_method='gpu_hist', eval_metric='logloss', random_state=0)
xgb_grid_search = GridSearchCV(xgb_model, param_grid=GB_param_grid, n_jobs=-1)
xgb_grid_search.fit(
    X_train, y_train,
    early_stopping_rounds=10, # Nếu không cải thiện sau 10 vòng lặp thì dừng
    eval_set=[(X_val, y_val)], # Đánh giá hiệu suất
    verbose=True # Hiển thị quá trình
)


    E.g. tree_method = "hist", device = "cuda"



[0]	validation_0-logloss:0.64209
[1]	validation_0-logloss:0.60156
[2]	validation_0-logloss:0.56431
[3]	validation_0-logloss:0.53660
[4]	validation_0-logloss:0.51037
[5]	validation_0-logloss:0.48627
[6]	validation_0-logloss:0.46767
[7]	validation_0-logloss:0.44925
[8]	validation_0-logloss:0.42932
[9]	validation_0-logloss:0.40990
[10]	validation_0-logloss:0.39746
[11]	validation_0-logloss:0.38428
[12]	validation_0-logloss:0.36945
[13]	validation_0-logloss:0.35819
[14]	validation_0-logloss:0.34167
[15]	validation_0-logloss:0.33241
[16]	validation_0-logloss:0.32025
[17]	validation_0-logloss:0.31080
[18]	validation_0-logloss:0.30377
[19]	validation_0-logloss:0.29707
[20]	validation_0-logloss:0.29136
[21]	validation_0-logloss:0.28266
[22]	validation_0-logloss:0.27399
[23]	validation_0-logloss:0.26947
[24]	validation_0-logloss:0.26386
[25]	validation_0-logloss:0.25941
[26]	validation_0-logloss:0.25155
[27]	validation_0-logloss:0.24723
[28]	validation_0-logloss:0.24406
[29]	validation_0-loglos

In [25]:
y_pre = xgb_grid_search.predict(X_test)
evaluate = classification_report(y_test, y_pre, output_dict=True)
xgb_grid_search_df = pd.DataFrame(evaluate).transpose()
xgb_grid_search_df


    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Unnamed: 0,precision,recall,f1-score,support
0,0.987494,0.990246,0.988868,2153.0
1,0.991474,0.989064,0.990268,2469.0
accuracy,0.989615,0.989615,0.989615,0.989615
macro avg,0.989484,0.989655,0.989568,4622.0
weighted avg,0.98962,0.989615,0.989616,4622.0
