# Tugas 2
Terdapat dataset mushroom. Berdasarkan dataset tersebut, bandingkan peforma antara algoritma Decision Tree dan AdaBoost. Gunakan tunning hyperparameter untuk mendapatkan parameter dan akurasi yang terbaik.

In [None]:
# Import Library
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # import DT
from sklearn.ensemble import AdaBoostClassifier # import AdaBoost
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder # Kebutuhan encoding label

In [None]:
# Load data
df = pd.read_csv('mushrooms.csv')
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [None]:
# Cek kolom null
df.isnull().sum()

Unnamed: 0,0
class,0
cap-shape,0
cap-surface,0
cap-color,0
bruises,0
odor,0
gill-attachment,0
gill-spacing,0
gill-size,0
gill-color,0


In [None]:
# Preprocessing: Encode fitur kategori
le = LabelEncoder()

for column in df.columns:
    df[column] = le.fit_transform(df[column])

df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [None]:
# Split df menjadi fitur (X) dan target (y)
X = df.iloc[:,1:]
y = df['class']

X.shape

(8124, 22)

In [None]:
# Split dataset menjadi training dan testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Model Decision Tree
dt = DecisionTreeClassifier(random_state=42)

# Melakukan Set up hyperparameter grids
dt_param_grid = {
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

# GridSearchCV for Decision Tree
dt_grid_search = GridSearchCV(estimator=dt, param_grid=dt_param_grid, cv=5, n_jobs=-1, verbose=1)
dt_grid_search.fit(X_train, y_train)

# Mengambil best parameters dan accuracies
dt_best_params = dt_grid_search.best_params_
dt_best_acc = dt_grid_search.best_score_

# Mengevaluasi pada data train dan test
dt_best_model = dt_grid_search.best_estimator_

y_pred_dt = dt_best_model.predict(X_test)

train_acc = accuracy_score(y_train, dt_best_model.predict(X_train))
test_acc = accuracy_score(y_test, y_pred_dt)

print("Decision Tree Best Parameters:", dt_best_params)
print("Decision Tree Train Accuracy:", train_acc)
print("Decision Tree Test Accuracy:", test_acc)

Fitting 5 folds for each of 90 candidates, totalling 450 fits
Decision Tree Best Parameters: {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Decision Tree Train Accuracy: 1.0
Decision Tree Test Accuracy: 1.0


In [None]:
# Model AdaBoost dengan base_estimator DecisionTree
ada = AdaBoostClassifier()

# Melakukan Set up hyperparameter grids
param_grid_ada = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.5, 1]
}

# Melakukan Grid Search
grid_search_ada = GridSearchCV(ada, param_grid_ada, cv=5)
grid_search_ada.fit(X_train, y_train)

ada_best_acc = grid_search_ada.best_estimator_
ada_best_score = grid_search_ada.best_score_


print("Best parameters for AdaBoost: ", grid_search_ada.best_params_)
print("Best accuracy for AdaBoost: ", grid_search_ada.best_score_)



Best parameters for AdaBoost:  {'learning_rate': 0.5, 'n_estimators': 100}
Best accuracy for AdaBoost:  1.0


In [None]:
print(f"Accuracy Score Decision Tree: {dt_best_acc}")
print(f"Accuracy Score AdaBoost: {ada_best_score}")

Accuracy Score Decision Tree: 1.0
Accuracy Score AdaBoost: 1.0
