## **Tugas Praktikum**

### **Tugas 1**

Terdapat dataset **mushroom**. Berdasarkan dataset yang tersebut, bandingkan peforma antara algoritma **Decision Tree** dan **RandomForest**. Gunakan tunning hyperparameter untuk mendapatkan parameter dan akurasi yang terbaik.

##### **Import Library**

In [95]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC # import SVM classifier
from sklearn.ensemble import VotingClassifier # import model Voting
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

##### **Data Preparation**

In [96]:
# Load Data

dataMshrm = pd.read_csv('../data/mushrooms.csv')

dataMshrm.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [97]:
dataMshrm.shape

(8124, 23)

In [98]:
dataMshrm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [99]:
# Cek kolom null
dataMshrm.isnull().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

##### **Melakukan Encoding**

In [100]:
from sklearn.calibration import LabelEncoder

labelencoder=LabelEncoder()
for column in dataMshrm.columns:
    dataMshrm[column] = labelencoder.fit_transform(dataMshrm[column])

dataMshrm.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


##### **Seleksi Fitur**

In [101]:
X = dataMshrm.drop(['class'], axis=1)
y = dataMshrm['class']

##### **Split Data Training dan Testing**

In [102]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

##### **Training Decision Tree**

In [103]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()

# Sesuaikan dt ke set training
dt.fit(X_train, y_train)

# Memprediksi label set test
y_pred_dt = dt.predict(X_test)

#  menghitung set accuracy
acc_dt = accuracy_score(y_test, y_pred_dt)
print("Decision Tree Accuracy: {}%".format(round(dt.score(X_test,y_test)*100,2)))

Decision Tree Accuracy: 100.0%


##### **Hyperparameter tuning for Decision Tree**

In [104]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Membuat objek DecisionTreeClassifier
dt = DecisionTreeClassifier()

# Daftar hyperparameter yang akan diuji
param_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Membuat objek GridSearchCV
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1)

# Melakukan pencarian hyperparameter terbaik pada set pelatihan
grid_search.fit(X_train, y_train)

# Mendapatkan hyperparameter terbaik
best_dt = grid_search.best_estimator_

# Memprediksi label set test dengan model terbaik
y_pred_dt = best_dt.predict(X_test)

# Menghitung akurasi
acc_dt = accuracy_score(y_test, y_pred_dt)
print("Decision Tree Accuracy: {:.2f}%".format(acc_dt * 100))

# Menampilkan hyperparameter terbaik
print("Hyperparameter terbaik:")
print(grid_search.best_params_)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


Decision Tree Accuracy: 100.00%
Hyperparameter terbaik:
{'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'best'}


##### **Training Random Forest**

In [105]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=10, random_state=1)

# Sesuaikan dt ke set training
rf.fit(X_train, y_train)

# Memprediksi label set test
y_pred_rf = rf.predict(X_test)

#  menghitung set accuracy
acc_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy: {}%".format(round(rf.score(X_test,y_test)*100,2)))

Random Forest Accuracy: 100.0%


##### **Hyperparameter tuning for Random Forest**

In [106]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Membuat objek RandomForestClassifier
rf = RandomForestClassifier()

# Daftar hyperparameter yang akan diuji
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True, False]
}

# Membuat objek GridSearchCV dengan n_jobs=-1 untuk memanfaatkan semua core CPU
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

# Melakukan pencarian hyperparameter terbaik pada set pelatihan
grid_search.fit(X_train, y_train)

# Mendapatkan hyperparameter terbaik
best_rf = grid_search.best_estimator_

# Memprediksi label set test dengan model terbaik
y_pred_rf = best_rf.predict(X_test)

# Menghitung akurasi
acc_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy: {:.2f}%".format(acc_rf * 100))

# Menampilkan hyperparameter terbaik
print("Hyperparameter terbaik:")
print(grid_search.best_params_)


Fitting 5 folds for each of 48 candidates, totalling 240 fits
Random Forest Accuracy: 100.00%
Hyperparameter terbaik:
{'bootstrap': True, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}


### **Tugas 2**
Terdapat dataset **mushroom**. Berdasarkan dataset tersebut, bandingkan peforma antara algoritma **Decision Tree** dan **AdaBoost**. Gunakan tunning hyperparameter untuk mendapatkan parameter dan akurasi yang terbaik.

##### **Training Decision Tree**

In [107]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()

# Sesuaikan dt ke set training
dt.fit(X_train, y_train)

# Memprediksi label set test
y_pred_dt = dt.predict(X_test)

#  menghitung set accuracy
acc_dt = accuracy_score(y_test, y_pred_dt)
print("Decision Tree Accuracy: {}%".format(round(dt.score(X_test,y_test)*100,2)))

Decision Tree Accuracy: 100.0%


##### **Hyperparameter tuning for Decision Tree**

In [108]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Membuat objek DecisionTreeClassifier
dt = DecisionTreeClassifier()

# Daftar hyperparameter yang akan diuji
param_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Membuat objek GridSearchCV
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

# Melakukan pencarian hyperparameter terbaik pada set pelatihan
grid_search.fit(X_train, y_train)

# Mendapatkan hyperparameter terbaik
best_dt = grid_search.best_estimator_

# Memprediksi label set test dengan model terbaik
y_pred_dt = best_dt.predict(X_test)

# Menghitung akurasi
acc_dt = accuracy_score(y_test, y_pred_dt)
print("Decision Tree Accuracy: {:.2f}%".format(acc_dt * 100))

# Menampilkan hyperparameter terbaik
print("Hyperparameter terbaik:")
print(grid_search.best_params_)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


Decision Tree Accuracy: 100.00%
Hyperparameter terbaik:
{'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'best'}


##### **Training AdaBoost**

In [109]:
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(n_estimators=2)

# Sesuaikan dt ke set training
ada.fit(X_train, y_train)

# Memprediksi label set test
y_pred_ada = ada.predict(X_test)

#  menghitung set accuracy
acc_ada = accuracy_score(y_test, y_pred_ada)
print("AdaBoost Accuracy: {}%".format(round(ada.score(X_test,y_test)*100,2)))

AdaBoost Accuracy: 84.06%


##### **Hyperparameter tuning for AdaBoost**

In [110]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Membuat objek AdaBoostClassifier
ada = AdaBoostClassifier()

# Daftar hyperparameter yang akan diuji
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0, 2.0],
}

# Membuat objek GridSearchCV
grid_search = GridSearchCV(estimator=ada, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

# Melakukan pencarian hyperparameter terbaik pada set pelatihan
grid_search.fit(X_train, y_train)

# Mendapatkan hyperparameter terbaik
best_ada = grid_search.best_estimator_

# Memprediksi label set test dengan model terbaik
y_pred_ada = best_ada.predict(X_test)

# Menghitung akurasi
acc_ada = accuracy_score(y_test, y_pred_ada)
print("AdaBoost Accuracy: {:.2f}%".format(acc_ada * 100))

# Menampilkan hyperparameter terbaik
print("Hyperparameter terbaik:")
print(grid_search.best_params_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


AdaBoost Accuracy: 100.00%
Hyperparameter terbaik:
{'learning_rate': 1.0, 'n_estimators': 50}


### **Tugas 3**

Dengan menggunakan dataset diabetes, buatlah ensemble voting dengan algoritma:
1. Logistic Regression
2. SVM kernel polynomial
3. Decission Tree

Boleh melakukan eksplorasi dengan melakukan tunning hyperparameter.

##### **Data Preparation**

In [136]:
# Load Data

dataDbt = pd.read_csv('../data/diabetes.csv')

dataDbt.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [137]:
# Cek nama kolom
dataDbt.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [138]:
# Cek kolom null
dataDbt.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [139]:
# Pada kasus ini, agak tidak masuk akal jika beberapa parameter bernilai 0
# sebagai contoh adalah nilai 'Glucose', 'BloodPlessure' ataupun 'Insulin'.
# Sekecil apapun nilainya, setiap manusia yang hidup pasti miliki nilai-nilai tersebut

# Kita akan manipulasi nilai yang 0 dengan melakukan 'imputasi' atau mengganti nilainya dengan nilai sintetis
# Pada kasus ini, kita akan menggunakan nilai mean

# Cek kolom neng nilai 0
feature_columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
for column in feature_columns:
    print("============================================")
    print(f"{column} ==> Missing zeros : {len(dataDbt.loc[dataDbt[column] == 0])}")

Pregnancies ==> Missing zeros : 111
Glucose ==> Missing zeros : 5
BloodPressure ==> Missing zeros : 35
SkinThickness ==> Missing zeros : 227
Insulin ==> Missing zeros : 374
BMI ==> Missing zeros : 11
DiabetesPedigreeFunction ==> Missing zeros : 0
Age ==> Missing zeros : 0


In [140]:
# Impute nilai 0 dengan mean
from sklearn.impute import SimpleImputer

fill_values = SimpleImputer(missing_values=0, strategy="mean", copy=False)

dataDbt[feature_columns] = fill_values.fit_transform(dataDbt[feature_columns])

In [141]:
dataDbt.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148.0,72.0,35.0,155.548223,33.6,0.627,50.0,1
1,1.0,85.0,66.0,29.0,155.548223,26.6,0.351,31.0,0
2,8.0,183.0,64.0,29.15342,155.548223,23.3,0.672,32.0,1
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0
4,4.494673,137.0,40.0,35.0,168.0,43.1,2.288,33.0,1


##### **Split Data Training dan Testing**

In [142]:
X = dataDbt[feature_columns]
y = dataDbt.Outcome

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

##### **Standarisasi Fitur**

In [143]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

# Standarisasi pada fitur di X_train dan X_test
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

In [144]:
X_train_std

array([[-1.16422997, -0.89610788, -1.00440048, ..., -1.20403257,
        -0.61421636, -0.94861028],
       [ 0.22077929, -0.56399695, -0.02026586, ...,  0.66428525,
        -0.90973787, -0.43466673],
       [ 0.04580856,  0.43233584, -0.34831073, ...,  1.44035573,
        -0.30699103, -0.77729576],
       ...,
       [ 1.95204087, -0.69684133,  1.12789121, ...,  1.91462102,
         1.94892066,  0.42190587],
       [ 0.04580856,  0.63160239,  0.01296379, ...,  1.45472741,
        -0.77514391, -0.34900947],
       [ 0.04580856,  0.10022491,  1.9480034 , ..., -1.40523602,
        -0.60836445, -1.03426754]])

##### **Training SVM Kernel Polynomial**

In [131]:
# Model SVM linier tanpa tunnning hyperparameter
svm_poly = SVC(kernel='poly')

# Fit ke model
svm_poly.fit(X_train_std, y_train)

# Prediksi
y_pred_svm_poly = svm_poly.predict(X_test_std)

# Evaluasi akurasi testing data
acc_svm_poly = accuracy_score(y_test, y_pred_svm_poly)

# Print hasil evaluasi
print("Test set accuracy: {:.2f}".format(acc_svm_poly))
print(f"Test set accuracy: {acc_svm_poly}")

Test set accuracy: 0.70
Test set accuracy: 0.696969696969697


##### **Hyperparameter tuning for SVM Kernel Polynomial**

In [121]:
from sklearn.model_selection import GridSearchCV

# Inisialisasi model SVM dengan kernel polynomial
svm_poly = SVC(kernel='poly', random_state=42)

# Mendefinisikan kumpulan hyperparameter yang akan diuji
param_grid = {'C': [0.01, 1, 10],
              'gamma': [0.01, 0.1, 1],
              'degree': [1, 2, 3]}

# Membyat objek GridSearchCV untuk mencari kombinasi hyperparameter terbaik
grid_search = GridSearchCV(svm_poly, param_grid, cv=5, n_jobs=-1)

# Fit model ke data pelatihan
grid_search.fit(X_train_std, y_train)

# Cetak hyperparameter terbaik yang ditemukan oleh GridSearchCV
print("Hyperparameter terbaik untuk model SVM kernel polynomial:")
print(grid_search.best_params_)

# Prediksi dengan model terbaik
y_pred_svm_poly = grid_search.best_estimator_.predict(X_test_std)

# Evaluasi akurasi pada data pengujian
acc_svm_poly = accuracy_score(y_test, y_pred_svm_poly)

# Print hasil evaluasi
print("Akurasi pada data pengujian setelah tuning hyperparameter: {:.2f}".format(acc_svm_poly))

Hyperparameter terbaik untuk model SVM kernel polynomial:
{'C': 10, 'degree': 1, 'gamma': 1}
Akurasi pada data pengujian setelah tuning hyperparameter: 0.74


##### **Training Decision Tree**

In [132]:
dt = DecisionTreeClassifier()

# Sesuaikan dt ke set training
dt.fit(X_train, y_train)

# Memprediksi label set test
y_pred_dt = dt.predict(X_test)

#  menghitung set accuracy
acc_dt = accuracy_score(y_test, y_pred_dt)
print("Test set accuracy: {:.2f}".format(acc_dt))
print(f"Test set accuracy: {acc_dt}")

Test set accuracy: 0.72
Test set accuracy: 0.7229437229437229


##### **Training Logistic Regression**

**Menggunakan 2 Layer**

In [133]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegressionCV


layer_one_estimators = [
                        ('svm_poly', SVC(kernel='poly')),
                        ('dt',DecisionTreeClassifier())             
                       ]
layer_two_estimators = [
                        ('dt_2', DecisionTreeClassifier()),
                        ('rf_2', RandomForestClassifier(n_estimators=50, random_state=42)),
                       ]
layer_two = StackingClassifier(estimators=layer_two_estimators, final_estimator=LogisticRegressionCV())

clf = StackingClassifier(estimators=layer_one_estimators, final_estimator=layer_two)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)
print(f"Accuracy: {clf.fit(X_train, y_train).score(X_test, y_test)}")

Accuracy: 0.703125


**Menggunakan 1 Layer**

In [124]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegressionCV


layer_one_estimators = [
                        ('svm_poly', SVC(kernel='poly')),
                        ('dt',DecisionTreeClassifier())             
                       ]

clf = StackingClassifier(estimators=layer_one_estimators, final_estimator=LogisticRegressionCV())

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)
print(f"Accuracy: {clf.fit(X_train, y_train).score(X_test, y_test)}")

Accuracy: 0.7395833333333334


##### **Training Ensemble Voting**

In [146]:
# Definisikan algoritma yang akan digunakan untuk voting
clf1 = LogisticRegressionCV()
clf2 = SVC(kernel='poly')
clf3 = DecisionTreeClassifier()

# model hard voting
voting = VotingClassifier(estimators=[('LogisticRegression', clf1), ('SVM-POLY', clf2), ('DecisionTree', clf3)], voting='hard')

# Fit model
voting.fit(X_train_std, y_train)

# Prediksi
y_pred_vt1 = voting.predict(X_test_std)

# Evaluasi akurasi testing data
acc_vt1 = accuracy_score(y_test, y_pred_vt1)

# Print hasil evaluasi
print('Voting Hard')
print("Test set accuracy: {:.2f}".format(acc_vt1))
print(f"Test set accuracy: {acc_vt1}")

Voting Hard
Test set accuracy: 0.74
Test set accuracy: 0.7402597402597403
