In [10]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # import DT
from sklearn.ensemble import RandomForestClassifier # import RandomForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import AdaBoostClassifier # import AdaBoost

### Tugas Bagging

Pada folder data, terdapat dataset jamur yang kita gunakan pada materi Decision Tree. Berdasarkan dataset yang sama, bandingkan peforma antara algoritma DT dan RandomForest. Gunakan tunning hyperparameter untuk mendapatkan parameter dan akurasi yang terbaik.

In [2]:
# Load data
dfm = pd.read_csv('data/mushrooms.csv')

dfm.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [3]:
# Cek kolom null
dfm.isnull().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [4]:
from sklearn.preprocessing import LabelEncoder

encode = LabelEncoder()
for col in dfm.columns:
    dfm[col] = encode.fit_transform(dfm[col])
dfm.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [5]:
# slice data
xmh = dfm.iloc[:,1:23].values
print(xmh)
ymh = dfm.iloc[:,0].values
ymh = ymh.reshape(len(ymh), 1)
print(ymh)

[[5 2 4 ... 2 3 5]
 [5 2 9 ... 3 2 1]
 [0 2 8 ... 3 2 3]
 ...
 [2 2 4 ... 0 1 2]
 [3 3 4 ... 7 4 2]
 [5 2 4 ... 4 1 2]]
[[1]
 [0]
 [0]
 ...
 [0]
 [1]
 [0]]


In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(xmh, ymh, test_size=0.2, random_state=1)

### Decision Tree Bagging

In [8]:
# Secara default, DecisionTreeClassifier dari scikit-learn akan menggunakan nilai "Gini" untuk kriteria
# Terdapat beberapa "hyperparamater" yang dapat digunakan. Silahka baca dokumentasi
# Pada kasus ini kita akan menggunakan parameter default
dfm = DecisionTreeClassifier()

# Sesuaikan dt ke set training
dfm.fit(X_train, y_train)

# Memprediksi label set test
y_pred_dfm = dfm.predict(X_test)

#  menghitung set accuracy
acc_dfm = accuracy_score(y_test, y_pred_dfm)
print("Test set accuracy: {:.2f}".format(acc_dfm))
print(f"Test set accuracy: {acc_dfm}")

Test set accuracy: 1.00
Test set accuracy: 1.0


### Random Forest Bagging

In [9]:
# Pada kasus kali ini kita akan menggunakan estimator pada RandomForest
# Untuk detail parameter (hyperparameter) silahkan cek dokumentasi
# n_estimator : berapa jumlah spilt pohon

rfm = RandomForestClassifier(n_estimators=10 , random_state=1)

# Sesuaikan dt ke set training
rfm.fit(X_train, y_train)

# Memprediksi label set test
y_pred_rfm = rfm.predict(X_test)

#  menghitung set accuracy
acc_rfm = accuracy_score(y_test, y_pred_rfm)
print("Test set accuracy: {:.2f}".format(acc_rfm))
print(f"Test set accuracy: {acc_rfm}")

Test set accuracy: 1.00
Test set accuracy: 1.0


  rfm.fit(X_train, y_train)


### Tugas Boosting

Pada folder data, terdapat dataset jamur yang kita gunakan pada materi Decision Tree. Berdasarkan dataset yang sama, bandingkan peforma antara algoritma DT dan AdaBoost. Gunakan tunning hyperparameter untuk mendapatkan parameter dan akurasi yang terbaik.

### AdaBoost Boosting

In [48]:
# Pada kasus kali ini kita akan menggunakan estimator pada AdaBoost
# Untuk detail parameter (hyperparameter) silahkan cek dokumentasi

ada = AdaBoostClassifier(n_estimators=2)

# Sesuaikan dt ke set training
ada.fit(X_train, y_train)

# Memprediksi label set test
y_pred_ada = ada.predict(X_test)

#  menghitung set accuracy
acc_ada = accuracy_score(y_test, y_pred_ada)
print("Test set accuracy: {:.2f}".format(acc_ada))
print(f"Test set accuracy: {acc_ada}")

Test set accuracy: 0.71
Test set accuracy: 0.7142857142857143


### Find Tunning Hyperparameter

In [51]:
from sklearn.model_selection import GridSearchCV

# defining the classifier
model = AdaBoostClassifier()
# creating a dic for the grid
grid = dict()
# estimator till 500
grid['n_estimators'] = [10, 50, 100, 200, 500]
# defining learning rate
grid['learning_rate'] = [0.0001, 0.01, 0.1, 1.0, 1.1, 1.2]
# initializing the grid search
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, scoring='accuracy')
# training the model on grid search for hyperparameter tuning of Adaboost
grid_result = grid_search.fit(X_train, y_train)
# finding the best results /hyperparameter tuning of Adaboost
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.763499 using {'learning_rate': 0.1, 'n_estimators': 200}


In [53]:
# Using tunning hyperparameter

ada = AdaBoostClassifier(learning_rate= 0.1, n_estimators=200)

# Sesuaikan dt ke set training
ada.fit(X_train, y_train)

# Memprediksi label set test
y_pred_ada = ada.predict(X_test)

#  menghitung set accuracy
acc_ada = accuracy_score(y_test, y_pred_ada)
print("Test set accuracy: {:.2f}".format(acc_ada))
print(f"Test set accuracy: {acc_ada}")

Test set accuracy: 0.74
Test set accuracy: 0.7445887445887446


## Tugas Stacking

Dengan menggunakan data yang sama, buatlan ensemble voting dengan algoritma
1. Logistic Regression
2. SVM kernel polynomial
3. Decission Tree

Anda boleh melakukan eksplorasi dengan melakukan tunning hyperparameter

In [16]:
# Load data
dbt = pd.read_csv('data/diabetes.csv')

dbt.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [17]:
# Cek kolom null
dbt.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [18]:
# Cek nama kolom
dbt.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [19]:
# Pada kasus ini, agak tidak masuk akal jika beberapa parameter bernilai 0
# sebagai contoh adalah nilai 'Glucose', 'BloodPlessure' ataupun 'Insulin'.
# Sekecil apapun nilainya, setiap manusia yang hidup pasti miliki nilai-nilai tersebut

# Kita akan manipulasi nilai yang 0 dengan melakukan 'imputasi' atau mengganti nilainya dengan nilai sintetis
# Pada kasus ini, kita akan menggunakan nilai mean

# Cek kolom neng nilai 0
feature_columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
for column in feature_columns:
    print("============================================")
    print(f"{column} ==> Missing zeros : {len(dbt.loc[dbt[column] == 0])}")

Pregnancies ==> Missing zeros : 111
Glucose ==> Missing zeros : 5
BloodPressure ==> Missing zeros : 35
SkinThickness ==> Missing zeros : 227
Insulin ==> Missing zeros : 374
BMI ==> Missing zeros : 11
DiabetesPedigreeFunction ==> Missing zeros : 0
Age ==> Missing zeros : 0


In [20]:
# Impute nilai 0 dengan mean
from sklearn.impute import SimpleImputer

fill_values = SimpleImputer(missing_values=0, strategy="mean", copy=False)

dbt[feature_columns] = fill_values.fit_transform(dbt[feature_columns])

In [22]:
Xdt = dbt[feature_columns]
ydt = dbt.Outcome

Xdt_train, Xdt_test, ydt_train, ydt_test = train_test_split(Xdt, ydt, test_size=0.3, random_state=42)

### Logistic Regression

In [25]:
from sklearn.linear_model import LogisticRegression

lgc_rgrs = LogisticRegression()

# Fit dengan data yang telah di standarisasi
lgc_rgrs.fit(Xdt_train, ydt_train)

# Prediksi dengan data test
y_pred_lgc = lgc_rgrs.predict(Xdt_test)

# Evaluasi akurasi testing data
acc_lgc = accuracy_score(ydt_test, y_pred_lgc)

# Print hasil evaluasi
print("Test set accuracy: {:.2f}".format(acc_lgc))
print(f"Test set accuracy: {acc_lgc}")


Test set accuracy: 0.74
Test set accuracy: 0.7359307359307359


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### SVM Polynomial

In [26]:
from sklearn.svm import SVC # import SVM classifier

# Model SVM linier tanpa tunnning hyperparameter
svm_pol = SVC(kernel='poly')

# Fit ke model
svm_pol.fit(Xdt_train, ydt_train)

# Prediksi
y_pred_svm_pol = svm_pol.predict(Xdt_test)

# Evaluasi akurasi testing data
acc_svm_pol = accuracy_score(ydt_test, y_pred_svm_pol)

# Print hasil evaluasi
print("Test set accuracy: {:.2f}".format(acc_svm_pol))
print(f"Test set accuracy: {acc_svm_pol}")

Test set accuracy: 0.74
Test set accuracy: 0.7402597402597403


### Decision Tree

In [27]:
# Secara default, DecisionTreeClassifier dari scikit-learn akan menggunakan nilai "Gini" untuk kriteria
# Terdapat beberapa "hyperparamater" yang dapat digunakan. Silahka baca dokumentasi
# Pada kasus ini kita akan menggunakan parameter default
dbt = DecisionTreeClassifier()

# Sesuaikan dt ke set training
dbt.fit(Xdt_train, ydt_train)

# Memprediksi label set test
y_pred_dbt = dbt.predict(Xdt_test)

#  menghitung set accuracy
acc_dbt = accuracy_score(ydt_test, y_pred_dbt)
print("Test set accuracy: {:.2f}".format(acc_dbt))
print(f"Test set accuracy: {acc_dbt}")

Test set accuracy: 0.70
Test set accuracy: 0.7012987012987013


### Voting

In [29]:

from sklearn.ensemble import VotingClassifier # import model Voting

# Definisikan algoritma yang akan digunakan untuk voting

clf1 = LogisticRegression()
clf2 = SVC(kernel='poly')
clf3 = DecisionTreeClassifier()

# model hard voting
voting = VotingClassifier(estimators=[('LogisticRegression', clf1), ('SVM-POL', clf2), ('DecissionTreeClassifier', clf3)], voting='hard')

# Fit model
voting.fit(Xdt_train, ydt_train)

# Prediksi
y_pred_vt1 = voting.predict(Xdt_test)

# Evaluasi akurasi testing data
acc_vt1 = accuracy_score(ydt_test, y_pred_vt1)

# Print hasil evaluasi
print('Voting Hard')
print("Test set accuracy: {:.2f}".format(acc_vt1))
print(f"Test set accuracy: {acc_vt1}")

Voting Hard
Test set accuracy: 0.74
Test set accuracy: 0.7359307359307359


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
