# Irshandy Aditya Wicaksana
# TI-3A / 16

# **Tugas 1**
Terdapat dataset mushroom. Berdasarkan dataset yang tersebut, bandingkan peforma antara algoritma Decision Tree dan RandomForest. Gunakan tunning hyperparameter untuk mendapatkan parameter dan akurasi yang terbaik.

In [21]:
# import library
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # import DT
from sklearn.ensemble import RandomForestClassifier # import RandomForest
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

Persiapan data

In [22]:
# Load data
df = pd.read_csv('data/mushrooms.csv')

df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [23]:
# Cek kolom null
df.isnull().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [24]:
# Encode data
from sklearn.preprocessing import LabelEncoder, StandardScaler
le = LabelEncoder() # membuat objek dari LabelEncoder
for col in df.columns:
    if df[col].dtype == 'object':  # Jika tipe data kolom adalah object/kategoris
        df[col] = le.fit_transform(df[col])

Seleksi fitur

In [25]:
# Slice dataframe mulai dari kolom 'radius_mean' sampai 'fractal_dimension_worst'
X = df.iloc[:,2:]
y = df['class']

# Cek jumlah fitur dan instance
X.shape

(8124, 21)

Split data training dan testing

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

Traning Decision Tree

In [27]:
# Menentukan parameter untuk Grid Search
param_grid_dt = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10]
}

# Menggunakan GridSearchCV untuk mencari parameter terbaik
grid_search_dt = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid_dt, cv=5)
grid_search_dt.fit(X_train, y_train)

# Mendapatkan model terbaik dan akurasi
best_dt_model = grid_search_dt.best_estimator_
y_pred_dt = best_dt_model.predict(X_test)
dt_accuracy = accuracy_score(y_test, y_pred_dt)

print("Best Decision Tree Parameters:", grid_search_dt.best_params_)
print("Decision Tree Accuracy:", dt_accuracy)

Best Decision Tree Parameters: {'criterion': 'gini', 'max_depth': None, 'min_samples_split': 2}
Decision Tree Accuracy: 1.0


Training RandomForest

In [28]:
# Menentukan parameter untuk Grid Search
param_grid_rf = {
    'n_estimators': [50, 100, 150],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10]
}

# Menggunakan GridSearchCV untuk mencari parameter terbaik
grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=5)
grid_search_rf.fit(X_train, y_train)

# Mendapatkan model terbaik dan akurasi
best_rf_model = grid_search_rf.best_estimator_
y_pred_rf = best_rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred_rf)

print("Best Random Forest Parameters:", grid_search_rf.best_params_)
print("Random Forest Accuracy:", rf_accuracy)

Best Random Forest Parameters: {'criterion': 'gini', 'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}
Random Forest Accuracy: 1.0


# **Tugas 2**
Terdapat dataset mushroom. Berdasarkan dataset tersebut, bandingkan peforma antara algoritma Decision Tree dan AdaBoost. Gunakan tunning hyperparameter untuk mendapatkan parameter dan akurasi yang terbaik.

In [35]:
# import library
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # import DT
from sklearn.ensemble import AdaBoostClassifier # import AdaBoost
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

Persiapan Data

In [30]:
# Load data
df = pd.read_csv('data/mushrooms.csv')

df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [31]:
# Encode data
from sklearn.preprocessing import LabelEncoder, StandardScaler
le = LabelEncoder() # membuat objek dari LabelEncoder
for col in df.columns:
    if df[col].dtype == 'object':  # Jika tipe data kolom adalah object/kategoris
        df[col] = le.fit_transform(df[col])

Seleksi fitur

In [32]:
# Slice dataframe mulai dari kolom 'radius_mean' sampai 'fractal_dimension_worst'
X = df.iloc[:,2:]
y = df['class']

# Cek jumlah fitur dan instance
X.shape

(8124, 21)

Training Decision Tree use Hyperparameter Tuning

In [33]:
# Menentukan parameter untuk Grid Search
param_grid_dt = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10]
}

# Menggunakan GridSearchCV untuk mencari parameter terbaik
grid_search_dt = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid_dt, cv=5)
grid_search_dt.fit(X_train, y_train)

# Mendapatkan model terbaik dan akurasi
best_dt_model = grid_search_dt.best_estimator_
y_pred_dt = best_dt_model.predict(X_test)
dt_accuracy = accuracy_score(y_test, y_pred_dt)

print("Best Decision Tree Parameters:", grid_search_dt.best_params_)
print("Decision Tree Accuracy:", dt_accuracy)

Best Decision Tree Parameters: {'criterion': 'gini', 'max_depth': None, 'min_samples_split': 2}
Decision Tree Accuracy: 1.0


Training AdaBoost use Hyperparameter Tuning

In [36]:
# Definisikan parameter grid untuk tuning
ab_params = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 1.0],
}

# Mengubah algoritma menjadi SAMME untuk menghindari FutureWarning dan gunakan estimator
ab_classifier = AdaBoostClassifier(estimator=DecisionTreeClassifier(), algorithm='SAMME')

# Menggunakan GridSearchCV untuk mencari parameter terbaik
ab_grid_search = GridSearchCV(ab_classifier, ab_params, cv=5, scoring='accuracy')
ab_grid_search.fit(X_train, y_train)

# Menampilkan hasil terbaik
print(f"Best AdaBoost Parameters: {ab_grid_search.best_params_}")
print(f"AdaBoost Accuracy: {ab_grid_search.best_score_}")

Best AdaBoost Parameters: {'learning_rate': 0.01, 'n_estimators': 50}
AdaBoost Accuracy: 1.0


# **Tugas 3**
Dengan menggunakan dataset diabetes, buatlah ensemble voting dengan algoritma
1. Logistic Regression
2. SVM kernel polynomial
3. Decission Tree

Anda boleh melakukan eksplorasi dengan melakukan tunning hyperparameter

In [41]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

Persiapan Data

In [38]:
# Load Data

dbt = pd.read_csv('data/diabetes.csv')

dbt.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [39]:
# Cek nama kolom
dbt.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [40]:
# Cek kolom null
dbt.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

Split Data

In [43]:
# Pisahkan fitur dan target
X = dbt.drop('Outcome', axis=1)  # Asumsikan kolom target bernama 'Outcome'
y = dbt['Outcome']

# Membagi dataset menjadi data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

Logistic Regression

In [45]:
# Membangun pipeline untuk menyekal data dan melakukan Logistic Regression
log_reg = make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000))

# Mendefinisikan hyperparameter untuk GridSearch
log_params = {
    'logisticregression__C': [0.01, 0.1, 1, 10, 100],
    'logisticregression__solver': ['liblinear', 'lbfgs']
}

# Melakukan GridSearchCV
log_grid_search = GridSearchCV(log_reg, log_params, cv=5, scoring='accuracy')
log_grid_search.fit(X_train, y_train)

# Menampilkan parameter terbaik
print("Best Logistic Regression Parameters:", log_grid_search.best_score_, log_grid_search.best_params_)

Best Logistic Regression Parameters: 0.7802007615091727 {'logisticregression__C': 0.1, 'logisticregression__solver': 'lbfgs'}


SVM dengan Kernel Polynomial

In [46]:
# Tuning hyperparameter untuk SVM
svm_params = {
    'C': [0.01, 0.1, 1, 10, 100],
    'degree': [2, 3, 4],
    'kernel': ['poly']
}

svm = SVC()
svm_grid_search = GridSearchCV(svm, svm_params, cv=5, scoring='accuracy')
svm_grid_search.fit(X_train, y_train)

print("Best SVM Parameters:", svm_grid_search.best_score_, svm_grid_search.best_params_)

Best SVM Parameters: 0.7689858082381447 {'C': 10, 'degree': 2, 'kernel': 'poly'}


Decision Tree

In [47]:
# Tuning hyperparameter Decision Tree
dt_params = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None] + list(range(1, 11)),
    'min_samples_split': [2, 5, 10]
}

dt_classifier = DecisionTreeClassifier()
dt_grid_search = GridSearchCV(dt_classifier, dt_params, cv=5, scoring='accuracy')
dt_grid_search.fit(X_train, y_train)

print("Best Decision Tree Parameters:", dt_grid_search.best_score_, dt_grid_search.best_params_)

Best Decision Tree Parameters: 0.7522845275181724 {'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 10}


Membuat Ensemble Voting Classifier

In [50]:
# Menggunakan model terbaik dari grid search
best_log_reg = log_grid_search.best_estimator_
best_svm = svm_grid_search.best_estimator_
best_dt = dt_grid_search.best_estimator_

# Membuat ensemble voting classifier
voting_classifier = VotingClassifier(estimators=[
    ('log_reg', best_log_reg),
    ('svm', best_svm),
    ('decision_tree', best_dt)
], voting='hard')

# Melatih model ensemble
voting_classifier.fit(X_train, y_train)

# Evaluasi akurasi
y_pred = voting_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Logistic Regression Accuracy:", log_grid_search.best_score_)
print("SVM Accuracy:", svm_grid_search.best_score_)
print("Decision Tree Accuracy:", dt_grid_search.best_score_)
print("Ensemble Voting Accuracy:", accuracy)
print(classification_report(y_test, y_pred))

Logistic Regression Accuracy: 0.7802007615091727
SVM Accuracy: 0.7689858082381447
Decision Tree Accuracy: 0.7522845275181724
Ensemble Voting Accuracy: 0.7532467532467533
              precision    recall  f1-score   support

           0       0.80      0.83      0.82       151
           1       0.66      0.60      0.63        80

    accuracy                           0.75       231
   macro avg       0.73      0.72      0.72       231
weighted avg       0.75      0.75      0.75       231

