# Tugas Praktikum 

## 1. Perbandingan Performa RandomForest dan Decision Tree dengan HyperTuning

In [2]:
# import library
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier as DT # Import Decision Tree Classifier
from sklearn.ensemble import RandomForestClassifier as RF # Import Random Forest Classifier
from sklearn.model_selection import train_test_split as tts 
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [3]:
# read dataset
df = pd.read_csv('mushrooms.csv')
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [4]:
df.isnull().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

-> tidak ada data yang null

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [6]:
# membuat data menjadi kategorikal dengan sebuah function
def make_categorical(data, columns):
    for column in columns:
        data[column] = pd.Categorical(data[column])
    return data

In [7]:
columns = df.columns
data = make_categorical(df, columns)

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   class                     8124 non-null   category
 1   cap-shape                 8124 non-null   category
 2   cap-surface               8124 non-null   category
 3   cap-color                 8124 non-null   category
 4   bruises                   8124 non-null   category
 5   odor                      8124 non-null   category
 6   gill-attachment           8124 non-null   category
 7   gill-spacing              8124 non-null   category
 8   gill-size                 8124 non-null   category
 9   gill-color                8124 non-null   category
 10  stalk-shape               8124 non-null   category
 11  stalk-root                8124 non-null   category
 12  stalk-surface-above-ring  8124 non-null   category
 13  stalk-surface-below-ring  8124 non-null   catego

In [9]:
columns

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')

#### Encoding data
Karena kolom fitur cukup banyak, maka perlu untuk menggunakan function khusus yang digunakan untuk melakukan encoding pada kolom-kolom tersebut.

In [10]:
# encoding data yang bersifat kategorikal menjadi numerik dengan function 
from sklearn import  preprocessing

def label_encoding(data, columns):
    result = data.copy()
    encoders = {}
    for column in columns:
        encoder = preprocessing.LabelEncoder()
        result[column] = encoder.fit_transform(result[column])
        encoders[column] = encoder
    return result, encoders

In [11]:
df1, encoders1 = label_encoding(data, columns)
df1.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


#### Pembagian fitur dan target

In [12]:
X = df1.iloc[:,1:] 
y = df1[['class']]  

#### Splitting Dataset

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Decision Tree

In [14]:
# inisiasi model DecisionTree()
dt_clf_model = DT() 

In [15]:
# definisi parameter
param_grid_dt ={
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2,5,10],
    'min_samples_leaf': [1, 2, 4]
}

In [16]:
# inisiasi GridSearchCV
from sklearn.model_selection import GridSearchCV
model_grs_dt = GridSearchCV(estimator=dt_clf_model, param_grid=param_grid_dt, cv=5, n_jobs=-1)

In [17]:
# fitting dengan hyperparamter tuning
model_grs_dt.fit(X_train, y_train)

In [18]:
# menampilkan parameter terbaik
print("Best parameters:", model_grs_dt.best_params_)

Best parameters: {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [19]:
# membuat model berdasar parameter terbaik 
best_model_dt = model_grs_dt.best_estimator_

In [20]:
# prediksi pada data tests
y_pred_dt = best_model_dt.predict(X_test)

In [21]:
# Hitung Jumlah Akurasi best model decision tree
accuracy_dt = accuracy_score(y_test, y_pred_dt)

In [22]:
print(f"Akurasi Best Model Decision Tree : {accuracy_dt}")

Akurasi Best Model Decision Tree : 1.0


### Random Forest

In [23]:
# Inisialiasi model Random Forest
rf_clf_model = RF()

In [24]:
# inisialisasi param grid random forest
param_grid_rf  = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split':[2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features':['sqrt']
}

In [25]:
# inisiasi GridSearchCV
model_grs_rf = GridSearchCV(estimator=rf_clf_model, param_grid=param_grid_rf, cv=5, n_jobs=-1)

In [26]:
# fitting model 
model_grs_rf.fit(X_train, y_train)

  return fit_method(estimator, *args, **kwargs)


In [27]:
model_grs_rf.best_params_

{'max_depth': None,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 50}

In [28]:
# best estimator
best_model_rf = model_grs_rf.best_estimator_

In [29]:
# prediksi
y_pred_rf = best_model_rf.predict(X_test)

In [30]:
# perhitungan akurasi 
accuracy_rf = accuracy_score(y_test, y_pred_rf)

In [31]:
print(f"Akurasi Best Model Random Forest : {accuracy_rf}")

Akurasi Best Model Random Forest : 1.0


## 2.  Perbandingan peforma antara algoritma Decision Tree dan AdaBoost dengan HyperTuning

In [32]:
# Inisialiasi model AdaBoost
from sklearn.ensemble import AdaBoostClassifier

ada_clf_model = AdaBoostClassifier()

In [33]:
# param grid for adaboost
param_grid_ada = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
}

In [34]:
# inisialisasi model gridSearch
model_grs_ada = GridSearchCV(estimator=ada_clf_model, param_grid=param_grid_ada, cv=5, n_jobs=-1)

In [35]:
# fitting model hypertunning
model_grs_ada.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [36]:
# best params
model_grs_ada.best_params_

{'learning_rate': 1.0, 'n_estimators': 50}

In [37]:
# prediksi oleh best model ada
best_model_ada = model_grs_ada.best_estimator_
y_pred_ada = best_model_ada.predict(X_test)

In [38]:
# perhitungan evaluasi hasil model adaboost
acc_ada = accuracy_score(y_test, y_pred_ada)

In [39]:
print(f'Akurasi Best Model AdaBoost : {acc_ada}')

Akurasi Best Model AdaBoost : 1.0


## 3. Membuat Ensemble Voting dengan algoritma LogisticRegression, SVM(kernel polinomial), dan Decission Tree

#### Membaca Dataset

In [40]:
# membaca dataset
dataframe = pd.read_csv('diabetes.csv')
dataframe.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


#### Melakukan imputasi pada data yang memiliki nilai 0

In [43]:
# melakukan impute pada nilai yang 0 dengan nilai mean, 
# hal ini dilakkan karena ada beberapa nilai pada kolom yang nilainya 0, yang mana hal tersebut menjadi hal yang kurang masuk akal jika dijadikan menjadi dataframe pada machine learning
from sklearn.impute import SimpleImputer

feature_columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']

fill_values = SimpleImputer(missing_values=0, strategy='mean', copy=False)
dataframe[feature_columns] = fill_values.fit_transform(dataframe[feature_columns])

#### Splitting data training dan data testing

In [45]:
X = dataframe[feature_columns]
y = dataframe[['Outcome']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Standarisasi Fitur

In [46]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.fit_transform(X_test)

#### Inisiasi modul yang diperlukan

In [47]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [55]:
# inisiasi model 
log_reg = LogisticRegression()
svm_poly = SVC(kernel='poly')
dt = DecisionTreeClassifier()

#### Menyusun Model yang akan digunakan ke dalam VotingClassifier

In [56]:
# menyusun model kedalam VotingClassifier
voting_clf = VotingClassifier(
    estimators = [
        ('lr', log_reg),
        ('svm', svm_poly),
        ('dt', dt)
    ],
    voting='hard'
)

#### Tuning parameter untuk masing-masing model yang akan dimasukkan ke dalam VotingClassifier

In [60]:
# Tuning parameter untuk masing-masing model 
param_grid_vt = {
    'lr__C':[0.001, 0.01, 0.1, 1, 10],
    'svm__C':[0.001, 0.01, 0.1, 1, 10],
    'svm__degree':[2, 3, 4],
    'dt__max_depth':[3,4,5,6,7]
}

#### Inisiasi Model GridSearchCV untuk melakukan hyperparameter tuning

In [61]:
# inisialisasi model GridSearchCV
model_grs = GridSearchCV(estimator=voting_clf, param_grid=param_grid_vt, scoring='accuracy', cv=5)


#### Melakukan Fitting Model dengan data training

In [62]:
# fitting model 
model_grs.fit(X_train_std, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.clas

#### Mendapatkan model terbaik dari hasil hyperparameter tuning

In [64]:
best_model_vt = model_grs.best_estimator_

#### Melakukan prediksi dengan data testing

In [65]:
y_pred_vt = best_model_vt.predict(X_test_std)

#### Menghitung akurasi dari model yang telah dibuat

In [66]:
acc_model_vt = accuracy_score(y_test, y_pred_vt)

In [68]:
print("Akurasi Model Ensemble Voting:", acc_model_vt)

Akurasi Model Ensemble Voting: 0.7922077922077922
