<a href="https://colab.research.google.com/github/integer-class/machine-learning/blob/main/AL_AZHAR_RRF/jobsheet_6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🚗 Lab Work 1
Bagging

Bagging using Random Forest
In this case, we will use one of the bagging methods, which is RandomForest, to classify the type of tumor. In this exercise, you will train with the Wisconsin Breast Cancer Dataset from the UCI Machine Learning Repository. This exercise will involve predicting whether a tumor is malignant or benign.

We will compare the performance of the Decision Tree and Random Forest algorithms in this case.

### Step 1 - Import Library

In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # import DT
from sklearn.ensemble import RandomForestClassifier # import RandomForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

### Step 2 - Data Preparation

#### Step 2.1 - Load Data

In [2]:
# Load data
df = pd.read_csv('data/wbc.csv')

df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


#### Step 2.2  - Check Null Values

In [3]:
# Check null
df.isnull().sum()

Unnamed: 0,0
id,0
diagnosis,0
radius_mean,0
texture_mean,0
perimeter_mean,0
area_mean,0
smoothness_mean,0
compactness_mean,0
concavity_mean,0
concave points_mean,0


#### Step 2.3 - Select The Features

In [4]:
# Features selection

# Slice dataframe from 'radius_mean' to 'fractal_dimension_worst'
X = df.iloc[:,3:-1]
y = df['diagnosis']
y = y.map({'M':1, 'B':0}) # Encode label

# Check features numbers and instance numbers
X.shape

(569, 29)

### Step 3 - Split Data

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

### Step 4 - Train Decision Tree Classifier Model

In [6]:
# By default, DT in scikit-learn will use "Gini" as split criteria
# Read the documentation for more detail
dt = DecisionTreeClassifier()

# Fitting / train DT model
dt.fit(X_train, y_train)

# Predict test set
y_pred_dt = dt.predict(X_test)

#  Calculate the accuracy
acc_dt = accuracy_score(y_test, y_pred_dt)
print("Test set accuracy: {:.2f}".format(acc_dt))
print(f"Test set accuracy: {acc_dt}")

Test set accuracy: 0.95
Test set accuracy: 0.9473684210526315


### Step 5 - Train Random Forest Classifier Model

In [7]:
# We will use estimator=10 in this model
# You can read the documentation to understand the hyperparameter of RF in scikit-learn

rf = RandomForestClassifier(n_estimators=10, random_state=1)

# Fitting / training
rf.fit(X_train, y_train)

# Predict the test set
y_pred_rf = rf.predict(X_test)

#  Calculate the accuracy
acc_rf = accuracy_score(y_test, y_pred_rf)
print("Test set accuracy: {:.2f}".format(acc_rf))
print(f"Test set accuracy: {acc_rf}")

Test set accuracy: 0.96
Test set accuracy: 0.956140350877193


# 🚕 Lab Work 2
Boosting

Boosting using AdaBoost
In this case, we will use one of the boosting methods, AdaBoost, to classify the types of Iris flowers. In this exercise, we will use the widely-used Iris dataset. This exercise will involve predicting the three types of Iris flowers: Iris Setosa, Iris Versicolor, and Iris Virginica, based on the length and width of their sepals and petals.

We will compare the performance of the Decision Tree and AdaBoost algorithms in this case.

### Step 1 - Import Library

In [8]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # import DT
from sklearn.ensemble import AdaBoostClassifier # import AdaBoost
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder # Kebutuhan encoding label

### Step 2 - Data Preparation

#### Step 2.1 - Load Data

In [9]:
# Load data
df = pd.read_csv('data/iris.csv')

df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


#### Step 2.2 - Check Null Values

In [10]:
# Cek kolom null
df.isnull().sum()

Unnamed: 0,0
Id,0
SepalLengthCm,0
SepalWidthCm,0
PetalLengthCm,0
PetalWidthCm,0
Species,0


#### Step 2.3 - Feature Selection

In [11]:
# Features selection
X = df.iloc[:,2:-1]
y = df['Species']

# Label encoding
ec = LabelEncoder()
y = ec.fit_transform(y)

# Check the number of features and instances
print(X.shape)

# Check instances label
print(y)

(150, 3)
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


### Step 3 - Split Data

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

### Step 4 - Build Decision Tree Model

In [13]:
dt = DecisionTreeClassifier()

# Fit model
dt.fit(X_train, y_train)

# Predict the test set
y_pred_dt = dt.predict(X_test)

# Calculate the accuracy
acc_dt = accuracy_score(y_test, y_pred_dt)
print("Test set accuracy: {:.2f}".format(acc_dt))
print(f"Test set accuracy: {acc_dt}")

Test set accuracy: 0.97
Test set accuracy: 0.9666666666666667


### Step 5 - Build AdaBoost Model

In [14]:
ada = AdaBoostClassifier(n_estimators=2)

# Fit to AdaBoost Model
ada.fit(X_train, y_train)

# Predict the test set
y_pred_ada = ada.predict(X_test)

# Calculate the accuracy
acc_ada = accuracy_score(y_test, y_pred_ada)
print("Test set accuracy: {:.2f}".format(acc_ada))
print(f"Test set accuracy: {acc_ada}")

Test set accuracy: 0.97
Test set accuracy: 0.9666666666666667




# 🚙 Lab Work 3
Stacking

Stacking
In this lab work, you need to build a stacking model using this following code. You need to train the 'iris' data based on this following code. You can also perform data preparation process base on the previous lab work.

In [15]:
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier



layer_one_estimators = [
                        ('rf_1', RandomForestClassifier(n_estimators=10, random_state=42)),
                        ('knn_1', KNeighborsClassifier(n_neighbors=5))
                       ]
layer_two_estimators = [
                        ('dt_2', DecisionTreeClassifier()),
                        ('rf_2', RandomForestClassifier(n_estimators=50, random_state=42)),
                       ]
layer_two = StackingClassifier(estimators=layer_two_estimators, final_estimator=LogisticRegression())


clf = StackingClassifier(estimators=layer_one_estimators, final_estimator=layer_two)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)
clf.fit(X_train, y_train).score(X_test, y_test)

0.8947368421052632

# 🚓 Lab Work 4
Voting

Introduction

In this case, we will use voting to classify patients with diabetes based on several features. Patients will be classified as either having diabetes (1) or not having diabetes (0). First, we will use several separate classification algorithms, including Naive Bayes, Linear SVM, and RBF SVM. After that, we will combine the performance of these three algorithms using the ensemble voting method.

### Step 1 - Import Libray

In [16]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB # import Naive Bayes model Gaussian (asumsi data terdistribusi normal)
from sklearn.svm import SVC # import SVM classifier
from sklearn.ensemble import VotingClassifier # import model Voting
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

### Step 2 - Data Preparation

#### Step 2.1 - Load Data

In [18]:
# Load Data
dbt = pd.read_csv('data/diabetes.csv')

dbt.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


#### Step 2.2 - Check Columns Name

In [19]:
# Check columns name
dbt.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

#### Step 2.3 - Check Null Values

In [20]:
# Check null values
dbt.isnull().sum()

Unnamed: 0,0
Pregnancies,0
Glucose,0
BloodPressure,0
SkinThickness,0
Insulin,0
BMI,0
DiabetesPedigreeFunction,0
Age,0
Outcome,0


#### Step 2.4 - Data Imputation

In this case, it doesn't make sense for certain parameters to have a value of 0. For example, values like 'Glucose,' 'Blood Pressure,' or 'Insulin' should have non-zero values for every living person, no matter how small.

We will manipulate the 0 values by performing 'imputation,' which means replacing them with synthetic values. In this case, we will use the mean value.

In [21]:
# Check column with 0 value
feature_columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']

for column in feature_columns:
    print("============================================")
    print(f"{column} ==> Missing zeros : {len(dbt.loc[dbt[column] == 0])}")

Pregnancies ==> Missing zeros : 111
Glucose ==> Missing zeros : 5
BloodPressure ==> Missing zeros : 35
SkinThickness ==> Missing zeros : 227
Insulin ==> Missing zeros : 374
BMI ==> Missing zeros : 11
DiabetesPedigreeFunction ==> Missing zeros : 0
Age ==> Missing zeros : 0


Perform data imputation,

In [22]:
# Impute 0 with mean value
from sklearn.impute import SimpleImputer

fill_values = SimpleImputer(missing_values=0, strategy="mean", copy=False)

dbt[feature_columns] = fill_values.fit_transform(dbt[feature_columns])

### Step 3 - Split Data

In [23]:
X = dbt[feature_columns]
y = dbt.Outcome

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Step 4 - Build Gaussian NB Model

#### Step 4.1 - Standarized The Features

In [24]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

# Standarisasi pada fitur di X_train dan X_test
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

#### Step 4.2 - Train and Evaluate The Model

In [25]:
gnb_std = GaussianNB()

# Fit with standarized features
gnb_std.fit(X_train_std, y_train)

# Predict the test set
y_pred_gnb = gnb_std.predict(X_test_std)

# Evaluate test set
acc_gnb = accuracy_score(y_test, y_pred_gnb)

# Print the result
print("Test set accuracy: {:.2f}".format(acc_gnb))
print(f"Test set accuracy: {acc_gnb}")

Test set accuracy: 0.74
Test set accuracy: 0.7359307359307359


### Step 5 - Build SVM Linear Model

In [26]:
svm_lin = SVC(kernel='linear')

# Fitting
svm_lin.fit(X_train_std, y_train)

# Predict
y_pred_svm_lin = svm_lin.predict(X_test_std)

# Evaluate test set
acc_svm_lin = accuracy_score(y_test, y_pred_svm_lin)

# Show the result
print("Test set accuracy: {:.2f}".format(acc_svm_lin))
print(f"Test set accuracy: {acc_svm_lin}")

Test set accuracy: 0.74
Test set accuracy: 0.7402597402597403


### Step 6 - Build SVM RBF Model

In [27]:
svm_rbf = SVC(kernel='rbf')

# Fitting
svm_rbf.fit(X_train_std, y_train)

# Predict
y_pred_svm_rbf = svm_rbf.predict(X_test_std)

# Evaluate test set
acc_svm_rbf = accuracy_score(y_test, y_pred_svm_rbf)

# Print
print("Test set accuracy: {:.2f}".format(acc_svm_rbf))
print(f"Test set accuracy: {acc_svm_rbf}")

Test set accuracy: 0.72
Test set accuracy: 0.7229437229437229


### Step 7 - Build Voting Model

In [28]:
# Define meta classifier
clf1 = GaussianNB()
clf2 = SVC(kernel='linear')
clf3 = SVC(kernel='rbf', probability=True)

# Define voting model --> Hard
voting = VotingClassifier(estimators=[('GaussianNB', clf1), ('SVM-LIN', clf2), ('SVM-RBF', clf3)], voting='hard')

# Fitting
voting.fit(X_train_std, y_train)

# Predict
y_pred_vt1 = voting.predict(X_test_std)

# Evaluate
acc_vt1 = accuracy_score(y_test, y_pred_vt1)

# Show the result
print('Voting Hard')
print("Test set accuracy: {:.2f}".format(acc_vt1))
print(f"Test set accuracy: {acc_vt1}")

Voting Hard
Test set accuracy: 0.74
Test set accuracy: 0.7402597402597403


# 🤩 Lab Assignment