# Lab Assignment

## Task 1

Step 0 - Import the Library

In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # import DT
from sklearn.ensemble import RandomForestClassifier # import RandomForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

Step 1 - Load the dataset

In [2]:
# Load the dataset
df = pd.read_csv("mushrooms.csv")
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [3]:
# Features selection

# Features selection from 'cap-shape' to 'habitat'
X = df.iloc[:, 1:] 
y = df['class']
X = pd.get_dummies(X, drop_first=True) # Encode label

# Check features numbers and instance numbers
X.shape

(8124, 95)

Step 2 - Split data

In [4]:
# Split the dataset into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

Step 3 - Train Decision Tree Classifier Model

In [5]:
# By default, DT in scikit-learn will use "Gini" as split criteria
# Read the documentation for more detail
dt = DecisionTreeClassifier()

# Fitting / train DT model
dt.fit(X_train, y_train)

# Predict test set
y_pred_dt = dt.predict(X_test)

#  Calculate the accuracy
acc_dt = accuracy_score(y_test, y_pred_dt)
print("Result, ")
print("Test set accuracy: {:.2f}".format(acc_dt))
print(f"Test set accuracy: {acc_dt}")

Result, 
Test set accuracy: 1.00
Test set accuracy: 1.0


Step 4 - Train Random Forest Classifier Model

In [6]:
# We will use estimator=10 in this model
# You can read the documentation to understand the hyperparameter of RF in scikit-learn

rf = RandomForestClassifier(n_estimators=10, random_state=1)

# Fitting / training
rf.fit(X_train, y_train)

# Predict the test set
y_pred_rf = rf.predict(X_test)

#  Calculate the accuracy
acc_rf = accuracy_score(y_test, y_pred_rf)
print("Test set accuracy: {:.2f}".format(acc_rf))
print(f"Test set accuracy: {acc_rf}")

Test set accuracy: 1.00
Test set accuracy: 1.0


Step 5 - Hyperparameter Tuning
Decision Tree

In [7]:
from sklearn.model_selection import GridSearchCV

# Define Decision Tree model and hyperparameter grid for tuning
dt_param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform GridSearchCV for Decision Tree
dt_grid_search = GridSearchCV(dt, dt_param_grid, cv=5, scoring='accuracy')
dt_grid_search.fit(X_train, y_train)

# Get the best hyperparameters for Decision Tree
best_params_dt = dt_grid_search.best_params_
best_accuracy_dt = dt_grid_search.best_score_
print("Best Decision Tree Parameters:", best_params_dt)
print("Best Decision Tree Accuracy:", best_accuracy_dt)

Best Decision Tree Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best Decision Tree Accuracy: 0.9996923076923077


Hyperparameter Tuning Random Forest

In [8]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform GridSearchCV for Decision Tree
rf_grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy')
rf_grid_search.fit(X_train, y_train)

# Get the best hyperparameters for Decision Tree
best_params_rf = rf_grid_search.best_params_
best_accuracy_rf = rf_grid_search.best_score_
print("Best Random Forest Parameters:", best_params_rf)
print("Best Random Forest Accuracy:", best_accuracy_rf)

Best Random Forest Parameters: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best Random Forest Accuracy: 1.0


## Task 2
Step 0 - Import Library

In [9]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # import DT
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

Step 1 - Load Data

In [10]:
# Load the dataset
df = pd.read_csv("mushrooms.csv")
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [11]:
# Features selection

# Features selection from 'cap-shape' to 'habitat'
X = df.iloc[:, 1:] 
y = df['class']
X = pd.get_dummies(X, drop_first=True) # Encode label

# Check features numbers and instance numbers
X.shape

(8124, 95)

Step 2 - Split data

In [12]:
# Split the dataset into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

Step 3 - Train Decision Tree Classifier Model

In [13]:
# By default, DT in scikit-learn will use "Gini" as split criteria
# Read the documentation for more detail
dt = DecisionTreeClassifier()

# Fitting / train DT model
dt.fit(X_train, y_train)

# Predict test set
y_pred_dt = dt.predict(X_test)

#  Calculate the accuracy
acc_dt = accuracy_score(y_test, y_pred_dt)
print("Result, ")
print("Test set accuracy: {:.2f}".format(acc_dt))
print(f"Test set accuracy: {acc_dt}")

Result, 
Test set accuracy: 1.00
Test set accuracy: 1.0


Step 4 - Train AdaBoost Model

In [14]:
ada = AdaBoostClassifier(n_estimators=2)

# Fit to AdaBoost Model
ada.fit(X_train, y_train)

# Predict the test set
y_pred_ada = ada.predict(X_test)

# Calculate the accuracy
acc_ada = accuracy_score(y_test, y_pred_ada)
print("Test set accuracy: {:.2f}".format(acc_ada))
print(f"Test set accuracy: {acc_ada}")

Test set accuracy: 0.88
Test set accuracy: 0.88


Step 5 - Hyperparameter Tuning Decision Tree

In [15]:
from sklearn.model_selection import GridSearchCV

# Define Decision Tree model and hyperparameter grid for tuning
dt_param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform GridSearchCV for Decision Tree
dt_grid_search = GridSearchCV(dt, dt_param_grid, cv=5, scoring='accuracy')
dt_grid_search.fit(X_train, y_train)

# Get the best hyperparameters for Decision Tree
best_params_dt = dt_grid_search.best_params_
best_accuracy_dt = dt_grid_search.best_score_
print("Best Decision Tree Parameters:", best_params_dt)
print("Best Decision Tree Accuracy:", best_accuracy_dt)

Best Decision Tree Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best Decision Tree Accuracy: 0.9996923076923077


Hyperparameter Tuning AdaBoost

In [16]:
#Define AdaBoost model and hyperparameter grid for tuning
ada = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), random_state=42)
ada_param_grid = {
    'n_estimators': [50, 100, 200]
}

# Perform GridSearchCV for AdaBoost
ada_grid_search = GridSearchCV(ada, ada_param_grid, cv=5, scoring='accuracy')
ada_grid_search.fit(X_train, y_train)

# Get the best hyperparameters for AdaBoost
best_params_ada = ada_grid_search.best_params_
best_accuracy_ada = ada_grid_search.best_score_
print("Best AdaBoost Parameters:", best_params_ada)
print("Best AdaBoost Accuracy:", best_accuracy_ada)



Best AdaBoost Parameters: {'n_estimators': 50}
Best AdaBoost Accuracy: 0.9996923076923077


## Task 3

Step 0 - Import the Library

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

Step 1 - Load the Dataset

In [18]:
# Load the diabetes dataset
dfx = pd.read_csv('diabetes.csv')
dfx.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


Step 2 - Check Columns Name

In [19]:
# Check columns name
dfx.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

Step 2.1 - Data Imputation

In [20]:
# Check column with 0 value
feature_columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']

for column in feature_columns:
    print("============================================")
    print(f"{column} ==> Missing zeros : {len(dfx.loc[dfx[column] == 0])}")

Pregnancies ==> Missing zeros : 111
Glucose ==> Missing zeros : 5
BloodPressure ==> Missing zeros : 35
SkinThickness ==> Missing zeros : 227
Insulin ==> Missing zeros : 374
BMI ==> Missing zeros : 11
DiabetesPedigreeFunction ==> Missing zeros : 0
Age ==> Missing zeros : 0


In [21]:
# Impute 0 with mean value
from sklearn.impute import SimpleImputer

fill_values = SimpleImputer(missing_values=0, strategy="mean", copy=False)

dfx[feature_columns] = fill_values.fit_transform(dfx[feature_columns])

Step 3 - Split Data

In [22]:
X = dfx[feature_columns]
y = dfx.Outcome

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

Step 4 - Logistic Regression

In [23]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

from sklearn.linear_model import LogisticRegression

logistic_regression = LogisticRegression()

logistic_regression.fit(X_train_std, y_train)

y_pred_lr = logistic_regression.predict(X_test_std)

acc_lr = accuracy_score(y_test, y_pred_lr)

print("Test set accuracy using Logistic Regression: {:.2f}".format(acc_lr))
print(f"Test set accuracy using Logistic Regression: {acc_lr}")


Test set accuracy using Logistic Regression: 0.74
Test set accuracy using Logistic Regression: 0.7359307359307359


Step 5 - SVM (Polynomial Kernel)

In [24]:
svm_poly = SVC(kernel='poly')

svm_poly.fit(X_train_std, y_train)

y_pred_svm_poly = svm_poly.predict(X_test_std)

acc_svm_poly = accuracy_score(y_test, y_pred_svm_poly)

print("Test set accuracy: {:.2f}".format(acc_svm_poly))
print(f"Test set accuracy: {acc_svm_poly}")

Test set accuracy: 0.70
Test set accuracy: 0.696969696969697


Step 6 - Decision Tree

In [25]:
# By default, DT in scikit-learn will use "Gini" as split criteria
# Read the documentation for more detail
from sklearn.tree import DecisionTreeClassifier


dt = DecisionTreeClassifier()

# Fitting / train DT model
dt.fit(X_train, y_train)

# Predict test set
y_pred_dt = dt.predict(X_test)

#  Calculate the accuracy
acc_dt = accuracy_score(y_test, y_pred_dt)
print("Test set accuracy: {:.2f}".format(acc_dt))
print(f"Test set accuracy: {acc_dt}")

Test set accuracy: 0.71
Test set accuracy: 0.7056277056277056


Step 7 - Build Voting Model

In [26]:
from sklearn.pipeline import Pipeline

# Define classifiers
clf1 = ('logistic_regression', LogisticRegression())
clf2 = ('svm_poly', SVC(kernel='poly'))
clf3 = ('decision_tree', DecisionTreeClassifier())

# Create a pipeline with StandardScaler and the classifier
pipeline_lr = Pipeline([
    ('scaler', StandardScaler()),
    clf1  # Classifier
])

# Create the Voting Classifier with 'hard' voting
voting = VotingClassifier(estimators=[clf1, clf2, clf3], voting='hard')

# Fit the ensemble on the training data
voting.fit(X_train_std, y_train)

# Predict on the test set
y_pred_vt1 = voting.predict(X_test_std)

# Evaluate the ensemble on the test data
acc_vt1 = accuracy_score(y_test, y_pred_vt1)

# Print the results
print('Voting Hard')
print("Test set accuracy: {:.2f}".format(acc_vt1))
print(f"Test set accuracy: {acc_vt1}")

Voting Hard
Test set accuracy: 0.74
Test set accuracy: 0.7402597402597403
