In [1]:
import os
import cv2
import numpy as np
import pandas as pd
from numpy.random import shuffle
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score

In [2]:
def get_data(path_data='/kaggle/input/att-images/ATT images', test_size=0.2, is_shuffle=True):
    folders = [f's{i+1}' for i in range(40)]
    X, y = [], []

    for i in range(1, 41):
        path = path_data + f'/s{i}'
        files = os.listdir(path)
        num_files = len(files)

        if is_shuffle:
            files = shuffle(files)

        for file in os.listdir(path):
            path = path_data + f'/s{i}/' + file
            image = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
            pixels = np.reshape(image, [1, image.shape[0] * image.shape[1]])
            pixels = np.asarray(pixels)

            if len(X) == 0:
                X = pixels
            else:
                X = np.vstack([X, pixels])
            y.append(f's{i}')

    return X, np.asarray(y)

In [3]:
def get_length(n):
    start = int(np.sqrt(n))
    while start > 1:
        if n % start == 0:
            break
        else:
            start -= 1
    return start, n // start

In [4]:
def cut_image(X, num_image=4, shape=(112,92)):
    height, width = shape
    num_width, num_height = get_length(num_image)
    part_width = width // num_width
    part_height = height // num_height

    X_cut = []

    for i in range(X.shape[0]):
        image = X[i].reshape(shape)
        for k in range(num_width):
            for j in range(num_height):
                left = j * part_width
                upper = k * part_height
                right = left + part_width
                lower = upper + part_height

                # Crop the image to get the part
                part = image[upper:lower, left:right]

                pixels = np.reshape(part, [1, part_width * part_height])
                pixels = np.asarray(pixels)

                if k == 0 and j == 0:
                    X_i = pixels
                else:
                    X_i = np.hstack([X_i, pixels])

        if len(X_cut) == 0:
            X_cut = X_i
        else:
            X_cut = np.vstack([X_cut, X_i])

    return X_cut

In [5]:
X, y = get_data()

n_components_pca = 50
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

In [6]:
class ModularPCA:
    def __init__(self, n_components, num_image=4, shape_image=(112,92)):
        self.n_components = n_components
        self.shape = shape_image
        self.num_image = num_image

    def fit(self, X):
        self.X = cut_image(X, self.num_image, self.shape)
        self.pca = PCA(n_components=self.n_components, random_state=42)
        self.pca.fit(self.X)

    def transform(self, X):
        X_cut = cut_image(X, self.num_image, self.shape)
        return self.pca.transform(X_cut)

    def fit_transform(self, X):
        self.fit(X)
        return self.pca.fit_transform(self.X)

In [7]:
# accuracy_scores, precision_scores, recall_scores, f1_scores = [], [], [], []

# for train_index, test_index in kf.split(X, y):
#     X_train, X_test = X[train_index], X[test_index]
#     y_train, y_test = y[train_index], y[test_index]

#     mpca = ModularPCA(n_components=n_components_pca)
#     X_train_reduced = mpca.fit_transform(X_train)
#     X_test_reduced = mpca.transform(X_test)

#     # Euclidean
#     y_pred = []
#     for i in range(80):
#         min_ = np.argmin(np.sqrt(np.sum((X_train_reduced - X_test_reduced[i])**2,axis=1)))
#         y_pred.append(y_train[min_])

#     # Evaluate the model
#     accuracy = accuracy_score(y_test, y_pred)
#     accuracy_scores.append(accuracy)

# print(np.mean(accuracy_scores))

In [8]:
# accuracy_scores, precision_scores, recall_scores, f1_scores = [], [], [], []

# for train_index, test_index in kf.split(X, y):
#     X_train, X_test = X[train_index], X[test_index]
#     y_train, y_test = y[train_index], y[test_index]

#     mpca = ModularPCA(n_components=n_components_pca)
#     X_train_reduced = mpca.fit_transform(X_train)
#     X_test_reduced = mpca.transform(X_test)

#     # Logistic
#     lr = LogisticRegression(multi_class='ovr', solver='liblinear')
#     lr.fit(X_train_reduced, y_train)
#     y_pred = lr.predict(X_test_reduced)

#     # Evaluate the model
#     accuracy = accuracy_score(y_test, y_pred)
#     accuracy_scores.append(accuracy)

# print(np.mean(accuracy_scores))

In [9]:
X, y = get_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# num_image = 4

In [10]:
n_components_pca = 50
pca = ModularPCA(n_components=n_components_pca, num_image=4)
X_train_reduced = pca.fit_transform(X_train)
X_test_reduced = pca.transform(X_test)

## Modeling

In [11]:
from sklearn.model_selection import KFold
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

In [12]:
lr = LogisticRegression()
lr.fit(X_train_reduced, y_train)
y_pred = lr.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

accuracy: 1.0
recall: 1.0
precision: 1.0
f1-score: 1.0
roc_auc:  1.0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Random Forest

In [13]:
prediction_results = {}

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

**Normal**

In [15]:
rfr = RandomForestClassifier(random_state=42)
rfr.fit(X_train_reduced,y_train)
y_pred = rfr.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

accuracy: 0.9875
recall: 0.9722222222222222
precision: 0.9629629629629629
f1-score: 0.9666666666666666
roc_auc:  0.7339875119474435


  _warn_prf(average, modifier, msg_start, len(result))


**Tunning**

In [16]:
import optuna


def objective(trial):
    hyperparams = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 10, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 32),
        'random_state': trial.suggest_categorical('random_state', [42]),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 32),
        'n_jobs': -1
    }

    model = RandomForestClassifier(**hyperparams)
    scores = cross_val_score(model, X_train_reduced,
                             y_train, cv=kf, scoring='accuracy')
    return np.mean(scores)

In [17]:
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 100)
best_params = study.best_params
print("Best params found :", best_params)

final_model = RandomForestClassifier(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
accuracy_score(y_test, y_pred)

[I 2024-02-22 17:58:13,665] A new study created in memory with name: no-name-f3b6df15-a9cb-4508-82c2-1fb4f33c171b
[I 2024-02-22 17:58:29,542] Trial 0 finished with value: 0.8 and parameters: {'n_estimators': 858, 'max_depth': 16, 'min_samples_split': 9, 'random_state': 42, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.8.
[I 2024-02-22 17:58:43,044] Trial 1 finished with value: 0.6875 and parameters: {'n_estimators': 812, 'max_depth': 13, 'min_samples_split': 9, 'random_state': 42, 'min_samples_leaf': 16}. Best is trial 0 with value: 0.8.
[I 2024-02-22 17:58:50,688] Trial 2 finished with value: 0.75 and parameters: {'n_estimators': 431, 'max_depth': 26, 'min_samples_split': 3, 'random_state': 42, 'min_samples_leaf': 9}. Best is trial 0 with value: 0.8.
[I 2024-02-22 17:58:58,549] Trial 3 finished with value: 0.746875 and parameters: {'n_estimators': 450, 'max_depth': 13, 'min_samples_split': 17, 'random_state': 42, 'min_samples_leaf': 10}. Best is trial 0 with value: 0.8.
[I 202

Best params found : {'n_estimators': 999, 'max_depth': 36, 'min_samples_split': 2, 'random_state': 42, 'min_samples_leaf': 1}


0.9875

In [18]:
final_model = RandomForestClassifier(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
prediction_results['random forest'] = y_pred

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

accuracy: 0.9875
recall: 0.9722222222222222
precision: 0.9629629629629629
f1-score: 0.9666666666666666
roc_auc:  0.7339875119474435


  _warn_prf(average, modifier, msg_start, len(result))


## Gassian NB

**Pre-tunning**

In [19]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train_reduced, y_train)

y_pred = gnb.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)


accuracy: 0.9625
recall: 0.9444444444444444
precision: 0.9375
f1-score: 0.9358465608465608
roc_auc:  0.7266591260476291


  _warn_prf(average, modifier, msg_start, len(result))


**Tunning**

In [20]:
def objective(trial):
    hyperparams = {
        'var_smoothing': trial.suggest_float('var_smoothing', 1e-9, 1e-4, log = True)
    }

    model = GaussianNB(**hyperparams)
    scores = cross_val_score(model, X_train_reduced, y_train, cv = kf,scoring = 'accuracy')
    return np.mean(scores)

In [21]:
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 100)
best_params = study.best_params
print("Best params found :", best_params)

final_model = GaussianNB(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
prediction_results['gnb'] = y_pred

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

[I 2024-02-22 18:22:54,706] A new study created in memory with name: no-name-b0650398-98bf-47be-ba0d-bfda17ed5891
[I 2024-02-22 18:22:54,750] Trial 0 finished with value: 0.75 and parameters: {'var_smoothing': 2.164577155632319e-07}. Best is trial 0 with value: 0.75.
[I 2024-02-22 18:22:54,793] Trial 1 finished with value: 0.75 and parameters: {'var_smoothing': 2.6176271490287392e-05}. Best is trial 0 with value: 0.75.
[I 2024-02-22 18:22:54,836] Trial 2 finished with value: 0.75 and parameters: {'var_smoothing': 1.7597885171421148e-05}. Best is trial 0 with value: 0.75.
[I 2024-02-22 18:22:54,880] Trial 3 finished with value: 0.75 and parameters: {'var_smoothing': 2.0269086391550877e-05}. Best is trial 0 with value: 0.75.
[I 2024-02-22 18:22:54,924] Trial 4 finished with value: 0.75 and parameters: {'var_smoothing': 3.8393547935591016e-07}. Best is trial 0 with value: 0.75.
[I 2024-02-22 18:22:54,967] Trial 5 finished with value: 0.75 and parameters: {'var_smoothing': 8.60961871750368

Best params found : {'var_smoothing': 9.776722182627445e-05}
accuracy: 0.9625
recall: 0.9444444444444444
precision: 0.9375
f1-score: 0.9358465608465608
roc_auc:  0.7266591260476291


  _warn_prf(average, modifier, msg_start, len(result))


## Logistic Regression

**Normal**

In [22]:
lr = LogisticRegression()
lr.fit(X_train_reduced, y_train)
y_pred = lr.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

accuracy: 1.0
recall: 1.0
precision: 1.0
f1-score: 1.0
roc_auc:  1.0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


**Tunning**

In [23]:
import optuna
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')


def objective(trial):
    hyperparams = {
        'solver': trial.suggest_categorical('solver', ['lbfgs', 'liblinear', 'newton-cg', 'saga']),
        'penalty': trial.suggest_categorical('penalty', ['l2']),
        'multi_class': trial.suggest_categorical('multi_class', ['ovr']),
        'C': trial.suggest_loguniform("C", 1e-3, 1e3),
        'n_jobs': -1
    }

    model = LogisticRegression(**hyperparams)
    scores = cross_val_score(model, X_train_reduced,
                             y_train, cv=kf, scoring='accuracy')
    return np.mean(scores)

In [24]:
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 100)
best_params = study.best_params
print("Best params found :", best_params)

final_model = LogisticRegression(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
prediction_results['logistic regression'] = y_pred

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

[I 2024-02-22 18:23:00,821] A new study created in memory with name: no-name-e3ba8ff3-87fb-4036-8c33-d6e96999f78a
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://sc

Best params found : {'solver': 'lbfgs', 'penalty': 'l2', 'multi_class': 'ovr', 'C': 0.07218415055662711}
accuracy: 1.0
recall: 1.0
precision: 1.0
f1-score: 1.0
roc_auc:  1.0


In [25]:
# lr = LogisticRegression(multi_class='ovr', solver='liblinear')
# lr.fit(X_train_reduced, y_train)
# y_pred = lr.predict(X_test_reduced)

## KNN Classifier

**Normal**

In [26]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train_reduced, y_train)
y_pred = knn.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)


accuracy: 0.9125
recall: 0.9375
precision: 0.9143518518518517
f1-score: 0.9068783068783068
roc_auc:  0.5771545864579081


**Tunning**

In [27]:
import optuna
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')


def objective(trial):
    hyperparams = {
        'n_neighbors': trial.suggest_int("n_neighbors", 5, 100),
        'weights': trial.suggest_categorical("weights", ["uniform", "distance"]),
        'metric': trial.suggest_categorical("metric", ["euclidean", "manhattan", "minkowski"]),
        'algorithm': trial.suggest_categorical('algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute']),
        'n_jobs': -1
    }

    # Create KNN model with tuned hyperparameters
    model = KNeighborsClassifier(**hyperparams)
    scores = cross_val_score(model, X_train_reduced,
                             y_train, cv=kf, scoring='accuracy')
    return np.mean(scores)

In [28]:
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 100)
best_params = study.best_params
print("Best params found :", best_params)

final_model =KNeighborsClassifier(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
prediction_results['KNN'] = y_pred

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

[I 2024-02-22 18:30:23,357] A new study created in memory with name: no-name-6c9ae3ac-1293-43c2-aba7-bb113276c8e7
[I 2024-02-22 18:30:23,411] Trial 0 finished with value: 0.103125 and parameters: {'n_neighbors': 63, 'weights': 'uniform', 'metric': 'euclidean', 'algorithm': 'brute'}. Best is trial 0 with value: 0.103125.
[I 2024-02-22 18:30:23,511] Trial 1 finished with value: 0.653125 and parameters: {'n_neighbors': 25, 'weights': 'distance', 'metric': 'minkowski', 'algorithm': 'ball_tree'}. Best is trial 1 with value: 0.653125.
[I 2024-02-22 18:30:23,607] Trial 2 finished with value: 0.753125 and parameters: {'n_neighbors': 13, 'weights': 'distance', 'metric': 'manhattan', 'algorithm': 'ball_tree'}. Best is trial 2 with value: 0.753125.
[I 2024-02-22 18:30:23,704] Trial 3 finished with value: 0.665625 and parameters: {'n_neighbors': 31, 'weights': 'distance', 'metric': 'manhattan', 'algorithm': 'ball_tree'}. Best is trial 2 with value: 0.753125.
[I 2024-02-22 18:30:23,763] Trial 4 fin

Best params found : {'n_neighbors': 5, 'weights': 'distance', 'metric': 'minkowski', 'algorithm': 'auto'}
accuracy: 0.9875
recall: 0.9930555555555556
precision: 0.9930555555555556
f1-score: 0.9920634920634921
roc_auc:  0.9963474025974026


## Support Vector Machine 

**Normal**

In [29]:
from sklearn.svm import SVC

svm = SVC(kernel='linear')
svm.fit(X_train_reduced, y_train)
y_pred = svm.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
# Assuming 'label_' prefix for clarity
y_test_onehot = pd.get_dummies(y_test, prefix='label_')
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot,
                              average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

accuracy: 1.0
recall: 1.0
precision: 1.0
f1-score: 1.0
roc_auc:  1.0


**Tunning**

In [30]:
import optuna
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')

def objective(trial):
    hyperparams = {
        'kernel': trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly']),
        'C': trial.suggest_loguniform('C', 1e-5, 1e5),
        'gamma': trial.suggest_loguniform('gamma', 1e-5, 1e5),
        # 'degree': trial.suggest_int('degree', 2, 5),  # for polynomial kernel
        'tol': trial.suggest_loguniform('tol', 1e-4, 1e-2),
        'shrinking': trial.suggest_categorical('shrinking', [True, False]),
    }
    
    # Create KNN model with tuned hyperparameters
    model = SVC(**hyperparams)
    scores = cross_val_score(model, X_train_reduced, y_train, cv = kf,scoring = 'accuracy')
    return np.mean(scores)

In [31]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
best_params = study.best_params
print("Best params found :", best_params)

final_model = SVC(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
prediction_results['svc'] = list(y_pred)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
# Assuming 'label_' prefix for clarity
y_test_onehot = pd.get_dummies(y_test, prefix='label_')
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot,
                              average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

[I 2024-02-22 18:30:31,640] A new study created in memory with name: no-name-fc0eb23a-40e8-4e45-b16a-0f88c05cdb0b
[I 2024-02-22 18:30:31,743] Trial 0 finished with value: 0.859375 and parameters: {'kernel': 'poly', 'C': 136.39282458257222, 'gamma': 1.282588265172263, 'tol': 0.0003305938953843011, 'shrinking': True}. Best is trial 0 with value: 0.859375.
[I 2024-02-22 18:30:31,835] Trial 1 finished with value: 0.946875 and parameters: {'kernel': 'linear', 'C': 0.04834977976971953, 'gamma': 14393.550761044286, 'tol': 0.00012788319825127054, 'shrinking': True}. Best is trial 1 with value: 0.946875.
[I 2024-02-22 18:30:31,950] Trial 2 finished with value: 0.0 and parameters: {'kernel': 'rbf', 'C': 5603.979219761246, 'gamma': 0.0007976637162598171, 'tol': 0.0001162209069282289, 'shrinking': True}. Best is trial 1 with value: 0.946875.
[I 2024-02-22 18:30:32,026] Trial 3 finished with value: 0.946875 and parameters: {'kernel': 'linear', 'C': 66.48016482666914, 'gamma': 0.005909812245984741, 

Best params found : {'kernel': 'linear', 'C': 0.04834977976971953, 'gamma': 14393.550761044286, 'tol': 0.00012788319825127054, 'shrinking': True}
accuracy: 1.0
recall: 1.0
precision: 1.0
f1-score: 1.0
roc_auc:  1.0


## Decision Tree

**Normal**

In [32]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_reduced, y_train)
y_pred = dt.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
# Assuming 'label_' prefix for clarity
y_test_onehot = pd.get_dummies(y_test, prefix='label_')
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot,
                              average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

svm = SVC(kernel='linear')
svm.fit(X_train_reduced, y_train)
y_pred = svm.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
# Assuming 'label_' prefix for clarity
y_test_onehot = pd.get_dummies(y_test, prefix='label_')
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot,
                              average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

accuracy: 0.6
recall: 0.55
precision: 0.5641666666666667
f1-score: 0.5229761904761905
roc_auc:  0.5452392744810545
accuracy: 1.0
recall: 1.0
precision: 1.0
f1-score: 1.0
roc_auc:  1.0


**Tunning**

In [33]:
import optuna
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')


def objective(trial):
    hyperparams = {
        "max_depth" : trial.suggest_int("max_depth", 2, 10),
        "min_samples_split" : trial.suggest_int("min_samples_split", 2, 20),
        "min_samples_leaf" : trial.suggest_int("min_samples_leaf", 1, 10),
        "criterion" : trial.suggest_categorical("criterion", ["gini", "entropy"]),
        'random_state': trial.suggest_categorical('random_state', [42])
    }

    # Create KNN model with tuned hyperparameters
    model = DecisionTreeClassifier(**hyperparams)
    scores = cross_val_score(model, X_train_reduced,
                             y_train, cv=kf, scoring='accuracy')
    return np.mean(scores)

In [34]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
best_params = study.best_params
print("Best params found :", best_params)

final_model = DecisionTreeClassifier(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
prediction_results['decison tree'] = list(y_pred)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
# Assuming 'label_' prefix for clarity
y_test_onehot = pd.get_dummies(y_test, prefix='label_')
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot,
                              average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

[I 2024-02-22 18:30:43,729] A new study created in memory with name: no-name-f943cfaf-78b5-4ad3-8c35-54c94b16446f
[I 2024-02-22 18:30:43,889] Trial 0 finished with value: 0.28125 and parameters: {'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 9, 'criterion': 'entropy', 'random_state': 42}. Best is trial 0 with value: 0.28125.
[I 2024-02-22 18:30:44,079] Trial 1 finished with value: 0.328125 and parameters: {'max_depth': 6, 'min_samples_split': 16, 'min_samples_leaf': 1, 'criterion': 'entropy', 'random_state': 42}. Best is trial 1 with value: 0.328125.
[I 2024-02-22 18:30:44,169] Trial 2 finished with value: 0.425 and parameters: {'max_depth': 10, 'min_samples_split': 3, 'min_samples_leaf': 1, 'criterion': 'gini', 'random_state': 42}. Best is trial 2 with value: 0.425.
[I 2024-02-22 18:30:44,326] Trial 3 finished with value: 0.265625 and parameters: {'max_depth': 5, 'min_samples_split': 17, 'min_samples_leaf': 9, 'criterion': 'entropy', 'random_state': 42}. Best is trial 

Best params found : {'max_depth': 7, 'min_samples_split': 4, 'min_samples_leaf': 1, 'criterion': 'entropy', 'random_state': 42}
accuracy: 0.6875
recall: 0.7361111111111112
precision: 0.7138888888888889
f1-score: 0.6833633958633959
roc_auc:  0.5365064934594339


Tuning rồi lưu kết quả tuning vào bên dưới

# num_image = 9

In [35]:
n_components_pca = 50
pca = ModularPCA(n_components=n_components_pca, num_image=9)
X_train_reduced = pca.fit_transform(X_train)
X_test_reduced = pca.transform(X_test)

In [36]:
# for train_index, test_index in skf.split(X, y):
#     X_train, X_test = X[train_index], X[test_index]
#     y_train, y_test = y[train_index], y[test_index]

#     pca = PCA(n_components=n_components_pca)
#     X_train_reduced = pca.fit_transform(X_train)
#     X_test_reduced = pca.transform(X_test)

#     # Euclidean
#     y_pred = []
#     for i in range(320):
#         min_ = np.argmin(np.sqrt(np.sum((X_train_reduced - X_test_reduced[i])**2,axis=1)))
#         y_pred.append(y_train[min_])

#     # Evaluate the model
#     accuracy = accuracy_score(y_test, y_pred)
#     accuracy_scores.append(accuracy)

# print(np.mean(accuracy_scores))

## Modeling

In [37]:
from sklearn.model_selection import KFold
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

## Random Forest

In [38]:
prediction_results = {}

In [39]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

**Normal**

In [40]:
rfr = RandomForestClassifier(random_state=42)
rfr.fit(X_train_reduced,y_train)
y_pred = rfr.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

accuracy: 0.9875
recall: 0.9722222222222222
precision: 0.9652777777777778
f1-score: 0.9682539682539684
roc_auc:  0.7339828150168876


**Tunning**

In [41]:
import optuna


def objective(trial):
    hyperparams = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 10, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 32),
        'random_state': trial.suggest_categorical('random_state', [42]),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 32),
        'n_jobs': -1
    }

    model = RandomForestClassifier(**hyperparams)
    scores = cross_val_score(model, X_train_reduced,
                             y_train, cv=kf, scoring='accuracy')
    return np.mean(scores)

In [42]:
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 100)
best_params = study.best_params
print("Best params found :", best_params)

final_model = RandomForestClassifier(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
accuracy_score(y_test, y_pred)

[I 2024-02-22 18:31:09,031] A new study created in memory with name: no-name-b959e357-42f3-4115-9ed9-8110f2ccdda1
[I 2024-02-22 18:31:14,277] Trial 0 finished with value: 0.715625 and parameters: {'n_estimators': 281, 'max_depth': 36, 'min_samples_split': 19, 'random_state': 42, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.715625.
[I 2024-02-22 18:31:20,203] Trial 1 finished with value: 0.65 and parameters: {'n_estimators': 326, 'max_depth': 16, 'min_samples_split': 31, 'random_state': 42, 'min_samples_leaf': 17}. Best is trial 0 with value: 0.715625.
[I 2024-02-22 18:31:23,742] Trial 2 finished with value: 0.703125 and parameters: {'n_estimators': 183, 'max_depth': 24, 'min_samples_split': 23, 'random_state': 42, 'min_samples_leaf': 5}. Best is trial 0 with value: 0.715625.
[I 2024-02-22 18:31:38,848] Trial 3 finished with value: 0.5875 and parameters: {'n_estimators': 857, 'max_depth': 45, 'min_samples_split': 4, 'random_state': 42, 'min_samples_leaf': 23}. Best is trial 0 w

Best params found : {'n_estimators': 977, 'max_depth': 28, 'min_samples_split': 2, 'random_state': 42, 'min_samples_leaf': 1}


0.9875

In [43]:
final_model = RandomForestClassifier(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
prediction_results['random forest'] = y_pred

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

accuracy: 0.9875
recall: 0.9722222222222222
precision: 0.9629629629629629
f1-score: 0.9666666666666666
roc_auc:  0.7339803738490329


## Gassian NB

**Pre-tunning**

In [44]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train_reduced, y_train)

y_pred = gnb.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)


accuracy: 0.975
recall: 0.9583333333333334
precision: 0.9527777777777777
f1-score: 0.9506172839506173
roc_auc:  0.7339875119474435


**Tunning**

In [45]:
def objective(trial):
    hyperparams = {
        'var_smoothing': trial.suggest_float('var_smoothing', 1e-9, 1e-4, log = True)
    }

    model = GaussianNB(**hyperparams)
    scores = cross_val_score(model, X_train_reduced, y_train, cv = kf,scoring = 'accuracy')
    return np.mean(scores)

In [46]:
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 100)
best_params = study.best_params
print("Best params found :", best_params)

final_model = GaussianNB(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
prediction_results['gnb'] = y_pred

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

[I 2024-02-22 18:56:27,428] A new study created in memory with name: no-name-3cadf7cd-8bdc-4c2f-9899-f6bd8276548b
[I 2024-02-22 18:56:27,472] Trial 0 finished with value: 0.73125 and parameters: {'var_smoothing': 1.1127213414869509e-05}. Best is trial 0 with value: 0.73125.
[I 2024-02-22 18:56:27,515] Trial 1 finished with value: 0.728125 and parameters: {'var_smoothing': 2.4566749177817425e-08}. Best is trial 0 with value: 0.73125.
[I 2024-02-22 18:56:27,558] Trial 2 finished with value: 0.734375 and parameters: {'var_smoothing': 5.683468202522304e-05}. Best is trial 2 with value: 0.734375.
[I 2024-02-22 18:56:27,600] Trial 3 finished with value: 0.728125 and parameters: {'var_smoothing': 5.4312250074831164e-09}. Best is trial 2 with value: 0.734375.
[I 2024-02-22 18:56:27,645] Trial 4 finished with value: 0.728125 and parameters: {'var_smoothing': 9.451709982395127e-08}. Best is trial 2 with value: 0.734375.
[I 2024-02-22 18:56:27,688] Trial 5 finished with value: 0.728125 and parame

Best params found : {'var_smoothing': 8.80143748920862e-05}
accuracy: 0.975
recall: 0.9583333333333334
precision: 0.9527777777777777
f1-score: 0.9506172839506173
roc_auc:  0.7339875119474435


## Logistic Regression

**Normal**

In [47]:
lr = LogisticRegression()
lr.fit(X_train_reduced, y_train)
y_pred = lr.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

accuracy: 1.0
recall: 1.0
precision: 1.0
f1-score: 1.0
roc_auc:  1.0


**Tunning**

In [48]:
import optuna
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')


def objective(trial):
    hyperparams = {
        'solver': trial.suggest_categorical('solver', ['lbfgs', 'liblinear', 'newton-cg', 'saga']),
        'penalty': trial.suggest_categorical('penalty', ['l2']),
        'multi_class': trial.suggest_categorical('multi_class', ['ovr']),
        'C': trial.suggest_loguniform("C", 1e-3, 1e3),
        'n_jobs': -1
    }

    model = LogisticRegression(**hyperparams)
    scores = cross_val_score(model, X_train_reduced,
                             y_train, cv=kf, scoring='accuracy')
    return np.mean(scores)

In [49]:
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 100)
best_params = study.best_params
print("Best params found :", best_params)

final_model = LogisticRegression(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
prediction_results['logistic regression'] = y_pred

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

[I 2024-02-22 18:56:34,448] A new study created in memory with name: no-name-9e2bba89-9703-452f-8763-a6b29dcad851
[I 2024-02-22 18:56:51,007] Trial 0 finished with value: 0.909375 and parameters: {'solver': 'liblinear', 'penalty': 'l2', 'multi_class': 'ovr', 'C': 1.7542754666298335}. Best is trial 0 with value: 0.909375.
[I 2024-02-22 18:56:52,256] Trial 1 finished with value: 0.79375 and parameters: {'solver': 'saga', 'penalty': 'l2', 'multi_class': 'ovr', 'C': 0.020403063384519533}. Best is trial 0 with value: 0.909375.
[I 2024-02-22 18:57:07,928] Trial 2 finished with value: 0.953125 and parameters: {'solver': 'newton-cg', 'penalty': 'l2', 'multi_class': 'ovr', 'C': 0.006374879670841058}. Best is trial 2 with value: 0.953125.
[I 2024-02-22 18:57:09,160] Trial 3 finished with value: 0.79375 and parameters: {'solver': 'saga', 'penalty': 'l2', 'multi_class': 'ovr', 'C': 0.1408541887838225}. Best is trial 2 with value: 0.953125.
[I 2024-02-22 18:57:18,460] Trial 4 finished with value: 0

Best params found : {'solver': 'lbfgs', 'penalty': 'l2', 'multi_class': 'ovr', 'C': 0.018671802271092063}
accuracy: 1.0
recall: 1.0
precision: 1.0
f1-score: 1.0
roc_auc:  1.0


In [50]:
# lr = LogisticRegression(multi_class='ovr', solver='liblinear')
# lr.fit(X_train_reduced, y_train)
# y_pred = lr.predict(X_test_reduced)

## KNN Classifier

**Normal**

In [51]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train_reduced, y_train)
y_pred = knn.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)


accuracy: 0.9125
recall: 0.9375
precision: 0.9143518518518517
f1-score: 0.9068783068783068
roc_auc:  0.5771545864579081


**Tunning**

In [52]:
import optuna
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')


def objective(trial):
    hyperparams = {
        'n_neighbors': trial.suggest_int("n_neighbors", 5, 100),
        'weights': trial.suggest_categorical("weights", ["uniform", "distance"]),
        'metric': trial.suggest_categorical("metric", ["euclidean", "manhattan", "minkowski"]),
        'algorithm': trial.suggest_categorical('algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute']),
        'n_jobs': -1
    }

    # Create KNN model with tuned hyperparameters
    model = KNeighborsClassifier(**hyperparams)
    scores = cross_val_score(model, X_train_reduced,
                             y_train, cv=kf, scoring='accuracy')
    return np.mean(scores)

In [53]:
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 100)
best_params = study.best_params
print("Best params found :", best_params)

final_model =KNeighborsClassifier(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
prediction_results['KNN'] = y_pred

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

[I 2024-02-22 19:11:15,510] A new study created in memory with name: no-name-bb78bbfc-a0dd-4a61-8cb0-206cb4d00756
[I 2024-02-22 19:11:15,623] Trial 0 finished with value: 0.065625 and parameters: {'n_neighbors': 89, 'weights': 'uniform', 'metric': 'euclidean', 'algorithm': 'ball_tree'}. Best is trial 0 with value: 0.065625.
[I 2024-02-22 19:11:15,722] Trial 1 finished with value: 0.625 and parameters: {'n_neighbors': 48, 'weights': 'distance', 'metric': 'minkowski', 'algorithm': 'ball_tree'}. Best is trial 1 with value: 0.625.
[I 2024-02-22 19:11:15,834] Trial 2 finished with value: 0.184375 and parameters: {'n_neighbors': 46, 'weights': 'uniform', 'metric': 'euclidean', 'algorithm': 'kd_tree'}. Best is trial 1 with value: 0.625.
[I 2024-02-22 19:11:15,934] Trial 3 finished with value: 0.60625 and parameters: {'n_neighbors': 62, 'weights': 'distance', 'metric': 'minkowski', 'algorithm': 'ball_tree'}. Best is trial 1 with value: 0.625.
[I 2024-02-22 19:11:15,966] Trial 4 finished with v

Best params found : {'n_neighbors': 5, 'weights': 'distance', 'metric': 'euclidean', 'algorithm': 'auto'}
accuracy: 0.9875
recall: 0.9930555555555556
precision: 0.9930555555555556
f1-score: 0.9920634920634921
roc_auc:  0.9963474025974026


## Support Vector Machine 

**Normal**

In [54]:
from sklearn.svm import SVC

svm = SVC(kernel='linear')
svm.fit(X_train_reduced, y_train)
y_pred = svm.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
# Assuming 'label_' prefix for clarity
y_test_onehot = pd.get_dummies(y_test, prefix='label_')
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot,
                              average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

accuracy: 1.0
recall: 1.0
precision: 1.0
f1-score: 1.0
roc_auc:  1.0


**Tunning**

In [55]:
import optuna
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')

def objective(trial):
    hyperparams = {
        'kernel': trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly']),
        'C': trial.suggest_loguniform('C', 1e-5, 1e5),
        'gamma': trial.suggest_loguniform('gamma', 1e-5, 1e5),
        # 'degree': trial.suggest_int('degree', 2, 5),  # for polynomial kernel
        'tol': trial.suggest_loguniform('tol', 1e-4, 1e-2),
        'shrinking': trial.suggest_categorical('shrinking', [True, False]),
    }
    
    # Create KNN model with tuned hyperparameters
    model = SVC(**hyperparams)
    scores = cross_val_score(model, X_train_reduced, y_train, cv = kf,scoring = 'accuracy')
    return np.mean(scores)

In [56]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
best_params = study.best_params
print("Best params found :", best_params)

final_model = SVC(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
prediction_results['svc'] = list(y_pred)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
# Assuming 'label_' prefix for clarity
y_test_onehot = pd.get_dummies(y_test, prefix='label_')
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot,
                              average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

[I 2024-02-22 19:11:23,565] A new study created in memory with name: no-name-b0d92dd5-34b0-49db-82b4-45ad86c1a29b
[I 2024-02-22 19:11:23,644] Trial 0 finished with value: 0.946875 and parameters: {'kernel': 'linear', 'C': 0.010852370050640879, 'gamma': 635.7365296941003, 'tol': 0.0015648068333601914, 'shrinking': False}. Best is trial 0 with value: 0.946875.
[I 2024-02-22 19:11:23,722] Trial 1 finished with value: 0.0 and parameters: {'kernel': 'rbf', 'C': 0.0007890596953535686, 'gamma': 2.087773938653386, 'tol': 0.008769227945934509, 'shrinking': True}. Best is trial 0 with value: 0.946875.
[I 2024-02-22 19:11:23,800] Trial 2 finished with value: 0.0 and parameters: {'kernel': 'rbf', 'C': 1.699720366123846e-05, 'gamma': 0.9835841636277356, 'tol': 0.002649724508235706, 'shrinking': False}. Best is trial 0 with value: 0.946875.
[I 2024-02-22 19:11:23,896] Trial 3 finished with value: 0.0 and parameters: {'kernel': 'rbf', 'C': 0.0897477713717745, 'gamma': 0.09966291882815484, 'tol': 0.00

Best params found : {'kernel': 'linear', 'C': 0.010852370050640879, 'gamma': 635.7365296941003, 'tol': 0.0015648068333601914, 'shrinking': False}
accuracy: 1.0
recall: 1.0
precision: 1.0
f1-score: 1.0
roc_auc:  1.0


## Decision Tree

**Normal**

In [57]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_reduced, y_train)
y_pred = dt.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
# Assuming 'label_' prefix for clarity
y_test_onehot = pd.get_dummies(y_test, prefix='label_')
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot,
                              average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

svm = SVC(kernel='linear')
svm.fit(X_train_reduced, y_train)
y_pred = svm.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
# Assuming 'label_' prefix for clarity
y_test_onehot = pd.get_dummies(y_test, prefix='label_')
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot,
                              average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

accuracy: 0.6625
recall: 0.6527777777777777
precision: 0.6435185185185186
f1-score: 0.6233906525573192
roc_auc:  0.545608933463109
accuracy: 1.0
recall: 1.0
precision: 1.0
f1-score: 1.0
roc_auc:  1.0


**Tunning**

In [58]:
import optuna
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')


def objective(trial):
    hyperparams = {
        "max_depth" : trial.suggest_int("max_depth", 2, 10),
        "min_samples_split" : trial.suggest_int("min_samples_split", 2, 20),
        "min_samples_leaf" : trial.suggest_int("min_samples_leaf", 1, 10),
        "criterion" : trial.suggest_categorical("criterion", ["gini", "entropy"])
    }

    # Create KNN model with tuned hyperparameters
    model = DecisionTreeClassifier(**hyperparams)
    scores = cross_val_score(model, X_train_reduced,
                             y_train, cv=kf, scoring='accuracy')
    return np.mean(scores)

In [59]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
best_params = study.best_params
print("Best params found :", best_params)

final_model = DecisionTreeClassifier(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
prediction_results['decison tree'] = list(y_pred)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
# Assuming 'label_' prefix for clarity
y_test_onehot = pd.get_dummies(y_test, prefix='label_')
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# # ROC AUC score with multiclass handling
# roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot,
#                               average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
# print('roc_auc: ', roc_auc_macro)

[I 2024-02-22 19:11:36,253] A new study created in memory with name: no-name-7c1bbf31-b743-468e-bd5c-98d2a1f4bd11
[I 2024-02-22 19:11:36,327] Trial 0 finished with value: 0.325 and parameters: {'max_depth': 10, 'min_samples_split': 18, 'min_samples_leaf': 7, 'criterion': 'gini'}. Best is trial 0 with value: 0.325.
[I 2024-02-22 19:11:36,406] Trial 1 finished with value: 0.40625 and parameters: {'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 6, 'criterion': 'gini'}. Best is trial 1 with value: 0.40625.
[I 2024-02-22 19:11:36,524] Trial 2 finished with value: 0.03125 and parameters: {'max_depth': 2, 'min_samples_split': 2, 'min_samples_leaf': 6, 'criterion': 'entropy'}. Best is trial 1 with value: 0.40625.
[I 2024-02-22 19:11:36,699] Trial 3 finished with value: 0.3125 and parameters: {'max_depth': 8, 'min_samples_split': 17, 'min_samples_leaf': 4, 'criterion': 'entropy'}. Best is trial 1 with value: 0.40625.
[I 2024-02-22 19:11:36,764] Trial 4 finished with value: 0.28125

Best params found : {'max_depth': 9, 'min_samples_split': 5, 'min_samples_leaf': 1, 'criterion': 'entropy'}
accuracy: 0.5375
recall: 0.4764957264957266
precision: 0.497008547008547
f1-score: 0.455962555962556


Tuning rồi lưu kết quả tuning vào bên dưới

# num_image = 16

In [60]:
n_components_pca = 50
pca = ModularPCA(n_components=n_components_pca, num_image=16)
X_train_reduced = pca.fit_transform(X_train)
X_test_reduced = pca.transform(X_test)

In [61]:
# accuracy_scores, precision_scores, recall_scores, f1_scores = [], [], [], []
# n_components_pca = 50
# n_splits = 5
# skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

In [62]:
# for train_index, test_index in skf.split(X, y):
#     X_train, X_test = X[train_index], X[test_index]
#     y_train, y_test = y[train_index], y[test_index]

#     pca = PCA(n_components=n_components_pca)
#     X_train_reduced = pca.fit_transform(X_train)
#     X_test_reduced = pca.transform(X_test)

#     # Euclidean
#     y_pred = []
#     for i in range(320):
#         min_ = np.argmin(np.sqrt(np.sum((X_train_reduced - X_test_reduced[i])**2,axis=1)))
#         y_pred.append(y_train[min_])

#     # Evaluate the model
#     accuracy = accuracy_score(y_test, y_pred)
#     accuracy_scores.append(accuracy)

# print(np.mean(accuracy_scores))

## Modeling

In [63]:
from sklearn.model_selection import KFold
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

## Random Forest

In [64]:
prediction_results = {}

In [65]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

**Normal**

In [66]:
rfr = RandomForestClassifier(random_state=42)
rfr.fit(X_train_reduced,y_train)
y_pred = rfr.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

accuracy: 0.9875
recall: 0.9722222222222222
precision: 0.9629629629629629
f1-score: 0.9666666666666666
roc_auc:  0.7339875119474435


**Tunning**

In [67]:
import optuna


def objective(trial):
    hyperparams = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 10, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 32),
        'random_state': trial.suggest_categorical('random_state', [42]),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 32),
        'n_jobs': -1
    }

    model = RandomForestClassifier(**hyperparams)
    scores = cross_val_score(model, X_train_reduced,
                             y_train, cv=kf, scoring='accuracy')
    return np.mean(scores)

In [68]:
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 100)
best_params = study.best_params
print("Best params found :", best_params)

final_model = RandomForestClassifier(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
accuracy_score(y_test, y_pred)

[I 2024-02-22 19:12:03,032] A new study created in memory with name: no-name-b4b28254-091b-477d-a7c9-c69125975f62
[I 2024-02-22 19:12:18,567] Trial 0 finished with value: 0.725 and parameters: {'n_estimators': 837, 'max_depth': 47, 'min_samples_split': 20, 'random_state': 42, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.725.
[I 2024-02-22 19:12:35,427] Trial 1 finished with value: 0.73125 and parameters: {'n_estimators': 950, 'max_depth': 45, 'min_samples_split': 3, 'random_state': 42, 'min_samples_leaf': 11}. Best is trial 1 with value: 0.73125.
[I 2024-02-22 19:12:44,465] Trial 2 finished with value: 0.771875 and parameters: {'n_estimators': 501, 'max_depth': 42, 'min_samples_split': 7, 'random_state': 42, 'min_samples_leaf': 8}. Best is trial 2 with value: 0.771875.
[I 2024-02-22 19:12:57,466] Trial 3 finished with value: 0.68125 and parameters: {'n_estimators': 754, 'max_depth': 25, 'min_samples_split': 31, 'random_state': 42, 'min_samples_leaf': 10}. Best is trial 2 with 

Best params found : {'n_estimators': 916, 'max_depth': 21, 'min_samples_split': 2, 'random_state': 42, 'min_samples_leaf': 1}


0.9875

In [69]:
final_model = RandomForestClassifier(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
prediction_results['random forest'] = y_pred

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# # One-hot encoding for probability calculation (adapt if necessary)
# y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
# y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# # ROC AUC score with multiclass handling
# roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
# print('roc_auc: ', roc_auc_macro)

accuracy: 0.9875
recall: 0.9722222222222222
precision: 0.9629629629629629
f1-score: 0.9666666666666666


## Gassian NB

**Pre-tunning**

In [70]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train_reduced, y_train)

y_pred = gnb.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# # One-hot encoding for probability calculation (adapt if necessary)
# y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
# y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# # ROC AUC score with multiclass handling
# roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
# print('roc_auc: ', roc_auc_macro)


accuracy: 0.9625
recall: 0.9444444444444444
precision: 0.9375
f1-score: 0.9358465608465608


**Tunning**

In [71]:
def objective(trial):
    hyperparams = {
        'var_smoothing': trial.suggest_float('var_smoothing', 1e-9, 1e-4, log = True)
    }

    model = GaussianNB(**hyperparams)
    scores = cross_val_score(model, X_train_reduced, y_train, cv = kf,scoring = 'accuracy')
    return np.mean(scores)

In [72]:
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 100)
best_params = study.best_params
print("Best params found :", best_params)

final_model = GaussianNB(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
prediction_results['gnb'] = y_pred

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# # One-hot encoding for probability calculation (adapt if necessary)
# y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
# y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# # ROC AUC score with multiclass handling
# roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
# print('roc_auc: ', roc_auc_macro)

[I 2024-02-22 19:33:38,109] A new study created in memory with name: no-name-a92c3424-e235-4002-95bf-924ea9e98b0a
[I 2024-02-22 19:33:38,153] Trial 0 finished with value: 0.75 and parameters: {'var_smoothing': 2.735103004904717e-05}. Best is trial 0 with value: 0.75.
[I 2024-02-22 19:33:38,196] Trial 1 finished with value: 0.75 and parameters: {'var_smoothing': 3.2723782616696995e-07}. Best is trial 0 with value: 0.75.
[I 2024-02-22 19:33:38,238] Trial 2 finished with value: 0.75 and parameters: {'var_smoothing': 5.649537574672336e-06}. Best is trial 0 with value: 0.75.
[I 2024-02-22 19:33:38,280] Trial 3 finished with value: 0.75 and parameters: {'var_smoothing': 2.5300102939958998e-05}. Best is trial 0 with value: 0.75.
[I 2024-02-22 19:33:38,322] Trial 4 finished with value: 0.75625 and parameters: {'var_smoothing': 4.786323657403775e-05}. Best is trial 4 with value: 0.75625.
[I 2024-02-22 19:33:38,364] Trial 5 finished with value: 0.75 and parameters: {'var_smoothing': 3.5716899386

Best params found : {'var_smoothing': 9.807041097906174e-05}
accuracy: 0.9625
recall: 0.9444444444444444
precision: 0.9375
f1-score: 0.9358465608465608


## Logistic Regression

**Normal**

In [73]:
lr = LogisticRegression()
lr.fit(X_train_reduced, y_train)
y_pred = lr.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# # One-hot encoding for probability calculation (adapt if necessary)
# y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
# y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# # ROC AUC score with multiclass handling
# roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
# print('roc_auc: ', roc_auc_macro)

accuracy: 1.0
recall: 1.0
precision: 1.0
f1-score: 1.0


**Tunning**

In [74]:
import optuna
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')


def objective(trial):
    hyperparams = {
        'solver': trial.suggest_categorical('solver', ['lbfgs', 'liblinear', 'newton-cg', 'saga']),
        'penalty': trial.suggest_categorical('penalty', ['l2']),
        'multi_class': trial.suggest_categorical('multi_class', ['ovr']),
        'C': trial.suggest_loguniform("C", 1e-3, 1e3),
        'n_jobs': -1
    }

    model = LogisticRegression(**hyperparams)
    scores = cross_val_score(model, X_train_reduced,
                             y_train, cv=kf, scoring='accuracy')
    return np.mean(scores)

In [75]:
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 100)
best_params = study.best_params
print("Best params found :", best_params)

final_model = LogisticRegression(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
prediction_results['logistic regression'] = y_pred

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# # One-hot encoding for probability calculation (adapt if necessary)
# y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
# y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# # ROC AUC score with multiclass handling
# roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
# print('roc_auc: ', roc_auc_macro)

[I 2024-02-22 19:33:45,511] A new study created in memory with name: no-name-dbb1ae29-ad51-413e-a461-79104247bc10
[I 2024-02-22 19:34:00,166] Trial 0 finished with value: 0.896875 and parameters: {'solver': 'liblinear', 'penalty': 'l2', 'multi_class': 'ovr', 'C': 0.12245382830212904}. Best is trial 0 with value: 0.896875.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/l

Best params found : {'solver': 'newton-cg', 'penalty': 'l2', 'multi_class': 'ovr', 'C': 0.0033108212264938495}
accuracy: 1.0
recall: 1.0
precision: 1.0
f1-score: 1.0


In [76]:
# lr = LogisticRegression(multi_class='ovr', solver='liblinear')
# lr.fit(X_train_reduced, y_train)
# y_pred = lr.predict(X_test_reduced)

## KNN Classifier

**Normal**

In [77]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train_reduced, y_train)
y_pred = knn.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# # One-hot encoding for probability calculation (adapt if necessary)
# y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
# y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# # ROC AUC score with multiclass handling
# roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
# print('roc_auc: ', roc_auc_macro)


accuracy: 0.9125
recall: 0.9375
precision: 0.9143518518518517
f1-score: 0.9068783068783068


**Tunning**

In [78]:
import optuna
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')


def objective(trial):
    hyperparams = {
        'n_neighbors': trial.suggest_int("n_neighbors", 5, 100),
        'weights': trial.suggest_categorical("weights", ["uniform", "distance"]),
        'metric': trial.suggest_categorical("metric", ["euclidean", "manhattan", "minkowski"]),
        'algorithm': trial.suggest_categorical('algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute']),
        'n_jobs': -1
    }

    # Create KNN model with tuned hyperparameters
    model = KNeighborsClassifier(**hyperparams)
    scores = cross_val_score(model, X_train_reduced,
                             y_train, cv=kf, scoring='accuracy')
    return np.mean(scores)

In [79]:
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 100)
best_params = study.best_params
print("Best params found :", best_params)

final_model =KNeighborsClassifier(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
prediction_results['KNN'] = y_pred

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# # One-hot encoding for probability calculation (adapt if necessary)
# y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
# y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# # ROC AUC score with multiclass handling
# roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
# print('roc_auc: ', roc_auc_macro)

[I 2024-02-22 19:52:12,248] A new study created in memory with name: no-name-91241e7c-b466-49d3-a4fa-d50d592f6205
[I 2024-02-22 19:52:12,347] Trial 0 finished with value: 0.59375 and parameters: {'n_neighbors': 71, 'weights': 'distance', 'metric': 'minkowski', 'algorithm': 'kd_tree'}. Best is trial 0 with value: 0.59375.
[I 2024-02-22 19:52:12,385] Trial 1 finished with value: 0.59375 and parameters: {'n_neighbors': 70, 'weights': 'distance', 'metric': 'minkowski', 'algorithm': 'brute'}. Best is trial 0 with value: 0.59375.
[I 2024-02-22 19:52:12,426] Trial 2 finished with value: 0.584375 and parameters: {'n_neighbors': 54, 'weights': 'distance', 'metric': 'manhattan', 'algorithm': 'auto'}. Best is trial 0 with value: 0.59375.
[I 2024-02-22 19:52:12,461] Trial 3 finished with value: 0.7375 and parameters: {'n_neighbors': 14, 'weights': 'distance', 'metric': 'manhattan', 'algorithm': 'auto'}. Best is trial 3 with value: 0.7375.
[I 2024-02-22 19:52:12,566] Trial 4 finished with value: 0.

Best params found : {'n_neighbors': 7, 'weights': 'distance', 'metric': 'euclidean', 'algorithm': 'ball_tree'}
accuracy: 0.925
recall: 0.949074074074074
precision: 0.9291666666666667
f1-score: 0.9256613756613759


## Support Vector Machine 

**Normal**

In [80]:
from sklearn.svm import SVC

svm = SVC(kernel='linear')
svm.fit(X_train_reduced, y_train)
y_pred = svm.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
# Assuming 'label_' prefix for clarity
y_test_onehot = pd.get_dummies(y_test, prefix='label_')
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# # ROC AUC score with multiclass handling
# roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot,
#                               average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
# print('roc_auc: ', roc_auc_macro)

accuracy: 1.0
recall: 1.0
precision: 1.0
f1-score: 1.0


**Tunning**

In [81]:
import optuna
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')

def objective(trial):
    hyperparams = {
        'kernel': trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly']),
        'C': trial.suggest_loguniform('C', 1e-5, 1e5),
        'gamma': trial.suggest_loguniform('gamma', 1e-5, 1e5),
        # 'degree': trial.suggest_int('degree', 2, 5),  # for polynomial kernel
        'tol': trial.suggest_loguniform('tol', 1e-4, 1e-2),
        'shrinking': trial.suggest_categorical('shrinking', [True, False]),
    }
    
    # Create KNN model with tuned hyperparameters
    model = SVC(**hyperparams)
    scores = cross_val_score(model, X_train_reduced, y_train, cv = kf,scoring = 'accuracy')
    return np.mean(scores)

In [82]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
best_params = study.best_params
print("Best params found :", best_params)

final_model = SVC(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
prediction_results['svc'] = list(y_pred)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
# Assuming 'label_' prefix for clarity
y_test_onehot = pd.get_dummies(y_test, prefix='label_')
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# # ROC AUC score with multiclass handling
# roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot,
#                               average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
# print('roc_auc: ', roc_auc_macro)

[I 2024-02-22 19:52:23,839] A new study created in memory with name: no-name-093bad13-e7aa-432f-a9a0-b01d91e93fca
[I 2024-02-22 19:52:23,917] Trial 0 finished with value: 0.0 and parameters: {'kernel': 'rbf', 'C': 4.592749442766192e-05, 'gamma': 0.6505271188616003, 'tol': 0.00011198397951952586, 'shrinking': False}. Best is trial 0 with value: 0.0.
[I 2024-02-22 19:52:24,018] Trial 1 finished with value: 0.859375 and parameters: {'kernel': 'poly', 'C': 1.065123994697636, 'gamma': 241.98406701191826, 'tol': 0.0004034202091747687, 'shrinking': True}. Best is trial 1 with value: 0.859375.
[I 2024-02-22 19:52:24,127] Trial 2 finished with value: 0.0 and parameters: {'kernel': 'rbf', 'C': 9.257455569884044, 'gamma': 0.6899578769012508, 'tol': 0.0001240811100340618, 'shrinking': False}. Best is trial 1 with value: 0.859375.
[I 2024-02-22 19:52:24,226] Trial 3 finished with value: 0.0 and parameters: {'kernel': 'rbf', 'C': 687.5696207675261, 'gamma': 0.004725632729467525, 'tol': 0.00615358272

Best params found : {'kernel': 'linear', 'C': 4.14442691595589, 'gamma': 0.005332912971643873, 'tol': 0.000587601229759673, 'shrinking': False}
accuracy: 1.0
recall: 1.0
precision: 1.0
f1-score: 1.0


## Decision Tree

**Normal**

In [83]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_reduced, y_train)
y_pred = dt.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
# Assuming 'label_' prefix for clarity
y_test_onehot = pd.get_dummies(y_test, prefix='label_')
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot,
                              average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

accuracy: 0.6
recall: 0.55
precision: 0.5641666666666667
f1-score: 0.5229761904761905
roc_auc:  0.5452392744810545


**Tunning**

In [84]:
import optuna
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')


def objective(trial):
    hyperparams = {
        "max_depth" : trial.suggest_int("max_depth", 2, 10),
        "min_samples_split" : trial.suggest_int("min_samples_split", 2, 20),
        "min_samples_leaf" : trial.suggest_int("min_samples_leaf", 1, 10),
        "criterion" : trial.suggest_categorical("criterion", ["gini", "entropy"]),
        'random_state': trial.suggest_categorical('random_state', [42])
    }

    # Create KNN model with tuned hyperparameters
    model = DecisionTreeClassifier(**hyperparams)
    scores = cross_val_score(model, X_train_reduced,
                             y_train, cv=kf, scoring='accuracy')
    return np.mean(scores)

In [85]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
best_params = study.best_params
print("Best params found :", best_params)

final_model = DecisionTreeClassifier(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
prediction_results['decison tree'] = list(y_pred)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
# Assuming 'label_' prefix for clarity
y_test_onehot = pd.get_dummies(y_test, prefix='label_')
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# # ROC AUC score with multiclass handling
# roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot,
#                               average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
# print('roc_auc: ', roc_auc_macro)

[I 2024-02-22 19:52:37,257] A new study created in memory with name: no-name-e4e6553a-0317-499d-9df1-d05052db94b8
[I 2024-02-22 19:52:37,320] Trial 0 finished with value: 0.23125 and parameters: {'max_depth': 6, 'min_samples_split': 8, 'min_samples_leaf': 9, 'criterion': 'gini', 'random_state': 42}. Best is trial 0 with value: 0.23125.
[I 2024-02-22 19:52:37,386] Trial 1 finished with value: 0.30625 and parameters: {'max_depth': 10, 'min_samples_split': 12, 'min_samples_leaf': 9, 'criterion': 'gini', 'random_state': 42}. Best is trial 1 with value: 0.30625.
[I 2024-02-22 19:52:37,456] Trial 2 finished with value: 0.209375 and parameters: {'max_depth': 6, 'min_samples_split': 16, 'min_samples_leaf': 2, 'criterion': 'gini', 'random_state': 42}. Best is trial 1 with value: 0.30625.
[I 2024-02-22 19:52:37,527] Trial 3 finished with value: 0.296875 and parameters: {'max_depth': 9, 'min_samples_split': 16, 'min_samples_leaf': 7, 'criterion': 'gini', 'random_state': 42}. Best is trial 1 with 

Best params found : {'max_depth': 7, 'min_samples_split': 2, 'min_samples_leaf': 1, 'criterion': 'entropy', 'random_state': 42}
accuracy: 0.6875
recall: 0.7314814814814815
precision: 0.7282407407407407
f1-score: 0.6792328042328042
