In [105]:
import os
import cv2
import numpy as np
import pandas as pd
from numpy.random import shuffle
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score

In [106]:
def get_data(path_data='../ATT images', test_size=0.2, is_shuffle=True):
    folders = [f's{i+1}' for i in range(40)]
    X, y = [], []

    for i in range(1, 41):
        path = path_data + f'/s{i}'
        files = os.listdir(path)
        num_files = len(files)

        if is_shuffle:
            files = shuffle(files)

        for file in os.listdir(path):
            path = path_data + f'/s{i}/' + file
            image = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
            pixels = np.reshape(image, [1, image.shape[0] * image.shape[1]])
            pixels = np.asarray(pixels)

            if len(X) == 0:
                X = pixels
            else:
                X = np.vstack([X, pixels])
            y.append(f's{i}')

    return X, np.asarray(y)

In [107]:
def get_length(n):
    start = int(np.sqrt(n))
    while start > 1:
        if n % start == 0:
            break
        else:
            start -= 1
    return start, n // start

In [108]:
def cut_image(X, num_image=4, shape=(112,92)):
    height, width = shape
    num_width, num_height = get_length(num_image)
    part_width = width // num_width
    part_height = height // num_height

    X_cut = []

    for i in range(X.shape[0]):
        image = X[i].reshape(shape)
        for k in range(num_width):
            for j in range(num_height):
                left = j * part_width
                upper = k * part_height
                right = left + part_width
                lower = upper + part_height

                # Crop the image to get the part
                part = image[upper:lower, left:right]

                pixels = np.reshape(part, [1, part_width * part_height])
                pixels = np.asarray(pixels)

                if k == 0 and j == 0:
                    X_i = pixels
                else:
                    X_i = np.hstack([X_i, pixels])

        if len(X_cut) == 0:
            X_cut = X_i
        else:
            X_cut = np.vstack([X_cut, X_i])

    return X_cut

In [109]:
X, y = get_data()

n_components_pca = 50
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

In [110]:
class ModularPCA:
    def __init__(self, n_components, num_image=4, shape_image=(112,92)):
        self.n_components = n_components
        self.shape = shape_image
        self.num_image = num_image

    def fit(self, X):
        self.X = cut_image(X, self.num_image, self.shape)
        self.pca = PCA(n_components=self.n_components, random_state=42)
        self.pca.fit(self.X)

    def transform(self, X):
        X_cut = cut_image(X, self.num_image, self.shape)
        return self.pca.transform(X_cut)

    def fit_transform(self, X):
        self.fit(X)
        return self.pca.fit_transform(self.X)

In [111]:
# accuracy_scores, precision_scores, recall_scores, f1_scores = [], [], [], []

# for train_index, test_index in kf.split(X, y):
#     X_train, X_test = X[train_index], X[test_index]
#     y_train, y_test = y[train_index], y[test_index]

#     mpca = ModularPCA(n_components=n_components_pca)
#     X_train_reduced = mpca.fit_transform(X_train)
#     X_test_reduced = mpca.transform(X_test)

#     # Euclidean
#     y_pred = []
#     for i in range(80):
#         min_ = np.argmin(np.sqrt(np.sum((X_train_reduced - X_test_reduced[i])**2,axis=1)))
#         y_pred.append(y_train[min_])

#     # Evaluate the model
#     accuracy = accuracy_score(y_test, y_pred)
#     accuracy_scores.append(accuracy)

# print(np.mean(accuracy_scores))

In [112]:
# accuracy_scores, precision_scores, recall_scores, f1_scores = [], [], [], []

# for train_index, test_index in kf.split(X, y):
#     X_train, X_test = X[train_index], X[test_index]
#     y_train, y_test = y[train_index], y[test_index]

#     mpca = ModularPCA(n_components=n_components_pca)
#     X_train_reduced = mpca.fit_transform(X_train)
#     X_test_reduced = mpca.transform(X_test)

#     # Logistic
#     lr = LogisticRegression(multi_class='ovr', solver='liblinear')
#     lr.fit(X_train_reduced, y_train)
#     y_pred = lr.predict(X_test_reduced)

#     # Evaluate the model
#     accuracy = accuracy_score(y_test, y_pred)
#     accuracy_scores.append(accuracy)

# print(np.mean(accuracy_scores))

In [113]:
X, y = get_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# num_image = 4

In [114]:
n_components_pca = 50
pca = ModularPCA(n_components=n_components_pca, num_image=4)
X_train_reduced = pca.fit_transform(X_train)
X_test_reduced = pca.transform(X_test)

# Modeling

In [115]:
from sklearn.model_selection import KFold
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

In [116]:
lr = LogisticRegression()
lr.fit(X_train_reduced, y_train)
y_pred = lr.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

accuracy: 0.975
recall: 0.9791666666666666
precision: 0.9837962962962963
f1-score: 0.9772486772486771
roc_auc:  0.9892248954748956


# Random Forest

In [117]:
prediction_results = {}

In [118]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

**Normal**

In [119]:
rfr = RandomForestClassifier(random_state=42)
rfr.fit(X_train_reduced,y_train)
y_pred = rfr.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

accuracy: 0.925
recall: 0.9351851851851851
precision: 0.9097222222222222
f1-score: 0.9089947089947089
roc_auc:  0.7622988574972966


**Tunning**

In [120]:
import optuna


def objective(trial):
    hyperparams = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 10, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 32),
        'random_state': trial.suggest_categorical('random_state', [42]),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 32),
        'n_jobs': -1
    }

    model = RandomForestClassifier(**hyperparams)
    scores = cross_val_score(model, X_train_reduced,
                             y_train, cv=kf, scoring='accuracy')
    return np.mean(scores)

In [121]:
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 100)
best_params = study.best_params
print("Best params found :", best_params)

final_model = RandomForestClassifier(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
accuracy_score(y_test, y_pred)

[I 2024-02-16 13:07:44,088] A new study created in memory with name: no-name-f6136bad-4c9d-4ded-9534-68b225e5dadf
[I 2024-02-16 13:07:49,317] Trial 0 finished with value: 0.8 and parameters: {'n_estimators': 471, 'max_depth': 19, 'min_samples_split': 5, 'random_state': 42, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.8.
[I 2024-02-16 13:07:59,126] Trial 1 finished with value: 0.709375 and parameters: {'n_estimators': 916, 'max_depth': 32, 'min_samples_split': 32, 'random_state': 42, 'min_samples_leaf': 12}. Best is trial 0 with value: 0.8.
[I 2024-02-16 13:08:00,078] Trial 2 finished with value: 0.475 and parameters: {'n_estimators': 101, 'max_depth': 39, 'min_samples_split': 3, 'random_state': 42, 'min_samples_leaf': 31}. Best is trial 0 with value: 0.8.
[I 2024-02-16 13:08:09,829] Trial 3 finished with value: 0.653125 and parameters: {'n_estimators': 840, 'max_depth': 14, 'min_samples_split': 21, 'random_state': 42, 'min_samples_leaf': 21}. Best is trial 0 with value: 0.8.
[

Best params found : {'n_estimators': 620, 'max_depth': 49, 'min_samples_split': 4, 'random_state': 42, 'min_samples_leaf': 1}


0.9375

In [122]:
final_model = RandomForestClassifier(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
prediction_results['random forest'] = y_pred

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

accuracy: 0.9375
recall: 0.949074074074074
precision: 0.9611111111111111
f1-score: 0.9430555555555556
roc_auc:  0.9736420986420986


# Gassian NB

**Pre-tunning**

In [123]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train_reduced, y_train)

y_pred = gnb.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)


accuracy: 0.925
recall: 0.9099099099099098
precision: 0.9121621621621622
f1-score: 0.8972972972972973
roc_auc:  0.7181602364188973


**Tunning**

In [124]:
def objective(trial):
    hyperparams = {
        'var_smoothing': trial.suggest_float('var_smoothing', 1e-9, 1e-4, log = True)
    }

    model = GaussianNB(**hyperparams)
    scores = cross_val_score(model, X_train_reduced, y_train, cv = kf,scoring = 'accuracy')
    return np.mean(scores)

In [125]:
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 100)
best_params = study.best_params
print("Best params found :", best_params)

final_model = GaussianNB(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
prediction_results['gnb'] = y_pred

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

[I 2024-02-16 13:16:38,201] A new study created in memory with name: no-name-94442f31-9760-4425-af12-bc6fddd690bc
[I 2024-02-16 13:16:38,230] Trial 0 finished with value: 0.746875 and parameters: {'var_smoothing': 9.698745441905131e-06}. Best is trial 0 with value: 0.746875.
[I 2024-02-16 13:16:38,258] Trial 1 finished with value: 0.746875 and parameters: {'var_smoothing': 1.2487384454792585e-05}. Best is trial 0 with value: 0.746875.
[I 2024-02-16 13:16:38,287] Trial 2 finished with value: 0.746875 and parameters: {'var_smoothing': 3.007912407322358e-09}. Best is trial 0 with value: 0.746875.
[I 2024-02-16 13:16:38,316] Trial 3 finished with value: 0.746875 and parameters: {'var_smoothing': 1.8770914153606374e-05}. Best is trial 0 with value: 0.746875.
[I 2024-02-16 13:16:38,342] Trial 4 finished with value: 0.746875 and parameters: {'var_smoothing': 4.404188767484831e-06}. Best is trial 0 with value: 0.746875.
[I 2024-02-16 13:16:38,369] Trial 5 finished with value: 0.746875 and para

Best params found : {'var_smoothing': 9.591934675048554e-05}
accuracy: 0.9375
recall: 0.9166666666666666
precision: 0.9144144144144145
f1-score: 0.9051480051480051
roc_auc:  0.721808267361232


# Logistic Regression

**Normal**

In [126]:
lr = LogisticRegression()
lr.fit(X_train_reduced, y_train)
y_pred = lr.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

accuracy: 0.975
recall: 0.9791666666666666
precision: 0.9837962962962963
f1-score: 0.9772486772486771
roc_auc:  0.9892248954748956


**Tunning**

In [127]:
import optuna
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')


def objective(trial):
    hyperparams = {
        'solver': trial.suggest_categorical('solver', ['lbfgs', 'liblinear', 'newton-cg', 'saga']),
        'penalty': trial.suggest_categorical('penalty', ['l2']),
        'multi_class': trial.suggest_categorical('multi_class', ['ovr']),
        'C': trial.suggest_loguniform("C", 1e-3, 1e3),
        'n_jobs': -1
    }

    model = LogisticRegression(**hyperparams)
    scores = cross_val_score(model, X_train_reduced,
                             y_train, cv=kf, scoring='accuracy')
    return np.mean(scores)

In [128]:
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 100)
best_params = study.best_params
print("Best params found :", best_params)

final_model = LogisticRegression(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
prediction_results['logistic regression'] = y_pred

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

[I 2024-02-16 13:16:41,041] A new study created in memory with name: no-name-86780fbc-5473-4ea8-a224-77adc73cc013
[I 2024-02-16 13:16:44,816] Trial 0 finished with value: 0.928125 and parameters: {'solver': 'liblinear', 'penalty': 'l2', 'multi_class': 'ovr', 'C': 4.683582694858897}. Best is trial 0 with value: 0.928125.
[I 2024-02-16 13:16:51,410] Trial 1 finished with value: 0.928125 and parameters: {'solver': 'newton-cg', 'penalty': 'l2', 'multi_class': 'ovr', 'C': 656.3940294185421}. Best is trial 0 with value: 0.928125.
[I 2024-02-16 13:16:55,929] Trial 2 finished with value: 0.978125 and parameters: {'solver': 'newton-cg', 'penalty': 'l2', 'multi_class': 'ovr', 'C': 0.013487194026077051}. Best is trial 2 with value: 0.978125.
[I 2024-02-16 13:16:59,952] Trial 3 finished with value: 0.928125 and parameters: {'solver': 'liblinear', 'penalty': 'l2', 'multi_class': 'ovr', 'C': 17.35892870483305}. Best is trial 2 with value: 0.978125.
[I 2024-02-16 13:17:00,446] Trial 4 finished with v

Best params found : {'solver': 'newton-cg', 'penalty': 'l2', 'multi_class': 'ovr', 'C': 0.013487194026077051}
accuracy: 0.9875
recall: 0.9861111111111112
precision: 0.9907407407407407
f1-score: 0.9851851851851853
roc_auc:  0.9928774928774929


In [129]:
# lr = LogisticRegression(multi_class='ovr', solver='liblinear')
# lr.fit(X_train_reduced, y_train)
# y_pred = lr.predict(X_test_reduced)

# KNN Classifier

**Normal**

In [130]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train_reduced, y_train)
y_pred = knn.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)


accuracy: 0.925
recall: 0.9421296296296295
precision: 0.9495370370370372
f1-score: 0.9279761904761905
roc_auc:  0.9699917551974515


**Tunning**

In [131]:
import optuna
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')


def objective(trial):
    hyperparams = {
        'n_neighbors': trial.suggest_int("n_neighbors", 5, 100),
        'weights': trial.suggest_categorical("weights", ["uniform", "distance"]),
        'metric': trial.suggest_categorical("metric", ["euclidean", "manhattan", "minkowski"]),
        'algorithm': trial.suggest_categorical('algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute']),
        'n_jobs': -1
    }

    # Create KNN model with tuned hyperparameters
    model = KNeighborsClassifier(**hyperparams)
    scores = cross_val_score(model, X_train_reduced,
                             y_train, cv=kf, scoring='accuracy')
    return np.mean(scores)

In [132]:
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 100)
best_params = study.best_params
print("Best params found :", best_params)

final_model =KNeighborsClassifier(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
prediction_results['KNN'] = y_pred

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

[I 2024-02-16 13:20:39,779] A new study created in memory with name: no-name-d4402308-02ae-4c92-b85c-63f9a4e6f22a
[I 2024-02-16 13:20:39,964] Trial 0 finished with value: 0.559375 and parameters: {'n_neighbors': 96, 'weights': 'distance', 'metric': 'manhattan', 'algorithm': 'ball_tree'}. Best is trial 0 with value: 0.559375.
[I 2024-02-16 13:20:40,002] Trial 1 finished with value: 0.69375 and parameters: {'n_neighbors': 30, 'weights': 'distance', 'metric': 'euclidean', 'algorithm': 'brute'}. Best is trial 1 with value: 0.69375.
[I 2024-02-16 13:20:40,167] Trial 2 finished with value: 0.584375 and parameters: {'n_neighbors': 96, 'weights': 'distance', 'metric': 'euclidean', 'algorithm': 'kd_tree'}. Best is trial 1 with value: 0.69375.
[I 2024-02-16 13:20:40,320] Trial 3 finished with value: 0.5875 and parameters: {'n_neighbors': 97, 'weights': 'distance', 'metric': 'euclidean', 'algorithm': 'ball_tree'}. Best is trial 1 with value: 0.69375.
[I 2024-02-16 13:20:40,478] Trial 4 finished w

Best params found : {'n_neighbors': 5, 'weights': 'uniform', 'metric': 'minkowski', 'algorithm': 'auto'}
accuracy: 0.925
recall: 0.9421296296296295
precision: 0.9495370370370372
f1-score: 0.9279761904761905
roc_auc:  0.9699917551974515


Tuning rồi lưu kết quả tuning vào bên dưới

## num_image = 9

In [133]:
n_components_pca = 50
pca = ModularPCA(n_components=n_components_pca, num_image=9)
X_train_reduced = pca.fit_transform(X_train)
X_test_reduced = pca.transform(X_test)

In [134]:
# for train_index, test_index in skf.split(X, y):
#     X_train, X_test = X[train_index], X[test_index]
#     y_train, y_test = y[train_index], y[test_index]

#     pca = PCA(n_components=n_components_pca)
#     X_train_reduced = pca.fit_transform(X_train)
#     X_test_reduced = pca.transform(X_test)

#     # Euclidean
#     y_pred = []
#     for i in range(320):
#         min_ = np.argmin(np.sqrt(np.sum((X_train_reduced - X_test_reduced[i])**2,axis=1)))
#         y_pred.append(y_train[min_])

#     # Evaluate the model
#     accuracy = accuracy_score(y_test, y_pred)
#     accuracy_scores.append(accuracy)

# print(np.mean(accuracy_scores))

# Modeling

In [135]:
from sklearn.model_selection import KFold
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Random Forest

In [136]:
prediction_results = {}

In [137]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

**Normal**

In [138]:
rfr = RandomForestClassifier(random_state=42)
rfr.fit(X_train_reduced,y_train)
y_pred = rfr.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

accuracy: 0.9
recall: 0.9143518518518517
precision: 0.9120370370370369
f1-score: 0.8899470899470899
roc_auc:  0.9557535028262877


**Tunning**

In [139]:
import optuna


def objective(trial):
    hyperparams = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 10, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 32),
        'random_state': trial.suggest_categorical('random_state', [42]),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 32),
        'n_jobs': -1
    }

    model = RandomForestClassifier(**hyperparams)
    scores = cross_val_score(model, X_train_reduced,
                             y_train, cv=kf, scoring='accuracy')
    return np.mean(scores)

In [140]:
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 100)
best_params = study.best_params
print("Best params found :", best_params)

final_model = RandomForestClassifier(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
accuracy_score(y_test, y_pred)

[I 2024-02-16 13:20:43,860] A new study created in memory with name: no-name-96d79994-79ac-43f6-8029-7df0541dce74
[I 2024-02-16 13:20:47,015] Trial 0 finished with value: 0.784375 and parameters: {'n_estimators': 372, 'max_depth': 32, 'min_samples_split': 3, 'random_state': 42, 'min_samples_leaf': 7}. Best is trial 0 with value: 0.784375.
[I 2024-02-16 13:20:50,086] Trial 1 finished with value: 0.51875 and parameters: {'n_estimators': 427, 'max_depth': 44, 'min_samples_split': 31, 'random_state': 42, 'min_samples_leaf': 31}. Best is trial 0 with value: 0.784375.
[I 2024-02-16 13:20:52,985] Trial 2 finished with value: 0.7125 and parameters: {'n_estimators': 358, 'max_depth': 14, 'min_samples_split': 27, 'random_state': 42, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.784375.
[I 2024-02-16 13:20:57,600] Trial 3 finished with value: 0.7375 and parameters: {'n_estimators': 604, 'max_depth': 39, 'min_samples_split': 26, 'random_state': 42, 'min_samples_leaf': 11}. Best is trial 0 

Best params found : {'n_estimators': 996, 'max_depth': 21, 'min_samples_split': 2, 'random_state': 42, 'min_samples_leaf': 1}


0.925

In [141]:
final_model = RandomForestClassifier(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
prediction_results['random forest'] = y_pred

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

accuracy: 0.925
recall: 0.9421296296296295
precision: 0.9472222222222223
f1-score: 0.9298280423280423
roc_auc:  0.969994067699764


# Gassian NB

**Pre-tunning**

In [142]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train_reduced, y_train)

y_pred = gnb.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)


accuracy: 0.9
recall: 0.8851351351351351
precision: 0.9022522522522523
f1-score: 0.8757614757614758
roc_auc:  0.7146765079185132


**Tunning**

In [143]:
def objective(trial):
    hyperparams = {
        'var_smoothing': trial.suggest_float('var_smoothing', 1e-9, 1e-4, log = True)
    }

    model = GaussianNB(**hyperparams)
    scores = cross_val_score(model, X_train_reduced, y_train, cv = kf,scoring = 'accuracy')
    return np.mean(scores)

In [144]:
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 100)
best_params = study.best_params
print("Best params found :", best_params)

final_model = GaussianNB(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
prediction_results['gnb'] = y_pred

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

[I 2024-02-16 13:32:44,726] A new study created in memory with name: no-name-1b45c393-c3bc-485d-9df9-b0fee11a1cbc
[I 2024-02-16 13:32:44,755] Trial 0 finished with value: 0.759375 and parameters: {'var_smoothing': 2.3165040123622865e-05}. Best is trial 0 with value: 0.759375.
[I 2024-02-16 13:32:44,784] Trial 1 finished with value: 0.75625 and parameters: {'var_smoothing': 1.0358278426045092e-08}. Best is trial 0 with value: 0.759375.
[I 2024-02-16 13:32:44,820] Trial 2 finished with value: 0.75625 and parameters: {'var_smoothing': 5.521505986703645e-08}. Best is trial 0 with value: 0.759375.
[I 2024-02-16 13:32:44,849] Trial 3 finished with value: 0.76875 and parameters: {'var_smoothing': 8.199523154052229e-05}. Best is trial 3 with value: 0.76875.
[I 2024-02-16 13:32:44,903] Trial 4 finished with value: 0.75625 and parameters: {'var_smoothing': 9.72574556979045e-09}. Best is trial 3 with value: 0.76875.
[I 2024-02-16 13:32:44,940] Trial 5 finished with value: 0.75625 and parameters: 

Best params found : {'var_smoothing': 9.821045494689508e-05}
accuracy: 0.9125
recall: 0.894144144144144
precision: 0.9040540540540541
f1-score: 0.8858000858000857
roc_auc:  0.7146765079185131


# Logistic Regression

**Normal**

In [145]:
lr = LogisticRegression()
lr.fit(X_train_reduced, y_train)
y_pred = lr.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

accuracy: 0.975
recall: 0.9791666666666666
precision: 0.9837962962962963
f1-score: 0.9772486772486771
roc_auc:  0.9892248954748956


**Tunning**

In [146]:
import optuna
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')


def objective(trial):
    hyperparams = {
        'solver': trial.suggest_categorical('solver', ['lbfgs', 'liblinear', 'newton-cg', 'saga']),
        'penalty': trial.suggest_categorical('penalty', ['l2']),
        'multi_class': trial.suggest_categorical('multi_class', ['ovr']),
        'C': trial.suggest_loguniform("C", 1e-3, 1e3),
        'n_jobs': -1
    }

    model = LogisticRegression(**hyperparams)
    scores = cross_val_score(model, X_train_reduced,
                             y_train, cv=kf, scoring='accuracy')
    return np.mean(scores)

In [147]:
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 100)
best_params = study.best_params
print("Best params found :", best_params)

final_model = LogisticRegression(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
prediction_results['logistic regression'] = y_pred

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

[I 2024-02-16 13:32:47,578] A new study created in memory with name: no-name-071f9abe-ec4b-43cd-9a5d-3415065d9128
[I 2024-02-16 13:32:53,287] Trial 0 finished with value: 0.95625 and parameters: {'solver': 'newton-cg', 'penalty': 'l2', 'multi_class': 'ovr', 'C': 1.7274641203365648}. Best is trial 0 with value: 0.95625.
[I 2024-02-16 13:32:56,517] Trial 1 finished with value: 0.915625 and parameters: {'solver': 'newton-cg', 'penalty': 'l2', 'multi_class': 'ovr', 'C': 193.76773143465692}. Best is trial 0 with value: 0.95625.
[I 2024-02-16 13:32:57,097] Trial 2 finished with value: 0.971875 and parameters: {'solver': 'lbfgs', 'penalty': 'l2', 'multi_class': 'ovr', 'C': 0.5943618660640917}. Best is trial 2 with value: 0.971875.
[I 2024-02-16 13:32:57,695] Trial 3 finished with value: 0.978125 and parameters: {'solver': 'lbfgs', 'penalty': 'l2', 'multi_class': 'ovr', 'C': 0.0035945991220888815}. Best is trial 3 with value: 0.978125.
[I 2024-02-16 13:33:02,027] Trial 4 finished with value: 0

Best params found : {'solver': 'lbfgs', 'penalty': 'l2', 'multi_class': 'ovr', 'C': 0.0035945991220888815}
accuracy: 0.975
recall: 0.9791666666666666
precision: 0.9837962962962963
f1-score: 0.9772486772486771
roc_auc:  0.9892248954748956


In [148]:
# lr = LogisticRegression(multi_class='ovr', solver='liblinear')
# lr.fit(X_train_reduced, y_train)
# y_pred = lr.predict(X_test_reduced)

# KNN Classifier

**Normal**

In [149]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train_reduced, y_train)
y_pred = knn.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)


accuracy: 0.925
recall: 0.9421296296296295
precision: 0.9495370370370372
f1-score: 0.9279761904761905
roc_auc:  0.9699917551974515


**Tunning**

In [150]:
import optuna
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')


def objective(trial):
    hyperparams = {
        'n_neighbors': trial.suggest_int("n_neighbors", 5, 100),
        'weights': trial.suggest_categorical("weights", ["uniform", "distance"]),
        'metric': trial.suggest_categorical("metric", ["euclidean", "manhattan", "minkowski"]),
        'algorithm': trial.suggest_categorical('algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute']),
        'n_jobs': -1
    }

    # Create KNN model with tuned hyperparameters
    model = KNeighborsClassifier(**hyperparams)
    scores = cross_val_score(model, X_train_reduced,
                             y_train, cv=kf, scoring='accuracy')
    return np.mean(scores)

In [151]:
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 100)
best_params = study.best_params
print("Best params found :", best_params)

final_model =KNeighborsClassifier(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
prediction_results['KNN'] = y_pred

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

[I 2024-02-16 13:34:40,751] A new study created in memory with name: no-name-085fd718-b917-4648-8fa9-4e9b9004edd9
[I 2024-02-16 13:34:40,797] Trial 0 finished with value: 0.609375 and parameters: {'n_neighbors': 60, 'weights': 'distance', 'metric': 'minkowski', 'algorithm': 'auto'}. Best is trial 0 with value: 0.609375.
[I 2024-02-16 13:34:40,962] Trial 1 finished with value: 0.521875 and parameters: {'n_neighbors': 99, 'weights': 'distance', 'metric': 'manhattan', 'algorithm': 'ball_tree'}. Best is trial 0 with value: 0.609375.
[I 2024-02-16 13:34:40,990] Trial 2 finished with value: 0.2375 and parameters: {'n_neighbors': 40, 'weights': 'uniform', 'metric': 'minkowski', 'algorithm': 'auto'}. Best is trial 0 with value: 0.609375.
[I 2024-02-16 13:34:41,148] Trial 3 finished with value: 0.584375 and parameters: {'n_neighbors': 88, 'weights': 'distance', 'metric': 'euclidean', 'algorithm': 'ball_tree'}. Best is trial 0 with value: 0.609375.
[I 2024-02-16 13:34:41,179] Trial 4 finished wi

Best params found : {'n_neighbors': 6, 'weights': 'distance', 'metric': 'minkowski', 'algorithm': 'brute'}
accuracy: 0.925
recall: 0.9421296296296295
precision: 0.9537037037037036
f1-score: 0.9296296296296297
roc_auc:  0.9699917551974515


Tuning rồi lưu kết quả tuning vào bên dưới

## num_image = 16

In [152]:
n_components_pca = 50
pca = ModularPCA(n_components=n_components_pca, num_image=16)
X_train_reduced = pca.fit_transform(X_train)
X_test_reduced = pca.transform(X_test)

In [153]:
# accuracy_scores, precision_scores, recall_scores, f1_scores = [], [], [], []
# n_components_pca = 50
# n_splits = 5
# skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

In [154]:
# for train_index, test_index in skf.split(X, y):
#     X_train, X_test = X[train_index], X[test_index]
#     y_train, y_test = y[train_index], y[test_index]

#     pca = PCA(n_components=n_components_pca)
#     X_train_reduced = pca.fit_transform(X_train)
#     X_test_reduced = pca.transform(X_test)

#     # Euclidean
#     y_pred = []
#     for i in range(320):
#         min_ = np.argmin(np.sqrt(np.sum((X_train_reduced - X_test_reduced[i])**2,axis=1)))
#         y_pred.append(y_train[min_])

#     # Evaluate the model
#     accuracy = accuracy_score(y_test, y_pred)
#     accuracy_scores.append(accuracy)

# print(np.mean(accuracy_scores))

# Modeling

In [155]:
from sklearn.model_selection import KFold
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Random Forest

In [156]:
prediction_results = {}

In [157]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

**Normal**

In [158]:
rfr = RandomForestClassifier(random_state=42)
rfr.fit(X_train_reduced,y_train)
y_pred = rfr.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

accuracy: 0.925
recall: 0.9351851851851851
precision: 0.9097222222222222
f1-score: 0.9089947089947089
roc_auc:  0.7622988574972966


**Tunning**

In [159]:
import optuna


def objective(trial):
    hyperparams = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 10, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 32),
        'random_state': trial.suggest_categorical('random_state', [42]),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 32),
        'n_jobs': -1
    }

    model = RandomForestClassifier(**hyperparams)
    scores = cross_val_score(model, X_train_reduced,
                             y_train, cv=kf, scoring='accuracy')
    return np.mean(scores)

In [160]:
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 100)
best_params = study.best_params
print("Best params found :", best_params)

final_model = RandomForestClassifier(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
accuracy_score(y_test, y_pred)

[I 2024-02-16 13:34:46,108] A new study created in memory with name: no-name-c4d9aa94-5313-44b7-a599-90d0c3f70381
[I 2024-02-16 13:34:47,258] Trial 0 finished with value: 0.603125 and parameters: {'n_estimators': 146, 'max_depth': 37, 'min_samples_split': 3, 'random_state': 42, 'min_samples_leaf': 24}. Best is trial 0 with value: 0.603125.
[I 2024-02-16 13:34:52,077] Trial 1 finished with value: 0.6375 and parameters: {'n_estimators': 641, 'max_depth': 21, 'min_samples_split': 2, 'random_state': 42, 'min_samples_leaf': 23}. Best is trial 1 with value: 0.6375.
[I 2024-02-16 13:34:54,389] Trial 2 finished with value: 0.778125 and parameters: {'n_estimators': 268, 'max_depth': 30, 'min_samples_split': 13, 'random_state': 42, 'min_samples_leaf': 10}. Best is trial 2 with value: 0.778125.
[I 2024-02-16 13:34:56,010] Trial 3 finished with value: 0.734375 and parameters: {'n_estimators': 188, 'max_depth': 28, 'min_samples_split': 13, 'random_state': 42, 'min_samples_leaf': 13}. Best is trial 

Best params found : {'n_estimators': 643, 'max_depth': 48, 'min_samples_split': 4, 'random_state': 42, 'min_samples_leaf': 1}


0.9375

In [161]:
final_model = RandomForestClassifier(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
prediction_results['random forest'] = y_pred

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

accuracy: 0.9375
recall: 0.949074074074074
precision: 0.9611111111111111
f1-score: 0.9430555555555556
roc_auc:  0.9736420986420986


# Gassian NB

**Pre-tunning**

In [162]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train_reduced, y_train)

y_pred = gnb.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)


accuracy: 0.925
recall: 0.9099099099099098
precision: 0.9121621621621622
f1-score: 0.8972972972972973
roc_auc:  0.7181602364188973


**Tunning**

In [163]:
def objective(trial):
    hyperparams = {
        'var_smoothing': trial.suggest_float('var_smoothing', 1e-9, 1e-4, log = True)
    }

    model = GaussianNB(**hyperparams)
    scores = cross_val_score(model, X_train_reduced, y_train, cv = kf,scoring = 'accuracy')
    return np.mean(scores)

In [164]:
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 100)
best_params = study.best_params
print("Best params found :", best_params)

final_model = GaussianNB(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
prediction_results['gnb'] = y_pred

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

[I 2024-02-16 13:41:22,778] A new study created in memory with name: no-name-5f8ecb24-affd-4425-a67e-2ebcb27949bd
[I 2024-02-16 13:41:22,807] Trial 0 finished with value: 0.746875 and parameters: {'var_smoothing': 5.839523690895988e-08}. Best is trial 0 with value: 0.746875.
[I 2024-02-16 13:41:22,834] Trial 1 finished with value: 0.746875 and parameters: {'var_smoothing': 1.3881904701595762e-06}. Best is trial 0 with value: 0.746875.
[I 2024-02-16 13:41:22,865] Trial 2 finished with value: 0.746875 and parameters: {'var_smoothing': 5.63962904029412e-09}. Best is trial 0 with value: 0.746875.
[I 2024-02-16 13:41:22,896] Trial 3 finished with value: 0.746875 and parameters: {'var_smoothing': 2.9645765235779734e-06}. Best is trial 0 with value: 0.746875.
[I 2024-02-16 13:41:22,928] Trial 4 finished with value: 0.746875 and parameters: {'var_smoothing': 2.708548092316079e-09}. Best is trial 0 with value: 0.746875.
[I 2024-02-16 13:41:22,957] Trial 5 finished with value: 0.746875 and param

Best params found : {'var_smoothing': 9.555448986550812e-05}
accuracy: 0.9375
recall: 0.9166666666666666
precision: 0.9144144144144145
f1-score: 0.9051480051480051
roc_auc:  0.721808267361232


# Logistic Regression

**Normal**

In [165]:
lr = LogisticRegression()
lr.fit(X_train_reduced, y_train)
y_pred = lr.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

accuracy: 0.975
recall: 0.9791666666666666
precision: 0.9837962962962963
f1-score: 0.9772486772486771
roc_auc:  0.9892248954748956


**Tunning**

In [166]:
import optuna
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')


def objective(trial):
    hyperparams = {
        'solver': trial.suggest_categorical('solver', ['lbfgs', 'liblinear', 'newton-cg', 'saga']),
        'penalty': trial.suggest_categorical('penalty', ['l2']),
        'multi_class': trial.suggest_categorical('multi_class', ['ovr']),
        'C': trial.suggest_loguniform("C", 1e-3, 1e3),
        'n_jobs': -1
    }

    model = LogisticRegression(**hyperparams)
    scores = cross_val_score(model, X_train_reduced,
                             y_train, cv=kf, scoring='accuracy')
    return np.mean(scores)

In [167]:
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 100)
best_params = study.best_params
print("Best params found :", best_params)

final_model = LogisticRegression(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
prediction_results['logistic regression'] = y_pred

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

[I 2024-02-16 13:41:25,991] A new study created in memory with name: no-name-39bb7bfb-1863-43f7-a9b5-478d0ba63f76
[I 2024-02-16 13:41:26,577] Trial 0 finished with value: 0.846875 and parameters: {'solver': 'saga', 'penalty': 'l2', 'multi_class': 'ovr', 'C': 176.10033805703344}. Best is trial 0 with value: 0.846875.
[I 2024-02-16 13:41:27,190] Trial 1 finished with value: 0.846875 and parameters: {'solver': 'saga', 'penalty': 'l2', 'multi_class': 'ovr', 'C': 46.62012583227125}. Best is trial 0 with value: 0.846875.
[I 2024-02-16 13:41:30,973] Trial 2 finished with value: 0.925 and parameters: {'solver': 'liblinear', 'penalty': 'l2', 'multi_class': 'ovr', 'C': 0.2969974718905308}. Best is trial 2 with value: 0.925.
[I 2024-02-16 13:41:34,050] Trial 3 finished with value: 0.86875 and parameters: {'solver': 'lbfgs', 'penalty': 'l2', 'multi_class': 'ovr', 'C': 473.45133644380525}. Best is trial 2 with value: 0.925.
[I 2024-02-16 13:41:38,463] Trial 4 finished with value: 0.96875 and parame

Best params found : {'solver': 'newton-cg', 'penalty': 'l2', 'multi_class': 'ovr', 'C': 0.004559581926751442}
accuracy: 0.9875
recall: 0.9861111111111112
precision: 0.9907407407407407
f1-score: 0.9851851851851853
roc_auc:  0.9928774928774929


In [168]:
# lr = LogisticRegression(multi_class='ovr', solver='liblinear')
# lr.fit(X_train_reduced, y_train)
# y_pred = lr.predict(X_test_reduced)

# KNN Classifier

**Normal**

In [169]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train_reduced, y_train)
y_pred = knn.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)


accuracy: 0.925
recall: 0.9421296296296295
precision: 0.9495370370370372
f1-score: 0.9279761904761905
roc_auc:  0.9699917551974515


**Tunning**

In [170]:
import optuna
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')


def objective(trial):
    hyperparams = {
        'n_neighbors': trial.suggest_int("n_neighbors", 5, 100),
        'weights': trial.suggest_categorical("weights", ["uniform", "distance"]),
        'metric': trial.suggest_categorical("metric", ["euclidean", "manhattan", "minkowski"]),
        'algorithm': trial.suggest_categorical('algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute']),
        'n_jobs': -1
    }

    # Create KNN model with tuned hyperparameters
    model = KNeighborsClassifier(**hyperparams)
    scores = cross_val_score(model, X_train_reduced,
                             y_train, cv=kf, scoring='accuracy')
    return np.mean(scores)

In [171]:
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 100)
best_params = study.best_params
print("Best params found :", best_params)

final_model =KNeighborsClassifier(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
prediction_results['KNN'] = y_pred

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

[I 2024-02-16 13:48:05,686] A new study created in memory with name: no-name-623a6daf-eb90-499d-8e1d-08e25c16f4ac
[I 2024-02-16 13:48:05,889] Trial 0 finished with value: 0.284375 and parameters: {'n_neighbors': 34, 'weights': 'uniform', 'metric': 'euclidean', 'algorithm': 'ball_tree'}. Best is trial 0 with value: 0.284375.
[I 2024-02-16 13:48:06,130] Trial 1 finished with value: 0.184375 and parameters: {'n_neighbors': 54, 'weights': 'uniform', 'metric': 'manhattan', 'algorithm': 'kd_tree'}. Best is trial 0 with value: 0.284375.
[I 2024-02-16 13:48:06,285] Trial 2 finished with value: 0.596875 and parameters: {'n_neighbors': 65, 'weights': 'distance', 'metric': 'manhattan', 'algorithm': 'kd_tree'}. Best is trial 2 with value: 0.596875.
[I 2024-02-16 13:48:06,439] Trial 3 finished with value: 0.584375 and parameters: {'n_neighbors': 93, 'weights': 'distance', 'metric': 'minkowski', 'algorithm': 'kd_tree'}. Best is trial 2 with value: 0.596875.
[I 2024-02-16 13:48:06,596] Trial 4 finish

Best params found : {'n_neighbors': 5, 'weights': 'distance', 'metric': 'euclidean', 'algorithm': 'auto'}
accuracy: 0.9375
recall: 0.949074074074074
precision: 0.9564814814814816
f1-score: 0.9393518518518518
roc_auc:  0.9736443526000489
