<a href="https://www.kaggle.com/code/hnaw257/facerecognition-task3-tunning-pca?scriptVersionId=164377468" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split 
from sklearn.datasets import fetch_lfw_people 
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, confusion_matrix 
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


In [2]:
lfw_people = fetch_lfw_people(min_faces_per_person = 70, funneled=False) 
  
n_samples, h, w = lfw_people.images.shape 
  
X = lfw_people.data 
n_features = X.shape[1] 
  
y = lfw_people.target 
target_names = lfw_people.target_names 
n_classes = target_names.shape[0] 
  
# Print Details about dataset 
print("Number of Data Samples: % d" % n_samples) 
print("Size of a data sample: % d" % n_features) 
print("Number of Class Labels: % d" % n_classes)

Number of Data Samples:  1288
Size of a data sample:  2914
Number of Class Labels:  7


In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42, stratify=y) 

In [4]:
n_components = 150
  
pca = PCA(n_components=n_components)
X_train_reduced = pca.fit_transform(X_train)
X_test_reduced = pca.transform(X_test)

# Euclidean

In [5]:
y_predict = []
for i in range(len(X_test_reduced)):
    min_ = np.argmin(np.sqrt(np.sum((X_train_reduced - X_test_reduced[i])**2,axis=1)))
    y_predict.append(y_train[min_])

In [6]:
print(classification_report(y_test, y_predict, target_names = target_names)) 

                   precision    recall  f1-score   support

     Ariel Sharon       0.29      0.26      0.28        19
     Colin Powell       0.47      0.54      0.50        59
  Donald Rumsfeld       0.40      0.40      0.40        30
    George W Bush       0.59      0.64      0.62       133
Gerhard Schroeder       0.11      0.07      0.09        27
      Hugo Chavez       0.25      0.11      0.15        18
       Tony Blair       0.37      0.39      0.38        36

         accuracy                           0.47       322
        macro avg       0.36      0.35      0.35       322
     weighted avg       0.45      0.47      0.46       322



****

**Set up dataframe for model results storage**

In [7]:
model_scores = {}
prediction_results = {}

# Logistic Regression

In [8]:
lr = LogisticRegression(multi_class='ovr', solver='liblinear')
lr.fit(X_train_reduced, y_train)
y_predict = lr.predict(X_test_reduced)

In [9]:
print(classification_report(y_test, y_predict, target_names = target_names)) 

                   precision    recall  f1-score   support

     Ariel Sharon       0.88      0.74      0.80        19
     Colin Powell       0.85      0.90      0.88        59
  Donald Rumsfeld       0.82      0.77      0.79        30
    George W Bush       0.86      0.93      0.89       133
Gerhard Schroeder       0.77      0.63      0.69        27
      Hugo Chavez       0.82      0.50      0.62        18
       Tony Blair       0.79      0.83      0.81        36

         accuracy                           0.84       322
        macro avg       0.83      0.76      0.78       322
     weighted avg       0.84      0.84      0.83       322



*****

# PCA - TUNNING PROCESS

In [10]:
from sklearn.model_selection import KFold
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Support Vector Machine 

**Normal**

In [11]:
from sklearn.svm import SVC

svm = SVC(kernel='linear')
svm.fit(X_train_reduced, y_train)
y_pred = svm.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
# Assuming 'label_' prefix for clarity
y_test_onehot = pd.get_dummies(y_test, prefix='label_')
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot,
                              average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

accuracy: 0.7763975155279503
recall: 0.6983450353082059
precision: 0.7211901751698047
f1-score: 0.7068685505938718
roc_auc:  0.8281798224076501


**Tunning**

In [12]:
import optuna
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')

def objective(trial):
    hyperparams = {
        'kernel': trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly']),
        'C': trial.suggest_loguniform('C', 1e-5, 1e5),
        'gamma': trial.suggest_loguniform('gamma', 1e-5, 1e5),
        # 'degree': trial.suggest_int('degree', 2, 5),  # for polynomial kernel
        'tol': trial.suggest_loguniform('tol', 1e-4, 1e-2),
        'shrinking': trial.suggest_categorical('shrinking', [True, False]),
    }
    
    # Create KNN model with tuned hyperparameters
    model = SVC(**hyperparams)
    scores = cross_val_score(model, X_train_reduced, y_train, cv = kf,scoring = 'accuracy')
    return np.mean(scores)

In [13]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
best_params = study.best_params
print("Best params found :", best_params)

final_model = SVC(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
prediction_results['svc'] = list(y_pred)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# Fit a label binarizer to get all possible classes
label_binarizer = LabelBinarizer()
label_binarizer.fit(y_test)

# Transform y_test and y_pred to one-hot encoding with the same set of columns
y_test_onehot = pd.DataFrame(label_binarizer.transform(
    y_test), columns=label_binarizer.classes_)
y_pred_onehot = pd.DataFrame(label_binarizer.transform(
    y_pred), columns=label_binarizer.classes_)

# Ensure both sets have the same columns
all_columns = set(y_test_onehot.columns).union(set(y_pred_onehot.columns))
y_test_onehot = y_test_onehot.reindex(columns=all_columns, fill_value=0)
y_pred_onehot = y_pred_onehot.reindex(columns=all_columns, fill_value=0)

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(
    y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')


print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

# Save model result
model_name = 'SVM'
model_scores[model_name] = [accuracy,recall,f1,precision,roc_auc_macro]

[I 2024-02-26 11:51:15,519] A new study created in memory with name: no-name-44405c63-23dc-438f-a10f-a659b16c53f7
[I 2024-02-26 11:51:15,733] Trial 0 finished with value: 0.41097163613054855 and parameters: {'kernel': 'poly', 'C': 2.9988357850868615e-05, 'gamma': 0.00014300316235684303, 'tol': 0.0001275602923327324, 'shrinking': True}. Best is trial 0 with value: 0.41097163613054855.
[I 2024-02-26 11:51:16,221] Trial 1 finished with value: 0.41097163613054855 and parameters: {'kernel': 'rbf', 'C': 0.0001541043402108083, 'gamma': 1100.40771996776, 'tol': 0.0008872407213702487, 'shrinking': True}. Best is trial 0 with value: 0.41097163613054855.
[I 2024-02-26 11:51:16,697] Trial 2 finished with value: 0.625260402756263 and parameters: {'kernel': 'rbf', 'C': 41.13044372693996, 'gamma': 0.027607349304346787, 'tol': 0.005492749308638338, 'shrinking': False}. Best is trial 2 with value: 0.625260402756263.
[I 2024-02-26 11:51:16,935] Trial 3 finished with value: 0.7463757277923188 and paramet

Best params found : {'kernel': 'linear', 'C': 0.21896984117938287, 'gamma': 28630.419844329415, 'tol': 0.00011304346810759962, 'shrinking': True}
accuracy: 0.8229813664596274
recall: 0.7535562069014344
precision: 0.8012223029568261
f1-score: 0.7723198490479504
roc_auc:  0.8596759409820124


In [14]:
print(f'{model_name}\n')
print('=' * 50)
print(classification_report(y_test, y_predict, target_names=target_names))

SVM

                   precision    recall  f1-score   support

     Ariel Sharon       0.88      0.74      0.80        19
     Colin Powell       0.85      0.90      0.88        59
  Donald Rumsfeld       0.82      0.77      0.79        30
    George W Bush       0.86      0.93      0.89       133
Gerhard Schroeder       0.77      0.63      0.69        27
      Hugo Chavez       0.82      0.50      0.62        18
       Tony Blair       0.79      0.83      0.81        36

         accuracy                           0.84       322
        macro avg       0.83      0.76      0.78       322
     weighted avg       0.84      0.84      0.83       322



# Decision Tree

**Normal**

In [15]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_reduced, y_train)
y_pred = dt.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
# Assuming 'label_' prefix for clarity
y_test_onehot = pd.get_dummies(y_test, prefix='label_')
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot,
                              average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

svm = SVC(kernel='linear')
svm.fit(X_train_reduced, y_train)
y_pred = svm.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
# Assuming 'label_' prefix for clarity
y_test_onehot = pd.get_dummies(y_test, prefix='label_')
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot,
                              average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

accuracy: 0.39751552795031053
recall: 0.24681811469755907
precision: 0.24931683784624964
f1-score: 0.2449976381444227
roc_auc:  0.5666445162506387
accuracy: 0.7763975155279503
recall: 0.6983450353082059
precision: 0.7211901751698047
f1-score: 0.7068685505938718
roc_auc:  0.8281798224076501


**Tunning**

In [16]:
import optuna
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')


def objective(trial):
    hyperparams = {
        "max_depth" : trial.suggest_int("max_depth", 2, 10),
        "min_samples_split" : trial.suggest_int("min_samples_split", 2, 20),
        "min_samples_leaf" : trial.suggest_int("min_samples_leaf", 1, 10),
        "criterion" : trial.suggest_categorical("criterion", ["gini", "entropy"]),
        'random_state': trial.suggest_categorical('random_state', [42])

    }

    # Create KNN model with tuned hyperparameters
    model = DecisionTreeClassifier(**hyperparams)
    scores = cross_val_score(model, X_train_reduced,
                             y_train, cv=kf, scoring='accuracy')
    return np.mean(scores)

In [17]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
best_params = study.best_params
print("Best params found :", best_params)

final_model = DecisionTreeClassifier(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
prediction_results['decison tree'] = list(y_pred)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')


# Fit a label binarizer to get all possible classes
label_binarizer = LabelBinarizer()
label_binarizer.fit(y_test)

# Transform y_test and y_pred to one-hot encoding with the same set of columns
y_test_onehot = pd.DataFrame(label_binarizer.transform(
    y_test), columns=label_binarizer.classes_)
y_pred_onehot = pd.DataFrame(label_binarizer.transform(
    y_pred), columns=label_binarizer.classes_)

# Ensure both sets have the same columns
all_columns = set(y_test_onehot.columns).union(set(y_pred_onehot.columns))
y_test_onehot = y_test_onehot.reindex(columns=all_columns, fill_value=0)
y_pred_onehot = y_pred_onehot.reindex(columns=all_columns, fill_value=0)

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(
    y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')


print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

# Save model result
model_name = 'Decision Tree'
model_scores[model_name] = [accuracy, recall,f1,precision,roc_auc_macro]

[I 2024-02-26 11:51:49,038] A new study created in memory with name: no-name-f4d13263-2131-4272-b79e-523f3cac92cd
[I 2024-02-26 11:51:49,345] Trial 0 finished with value: 0.3861171945943059 and parameters: {'max_depth': 6, 'min_samples_split': 20, 'min_samples_leaf': 7, 'criterion': 'gini', 'random_state': 42}. Best is trial 0 with value: 0.3861171945943059.
[I 2024-02-26 11:51:49,479] Trial 1 finished with value: 0.40785214465039255 and parameters: {'max_depth': 2, 'min_samples_split': 6, 'min_samples_leaf': 8, 'criterion': 'gini', 'random_state': 42}. Best is trial 1 with value: 0.40785214465039255.
[I 2024-02-26 11:51:49,798] Trial 2 finished with value: 0.3715933977885796 and parameters: {'max_depth': 6, 'min_samples_split': 2, 'min_samples_leaf': 4, 'criterion': 'gini', 'random_state': 42}. Best is trial 1 with value: 0.40785214465039255.
[I 2024-02-26 11:51:50,428] Trial 3 finished with value: 0.3809465306340473 and parameters: {'max_depth': 6, 'min_samples_split': 11, 'min_sampl

Best params found : {'max_depth': 4, 'min_samples_split': 2, 'min_samples_leaf': 10, 'criterion': 'gini', 'random_state': 42}
accuracy: 0.43788819875776397
recall: 0.254194573061656
precision: 0.3003118464641978
f1-score: 0.24500657969456083
roc_auc:  0.5696421059906465


In [18]:
print(f'{model_name}\n')
print('=' * 50)
print(classification_report(y_test, y_predict, target_names=target_names))

Decision Tree

                   precision    recall  f1-score   support

     Ariel Sharon       0.88      0.74      0.80        19
     Colin Powell       0.85      0.90      0.88        59
  Donald Rumsfeld       0.82      0.77      0.79        30
    George W Bush       0.86      0.93      0.89       133
Gerhard Schroeder       0.77      0.63      0.69        27
      Hugo Chavez       0.82      0.50      0.62        18
       Tony Blair       0.79      0.83      0.81        36

         accuracy                           0.84       322
        macro avg       0.83      0.76      0.78       322
     weighted avg       0.84      0.84      0.83       322



# KNN Classifier

**Normal**

In [19]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train_reduced, y_train)
y_pred = knn.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)


accuracy: 0.5248447204968945
recall: 0.38306868192811866
precision: 0.4871229432436732
f1-score: 0.38043121586788226
roc_auc:  0.6448149449183017


**Tunning**

In [20]:
import optuna
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')

def objective(trial):
    hyperparams = {
        'n_neighbors': trial.suggest_int("n_neighbors", 5, 100),
        'weights' : trial.suggest_categorical("weights", ["uniform", "distance"]),
        'metric' : trial.suggest_categorical("metric", ["euclidean", "manhattan", "minkowski"]),
        'algorithm':trial.suggest_categorical('algorithm',['auto', 'ball_tree', 'kd_tree', 'brute']),
        'n_jobs': -1
    }
    
    # Create KNN model with tuned hyperparameters
    model = KNeighborsClassifier(**hyperparams)
    scores = cross_val_score(model, X_train_reduced, y_train, cv = kf,scoring = 'accuracy')
    return np.mean(scores)

In [21]:
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 100)
best_params = study.best_params
print("Best params found :", best_params)

final_model =KNeighborsClassifier(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
prediction_results['KNN'] = y_pred 

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')


# Fit a label binarizer to get all possible classes
label_binarizer = LabelBinarizer()
label_binarizer.fit(y_test)

# Transform y_test and y_pred to one-hot encoding with the same set of columns
y_test_onehot = pd.DataFrame(label_binarizer.transform(
    y_test), columns=label_binarizer.classes_)
y_pred_onehot = pd.DataFrame(label_binarizer.transform(
    y_pred), columns=label_binarizer.classes_)

# Ensure both sets have the same columns
all_columns = set(y_test_onehot.columns).union(set(y_pred_onehot.columns))
y_test_onehot = y_test_onehot.reindex(columns=all_columns, fill_value=0)
y_pred_onehot = y_pred_onehot.reindex(columns=all_columns, fill_value=0)

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(
    y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')


print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

# Save model result
model_name = 'K-Nearest Neighbors'
model_scores[model_name] = [accuracy, recall,f1,precision,roc_auc_macro]

[I 2024-02-26 11:52:19,533] A new study created in memory with name: no-name-79a53a55-cce4-4f8c-b027-dd1b30c13ae5
[I 2024-02-26 11:52:19,749] Trial 0 finished with value: 0.4492548474974627 and parameters: {'n_neighbors': 81, 'weights': 'uniform', 'metric': 'manhattan', 'algorithm': 'ball_tree'}. Best is trial 0 with value: 0.4492548474974627.
[I 2024-02-26 11:52:19,998] Trial 1 finished with value: 0.4502911169275145 and parameters: {'n_neighbors': 71, 'weights': 'uniform', 'metric': 'manhattan', 'algorithm': 'kd_tree'}. Best is trial 1 with value: 0.4502911169275145.
[I 2024-02-26 11:52:20,034] Trial 2 finished with value: 0.4616740558730837 and parameters: {'n_neighbors': 50, 'weights': 'distance', 'metric': 'minkowski', 'algorithm': 'auto'}. Best is trial 2 with value: 0.4616740558730837.
[I 2024-02-26 11:52:20,060] Trial 3 finished with value: 0.5103626943005182 and parameters: {'n_neighbors': 9, 'weights': 'distance', 'metric': 'euclidean', 'algorithm': 'auto'}. Best is trial 3 w

Best params found : {'n_neighbors': 9, 'weights': 'uniform', 'metric': 'manhattan', 'algorithm': 'kd_tree'}
accuracy: 0.5652173913043478
recall: 0.3466107763355119
precision: 0.517234924783587
f1-score: 0.3564377371073326
roc_auc:  0.625251229808976


In [22]:
print(f'{model_name}\n')
print('=' * 50)
print(classification_report(y_test, y_predict, target_names=target_names))

K-Nearest Neighbors

                   precision    recall  f1-score   support

     Ariel Sharon       0.88      0.74      0.80        19
     Colin Powell       0.85      0.90      0.88        59
  Donald Rumsfeld       0.82      0.77      0.79        30
    George W Bush       0.86      0.93      0.89       133
Gerhard Schroeder       0.77      0.63      0.69        27
      Hugo Chavez       0.82      0.50      0.62        18
       Tony Blair       0.79      0.83      0.81        36

         accuracy                           0.84       322
        macro avg       0.83      0.76      0.78       322
     weighted avg       0.84      0.84      0.83       322



****

# Gassian NB

**Pre-tunning**

In [23]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train_reduced, y_train)

y_pred = gnb.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# # ROC AUC score with multiclass handling
# roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovr')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
# print('roc_auc: ', roc_auc_macro)


accuracy: 0.7018633540372671
recall: 0.6062613488253861
precision: 0.6822034588396305
f1-score: 0.6265597633503699


**Tunning**

In [24]:
def objective(trial):
    hyperparams = {
        'var_smoothing': trial.suggest_float('var_smoothing', 1e-9, 1e-4, log = True)
    }
    
    model = GaussianNB(**hyperparams)
    scores = cross_val_score(model, X_train_reduced, y_train, cv = kf,scoring = 'accuracy')
    return np.mean(scores)

In [25]:
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 100)
best_params = study.best_params
print("Best params found :", best_params)

final_model = GaussianNB(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
prediction_results['gnb'] = y_pred 

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')


# Fit a label binarizer to get all possible classes
label_binarizer = LabelBinarizer()
label_binarizer.fit(y_test)

# Transform y_test and y_pred to one-hot encoding with the same set of columns
y_test_onehot = pd.DataFrame(label_binarizer.transform(
    y_test), columns=label_binarizer.classes_)
y_pred_onehot = pd.DataFrame(label_binarizer.transform(
    y_pred), columns=label_binarizer.classes_)

# Ensure both sets have the same columns
all_columns = set(y_test_onehot.columns).union(set(y_pred_onehot.columns))
y_test_onehot = y_test_onehot.reindex(columns=all_columns, fill_value=0)
y_pred_onehot = y_pred_onehot.reindex(columns=all_columns, fill_value=0)

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(
    y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')


print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

# Save model result
model_name = 'Gaussian Naives Bayes'
model_scores[model_name] = [accuracy, recall,f1,precision,roc_auc_macro]

[I 2024-02-26 11:52:35,834] A new study created in memory with name: no-name-c997f65e-7a25-42dc-97b0-5e12e458453c
[I 2024-02-26 11:52:35,852] Trial 0 finished with value: 0.6211260082260563 and parameters: {'var_smoothing': 1.4445475674382134e-05}. Best is trial 0 with value: 0.6211260082260563.
[I 2024-02-26 11:52:35,868] Trial 1 finished with value: 0.6221622776561082 and parameters: {'var_smoothing': 9.164535262186155e-08}. Best is trial 1 with value: 0.6221622776561082.
[I 2024-02-26 11:52:35,889] Trial 2 finished with value: 0.6221622776561082 and parameters: {'var_smoothing': 3.4231558978384805e-08}. Best is trial 1 with value: 0.6221622776561082.
[I 2024-02-26 11:52:35,906] Trial 3 finished with value: 0.6221622776561082 and parameters: {'var_smoothing': 8.685609467917242e-07}. Best is trial 1 with value: 0.6221622776561082.
[I 2024-02-26 11:52:35,923] Trial 4 finished with value: 0.6221622776561082 and parameters: {'var_smoothing': 1.9811042507744773e-08}. Best is trial 1 with 

Best params found : {'var_smoothing': 9.62524252094234e-05}
accuracy: 0.7049689440993789
recall: 0.6015716247843176
precision: 0.6822317232925549
f1-score: 0.6220105792857565
roc_auc:  0.7720775719427777


In [26]:
print(f'{model_name}\n')
print('=' * 50)
print(classification_report(y_test, y_predict, target_names=target_names))

Gaussian Naives Bayes

                   precision    recall  f1-score   support

     Ariel Sharon       0.88      0.74      0.80        19
     Colin Powell       0.85      0.90      0.88        59
  Donald Rumsfeld       0.82      0.77      0.79        30
    George W Bush       0.86      0.93      0.89       133
Gerhard Schroeder       0.77      0.63      0.69        27
      Hugo Chavez       0.82      0.50      0.62        18
       Tony Blair       0.79      0.83      0.81        36

         accuracy                           0.84       322
        macro avg       0.83      0.76      0.78       322
     weighted avg       0.84      0.84      0.83       322



****

# Logistic Regression

**Normal**

In [27]:
lr = LogisticRegression()
lr.fit(X_train_reduced, y_train)
y_pred = lr.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# # One-hot encoding for probability calculation (adapt if necessary)
# y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
# y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# # ROC AUC score with multiclass handling
# roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
# print('roc_auc: ', roc_auc_macro)

accuracy: 0.8229813664596274
recall: 0.7423265926451857
precision: 0.7984699942150627
f1-score: 0.7638405921853345


**Tunning**

In [28]:
import optuna
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')


def objective(trial):
    hyperparams = {
        'solver': trial.suggest_categorical('solver', ['lbfgs', 'liblinear', 'newton-cg', 'saga']),
        'penalty': trial.suggest_categorical('penalty', ['l2']),
        'multi_class': trial.suggest_categorical('multi_class', ['ovr']),
        'C': trial.suggest_loguniform("C", 1e-3, 1e3),
        'n_jobs': -1
    }

    model = LogisticRegression(**hyperparams)
    scores = cross_val_score(model, X_train_reduced,
                             y_train, cv=kf, scoring='accuracy')
    return np.mean(scores)

In [29]:
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 100)
best_params = study.best_params
print("Best params found :", best_params)

final_model = LogisticRegression(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
prediction_results['logistic regression'] = y_pred 

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')


# Fit a label binarizer to get all possible classes
label_binarizer = LabelBinarizer()
label_binarizer.fit(y_test)

# Transform y_test and y_pred to one-hot encoding with the same set of columns
y_test_onehot = pd.DataFrame(label_binarizer.transform(
    y_test), columns=label_binarizer.classes_)
y_pred_onehot = pd.DataFrame(label_binarizer.transform(
    y_pred), columns=label_binarizer.classes_)

# Ensure both sets have the same columns
all_columns = set(y_test_onehot.columns).union(set(y_pred_onehot.columns))
y_test_onehot = y_test_onehot.reindex(columns=all_columns, fill_value=0)
y_pred_onehot = y_pred_onehot.reindex(columns=all_columns, fill_value=0)

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(
    y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')


print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

# Save model result
model_name = 'Logistic Regression'
model_scores[model_name] = [accuracy, recall,f1,precision,roc_auc_macro]

[I 2024-02-26 11:52:38,838] A new study created in memory with name: no-name-2805e459-c9da-4de0-abf8-8a647b6e0615
[I 2024-02-26 11:52:40,405] Trial 0 finished with value: 0.7080658084504032 and parameters: {'solver': 'newton-cg', 'penalty': 'l2', 'multi_class': 'ovr', 'C': 0.05484102257712073}. Best is trial 0 with value: 0.7080658084504032.
[I 2024-02-26 11:52:40,776] Trial 1 finished with value: 0.5455210725922761 and parameters: {'solver': 'newton-cg', 'penalty': 'l2', 'multi_class': 'ovr', 'C': 0.007789526569870762}. Best is trial 0 with value: 0.7080658084504032.
[I 2024-02-26 11:52:43,261] Trial 2 finished with value: 0.7629506970781476 and parameters: {'solver': 'saga', 'penalty': 'l2', 'multi_class': 'ovr', 'C': 188.18671077703118}. Best is trial 2 with value: 0.7629506970781476.
[I 2024-02-26 11:52:43,433] Trial 3 finished with value: 0.510319961540516 and parameters: {'solver': 'lbfgs', 'penalty': 'l2', 'multi_class': 'ovr', 'C': 0.005234650629286082}. Best is trial 2 with va

Best params found : {'solver': 'liblinear', 'penalty': 'l2', 'multi_class': 'ovr', 'C': 0.706368277303258}
accuracy: 0.84472049689441
recall: 0.7625656824331476
precision: 0.8360151055217219
f1-score: 0.7909415034686162
roc_auc:  0.8660207422847463


In [30]:
print(f'{model_name}\n')
print('=' * 50)
print(classification_report(y_test, y_predict, target_names=target_names))

Logistic Regression

                   precision    recall  f1-score   support

     Ariel Sharon       0.88      0.74      0.80        19
     Colin Powell       0.85      0.90      0.88        59
  Donald Rumsfeld       0.82      0.77      0.79        30
    George W Bush       0.86      0.93      0.89       133
Gerhard Schroeder       0.77      0.63      0.69        27
      Hugo Chavez       0.82      0.50      0.62        18
       Tony Blair       0.79      0.83      0.81        36

         accuracy                           0.84       322
        macro avg       0.83      0.76      0.78       322
     weighted avg       0.84      0.84      0.83       322



***

# Random Forest

In [31]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

**Normal**

In [32]:
rfr = RandomForestClassifier(random_state=42)
rfr.fit(X_train_reduced,y_train)
y_pred = rfr.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

accuracy: 0.515527950310559
recall: 0.2420775506158455
precision: 0.5628264208909369
f1-score: 0.23636595327435472
roc_auc:  0.5684446083195608


**Tunning**

In [33]:
import optuna


def objective(trial):
    hyperparams = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 10, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 32),
        'random_state': trial.suggest_categorical('random_state', [42]),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 32),
        'n_jobs': -1
    }

    model = RandomForestClassifier(**hyperparams)
    scores = cross_val_score(model, X_train_reduced,
                             y_train, cv=kf, scoring='accuracy')
    return np.mean(scores)

In [34]:
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 100)
best_params = study.best_params
print("Best params found :", best_params)

final_model = RandomForestClassifier(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
accuracy_score(y_test, y_pred)

[I 2024-02-26 11:55:20,775] A new study created in memory with name: no-name-89bd3598-694a-4397-88ba-827190e9d602
[I 2024-02-26 11:55:38,040] Trial 0 finished with value: 0.44618877196730944 and parameters: {'n_estimators': 749, 'max_depth': 21, 'min_samples_split': 13, 'random_state': 42, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.44618877196730944.
[I 2024-02-26 11:55:55,466] Trial 1 finished with value: 0.4171892527108595 and parameters: {'n_estimators': 975, 'max_depth': 37, 'min_samples_split': 12, 'random_state': 42, 'min_samples_leaf': 32}. Best is trial 0 with value: 0.44618877196730944.
[I 2024-02-26 11:55:58,221] Trial 2 finished with value: 0.4389295443619464 and parameters: {'n_estimators': 128, 'max_depth': 36, 'min_samples_split': 31, 'random_state': 42, 'min_samples_leaf': 7}. Best is trial 0 with value: 0.44618877196730944.
[I 2024-02-26 11:56:01,399] Trial 3 finished with value: 0.4420490358421024 and parameters: {'n_estimators': 152, 'max_depth': 31, 'min_s

Best params found : {'n_estimators': 108, 'max_depth': 41, 'min_samples_split': 2, 'random_state': 42, 'min_samples_leaf': 1}


0.515527950310559

In [35]:
final_model = RandomForestClassifier(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
prediction_results['random forest'] = y_pred 

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')


# Fit a label binarizer to get all possible classes
label_binarizer = LabelBinarizer()
label_binarizer.fit(y_test)

# Transform y_test and y_pred to one-hot encoding with the same set of columns
y_test_onehot = pd.DataFrame(label_binarizer.transform(
    y_test), columns=label_binarizer.classes_)
y_pred_onehot = pd.DataFrame(label_binarizer.transform(
    y_pred), columns=label_binarizer.classes_)

# Ensure both sets have the same columns
all_columns = set(y_test_onehot.columns).union(set(y_pred_onehot.columns))
y_test_onehot = y_test_onehot.reindex(columns=all_columns, fill_value=0)
y_pred_onehot = y_pred_onehot.reindex(columns=all_columns, fill_value=0)

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(
    y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')


print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

# Save model result
model_name = 'Random Forest'
model_scores[model_name] = [accuracy, recall,f1,precision,roc_auc_macro]

accuracy: 0.515527950310559
recall: 0.2381900068977933
precision: 0.5197278911564627
f1-score: 0.22529692853671196
roc_auc:  0.5610324692864168


In [36]:
print(f'{model_name}\n')
print('=' * 50)
print(classification_report(y_test, y_predict, target_names=target_names))

Random Forest

                   precision    recall  f1-score   support

     Ariel Sharon       0.88      0.74      0.80        19
     Colin Powell       0.85      0.90      0.88        59
  Donald Rumsfeld       0.82      0.77      0.79        30
    George W Bush       0.86      0.93      0.89       133
Gerhard Schroeder       0.77      0.63      0.69        27
      Hugo Chavez       0.82      0.50      0.62        18
       Tony Blair       0.79      0.83      0.81        36

         accuracy                           0.84       322
        macro avg       0.83      0.76      0.78       322
     weighted avg       0.84      0.84      0.83       322



****

****

# SUMMARY

**Classification over all models**

In [37]:
results_df = pd.DataFrame(model_scores, index=['Accuracy', 'Recall','F1 Score','Precision','ROC AUC']).T
results_df

Unnamed: 0,Accuracy,Recall,F1 Score,Precision,ROC AUC
SVM,0.822981,0.753556,0.77232,0.801222,0.859676
Decision Tree,0.437888,0.254195,0.245007,0.300312,0.569642
K-Nearest Neighbors,0.565217,0.346611,0.356438,0.517235,0.625251
Gaussian Naives Bayes,0.704969,0.601572,0.622011,0.682232,0.772078
Logistic Regression,0.84472,0.762566,0.790942,0.836015,0.866021
Random Forest,0.515528,0.23819,0.225297,0.519728,0.561032


In [38]:
# dump prediction results
# import json
# with open("/kaggle/working/predict.json", "w") as json_file:
#     json.dump(prediction_results, json_file)

In [39]:
predictions = {}
for k in prediction_results.keys():
    predictions[k] = list(prediction_results[k])
predictions

{'svc': [1,
  1,
  3,
  3,
  1,
  3,
  3,
  3,
  3,
  1,
  3,
  0,
  6,
  5,
  3,
  3,
  3,
  5,
  4,
  3,
  3,
  2,
  1,
  4,
  1,
  1,
  1,
  0,
  3,
  1,
  1,
  3,
  3,
  1,
  2,
  4,
  2,
  1,
  1,
  3,
  3,
  1,
  1,
  1,
  5,
  6,
  1,
  2,
  3,
  3,
  3,
  0,
  1,
  1,
  3,
  0,
  3,
  4,
  3,
  3,
  1,
  5,
  3,
  3,
  1,
  3,
  3,
  1,
  6,
  3,
  1,
  3,
  1,
  2,
  3,
  6,
  6,
  3,
  6,
  3,
  3,
  1,
  3,
  1,
  0,
  4,
  2,
  1,
  5,
  3,
  3,
  2,
  6,
  5,
  3,
  1,
  2,
  3,
  2,
  3,
  1,
  3,
  3,
  4,
  6,
  3,
  3,
  1,
  3,
  3,
  1,
  5,
  3,
  3,
  3,
  2,
  3,
  1,
  6,
  3,
  0,
  1,
  3,
  3,
  2,
  2,
  1,
  3,
  3,
  3,
  5,
  6,
  2,
  2,
  1,
  0,
  3,
  1,
  3,
  0,
  1,
  3,
  3,
  3,
  1,
  2,
  6,
  3,
  6,
  3,
  1,
  1,
  3,
  2,
  2,
  3,
  3,
  3,
  6,
  3,
  3,
  6,
  6,
  3,
  3,
  3,
  3,
  3,
  6,
  3,
  3,
  3,
  4,
  3,
  2,
  3,
  1,
  1,
  3,
  3,
  0,
  1,
  3,
  6,
  2,
  6,
  4,
  4,
  3,
  6,
  0,
  3,
  0,
  2,
  3,
  3,
  4,
  3,
  1