<a href="https://www.kaggle.com/code/hnaw257/facerecognition-task3-tunning-pca?scriptVersionId=164376844" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split 
from sklearn.datasets import fetch_lfw_people 
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, confusion_matrix 
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


In [2]:
lfw_people = fetch_lfw_people(min_faces_per_person = 70, funneled=False) 
  
n_samples, h, w = lfw_people.images.shape 
  
X = lfw_people.data 
n_features = X.shape[1] 
  
y = lfw_people.target 
target_names = lfw_people.target_names 
n_classes = target_names.shape[0] 
  
# Print Details about dataset 
print("Number of Data Samples: % d" % n_samples) 
print("Size of a data sample: % d" % n_features) 
print("Number of Class Labels: % d" % n_classes)

Number of Data Samples:  1288
Size of a data sample:  2914
Number of Class Labels:  7


In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42, stratify=y) 

In [4]:
n_components = 150
  
pca = PCA(n_components=n_components)
X_train_reduced = pca.fit_transform(X_train)
X_test_reduced = pca.transform(X_test)

# Euclidean

In [5]:
y_predict = []
for i in range(len(X_test_reduced)):
    min_ = np.argmin(np.sqrt(np.sum((X_train_reduced - X_test_reduced[i])**2,axis=1)))
    y_predict.append(y_train[min_])

In [6]:
print(classification_report(y_test, y_predict, target_names = target_names)) 

                   precision    recall  f1-score   support

     Ariel Sharon       0.29      0.26      0.28        19
     Colin Powell       0.47      0.54      0.50        59
  Donald Rumsfeld       0.40      0.40      0.40        30
    George W Bush       0.59      0.64      0.62       133
Gerhard Schroeder       0.11      0.07      0.09        27
      Hugo Chavez       0.25      0.11      0.15        18
       Tony Blair       0.37      0.39      0.38        36

         accuracy                           0.47       322
        macro avg       0.36      0.35      0.35       322
     weighted avg       0.45      0.47      0.46       322



****

**Set up dataframe for model results storage**

In [7]:
model_scores = {}
prediction_results = {}

# Logistic Regression

In [8]:
lr = LogisticRegression(multi_class='ovr', solver='liblinear')
lr.fit(X_train_reduced, y_train)
y_predict = lr.predict(X_test_reduced)

In [9]:
print(classification_report(y_test, y_predict, target_names = target_names)) 

                   precision    recall  f1-score   support

     Ariel Sharon       0.88      0.74      0.80        19
     Colin Powell       0.84      0.90      0.87        59
  Donald Rumsfeld       0.82      0.77      0.79        30
    George W Bush       0.85      0.92      0.88       133
Gerhard Schroeder       0.75      0.56      0.64        27
      Hugo Chavez       0.80      0.44      0.57        18
       Tony Blair       0.74      0.86      0.79        36

         accuracy                           0.83       322
        macro avg       0.81      0.74      0.76       322
     weighted avg       0.82      0.83      0.82       322



*****

# PCA - TUNNING PROCESS

In [10]:
from sklearn.model_selection import KFold
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Support Vector Machine 

**Normal**

In [11]:
from sklearn.svm import SVC

svm = SVC(kernel='linear')
svm.fit(X_train_reduced, y_train)
y_pred = svm.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
# Assuming 'label_' prefix for clarity
y_test_onehot = pd.get_dummies(y_test, prefix='label_')
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot,
                              average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

accuracy: 0.7763975155279503
recall: 0.6788319669229571
precision: 0.7165616133267115
f1-score: 0.692277642195528
roc_auc:  0.8182494777228536


**Tunning**

In [12]:
import optuna
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')

def objective(trial):
    hyperparams = {
        'kernel': trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly']),
        'C': trial.suggest_loguniform('C', 1e-5, 1e5),
        'gamma': trial.suggest_loguniform('gamma', 1e-5, 1e5),
        # 'degree': trial.suggest_int('degree', 2, 5),  # for polynomial kernel
        'tol': trial.suggest_loguniform('tol', 1e-4, 1e-2),
        'shrinking': trial.suggest_categorical('shrinking', [True, False]),
    }
    
    # Create KNN model with tuned hyperparameters
    model = SVC(**hyperparams)
    scores = cross_val_score(model, X_train_reduced, y_train, cv = kf,scoring = 'accuracy')
    return np.mean(scores)

In [13]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
best_params = study.best_params
print("Best params found :", best_params)

final_model = SVC(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
prediction_results['svc'] = list(y_pred)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# Fit a label binarizer to get all possible classes
label_binarizer = LabelBinarizer()
label_binarizer.fit(y_test)

# Transform y_test and y_pred to one-hot encoding with the same set of columns
y_test_onehot = pd.DataFrame(label_binarizer.transform(
    y_test), columns=label_binarizer.classes_)
y_pred_onehot = pd.DataFrame(label_binarizer.transform(
    y_pred), columns=label_binarizer.classes_)

# Ensure both sets have the same columns
all_columns = set(y_test_onehot.columns).union(set(y_pred_onehot.columns))
y_test_onehot = y_test_onehot.reindex(columns=all_columns, fill_value=0)
y_pred_onehot = y_pred_onehot.reindex(columns=all_columns, fill_value=0)

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(
    y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')


print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

# Save model result
model_name = 'SVM'
model_scores[model_name] = [accuracy,recall,f1,precision,roc_auc_macro]

[I 2024-02-26 11:46:29,441] A new study created in memory with name: no-name-8119eac5-afde-4eb0-992d-bced06c0de61
[I 2024-02-26 11:46:29,984] Trial 0 finished with value: 0.5941990278297099 and parameters: {'kernel': 'poly', 'C': 0.005267334937338961, 'gamma': 29.125195404576928, 'tol': 0.0023255926188676876, 'shrinking': False}. Best is trial 0 with value: 0.5941990278297099.
[I 2024-02-26 11:46:30,715] Trial 1 finished with value: 0.41097163613054855 and parameters: {'kernel': 'rbf', 'C': 0.07018785707164503, 'gamma': 0.05364250584437953, 'tol': 0.006676391954969385, 'shrinking': False}. Best is trial 0 with value: 0.5941990278297099.
[I 2024-02-26 11:46:31,312] Trial 2 finished with value: 0.540318359062016 and parameters: {'kernel': 'poly', 'C': 0.0002598680701578184, 'gamma': 0.32023208990299146, 'tol': 0.00010279226446484342, 'shrinking': True}. Best is trial 0 with value: 0.5941990278297099.
[I 2024-02-26 11:46:32,413] Trial 3 finished with value: 0.41097163613054855 and paramet

Best params found : {'kernel': 'linear', 'C': 0.08254220895932002, 'gamma': 0.0006989255985752181, 'tol': 0.0014439693184457882, 'shrinking': True}
accuracy: 0.8291925465838509
recall: 0.7199094386490844
precision: 0.8075463520343958
f1-score: 0.7520955596908846
roc_auc:  0.842857487974295


In [14]:
print(f'{model_name}\n')
print('=' * 50)
print(classification_report(y_test, y_predict, target_names=target_names))

SVM

                   precision    recall  f1-score   support

     Ariel Sharon       0.88      0.74      0.80        19
     Colin Powell       0.84      0.90      0.87        59
  Donald Rumsfeld       0.82      0.77      0.79        30
    George W Bush       0.85      0.92      0.88       133
Gerhard Schroeder       0.75      0.56      0.64        27
      Hugo Chavez       0.80      0.44      0.57        18
       Tony Blair       0.74      0.86      0.79        36

         accuracy                           0.83       322
        macro avg       0.81      0.74      0.76       322
     weighted avg       0.82      0.83      0.82       322



# Decision Tree

**Normal**

In [15]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_reduced, y_train)
y_pred = dt.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
# Assuming 'label_' prefix for clarity
y_test_onehot = pd.get_dummies(y_test, prefix='label_')
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot,
                              average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

svm = SVC(kernel='linear')
svm.fit(X_train_reduced, y_train)
y_pred = svm.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
# Assuming 'label_' prefix for clarity
y_test_onehot = pd.get_dummies(y_test, prefix='label_')
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot,
                              average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

accuracy: 0.38509316770186336
recall: 0.25636636654860184
precision: 0.2626379958012611
f1-score: 0.25608003225241727
roc_auc:  0.5702417346949636
accuracy: 0.7763975155279503
recall: 0.6788319669229571
precision: 0.7165616133267115
f1-score: 0.692277642195528
roc_auc:  0.8182494777228536


**Tunning**

In [16]:
import optuna
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')


def objective(trial):
    hyperparams = {
        "max_depth" : trial.suggest_int("max_depth", 2, 10),
        "min_samples_split" : trial.suggest_int("min_samples_split", 2, 20),
        "min_samples_leaf" : trial.suggest_int("min_samples_leaf", 1, 10),
        "criterion" : trial.suggest_categorical("criterion", ["gini", "entropy"]),
        'random_state': trial.suggest_categorical('random_state', [42])

    }

    # Create KNN model with tuned hyperparameters
    model = DecisionTreeClassifier(**hyperparams)
    scores = cross_val_score(model, X_train_reduced,
                             y_train, cv=kf, scoring='accuracy')
    return np.mean(scores)

In [17]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
best_params = study.best_params
print("Best params found :", best_params)

final_model = DecisionTreeClassifier(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
prediction_results['decison tree'] = list(y_pred)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')


# Fit a label binarizer to get all possible classes
label_binarizer = LabelBinarizer()
label_binarizer.fit(y_test)

# Transform y_test and y_pred to one-hot encoding with the same set of columns
y_test_onehot = pd.DataFrame(label_binarizer.transform(
    y_test), columns=label_binarizer.classes_)
y_pred_onehot = pd.DataFrame(label_binarizer.transform(
    y_pred), columns=label_binarizer.classes_)

# Ensure both sets have the same columns
all_columns = set(y_test_onehot.columns).union(set(y_pred_onehot.columns))
y_test_onehot = y_test_onehot.reindex(columns=all_columns, fill_value=0)
y_pred_onehot = y_pred_onehot.reindex(columns=all_columns, fill_value=0)

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(
    y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')


print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

# Save model result
model_name = 'Decision Tree'
model_scores[model_name] = [accuracy, recall,f1,precision,roc_auc_macro]

[I 2024-02-26 11:47:18,623] A new study created in memory with name: no-name-93b8b9f6-6708-46e4-a1da-fa9d58a0b98e
[I 2024-02-26 11:47:19,580] Trial 0 finished with value: 0.37270979114363545 and parameters: {'max_depth': 7, 'min_samples_split': 13, 'min_samples_leaf': 3, 'criterion': 'entropy', 'random_state': 42}. Best is trial 0 with value: 0.37270979114363545.
[I 2024-02-26 11:47:20,526] Trial 1 finished with value: 0.3561294802628065 and parameters: {'max_depth': 8, 'min_samples_split': 4, 'min_samples_leaf': 5, 'criterion': 'entropy', 'random_state': 42}. Best is trial 0 with value: 0.37270979114363545.
[I 2024-02-26 11:47:20,741] Trial 2 finished with value: 0.3985470861599274 and parameters: {'max_depth': 3, 'min_samples_split': 12, 'min_samples_leaf': 10, 'criterion': 'gini', 'random_state': 42}. Best is trial 2 with value: 0.3985470861599274.
[I 2024-02-26 11:47:21,220] Trial 3 finished with value: 0.3591581646279579 and parameters: {'max_depth': 10, 'min_samples_split': 6, 'm

Best params found : {'max_depth': 4, 'min_samples_split': 19, 'min_samples_leaf': 7, 'criterion': 'gini', 'random_state': 42}
accuracy: 0.4409937888198758
recall: 0.2566158805677093
precision: 0.30715459477114176
f1-score: 0.24834292259608387
roc_auc:  0.5710434318768943


In [18]:
print(f'{model_name}\n')
print('=' * 50)
print(classification_report(y_test, y_predict, target_names=target_names))

Decision Tree

                   precision    recall  f1-score   support

     Ariel Sharon       0.88      0.74      0.80        19
     Colin Powell       0.84      0.90      0.87        59
  Donald Rumsfeld       0.82      0.77      0.79        30
    George W Bush       0.85      0.92      0.88       133
Gerhard Schroeder       0.75      0.56      0.64        27
      Hugo Chavez       0.80      0.44      0.57        18
       Tony Blair       0.74      0.86      0.79        36

         accuracy                           0.83       322
        macro avg       0.81      0.74      0.76       322
     weighted avg       0.82      0.83      0.82       322



# KNN Classifier

**Normal**

In [19]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train_reduced, y_train)
y_pred = knn.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)


accuracy: 0.5124223602484472
recall: 0.3618398633117635
precision: 0.46301590879008053
f1-score: 0.35788584229132325
roc_auc:  0.6328040377715103


**Tunning**

In [20]:
import optuna
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')

def objective(trial):
    hyperparams = {
        'n_neighbors': trial.suggest_int("n_neighbors", 5, 100),
        'weights' : trial.suggest_categorical("weights", ["uniform", "distance"]),
        'metric' : trial.suggest_categorical("metric", ["euclidean", "manhattan", "minkowski"]),
        'algorithm':trial.suggest_categorical('algorithm',['auto', 'ball_tree', 'kd_tree', 'brute']),
        'n_jobs': -1
    }
    
    # Create KNN model with tuned hyperparameters
    model = KNeighborsClassifier(**hyperparams)
    scores = cross_val_score(model, X_train_reduced, y_train, cv = kf,scoring = 'accuracy')
    return np.mean(scores)

In [21]:
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 100)
best_params = study.best_params
print("Best params found :", best_params)

final_model =KNeighborsClassifier(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
prediction_results['KNN'] = y_pred 

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')


# Fit a label binarizer to get all possible classes
label_binarizer = LabelBinarizer()
label_binarizer.fit(y_test)

# Transform y_test and y_pred to one-hot encoding with the same set of columns
y_test_onehot = pd.DataFrame(label_binarizer.transform(
    y_test), columns=label_binarizer.classes_)
y_pred_onehot = pd.DataFrame(label_binarizer.transform(
    y_pred), columns=label_binarizer.classes_)

# Ensure both sets have the same columns
all_columns = set(y_test_onehot.columns).union(set(y_pred_onehot.columns))
y_test_onehot = y_test_onehot.reindex(columns=all_columns, fill_value=0)
y_pred_onehot = y_pred_onehot.reindex(columns=all_columns, fill_value=0)

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(
    y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')


print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

# Save model result
model_name = 'K-Nearest Neighbors'
model_scores[model_name] = [accuracy, recall,f1,precision,roc_auc_macro]

[I 2024-02-26 11:48:01,854] A new study created in memory with name: no-name-d57b20e7-c985-4ae7-9e7d-ab120b7c505b
[I 2024-02-26 11:48:02,050] Trial 0 finished with value: 0.4482292612574114 and parameters: {'n_neighbors': 79, 'weights': 'distance', 'metric': 'manhattan', 'algorithm': 'ball_tree'}. Best is trial 0 with value: 0.4482292612574114.
[I 2024-02-26 11:48:02,151] Trial 1 finished with value: 0.49687516692484374 and parameters: {'n_neighbors': 36, 'weights': 'distance', 'metric': 'manhattan', 'algorithm': 'brute'}. Best is trial 1 with value: 0.49687516692484374.
[I 2024-02-26 11:48:02,318] Trial 2 finished with value: 0.48028951444901447 and parameters: {'n_neighbors': 46, 'weights': 'uniform', 'metric': 'manhattan', 'algorithm': 'brute'}. Best is trial 1 with value: 0.49687516692484374.
[I 2024-02-26 11:48:02,547] Trial 3 finished with value: 0.47617648629880877 and parameters: {'n_neighbors': 32, 'weights': 'uniform', 'metric': 'euclidean', 'algorithm': 'ball_tree'}. Best is

Best params found : {'n_neighbors': 7, 'weights': 'distance', 'metric': 'manhattan', 'algorithm': 'kd_tree'}
accuracy: 0.5900621118012422
recall: 0.37410923731173507
precision: 0.6076981737364312
f1-score: 0.3928655321471505
roc_auc:  0.6424217283935005


In [22]:
print(f'{model_name}\n')
print('=' * 50)
print(classification_report(y_test, y_predict, target_names=target_names))

K-Nearest Neighbors

                   precision    recall  f1-score   support

     Ariel Sharon       0.88      0.74      0.80        19
     Colin Powell       0.84      0.90      0.87        59
  Donald Rumsfeld       0.82      0.77      0.79        30
    George W Bush       0.85      0.92      0.88       133
Gerhard Schroeder       0.75      0.56      0.64        27
      Hugo Chavez       0.80      0.44      0.57        18
       Tony Blair       0.74      0.86      0.79        36

         accuracy                           0.83       322
        macro avg       0.81      0.74      0.76       322
     weighted avg       0.82      0.83      0.82       322



****

# Gassian NB

**Pre-tunning**

In [23]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train_reduced, y_train)

y_pred = gnb.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# # ROC AUC score with multiclass handling
# roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovr')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
# print('roc_auc: ', roc_auc_macro)


accuracy: 0.6925465838509317
recall: 0.5774975507102434
precision: 0.6571381886087767
f1-score: 0.5999113367284531


**Tunning**

In [24]:
def objective(trial):
    hyperparams = {
        'var_smoothing': trial.suggest_float('var_smoothing', 1e-9, 1e-4, log = True)
    }
    
    model = GaussianNB(**hyperparams)
    scores = cross_val_score(model, X_train_reduced, y_train, cv = kf,scoring = 'accuracy')
    return np.mean(scores)

In [25]:
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 100)
best_params = study.best_params
print("Best params found :", best_params)

final_model = GaussianNB(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
prediction_results['gnb'] = y_pred 

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')


# Fit a label binarizer to get all possible classes
label_binarizer = LabelBinarizer()
label_binarizer.fit(y_test)

# Transform y_test and y_pred to one-hot encoding with the same set of columns
y_test_onehot = pd.DataFrame(label_binarizer.transform(
    y_test), columns=label_binarizer.classes_)
y_pred_onehot = pd.DataFrame(label_binarizer.transform(
    y_pred), columns=label_binarizer.classes_)

# Ensure both sets have the same columns
all_columns = set(y_test_onehot.columns).union(set(y_pred_onehot.columns))
y_test_onehot = y_test_onehot.reindex(columns=all_columns, fill_value=0)
y_pred_onehot = y_pred_onehot.reindex(columns=all_columns, fill_value=0)

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(
    y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')


print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

# Save model result
model_name = 'Gaussian Naives Bayes'
model_scores[model_name] = [accuracy, recall,f1,precision,roc_auc_macro]

[I 2024-02-26 11:48:23,397] A new study created in memory with name: no-name-242e2c67-20f9-4921-9ecd-db8bb62d803a
[I 2024-02-26 11:48:23,422] Trial 0 finished with value: 0.6128679023556434 and parameters: {'var_smoothing': 2.963800079585228e-09}. Best is trial 0 with value: 0.6128679023556434.
[I 2024-02-26 11:48:23,448] Trial 1 finished with value: 0.6128679023556434 and parameters: {'var_smoothing': 5.856044765270406e-06}. Best is trial 0 with value: 0.6128679023556434.
[I 2024-02-26 11:48:23,474] Trial 2 finished with value: 0.6128679023556434 and parameters: {'var_smoothing': 1.163741705794585e-06}. Best is trial 0 with value: 0.6128679023556434.
[I 2024-02-26 11:48:23,499] Trial 3 finished with value: 0.6128679023556434 and parameters: {'var_smoothing': 2.082383492300755e-06}. Best is trial 0 with value: 0.6128679023556434.
[I 2024-02-26 11:48:23,523] Trial 4 finished with value: 0.6128679023556434 and parameters: {'var_smoothing': 2.43056603915848e-06}. Best is trial 0 with valu

Best params found : {'var_smoothing': 8.345522752574468e-05}
accuracy: 0.6956521739130435
recall: 0.5785716645663121
precision: 0.6606521416513624
f1-score: 0.601532880245184
roc_auc:  0.7599155772868146


In [26]:
print(f'{model_name}\n')
print('=' * 50)
print(classification_report(y_test, y_predict, target_names=target_names))

Gaussian Naives Bayes

                   precision    recall  f1-score   support

     Ariel Sharon       0.88      0.74      0.80        19
     Colin Powell       0.84      0.90      0.87        59
  Donald Rumsfeld       0.82      0.77      0.79        30
    George W Bush       0.85      0.92      0.88       133
Gerhard Schroeder       0.75      0.56      0.64        27
      Hugo Chavez       0.80      0.44      0.57        18
       Tony Blair       0.74      0.86      0.79        36

         accuracy                           0.83       322
        macro avg       0.81      0.74      0.76       322
     weighted avg       0.82      0.83      0.82       322



****

# Logistic Regression

**Normal**

In [27]:
lr = LogisticRegression()
lr.fit(X_train_reduced, y_train)
y_pred = lr.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# # One-hot encoding for probability calculation (adapt if necessary)
# y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
# y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# # ROC AUC score with multiclass handling
# roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
# print('roc_auc: ', roc_auc_macro)

accuracy: 0.8167701863354038
recall: 0.718686142313006
precision: 0.7784214626924362
f1-score: 0.7397297737529582


**Tunning**

In [28]:
import optuna
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')


def objective(trial):
    hyperparams = {
        'solver': trial.suggest_categorical('solver', ['lbfgs', 'liblinear', 'newton-cg', 'saga']),
        'penalty': trial.suggest_categorical('penalty', ['l2']),
        'multi_class': trial.suggest_categorical('multi_class', ['ovr']),
        'C': trial.suggest_loguniform("C", 1e-3, 1e3),
        'n_jobs': -1
    }

    model = LogisticRegression(**hyperparams)
    scores = cross_val_score(model, X_train_reduced,
                             y_train, cv=kf, scoring='accuracy')
    return np.mean(scores)

In [29]:
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 100)
best_params = study.best_params
print("Best params found :", best_params)

final_model = LogisticRegression(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
prediction_results['logistic regression'] = y_pred 

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')


# Fit a label binarizer to get all possible classes
label_binarizer = LabelBinarizer()
label_binarizer.fit(y_test)

# Transform y_test and y_pred to one-hot encoding with the same set of columns
y_test_onehot = pd.DataFrame(label_binarizer.transform(
    y_test), columns=label_binarizer.classes_)
y_pred_onehot = pd.DataFrame(label_binarizer.transform(
    y_pred), columns=label_binarizer.classes_)

# Ensure both sets have the same columns
all_columns = set(y_test_onehot.columns).union(set(y_pred_onehot.columns))
y_test_onehot = y_test_onehot.reindex(columns=all_columns, fill_value=0)
y_pred_onehot = y_pred_onehot.reindex(columns=all_columns, fill_value=0)

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(
    y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')


print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

# Save model result
model_name = 'Logistic Regression'
model_scores[model_name] = [accuracy, recall,f1,precision,roc_auc_macro]

[I 2024-02-26 11:48:27,835] A new study created in memory with name: no-name-71e7e17a-4696-43a8-90c6-7a829a866b27
[I 2024-02-26 11:48:29,166] Trial 0 finished with value: 0.6770097751188504 and parameters: {'solver': 'saga', 'penalty': 'l2', 'multi_class': 'ovr', 'C': 0.03291871097263936}. Best is trial 0 with value: 0.6770097751188504.
[I 2024-02-26 11:48:30,156] Trial 1 finished with value: 0.6739116500186957 and parameters: {'solver': 'liblinear', 'penalty': 'l2', 'multi_class': 'ovr', 'C': 0.0147746080153082}. Best is trial 0 with value: 0.6770097751188504.
[I 2024-02-26 11:48:34,742] Trial 2 finished with value: 0.7442978473372148 and parameters: {'solver': 'liblinear', 'penalty': 'l2', 'multi_class': 'ovr', 'C': 19.544082687256193}. Best is trial 2 with value: 0.7442978473372148.
[I 2024-02-26 11:48:41,104] Trial 3 finished with value: 0.7391378665669569 and parameters: {'solver': 'liblinear', 'penalty': 'l2', 'multi_class': 'ovr', 'C': 487.014499598736}. Best is trial 2 with val

Best params found : {'solver': 'liblinear', 'penalty': 'l2', 'multi_class': 'ovr', 'C': 0.6005023185342078}
accuracy: 0.8229813664596274
recall: 0.7255385908947779
precision: 0.8031825703049271
f1-score: 0.7523791312674002
roc_auc:  0.8456018713582493


In [30]:
print(f'{model_name}\n')
print('=' * 50)
print(classification_report(y_test, y_predict, target_names=target_names))

Logistic Regression

                   precision    recall  f1-score   support

     Ariel Sharon       0.88      0.74      0.80        19
     Colin Powell       0.84      0.90      0.87        59
  Donald Rumsfeld       0.82      0.77      0.79        30
    George W Bush       0.85      0.92      0.88       133
Gerhard Schroeder       0.75      0.56      0.64        27
      Hugo Chavez       0.80      0.44      0.57        18
       Tony Blair       0.74      0.86      0.79        36

         accuracy                           0.83       322
        macro avg       0.81      0.74      0.76       322
     weighted avg       0.82      0.83      0.82       322



***

# Random Forest

In [31]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

**Normal**

In [32]:
rfr = RandomForestClassifier(random_state=42)
rfr.fit(X_train_reduced,y_train)
y_pred = rfr.predict(X_test_reduced)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# One-hot encoding for probability calculation (adapt if necessary)
y_test_onehot = pd.get_dummies(y_test, prefix='label_')  # Assuming 'label_' prefix for clarity
y_pred_onehot = pd.get_dummies(y_pred, prefix='label_')

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')  # Specify 'ovo' or 'ovr'

print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

accuracy: 0.5062111801242236
recall: 0.2170433145009416
precision: 0.25053549258496255
f1-score: 0.18517447464815887
roc_auc:  0.4181080412865555


**Tunning**

In [33]:
import optuna


def objective(trial):
    hyperparams = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 10, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 32),
        'random_state': trial.suggest_categorical('random_state', [42]),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 32),
        'n_jobs': -1
    }

    model = RandomForestClassifier(**hyperparams)
    scores = cross_val_score(model, X_train_reduced,
                             y_train, cv=kf, scoring='accuracy')
    return np.mean(scores)

In [34]:
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 100)
best_params = study.best_params
print("Best params found :", best_params)

final_model = RandomForestClassifier(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
accuracy_score(y_test, y_pred)

[I 2024-02-26 11:51:50,140] A new study created in memory with name: no-name-b022ee12-cff9-4e85-959c-322387e4a7db
[I 2024-02-26 11:52:11,728] Trial 0 finished with value: 0.4337588804016879 and parameters: {'n_estimators': 893, 'max_depth': 40, 'min_samples_split': 16, 'random_state': 42, 'min_samples_leaf': 17}. Best is trial 0 with value: 0.4337588804016879.
[I 2024-02-26 11:52:33,670] Trial 1 finished with value: 0.43893488595694674 and parameters: {'n_estimators': 877, 'max_depth': 20, 'min_samples_split': 11, 'random_state': 42, 'min_samples_leaf': 11}. Best is trial 1 with value: 0.43893488595694674.
[I 2024-02-26 11:52:43,737] Trial 2 finished with value: 0.4482559692324128 and parameters: {'n_estimators': 369, 'max_depth': 30, 'min_samples_split': 13, 'random_state': 42, 'min_samples_leaf': 6}. Best is trial 2 with value: 0.4482559692324128.
[I 2024-02-26 11:52:53,442] Trial 3 finished with value: 0.4327226109716361 and parameters: {'n_estimators': 403, 'max_depth': 27, 'min_sa

Best params found : {'n_estimators': 124, 'max_depth': 47, 'min_samples_split': 4, 'random_state': 42, 'min_samples_leaf': 2}


0.5093167701863354

In [35]:
final_model = RandomForestClassifier(**best_params)
final_model.fit(X_train_reduced, y_train)
y_pred = final_model.predict(X_test_reduced)
prediction_results['random forest'] = y_pred 

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')


# Fit a label binarizer to get all possible classes
label_binarizer = LabelBinarizer()
label_binarizer.fit(y_test)

# Transform y_test and y_pred to one-hot encoding with the same set of columns
y_test_onehot = pd.DataFrame(label_binarizer.transform(
    y_test), columns=label_binarizer.classes_)
y_pred_onehot = pd.DataFrame(label_binarizer.transform(
    y_pred), columns=label_binarizer.classes_)

# Ensure both sets have the same columns
all_columns = set(y_test_onehot.columns).union(set(y_pred_onehot.columns))
y_test_onehot = y_test_onehot.reindex(columns=all_columns, fill_value=0)
y_pred_onehot = y_pred_onehot.reindex(columns=all_columns, fill_value=0)

# ROC AUC score with multiclass handling
roc_auc_macro = roc_auc_score(
    y_test_onehot, y_pred_onehot, average='macro', multi_class='ovo')


print("accuracy:", accuracy)
print("recall:", recall)
print("precision:", precision)
print("f1-score:", f1)
print('roc_auc: ', roc_auc_macro)

# Save model result
model_name = 'Random Forest'
model_scores[model_name] = [accuracy, recall,f1,precision,roc_auc_macro]

accuracy: 0.5093167701863354
recall: 0.2245621114934228
precision: 0.48932676518883417
f1-score: 0.2049401506967847
roc_auc:  0.5526746191712363


In [36]:
print(f'{model_name}\n')
print('=' * 50)
print(classification_report(y_test, y_predict, target_names=target_names))

Random Forest

                   precision    recall  f1-score   support

     Ariel Sharon       0.88      0.74      0.80        19
     Colin Powell       0.84      0.90      0.87        59
  Donald Rumsfeld       0.82      0.77      0.79        30
    George W Bush       0.85      0.92      0.88       133
Gerhard Schroeder       0.75      0.56      0.64        27
      Hugo Chavez       0.80      0.44      0.57        18
       Tony Blair       0.74      0.86      0.79        36

         accuracy                           0.83       322
        macro avg       0.81      0.74      0.76       322
     weighted avg       0.82      0.83      0.82       322



****

****

# SUMMARY

**Classification over all models**

In [37]:
results_df = pd.DataFrame(model_scores, index=['Accuracy', 'Recall','F1 Score','Precision','ROC AUC']).T
results_df

Unnamed: 0,Accuracy,Recall,F1 Score,Precision,ROC AUC
SVM,0.829193,0.719909,0.752096,0.807546,0.842857
Decision Tree,0.440994,0.256616,0.248343,0.307155,0.571043
K-Nearest Neighbors,0.590062,0.374109,0.392866,0.607698,0.642422
Gaussian Naives Bayes,0.695652,0.578572,0.601533,0.660652,0.759916
Logistic Regression,0.822981,0.725539,0.752379,0.803183,0.845602
Random Forest,0.509317,0.224562,0.20494,0.489327,0.552675


In [38]:
# dump prediction results
# import json
# with open("/kaggle/working/predict.json", "w") as json_file:
#     json.dump(prediction_results, json_file)

In [39]:
prediction_results

{'svc': [1,
  1,
  3,
  3,
  1,
  3,
  3,
  3,
  3,
  1,
  3,
  0,
  6,
  5,
  3,
  3,
  3,
  5,
  4,
  3,
  3,
  2,
  1,
  3,
  1,
  1,
  1,
  3,
  3,
  1,
  1,
  3,
  2,
  1,
  2,
  4,
  2,
  1,
  1,
  3,
  3,
  1,
  1,
  1,
  5,
  6,
  3,
  2,
  3,
  3,
  3,
  0,
  1,
  1,
  3,
  0,
  3,
  4,
  3,
  5,
  1,
  3,
  3,
  3,
  1,
  3,
  3,
  1,
  6,
  3,
  1,
  3,
  1,
  2,
  3,
  6,
  6,
  3,
  6,
  3,
  3,
  1,
  3,
  1,
  0,
  4,
  2,
  1,
  5,
  3,
  3,
  2,
  6,
  3,
  3,
  1,
  3,
  3,
  3,
  3,
  1,
  3,
  3,
  3,
  4,
  3,
  3,
  1,
  3,
  3,
  1,
  3,
  3,
  3,
  3,
  6,
  3,
  1,
  3,
  3,
  0,
  1,
  1,
  3,
  2,
  2,
  1,
  3,
  3,
  3,
  5,
  6,
  2,
  2,
  1,
  3,
  3,
  1,
  3,
  0,
  1,
  3,
  3,
  3,
  1,
  2,
  6,
  3,
  6,
  3,
  1,
  1,
  3,
  2,
  2,
  3,
  3,
  3,
  6,
  3,
  3,
  6,
  6,
  3,
  3,
  3,
  3,
  3,
  6,
  3,
  3,
  3,
  4,
  3,
  2,
  3,
  1,
  1,
  3,
  3,
  0,
  2,
  3,
  6,
  2,
  6,
  3,
  4,
  3,
  6,
  0,
  3,
  0,
  2,
  3,
  3,
  4,
  3,
  1