![Facial Recognition](facialrecognition.jpg)

You are a member of an elite group of data scientists, specialising in advanced facial recognition technology, this firm is dedicated to identifying and safeguarding prominent individuals from various spheres—ranging from entertainment and sports to politics and philanthropy. The team's mission is to deploy AI-driven solutions that can accurately distinguish between images of notable personalities and the general populace, enhancing the personal security of such high-profile individuals. You're to focus on Arnold Schwarzenegger, a figure whose accomplishments span from bodybuilding champion to Hollywood icon, and from philanthropist to the Governor of California. 

### **The Data**
The `data/lfw_arnie_nonarnie.csv` dataset contains processed facial image data derived from the "Labeled Faces in the Wild" (LFW) dataset, focusing specifically on images of Arnold Schwarzenegger and other individuals not identified as him. This dataset has been prepared to aid in the development and evaluation of facial recognition models. There are 40 images of Arnold Schwarzenegger and 150 of other people.

| Column Name | Description |
|-------------|-------------|
| PC1, PC2, ... PCN | Principal components from PCA, capturing key image features. |
| Label | Binary indicator: `1` for Arnold Schwarzenegger, `0` for others. |

In [115]:
# Import required libraries
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Read the CSV file 
df = pd.read_csv("data/lfw_arnie_nonarnie.csv")

# Seperate the predictor and class label
X = df.drop('Label', axis=1)
y = df['Label'] 

# Split the data into training and testing sets using stratify to balance the class
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

In [116]:
print(X_train);


            0         1         2  ...       147       148       149
63  -0.174255 -2.970224  3.230451  ... -0.232865 -0.202193  0.221698
20  -3.533020  0.858683  1.185077  ... -0.023084  0.023557 -0.005008
125 -5.771363 -2.005809  1.249855  ... -0.049232  0.067747 -0.169667
130  9.150897 -7.013354  2.596765  ... -0.001627 -0.044008  0.101841
41   1.872624 -0.253442  2.596265  ... -0.093455  0.008534  0.185321
..        ...       ...       ...  ...       ...       ...       ...
153 -1.124056  0.537134  2.017247  ... -0.176542 -0.036645  0.027890
79  -3.552765  1.089497 -2.182357  ... -0.169197  0.153629 -0.130576
157 -1.908888  0.837640  2.345791  ... -0.376907  0.153506  0.278081
64  -1.400702  1.999302  4.259822  ... -0.138655 -0.241056 -0.108794
137  8.415617 -2.587975 -2.370433  ... -0.150624 -0.061419 -0.105073

[152 rows x 150 columns]


In [117]:
print(y_train)
from sklearn.linear_model import LogisticRegression;


63     0
20     1
125    0
130    0
41     0
      ..
153    0
79     0
157    0
64     0
137    0
Name: Label, Length: 152, dtype: int64


In [118]:
logisticRegression = LogisticRegression();
logisticRegression.fit(X_train,y_train);

In [119]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear']
}

# Create a GridSearchCV object
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring='accuracy')

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best estimator
best_logistic_regression = grid_search.best_estimator_

# Predict using the best estimator
preds = best_logistic_regression.predict(X_test)

# Print the classification report
from sklearn.metrics import classification_report
print(f"{best_logistic_regression} Results:\n{classification_report(y_test, preds)}", sep="\n\n")

LogisticRegression(C=0.1, solver='newton-cg') Results:
              precision    recall  f1-score   support

           0       0.91      0.97      0.94        30
           1       0.83      0.62      0.71         8

    accuracy                           0.89        38
   macro avg       0.87      0.80      0.82        38
weighted avg       0.89      0.89      0.89        38



In [120]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# Define the parameter grid correctly for DecisionTreeClassifier
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

tree = DecisionTreeClassifier()
gridsearch1 = GridSearchCV(tree, param_grid, cv=5, scoring="accuracy")
gridsearch1.fit(X_train, y_train)
bestDecisionTree = gridsearch1.best_estimator_
predTree = bestDecisionTree.predict(X_test)
print(f"{bestDecisionTree} Results:\n{classification_report(y_test, predTree)}", sep="\n\n")

DecisionTreeClassifier() Results:
              precision    recall  f1-score   support

           0       0.80      0.80      0.80        30
           1       0.25      0.25      0.25         8

    accuracy                           0.68        38
   macro avg       0.53      0.53      0.53        38
weighted avg       0.68      0.68      0.68        38



In [121]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Initialize models dictionary
models = {
    'Logistic Regression': Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', LogisticRegression())
    ]),
    'Decision Tree': Pipeline([
        ('classifier', DecisionTreeClassifier())
    ]),
    'KNeighbors': Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', KNeighborsClassifier())
    ])
}

# Initialize variables to store the best model information
best_model_name = None
best_model_info = None
best_model_cv_score = 0

# Perform cross-validation and determine the best model
for model_name, model_pipeline in models.items():
    cv_scores = cross_val_score(model_pipeline, X_train, y_train, cv=5, scoring='accuracy')
    mean_cv_score = cv_scores.mean()
    
    if mean_cv_score > best_model_cv_score:
        best_model_name = model_name
        best_model_info = model_pipeline.get_params()
        best_model_cv_score = mean_cv_score

# Fit the best model on the training data
best_model = models[best_model_name]
best_model.fit(X_train, y_train)

# Predict on the test set
test_preds = best_model.predict(X_test)

# Evaluate the best model
accuracy = accuracy_score(y_test, test_preds)
precision = precision_score(y_test, test_preds, average='weighted')
recall = recall_score(y_test, test_preds, average='weighted')
f1 = f1_score(y_test, test_preds, average='weighted')

# Store the best accuracy score
score = accuracy

# Output the results
best_model_name, best_model_info, best_model_cv_score, accuracy, precision, recall, f1, score

('Logistic Regression',
 {'memory': None,
  'steps': [('scaler', StandardScaler()),
   ('classifier', LogisticRegression())],
  'verbose': False,
  'scaler': StandardScaler(),
  'classifier': LogisticRegression(),
  'scaler__copy': True,
  'scaler__with_mean': True,
  'scaler__with_std': True,
  'classifier__C': 1.0,
  'classifier__class_weight': None,
  'classifier__dual': False,
  'classifier__fit_intercept': True,
  'classifier__intercept_scaling': 1,
  'classifier__l1_ratio': None,
  'classifier__max_iter': 100,
  'classifier__multi_class': 'auto',
  'classifier__n_jobs': None,
  'classifier__penalty': 'l2',
  'classifier__random_state': None,
  'classifier__solver': 'lbfgs',
  'classifier__tol': 0.0001,
  'classifier__verbose': 0,
  'classifier__warm_start': False},
 0.8221505376344087,
 0.8157894736842105,
 0.8506401137980085,
 0.8157894736842105,
 0.7537749847254953,
 0.8157894736842105)

In [122]:
# Assuming y_true and y_pred are defined somewhere in the notebook
# If not, define them here for the sake of completeness
# y_true = ...
# y_pred = ...

# Save the best model's parameters as 'best_model_info'
best_model_info = {
    'accuracy': knn_accuracy,
    'precision': knn_precision,
    'recall': knn_recall,
    'f1': knn_f1

}

# Save the best model's cross-validation score as 'best_model_cv_score'
best_model_cv_score = best_model_cv_scores

# Display the best model information
best_model_info

{'accuracy': 0.7368421052631579,
 'precision': 0.6795665634674922,
 'recall': 0.7368421052631579,
 'f1': 0.7012061403508771}