In [1]:
from collections import Counter
import os

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import make_scorer, precision_score, accuracy_score
from scipy.stats import entropy

In [2]:
# load labeled dataset

if os.path.exists("../data/lfw_facenet_embeddings_label_spreading.parquet"):
    # try local path
    df = pd.read_parquet("../data/lfw_facenet_embeddings_label_spreading.parquet")
else:
    # download from Hugging Face
    df = pd.read_parquet("hf://datasets/lajota13/lfw_facenet_embeddings/lfw_facenet_embeddings_label_spreading.parquet")
    df.to_parquet("../data/lfw_facenet_embeddings_label_spreading.parquet")

df.head()

Unnamed: 0,original_path,embedding,name,season,macroseason,macrolabel
2,data/lfw-deepfunneled/Aaron_Eckhart/Aaron_Eckh...,"[-0.008572585880756378, -0.09566288441419601, ...",Aaron Eckhart,true-spring-celebrities,spring,2
97,data/lfw-deepfunneled/Adriana_Lima/Adriana_Lim...,"[0.034277111291885376, -0.037622272968292236, ...",Adriana Lima,soft-summer-celebrities,summer,1
100,data/lfw-deepfunneled/Adrien_Brody/Adrien_Brod...,"[-0.028514988720417023, -0.07194817066192627, ...",Adrien Brody,bright-winter-celebrities,winter,0
101,data/lfw-deepfunneled/Adrien_Brody/Adrien_Brod...,"[-0.04232393950223923, -0.03775602579116821, -...",Adrien Brody,bright-winter-celebrities,winter,0
102,data/lfw-deepfunneled/Adrien_Brody/Adrien_Brod...,"[-0.02667202800512314, -0.035010598599910736, ...",Adrien Brody,bright-winter-celebrities,winter,0


In [3]:
# split dataset in training and test on the basis of the celebrities' names

train_names, test_names = train_test_split(
    df["name"].drop_duplicates().to_frame(), 
    test_size=0.2, 
    random_state=42, 
    shuffle=True
)
train_df = df.merge(train_names, on="name", how="inner")
test_df = df.merge(test_names, on="name", how="inner")
np_train = np.vstack(train_df["embedding"].tolist())
np_test = np.vstack(test_df["embedding"].tolist())

In [4]:
def max_accuracy_dummy_classifier(clf, X: np.ndarray, y: np.ndarray) -> float:
    _ = clf, X
    counts = np.array(list(Counter(y).values()))
    p = counts / counts.sum()
    return (p ** 2).sum()


def evaluate_classifier(clf, train_df: pd.DataFrame, label: str = "macroseason") -> dict:
    metrics = cross_validate(
        clf, 
        np.vstack(train_df["embedding"]), 
        train_df[label],
        return_train_score=True,
        scoring={
            "accuracy": make_scorer(accuracy_score),
            "macro_precision": make_scorer(precision_score, average="macro"),
            "min_precision": lambda _clf, X, y: precision_score(y, _clf.predict(X), average=None).min(),
            "H(Y|X) [bits]": lambda _clf, X, y: entropy(_clf.predict_proba(X), base=2, axis=1).mean(),
            "max_accuracy_dummy_classifier": max_accuracy_dummy_classifier
        }
    )
    metrics_d = {k: x.mean() for k, x in metrics.items()}
    metrics_d["classifier"] = str(clf)
    return metrics_d

In [5]:
from sklearn.svm import SVC
from sklearn.neighbors import NeighborhoodComponentsAnalysis, KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

from tqdm.notebook import tqdm


gnb = GaussianNB()

rf = RandomForestClassifier(random_state=42)

nca = NeighborhoodComponentsAnalysis(random_state=42)
knn = KNeighborsClassifier(n_neighbors=5)
nca_knn = Pipeline([('nca', nca), ('knn', knn)])

svc = SVC(random_state=42, probability=True)
mlp = MLPClassifier(hidden_layer_sizes=(16, 8), max_iter=1000, random_state=42)

classifiers = [
    gnb,
    rf,
    #nca_knn,
    svc,
    mlp
]


metrics = map(lambda clf: pd.Series(evaluate_classifier(clf, train_df)), tqdm(classifiers))
metrics_df = pd.concat(metrics, axis=1).T
metrics_df

  0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0,fit_time,score_time,test_accuracy,train_accuracy,test_macro_precision,train_macro_precision,test_min_precision,train_min_precision,test_H(Y|X) [bits],train_H(Y|X) [bits],test_max_accuracy_dummy_classifier,train_max_accuracy_dummy_classifier,classifier
0,0.069232,0.077839,0.408927,0.546728,0.4243,0.546054,0.330142,0.413625,0.174184,0.151418,0.269779,0.269778,GaussianNB()
1,19.553989,0.192264,0.512002,1.0,0.517593,1.0,0.425137,1.0,1.848524,0.954931,0.269779,0.269778,RandomForestClassifier(random_state=42)
2,117.98671,18.349576,0.572406,0.917118,0.581007,0.909931,0.456266,0.878296,1.041937,0.635293,0.269779,0.269778,"SVC(probability=True, random_state=42)"
3,35.101092,0.061888,0.476895,0.743838,0.472948,0.722929,0.39327,0.649359,1.089404,0.959213,0.269779,0.269778,"MLPClassifier(hidden_layer_sizes=(16, 8), max_..."
