In [None]:
import os
import json
from itertools import chain

import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.semi_supervised import LabelSpreading
from sklearn.metrics import classification_report

## Dataset preparation

In [None]:
# load LFW dataset face embeddings

if os.path.exists("../data/lfw_facenet_embeddings.parquet"):
    # try local path
    embedding_df = pd.read_parquet("../data/lfw_facenet_embeddings.parquet")
else:
    # download from Hugging Face
    embedding_df = pd.read_parquet("hf://datasets/lajota13/lfw_facenet_embeddings/lfw_facenet_embeddings.parquet")
    embedding_df.to_parquet("../data/lfw_facenet_embeddings.parquet")

In [None]:
# parse celebs names

embedding_df["name"] = embedding_df["label"].str.replace("_", " ")
embedding_df.drop(columns=["label"], inplace=True)

In [None]:
# load seasonal color analysis annotations

with open("../data/celebrities.json") as fid:
  annotations = json.load(fid)
annotations_df = pd.DataFrame(list(chain(*[[{"name": c, "season":  season} for c in celebs] for season, celebs in annotations.items()])))
annotations_df["macroseason"] = annotations_df["season"].str.extract("[a-z]+-([a-z]+)-[a-z]+")

In [None]:
# associate annotations to embeddings

embedding_annotated_df = embedding_df.merge(annotations_df, on="name", how="left")
macrolabel_map_df = pd.DataFrame(
    {
        "macrolabel": [0, 1, 2, 3],
        "macroseason": ["winter", "summer", "spring", "autumn"]
    }
)
embedding_annotated_df = embedding_annotated_df.merge(macrolabel_map_df, on="macroseason", how="left")
embedding_annotated_df["macrolabel"] = embedding_annotated_df["macrolabel"].fillna(-1)
embedding_annotated_df["macrolabel"] = embedding_annotated_df["macrolabel"].astype(int)

In [None]:
# split dataset in training and test based on celebrities' names

labeled_df = embedding_annotated_df[embedding_annotated_df["macrolabel"] != -1].copy()
unlabeled_df = embedding_annotated_df[embedding_annotated_df["macrolabel"] == -1].copy()

train_names, test_names = train_test_split(
    labeled_df["name"].drop_duplicates().to_frame(),
    test_size=0.2,
    random_state=42,
)
train_df = labeled_df.merge(train_names, on="name", how="inner")
test_df = labeled_df.merge(test_names, on="name", how="inner")
train_df = pd.concat([train_df, unlabeled_df])

## Label propagation

In [None]:
ls = LabelSpreading()
X_train = np.array(train_df["embedding"].tolist())
y_train = train_df["macrolabel"].values.astype(np.int8)
ls.fit(X_train, y_train)

In [None]:
# evaluate propagated labels

X_test = np.array(test_df["embedding"].tolist())
y_test = test_df["macrolabel"].values.astype(np.int8)
y_pred = ls.predict(X_test)

print(classification_report(y_test, y_pred))

In [None]:
# dump propagated labels

unlabeled_df["macrolabel"] = (ls.predict(np.vstack(unlabeled_df["embedding"].tolist()))).astype(np.int8)
unlabeled_df = unlabeled_df.drop(columns="macroseason").merge(macrolabel_map_df, on="macrolabel", how="left")
label_propagated_df = pd.concat([labeled_df, unlabeled_df])
label_propagated_df.to_parquet("../data/lfw_facenet_embeddings_label_propagated.parquet")

In [None]:
# show predicted frequencies

label_propagated_df["macroseason"].value_counts()