In [None]:
from itertools import product, chain
import json

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
SRC_PATH = "../data/lfw-colors.parquet"
CELEBS_SEASONS_PATH = "../data/celebrities.json"

In [None]:
with open(CELEBS_SEASONS_PATH) as fid:
    celebs_seasons = json.load(fid)

celebs_seasons_list = list(chain(*[
    [{"season": season, "name": celeb.lower()} for celeb in celebs]
    for season, celebs in celebs_seasons.items()    
]))
df_celebs_seasons = pd.DataFrame(celebs_seasons_list)
df_celebs_seasons.head()

In [None]:
df_colors = pd.read_parquet(SRC_PATH)
# filter of the images with multiple detected images
df_colors_w_count = df_colors.merge(df_colors["src_path"].value_counts().to_frame().reset_index(), on="src_path")
df_colors_filtered = df_colors_w_count.loc[df_colors_w_count["count"] == 1, :].drop(columns="count")
df_colors_filtered["name"] = df_colors_filtered["src_path"].str.lower().str.replace(
    "_", 
    " "
).str.extract(
    "data/lfw-deepfunneled/([a-z ]+)/.*"
)
df_colors_filtered.head()

In [None]:
df = df_colors_filtered.merge(df_celebs_seasons, on="name")
len(df)

In [None]:
df["macroseason"] = df["season"].apply(lambda s: s.split("-")[1])
df["macroseason"].value_counts()

In [None]:
REGIONS = ["face"]
FEATURES = [f"{r}-{c}" for r, c in product(REGIONS, ("h", "s", "v"))]
g = sns.PairGrid(df[FEATURES + ["macroseason"]], hue="macroseason")
g.map_diag(sns.histplot, kde=True)
g.map_offdiag(sns.scatterplot)
g.add_legend()

In [None]:
df["sin-face-h"] = np.sin(df["face-h"] * 2 * np.pi)

In [None]:
REGIONS = ["face"]
FEATURES = [f"{r}-{c}" for r, c in product(REGIONS, ("h", "s", "v"))]
del FEATURES[FEATURES.index("face-h")]
FEATURES += ["sin-face-h"]
g = sns.PairGrid(df[FEATURES + ["macroseason"]], hue="macroseason")
g.map_diag(sns.histplot, kde=True)
g.map_offdiag(sns.scatterplot)
g.add_legend()

In [None]:
from sklearn.feature_selection import f_classif

_, np_p_values = f_classif(df[FEATURES], df["macroseason"])
p_values_f_test = pd.Series(
    {FEATURES[i]: p for i, p in enumerate(f_classif(df[FEATURES], df["macroseason"])[-1])}
).to_frame()
p_values_f_test.columns = ["p_value"]
sns.heatmap(
    p_values_f_test, 
    annot=True
)

In [None]:
from scipy.stats import normaltest

p_values_normality = df[FEATURES + ["macroseason"]].groupby("macroseason").apply(
    lambda x: pd.Series(
        {FEATURES[i]: p for i, p in enumerate(normaltest(x)[-1])}
    )
)
sns.heatmap(
    p_values_normality, 
    annot=True
)

In [None]:
# Compute the correlation matrix
corr = df[FEATURES].corr()
sns.heatmap(
    corr, 
    annot=True
)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay

FEATURES = ["face-v", "sin-face-h"]
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42, stratify=df["macroseason"], shuffle=True)
np_train = train_df[FEATURES].values
np_test = test_df[FEATURES].values

In [None]:
def evaluate_model(clf, train_df, test_df, label: str):
    np_train = train_df[FEATURES].values
    np_test = test_df[FEATURES].values
    clf.fit(np_train, train_df[label])
    print("Train report")
    train_pred = clf.predict(np_train)
    print(classification_report(train_df[label], train_pred))
    ConfusionMatrixDisplay.from_predictions(train_df[label], train_pred)
    print("Test report")
    test_pred = clf.predict(np_test)
    print(classification_report(test_df[label], test_pred))
    ConfusionMatrixDisplay.from_predictions(test_df[label], test_pred)

In [None]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()

evaluate_model(gnb, train_df, test_df, "macroseason")