In [None]:
import matplotlib
from itertools import product

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
SRC_PATH = "../data/celebrities_seasons_colors.parquet"

In [None]:
df = pd.read_parquet(SRC_PATH)
df.head()

In [None]:
df["macro_label"] = df["src_path"].apply(lambda s: s.split("/")[2].split("-")[1])

In [None]:
REGIONS = ["face"]
FEATURES = [f"{r}-{c}" for r, c in product(REGIONS, ("h", "s", "v"))]
g = sns.PairGrid(df[FEATURES + ["macro_label"]], hue="macro_label")
g.map_diag(sns.histplot, kde=True)
g.map_offdiag(sns.scatterplot)
g.add_legend()

In [None]:
df["sin-face-h"] = np.sin(df["face-h"] * 2 * np.pi)

In [None]:
REGIONS = ["face"]
FEATURES = [f"{r}-{c}" for r, c in product(REGIONS, ("h", "s", "v"))]
del FEATURES[FEATURES.index("face-h")]
FEATURES += ["sin-face-h"]
g = sns.PairGrid(df[FEATURES + ["macro_label"]], hue="macro_label")
g.map_diag(sns.histplot, kde=True)
g.map_offdiag(sns.scatterplot)
g.add_legend()

In [None]:
from sklearn.feature_selection import f_classif

_, np_p_values = f_classif(df[FEATURES], df["macro_label"])
p_values_f_test = pd.Series(
    {FEATURES[i]: p for i, p in enumerate(f_classif(df[FEATURES], df["macro_label"])[-1])}
).to_frame()
p_values_f_test.columns = ["p_value"]
sns.heatmap(
    p_values_f_test, 
    annot=True
)

In [None]:
from scipy.stats import normaltest

p_values_normality = df[FEATURES + ["macro_label"]].groupby("macro_label").apply(
    lambda x: pd.Series(
        {FEATURES[i]: p for i, p in enumerate(normaltest(x)[-1])}
    )
)
sns.heatmap(
    p_values_normality, 
    annot=True
)

In [None]:
# Compute the correlation matrix
corr = df[FEATURES].corr()
sns.heatmap(
    corr, 
    annot=True
)

In [None]:
from sklearn.metrics import silhouette_score

silhouette_score(df[FEATURES], df["macro_label"])

In [None]:
from sklearn.cluster import KMeans
from tqdm.notebook import tqdm


X = np.random.rand(1000, 2)
kmeans = KMeans(4)
y = kmeans.fit_predict(X)
silhouette_score(X, y)

In [None]:
y = (4 * np.random.rand(1000, 1).ravel()).astype(int)
silhouette_score(X, y)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay

FEATURES = ["face-v", "sin-face-h"]
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42, stratify=df["macro_label"], shuffle=True)
np_train = train_df[FEATURES].values
np_test = test_df[FEATURES].values

In [None]:
def evaluate_model(clf, train_df, test_df, label: str):
    np_train = train_df[FEATURES].values
    np_test = test_df[FEATURES].values
    clf.fit(np_train, train_df[label])
    print("Train report")
    train_pred = clf.predict(np_train)
    print(classification_report(train_df[label], train_pred))
    ConfusionMatrixDisplay.from_predictions(train_df[label], train_pred)
    print("Test report")
    test_pred = clf.predict(np_test)
    print(classification_report(test_df[label], test_pred))
    ConfusionMatrixDisplay.from_predictions(test_df[label], test_pred)

In [None]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()

evaluate_model(gnb, train_df, test_df, "macro_label")

In [None]:
from sklearn.svm import LinearSVC

svc = LinearSVC()
evaluate_model(svc, train_df, test_df, "macro_label")

In [None]:
FEATURES

In [None]:
from sklearn.inspection import DecisionBoundaryDisplay

# Settings for plotting
if ax is None:
    _, ax = plt.subplots(figsize=(4, 3))
x_min, x_max, y_min, y_max = -3, 3, -3, 3
ax.set(xlim=(x_min, x_max), ylim=(y_min, y_max))

# Plot decision boundary and margins
common_params = {"estimator": svc, "X": train_df[FEATURES], "ax": ax}
DecisionBoundaryDisplay.from_estimator(
    **common_params,
    response_method="predict",
    plot_method="pcolormesh",
    alpha=0.3,
)
DecisionBoundaryDisplay.from_estimator(
    **common_params,
    response_method="decision_function",
    plot_method="contour",
    levels=[-1, 0, 1],
    colors=["k", "k", "k"],
    linestyles=["--", "-", "--"],
)

if support_vectors:
    # Plot bigger circles around samples that serve as support vectors
    ax.scatter(
        clf.support_vectors_[:, 0],
        clf.support_vectors_[:, 1],
        s=150,
        facecolors="none",
        edgecolors="k",
    )

# Plot samples by color and add legend
ax.scatter(X[:, 0], X[:, 1], c=y, s=30, edgecolors="k")
ax.legend(*scatter.legend_elements(), loc="upper right", title="Classes")
if long_title:
    ax.set_title(f" Decision boundaries of {kernel} kernel in SVC")
else:
    ax.set_title(kernel)

if ax is None:
    plt.show()