In [None]:
import pandas as pd


In [None]:
SRC_PATH = "../data/celebrities_seasons.parquet"

In [None]:
df = pd.read_parquet(SRC_PATH)
df.head()

In [None]:
df["label"] = df["label"].str.extract("([a-z]+-[a-z]+)-[a-z]+")
df["macro_label"] = df["label"].str.extract("[a-z]+-([a-z]+)")
df.head()

In [None]:
df["macro_label"].value_counts()

In [None]:
df["label"].value_counts()

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label"], shuffle=True)
np_train = np.vstack(train_df["embedding"].tolist())
np_test = np.vstack(test_df["embedding"].tolist())

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, precision_score, accuracy_score
from collections import Counter
from scipy.stats import entropy


def max_accuracy_dummy_classifier(clf, X: np.ndarray, y: np.ndarray) -> float:
    _ = clf, X
    counts = np.array(list(Counter(y).values()))
    p = counts / counts.sum()
    return (p ** 2).sum()


def evaluate_classifier(clf, train_df: pd.DataFrame, label: str = "macro_label") -> dict:
    metrics = cross_validate(
        clf, 
        np.vstack(train_df["embedding"]), 
        train_df[label],
        return_train_score=True,
        scoring={
            "accuracy": make_scorer(accuracy_score),
            "macro_precision": make_scorer(precision_score, average="macro"),
            "min_precision": lambda _clf, X, y: precision_score(y, _clf.predict(X), average=None).min(),
            "H(Y|X) [bits]": lambda _clf, X, y: entropy(_clf.predict_proba(X), base=2, axis=1).mean(),
            "max_accuracy_dummy_classifier": max_accuracy_dummy_classifier
        }
    )
    metrics_d = {k: x.mean() for k, x in metrics.items()}
    metrics_d["classifier"] = str(clf)
    return metrics_d

In [None]:
from sklearn.svm import SVC
from sklearn.neighbors import NeighborhoodComponentsAnalysis, KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

from tqdm.notebook import tqdm


gnb = GaussianNB()

rf = RandomForestClassifier(random_state=42)

nca = NeighborhoodComponentsAnalysis(random_state=42)
knn = KNeighborsClassifier(n_neighbors=5)
nca_knn = Pipeline([('nca', nca), ('knn', knn)])

svc = SVC(random_state=42, probability=True)
mlp = MLPClassifier(hidden_layer_sizes=(16, 8), max_iter=1000, random_state=42)

classifiers = [
    gnb,
    rf,
    nca_knn,
    svc,
    mlp
]


metrics = map(lambda clf: pd.Series(evaluate_classifier(clf, train_df)), tqdm(classifiers))
metrics_df = pd.concat(metrics, axis=1).T
metrics_df

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import classification_report

K = 4

nca = NeighborhoodComponentsAnalysis(random_state=42)
knn = KNeighborsClassifier(n_neighbors=K)
nca_knn = Pipeline([('nca', nca), ('knn', knn)])
nca_knn.fit(np_train, train_df["macro_label"])

print("Train report")
train_pred = nca_knn.predict(np_train)
print(classification_report(train_df["macro_label"], train_pred))
ConfusionMatrixDisplay.from_predictions(train_df["macro_label"], train_pred)

print("Test report")
np_test_pred = nca_knn.predict(np_test)
np_knn_dist, np_knn_idx = knn.kneighbors(nca.transform(np_test), K)
print(classification_report(test_df["macro_label"], np_test_pred))
ConfusionMatrixDisplay.from_predictions(test_df["macro_label"], np_test_pred)

p = nca_knn.predict_proba(np_test)
h = entropy(p, base=2, axis=1)
conditional_h = h.mean()
print("H(Y|X):", conditional_h, "bits")

pred_df = test_df[["src_path", "macro_label"]].copy()
pred_df["pred"] = np_test_pred
pred_df["knn_dist"] = np_knn_dist.tolist()
pred_df["knn_idx"] = np_knn_idx.tolist()

In [None]:
gt_label = "winter"
pred_label = "autumn"

In [None]:
from PIL import Image
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import ImageGrid


misclassified_paths = pred_df.loc[(pred_df["macro_label"] == gt_label) & (pred_df["pred"] == pred_label), ["src_path"]]

fig = plt.figure(figsize=(15, 20))
grid = ImageGrid(fig, 111,  # similar to subplot(111)
                 nrows_ncols=(len(misclassified_paths), K + 1),  # creates 2x2 grid of Axes
                 axes_pad=0.2,  # pad between Axes in inch.
                 )
grid = iter(grid)
for i, (t, row) in enumerate(misclassified_paths.iterrows(), 1):
    src_path = row["src_path"]
    knn_dist = pred_df.loc[t, "knn_dist"]
    knn_idx = pred_df.loc[t, "knn_idx"]
    knn_src_paths = train_df.iloc[knn_idx]["src_path"]
    ax = next(grid)
    #plt.subplot(len(misclassified_paths), len(knn_src_paths) + 1, i * len(knn_src_paths))
    ax.imshow(np.array(Image.open(f"../{src_path}").resize((128, 128))))
    for j, (nn_dist, nn_src_path) in enumerate(zip(knn_dist, knn_src_paths), 1): 
        #plt.subplot(len(misclassified_paths), len(knn_src_paths) + 1, i * len(knn_dist) + j)
        ax = next(grid)
        ax.imshow(np.array(Image.open(f"../{nn_src_path}").resize((128, 128))))
        ax.set_title(f"{nn_dist:.2f}", fontdict={"fontsize": 7})
#plt.tight_layout()