# Evaluate the latent space on useen data

Use a video sequence from a different specimen the model has not been trained on to explore how well the model generalizes.


In [None]:
import sys

sys.path.insert(0, "/home/katharina/vame_approach/VAME")

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from vame.analysis.kinutils import KinVideo, create_grid_video
import os
from datetime import datetime
from vame.util.auxiliary import read_config
import logging
import re
from pathlib import Path
from IPython import display
from sklearn.manifold import TSNE
import umap
from fcmeans import FCM
from ipywidgets import Output, GridspecLayout
from scipy.spatial.distance import pdist, squareform
from vame.analysis.utils import (
    create_aligned_mouse_video,
    create_pose_snipplet,
    create_visual_comparison,
    thin_dataset_iteratively,
    find_percentile_threshold,
    estimate_fuzzifier,
    fukuyama_sugeno_index,
)
from matplotlib import cm
import seaborn as sns
from vame.initialize_project.themis_new import get_video_metadata

np.random.seed(42)

%reload_ext autoreload
%autoreload 2

Load latent vectors corresponding to landmarks that where unseen during training and the latent vectors corresponding to landmarks the
model was trained on.

In [None]:
# the model was trained on video sequence 0057 of rat H01
# now we evaluate on video sequence 0089 of rat H06
PROJECT_PATH = "/home/katharina/vame_approach/themis_tail_belly_align"
PROJECT_PATH = "/home/katharina/vame_approach/tb_align_0089"

# path where the original videos are stored
VIDEO_ROOT = "/media/Themis/Data/Video"

USE_ALIGNED_VIDEO = True

trained_models = [
    (datetime.strptime(element, "%m-%d-%Y-%H-%M"), element)
    for element in os.listdir(os.path.join(PROJECT_PATH, "model"))
]
# sort by time step
trained_models.sort(key=lambda x: x[0])
latest_model = trained_models[-1][-1]

config_file = os.path.join(PROJECT_PATH, "model", latest_model, "config.yaml")
config = read_config(config_file)


data_file_trained_on = "landmarks_0089_3843S2B10Gaussians_E149_confidece"
data_file_unseen = "landmarks_0087_3843S2B10Gaussians_E149_confidece"
latent_vectors_trained_on = np.load(
    os.path.join(
        PROJECT_PATH,
        "inference",
        "results",
        latest_model,
        "latent_vectors_" + data_file_trained_on + ".npy",
    )
)
latent_vectors_unseen = np.load(
    os.path.join(
        PROJECT_PATH,
        "inference",
        "results",
        latest_model,
        "latent_vectors_" + data_file_unseen + ".npy",
    )
)

## 1) Compare video clips of randomly sampled anchors vs neighbors and distant samples
## from the dataset unseen during training

Sample several anchors from the latent space and visualize them with their close neighbors vs distant samples

In [None]:
# get video files to show video snipplets later
video_info = get_video_metadata(VIDEO_ROOT, None)
video_id_trained_on = data_file_trained_on.split("_")[1]
video_id_unseen = data_file_unseen.split("_")[1]

if USE_ALIGNED_VIDEO:
    video_file_trained_on = os.path.join(
        PROJECT_PATH, "videos", "aligned_videos", "a" + video_id_trained_on + ".MP4",
    )
    video_file_unseen = os.path.join(
        PROJECT_PATH, "videos", "aligned_videos", "a" + video_id_unseen + ".MP4",
    )
else:
    video_file_trained_on = os.path.join(
        video_info[video_info["vid_file"] == video_id_trained_on + ".MP4"][
            "vid_folder"
        ].values[0],
        video_id_trained_on + ".MP4",
    )
    video_file_unseen = os.path.join(
        video_info[video_info["vid_file"] == video_id_unseen + ".MP4"][
            "vid_folder"
        ].values[0],
        video_id_unseen + ".MP4",
    )

In [None]:
pick_n_anchors = 3  # how many anchor ids to pick randomly
min_dist_nn_factor = 1  # config["time_window"] * min_dist_nn_factor will define the min distance in time points between the anchor and between sampled neighbors

_, _, camera_pos, _ = Path(video_file_unseen).parts[-4:]


random_anchor_ids = np.random.choice(
    np.arange(0, latent_vectors_unseen.shape[0]), pick_n_anchors, replace=False
)
min_frame_distance = int(config["time_window"] * min_dist_nn_factor)

video = KinVideo(video_file_unseen, view=camera_pos)
video.probevid()
video_clip_duration = config["time_window"] / video.getfps()

video_stack = []
for a_idx in random_anchor_ids:
    video_stack.append(
        create_visual_comparison(
            a_idx,
            latent_vectors_unseen,
            min_frame_distance,
            video_file_unseen,
            video_clip_duration,
            upper_dist_percentile=80,
        )
    )

# plot next to each other: left side: anchor and its 8 closest neighbors; right side anchor and 8 samples belonging to the
# 20% of the most distant latent vectors wrt. the anchor embedding
grid = GridspecLayout(pick_n_anchors, 2)
# sorted video files
for i_row, video_pair in enumerate(video_stack):
    for j_vid, video_f in enumerate(video_pair):
        out = Output()
        with out:
            display.display(
                display.Video(
                    video_f,
                    embed=True,
                    html_attributes="loop autoplay",
                    width=450,
                    height=450,
                )
            )
        grid[i_row, j_vid] = out
grid

## 2) Dilution of latent vectors

Dilute the latent vectors since they correspond to heavily overlapping time series of the landmarks, hence to latent vectors can essentially correspond to the same input time series just shifted by a single frame.

In [None]:
# remove vectors which are temporally close to the sampled anchor if the belong to its closest N% percentile of embeddings in the latent space
min_dist_in_frames = config["time_window"]
sub_sampling_factor = (
    config["time_window"] // 10
)  # choose a subsampling factor for neighbor percentile estimation to save memory
neighbor_percentile_unseen = find_percentile_threshold(
    latent_vectors_unseen[::sub_sampling_factor],
    config["time_window"],
    time_idx=np.arange(0, len(latent_vectors_unseen))[::sub_sampling_factor],
    test_fraction=0.01 * sub_sampling_factor,
)
print(f"Selected neigbor percentile unseen dataset: {neighbor_percentile_unseen}")

min_remaining_dataset = 0.001  # minimum fraction of remaining samples  e.g. 0.1 = 10%
remaining_embeddings_unseen, remaining_time_ids_unseen = thin_dataset_iteratively(
    latent_vectors_unseen,
    min_remaining_dataset,
    neighbor_percentile_unseen,
    min_dist_in_frames,
)
percentile_data_unseen = np.round(
    len(remaining_embeddings_unseen) / len(latent_vectors_unseen) * 100, 2
)
print(
    f"{len(remaining_embeddings_unseen)} remaining samples from orignially {len(latent_vectors_unseen)}. So just {percentile_data_unseen}% of the original dataset"
)

In [None]:
neighbor_percentile_trained_on = find_percentile_threshold(
    latent_vectors_trained_on[::sub_sampling_factor],
    config["time_window"],
    time_idx=np.arange(0, len(latent_vectors_trained_on))[::sub_sampling_factor],
    test_fraction=0.01 * sub_sampling_factor,
)
print(
    f"Selected neigbor percentile trained on dataset: {neighbor_percentile_trained_on}"
)

(
    remaining_embeddings_trained_on,
    remaining_time_ids_trained_on,
) = thin_dataset_iteratively(
    latent_vectors_trained_on,
    min_remaining_dataset,
    neighbor_percentile_trained_on,
    min_dist_in_frames,
)
percentile_data_trained_on = np.round(
    len(remaining_embeddings_trained_on) / len(latent_vectors_trained_on) * 100, 2
)
print(
    f"{len(remaining_embeddings_trained_on)} remaining samples from orignially {len(latent_vectors_trained_on)}. So just {percentile_data_trained_on}% of the original dataset"
)

## 3) Clustering the data


### 3.1) Estimate the fuzzifier and the number of clusters
Use the diluted latent vectors corresponding to the time series the model was trained on and estimate the fuzzifier and the number of clusters for the fuzzy-c-means clustering method. Use the Fukuyama Sugeno index to find the number of needed clusters.

In [None]:
# find number of clusters
test_n_clusters = np.arange(1, 20)
N_samples, M_feat = remaining_embeddings_trained_on.shape
m = estimate_fuzzifier(N_samples, M_feat)
print(m)
fcm_models = [FCM(n_clusters=i_clusters, m=m) for i_clusters in test_n_clusters]
for i_model in range(len(fcm_models)):
    if i_model % 5 == 0:
        print(f"fitting model: {i_model}/{len(test_n_clusters)}")
    fcm_models[i_model].fit(remaining_embeddings_trained_on)

all_labels = [
    fcm_models[i_mode].soft_predict(remaining_embeddings_trained_on)
    for i_mode, i_clusters in enumerate(test_n_clusters)
]
fs_scores = [
    fukuyama_sugeno_index(
        remaining_embeddings_trained_on, labels, fcm_models[i].centers, m
    )
    for i, labels in enumerate(all_labels)
]

In [None]:
%matplotlib widget

plt.plot(fs_scores, ".")
plt.title("Fukuyama Sugeno Index")
plt.xlabel("N Clusters")

### 3.2) Train a UMAP projection and show the visualize the learned clusters
Train a UMAP projection on the diluted latent vectors corresponding to the time series the model has been trained on and use this UMAP projection for the diluted latent vectors corresponding to the unseen time series.

In [None]:
umap_trained_on = umap.UMAP(
    n_components=2, min_dist=0.0001, n_neighbors=30, random_state=config["random_state"]
).fit(remaining_embeddings_trained_on)

umap_embedding_trained_on = umap_trained_on.transform(remaining_embeddings_trained_on)
umap_embedding_unseen = umap_trained_on.transform(remaining_embeddings_unseen)

In [None]:
n_clusters_fcm = 16
idx = np.where(n_clusters_fcm == test_n_clusters)[0][0]
fcm = fcm_models[idx]

# output
# output is [N,K]: N number of latent embeddings and K the number of clusters; for where each entry is a membership score between 0...1
fcm_labels_soft_trained_on = fcm.soft_predict(remaining_embeddings_trained_on)
fcm_labels_soft_unseen = fcm.soft_predict(remaining_embeddings_unseen)

Explore the fuzzy clusters that where found in the high dim latent space by plotting for each sample its membership encoded as alpha (transparency) in the UMAP projection.

In [None]:
%matplotlib widget
n_cols = int(n_clusters_fcm ** 0.5)
n_rows = int(np.ceil(n_clusters_fcm / n_cols))


fig, ax = plt.subplots(n_rows, n_cols, sharex=True, sharey=True)
fig.set_size_inches(9, (9 / n_cols) * n_rows)
cmap = cm.get_cmap("rainbow", n_clusters_fcm)
for i_cluster in range(n_clusters_fcm):
    i_col = i_cluster % n_cols
    i_row = i_cluster // n_cols
    ax[i_row][i_col].scatter(
        umap_embedding_unseen[:, 0],
        umap_embedding_unseen[:, 1],
        color="w",
        edgecolor="k",
    )
    ax[i_row][i_col].scatter(
        umap_embedding_unseen[:, 0],
        umap_embedding_unseen[:, 1],
        color=cmap(i_cluster),
        alpha=list(fcm_labels_soft_unseen[:, i_cluster]),
        edgecolor="k",
    )

plt.tight_layout()

In [None]:
# assign each latent vector to the cluster with the highest membership score
cluster_membership_thr = 0.6
fcm_labels_unseen = np.ones(fcm_labels_soft_unseen.shape[0]) * -1
fcm_labels_unseen[
    np.max(fcm_labels_soft_unseen, axis=1) > cluster_membership_thr
] = np.argmax(fcm_labels_soft_unseen, axis=1)[
    np.max(fcm_labels_soft_unseen, axis=1) > cluster_membership_thr
]

fcm_labels_trained_on = np.ones(fcm_labels_soft_trained_on.shape[0]) * -1
fcm_labels_trained_on[
    np.max(fcm_labels_soft_trained_on, axis=1) > cluster_membership_thr
] = np.argmax(fcm_labels_soft_trained_on, axis=1)[
    np.max(fcm_labels_soft_trained_on, axis=1) > cluster_membership_thr
]

In [None]:
cluster_id = 14


# all selected time points including anchor and its nearest neighbors
time_ids_cluster_unseen = np.random.choice(
    remaining_time_ids_unseen[fcm_labels_unseen == cluster_id],
    min(16, np.sum(fcm_labels_unseen == cluster_id)),
    replace=False,
)

video_clip_data_cluster_unseen = [
    (video_file_unseen, t_id / video.getfps(), (0, 0, video.width, video.height))
    for t_id in time_ids_cluster_unseen
]
grid_video_cluster = create_grid_video(
    video_clip_data_cluster_unseen, video_clip_duration, speed=0.5, nrows=4, ncols=4
)  # duration is in seconds!!
display.Video(
    grid_video_cluster,
    embed=True,
    html_attributes="loop autoplay",
    width=600,
    height=600,
)

In [None]:
## Cluster the data not trained on using the clusters found by the data the model was trained on. Then compare samples referring
## to the same cluster that origin from the video data unseen to the model vs the video data the model was trained on

In [None]:
# Show samples for each cluster left: samples from the video sequence not used in training; right samples from the video sequence used in training

grid = GridspecLayout(n_clusters_fcm, 2)
# sorted video files

for i_cluster_id in range(n_clusters_fcm):
    # sample video clips corresponding to a single cluster of the data NOT used in training
    time_ids_cluster_unseen = np.random.choice(
        remaining_time_ids_unseen[fcm_labels_unseen == i_cluster_id],
        min(16, np.sum(fcm_labels_unseen == i_cluster_id)),
        replace=False,
    )
    video_clip_data_cluster_unseen = [
        (video_file_unseen, t_id / video.getfps(), (0, 0, video.width, video.height))
        for t_id in time_ids_cluster_unseen
    ]
    if len(time_ids_cluster_unseen) > 0:
        grid_video_cluster_unseen = create_grid_video(
            video_clip_data_cluster_unseen,
            video_clip_duration,
            speed=0.5,
            nrows=4,
            ncols=4,
        )
        out = Output()
        with out:
            display.display(
                display.Video(
                    grid_video_cluster_unseen,
                    embed=True,
                    html_attributes="loop autoplay",
                    width=450,
                    height=450,
                )
            )
        grid[i_cluster_id, 0] = out

    # sample video clips corresponding to a single cluster of the data used in training
    time_ids_cluster_trained_on = np.random.choice(
        remaining_time_ids_trained_on[fcm_labels_trained_on == i_cluster_id],
        min(16, np.sum(fcm_labels_trained_on == i_cluster_id)),
        replace=False,
    )
    if len(time_ids_cluster_trained_on) > 0:
        video_clip_data_cluster_trained_on = [
            (
                video_file_trained_on,
                t_id / video.getfps(),
                (0, 0, video.width, video.height),
            )
            for t_id in time_ids_cluster_trained_on
        ]
        grid_video_cluster_trained_on = create_grid_video(
            video_clip_data_cluster_trained_on,
            video_clip_duration,
            speed=0.5,
            nrows=4,
            ncols=4,
        )
        out = Output()
        with out:
            display.display(
                display.Video(
                    grid_video_cluster_trained_on,
                    embed=True,
                    html_attributes="loop autoplay",
                    width=450,
                    height=450,
                )
            )
        grid[i_cluster_id, 1] = out
grid

In [None]:
## 5) Compare the behavior over time? - cluster the full latent space and then see how the cluster labels progress over time
clusters_all_trained_on = fcm.soft_predict(latent_vectors_trained_on)
clusters_all_unseen = fcm.soft_predict(latent_vectors_unseen)

In [None]:
%matplotlib widget
fig, ax = plt.subplots(2, 1, sharex=True, sharey=True)
cmap = cm.get_cmap("rainbow", n_clusters_fcm)

for i_cluster in range(n_clusters_fcm):
    ax[0].plot(clusters_all_trained_on[:, i_cluster], c=cmap(i_cluster))
    ax[1].plot(clusters_all_unseen[:, i_cluster], c=cmap(i_cluster))
ax[0].set_title("data trained on")
ax[1].set_title("data unseen during training")

## 4) Reduce the latent space using a PCA and then apply the clustering again

In [None]:
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN

pca = PCA(n_components=11)
pca_vectors_trained_on = pca.fit_transform(remaining_embeddings_trained_on)
pca_vectors_unseen = pca.transform(remaining_embeddings_unseen)
dbscan = DBSCAN(eps=2.2)

dbscan_labels_trained_on = dbscan.fit_predict(pca_vectors_trained_on)
# dbscan_labels_unseen = dbscan.predict(pca_vectors_unseen)
n_dbscan_labels = int(np.max(dbscan_labels_trained_on))

In [None]:
print(np.unique(dbscan_labels_trained_on))

In [None]:
grid = GridspecLayout(n_dbscan_labels, 2)
# sorted video files

for i_cluster_id in range(n_dbscan_labels):
    # sample video clips corresponding to a single cluster of the data NOT used in training
    # time_ids_cluster_unseen = np.random.choice(remaining_time_ids_unseen[fcm_labels_unseen == i_cluster_id], min(16, np.sum(fcm_labels_unseen == i_cluster_id)), replace=False)
    # video_clip_data_cluster_unseen = [(video_file_unseen, t_id/ video.getfps(), (0,0,video.width,video.height)) for t_id in time_ids_cluster_unseen]
    # if len(time_ids_cluster_unseen) > 0:
    #    grid_video_cluster_unseen = create_grid_video(video_clip_data_cluster_unseen,video_clip_duration,speed=0.5,nrows=4,ncols=4)
    #    out = Output()
    #    with out:
    #        display.display(display.Video(grid_video_cluster_unseen, embed=True, html_attributes="loop autoplay", width=450,height=450))
    #    grid[i_cluster_id, 0] = out

    # sample video clips corresponding to a single cluster of the data used in training
    time_ids_cluster_trained_on = np.random.choice(
        remaining_time_ids_trained_on[dbscan_labels_trained_on == i_cluster_id],
        min(16, np.sum(dbscan_labels_trained_on == i_cluster_id)),
        replace=False,
    )
    if len(time_ids_cluster_trained_on) > 0:
        video_clip_data_cluster_trained_on = [
            (
                video_file_trained_on,
                t_id / video.getfps(),
                (0, 0, video.width, video.height),
            )
            for t_id in time_ids_cluster_trained_on
        ]
        grid_video_cluster_trained_on = create_grid_video(
            video_clip_data_cluster_trained_on,
            video_clip_duration,
            speed=0.5,
            nrows=4,
            ncols=4,
        )
        out = Output()
        with out:
            display.display(
                display.Video(
                    grid_video_cluster_trained_on,
                    embed=True,
                    html_attributes="loop autoplay",
                    width=450,
                    height=450,
                )
            )
        grid[i_cluster_id, 1] = out
        
# just h
grid

In [None]:
pca_dim_red = PCA(n_components=20)
pca_dim_red.fit(remaining_embeddings_unseen)
print(
    f"Explained variance cumulated over the dimensions: {pca_dim_red.explained_variance_ratio_.cumsum()}"
)

pca_dim_red = PCA(n_components=20)
pca_dim_red.fit(remaining_embeddings_trained_on)
print(
    f"Explained variance cumulated over the dimensions: {pca_dim_red.explained_variance_ratio_.cumsum()}"
)