## Embed multiple rats in the same latent space

Motivation: if the learned embeddings are more or less indepenend from the rat, embeddings from different rats shouldn't be embedded
in different parts of the latent space - appart from extreme behavior which are occuring only in a single rat e.g. seisures

In [None]:
import sys

sys.path.insert(0, "/home/katharina/vame_approach/VAME")

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from vame.analysis.kinutils import KinVideo, create_grid_video
import os
from datetime import datetime
from vame.util.auxiliary import read_config
import logging
import re
from pathlib import Path
from IPython import display
from sklearn.manifold import TSNE
import umap
from fcmeans import FCM
from ipywidgets import Output, GridspecLayout
from scipy.spatial.distance import pdist, squareform
from vame.analysis.visualize import (
    create_aligned_mouse_video,
    create_pose_snipplet,
    create_visual_comparison,
    thin_dataset_iteratively,
    find_percentile_threshold,
    estimate_fuzzifier,
    fukuyama_sugeno_index,
)
from matplotlib import cm
import seaborn as sns
from vame.initialize_project.themis_new import get_video_metadata

np.random.seed(42)

%reload_ext autoreload
%autoreload 2

In [None]:
PROJECT_PATH = "/home/katharina/vame_approach/tb_align_0044_0089"
MODEL_TIME_STAMP = "06-01-2022-16-40"

# path where the original videos are stored
VIDEO_ROOT = "/media/Themis/Data/Video"

USE_ALIGNED_VIDEO = True

config_file = os.path.join(PROJECT_PATH, "model", MODEL_TIME_STAMP, "config.yaml")
config = read_config(config_file)

latent_vec_dir = os.path.join(PROJECT_PATH, "inference", "results", MODEL_TIME_STAMP)
latent_vec_files = [
    os.path.join(latent_vec_dir, file) for file in os.listdir(latent_vec_dir)
]
latent_vectors = {
    os.path.basename(file).split("_")[3]: np.load(file) for file in latent_vec_files
}

In [None]:
print(latent_vectors.keys())

In [None]:
# dilute the datasets
neighbor_percentiles = {}
latent_vectors_diluted = {}
time_ids_diluted = {}
for video_id, latent_vec in latent_vectors.items():
    neighbor_percentiles[video_id] = find_percentile_threshold(
        latent_vec, config["time_window"]
    )
    remaining_embeddings, remaining_time_ids = thin_dataset_iteratively(
        latent_vec, 0.00001, neighbor_percentiles[video_id], config["time_window"]
    )

    latent_vectors_diluted[video_id] = remaining_embeddings
    time_ids_diluted[video_id] = remaining_time_ids

In [None]:
labels, all_latent_vectors = list(
    zip(*[(k, v) for k, v in latent_vectors_diluted.items()])
)

In [None]:
labels_full = [[l] * len(latent_vectors_diluted[l]) for l in labels]
labels_full = np.array([l for sub_list in labels_full for l in sub_list])
all_latent_vectors = np.concatenate(all_latent_vectors)

In [None]:
umap_trafo = umap.UMAP(
    n_components=2, min_dist=0.001, n_neighbors=30, random_state=config["random_state"]
).fit(all_latent_vectors)

In [None]:
%matplotlib widget
print(all_latent_vectors.shape)
umap_embeddings = umap_trafo.transform(all_latent_vectors)

cmap = cm.get_cmap("rainbow", len(labels))
for l in labels:
    print(l)
    plt.scatter(
        umap_embeddings[labels_full == l, 0],
        umap_embeddings[labels_full == l, 1],
        color=cmap(labels.index(l)),
        edgecolor="k",
        label=l,
    )
plt.legend()

In [None]:
%matplotlib widget
from sklearn.decomposition import PCA

pca = PCA(n_components=2)

pca_vectors = pca.fit_transform(all_latent_vectors)

colors = np.stack([cmap(labels.index(l)) for l in labels_full])
plt.scatter(pca_vectors[:, 0], pca_vectors[:, 1], label=labels_full, color=colors)