In [None]:
import h5py
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import sklearn
import sklearn.dummy
import sklearn.linear_model
import sklearn.metrics
import sklearn.model_selection
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import adjusted_mutual_info_score

import numpy as np
import sys

import pathlib
from pathlib import Path
import skvideo
import skvideo.io
import umap
import hdbscan

import torch
from openTSNE import TSNE

from unsupervised_behaviors.constants import DanceLabels

In [None]:
batch_size = 128
device = "cuda:1"

latents_path = "/storage/mi/jennyonline/data/latents_videos.pt"
videos_path = "/storage/mi/jennyonline/data/videos_2019_10000.h5"
model_path = "/storage/mi/jennyonline/data/cpc_ben.pt"

## Loss

In [None]:
latents = torch.load(latents_path)

with h5py.File(videos_path, "r") as f:
    labels = f["labels"][:]

model, _, losses = torch.load(model_path)
model = model.to(device)
plt.plot(pd.Series(losses).rolling(128).mean())
plt.xlabel('Iterations')
plt.ylabel('Loss')
plt.savefig('/storage/mi/jennyonline/data/loss_cpc_ben.pdf', bbox_inches='tight')

## t-SNE

In [None]:
reps = model.get_representations(
    latents, batch_size, device
)

In [None]:
embedding = TSNE(n_jobs=-1).fit(reps)

In [None]:
plt.scatter(embedding[:, 0], embedding[:, 1], s=1)

In [None]:
plt.figure(figsize=(12, 6))

colors = sns.color_palette(n_colors=len(DanceLabels))

for label in DanceLabels:
    elems = embedding[labels == label.value]
    scatter = plt.scatter(elems[:, 0], elems[:, 1], s=3, c=[colors[label.value]], label=label.name)
plt.title("very deep VAE -> CPC -> TSNE")
plt.legend()
plt.xlabel('First t-SNE dimension')
plt.ylabel('Second t-SNE dimension')
plt.savefig('/storage/mi/jennyonline/data/t_sne.pdf')

## Linear Classifier

In [None]:
linear = sklearn.linear_model.LogisticRegression(multi_class="multinomial", max_iter=1000, n_jobs=4)
sklearn.model_selection.cross_val_score(
    linear,
    latents.mean(axis=1),
    labels,
    cv=sklearn.model_selection.StratifiedShuffleSplit(),
    scoring=sklearn.metrics.make_scorer(
        sklearn.metrics.roc_auc_score, multi_class="ovo", needs_proba=True
    ),
    n_jobs=-1,
).mean()

In [None]:
linear = sklearn.linear_model.LogisticRegression(multi_class="multinomial", max_iter=1000, n_jobs=4)
sklearn.model_selection.cross_val_score(
    linear,
    reps,
    labels,
    cv=sklearn.model_selection.StratifiedShuffleSplit(),
    scoring=sklearn.metrics.make_scorer(
        sklearn.metrics.roc_auc_score, multi_class="ovo", needs_proba=True
    ),
    n_jobs=-1,
).mean()

In [None]:
sklearn.model_selection.cross_val_score(
    sklearn.dummy.DummyClassifier(),
    reps,
    labels,
    cv=sklearn.model_selection.StratifiedShuffleSplit(),
    scoring=sklearn.metrics.make_scorer(
        sklearn.metrics.roc_auc_score, multi_class="ovo", needs_proba=True
    ),
).mean()

## Clustering

In [None]:
# array containing indices of labels that are 1 or 2 -> these indices are used for ami
idx_for_ami = []

for i in range(len(labels)):
    if(labels[i]==1):
        idx_for_ami.append(i)
    elif(labels[i]==2):
        idx_for_ami.append(i)
labels_for_ami = labels[idx_for_ami]

## kMeans

In [None]:
kmeans = KMeans(n_clusters=3, random_state=0).fit(reps)
clusters_kmeans = np.array(kmeans.predict(reps))

clusters_for_ami = clusters_kmeans[idx_for_ami]

In [None]:
ami = sklearn.metrics.adjusted_mutual_info_score(labels_for_ami,clusters_for_ami)
ami

## kMeans on latents

In [None]:
# take frame in the middle for evaluation
latents_for_clustering = latents[:,[16],:]
latents_for_clustering = latents_for_clustering.reshape(10000,160)

kmeans = KMeans(n_clusters=3, random_state=0).fit(latents_for_clustering)
clusters_kmeans_latents = np.array(kmeans.predict(latents_for_clustering))

clusters_for_ami = clusters_kmeans_latents[idx_for_ami]

In [None]:
ami = sklearn.metrics.adjusted_mutual_info_score(labels_for_ami,clusters_for_ami)
ami

In [None]:
# take mean over all frames for evaluation
latents_for_clustering = np.mean(latents, axis = 1)
print(latents_for_clustering.shape)

kmeans = KMeans(n_clusters=3, random_state=0).fit(latents_for_clustering)
clusters_kmeans_latents = np.array(kmeans.predict(latents_for_clustering))

clusters_for_ami = clusters_kmeans_latents[idx_for_ami]

In [None]:
ami = sklearn.metrics.adjusted_mutual_info_score(labels_for_ami,clusters_for_ami)
ami

## Logistic Regression on latents

In [None]:
linear = sklearn.linear_model.LogisticRegression(multi_class="multinomial", max_iter=1000, n_jobs=4)
sklearn.model_selection.cross_val_score(
    linear,
    latents_for_clustering,
    labels,
    cv=sklearn.model_selection.StratifiedShuffleSplit(),
    scoring=sklearn.metrics.make_scorer(
        sklearn.metrics.roc_auc_score, multi_class="ovo", needs_proba=True
    ),
    n_jobs=-1,
).mean()

## kMeans using UMAP

In [None]:
clusterable_embedding = umap.UMAP(
    n_neighbors=30,
    min_dist=0.0,
    n_components=30,
    random_state=42,
).fit_transform(reps)

In [None]:
kmeans = KMeans(n_clusters=3, random_state=0).fit(clusterable_embedding)
clusters_kmeans_umap = np.array(kmeans.predict(clusterable_embedding))

clusters_for_ami = clusters_kmeans_umap[idx_for_ami]

In [None]:
ami = sklearn.metrics.adjusted_mutual_info_score(labels_for_ami,clusters_for_ami)
ami

## HDBSCAN

In [None]:
clusters_hdb = hdbscan.HDBSCAN(
    min_cluster_size=200,
).fit_predict(reps)

clusters_for_ami = clusters_hdb[idx_for_ami]

In [None]:
ami = sklearn.metrics.adjusted_mutual_info_score(labels_for_ami,clusters_for_ami)
ami

## HDBSCAN using UMAP

In [None]:
clusters_hdb_umap = hdbscan.HDBSCAN(
    min_cluster_size=200,
).fit_predict(clusterable_embedding)

clusters_for_ami = clusters_hdb_umap[idx_for_ami]

In [None]:
ami = sklearn.metrics.adjusted_mutual_info_score(labels_for_ami,clusters_for_ami)
ami

## choosing and creating videos

In [None]:
clusters_zero = []
clusters_one = []
clusters_two = []

for i in range(len(clusters_kmeans)):
    if(clusters_kmeans[i]==0):
        clusters_zero.append(i)
    elif(clusters_kmeans[i]==1):
        clusters_one.append(i)
    else:
        clusters_two.append(i)

In [None]:
def extract_video(
    h5_path: pathlib.Path, video_idx: int, output_path: pathlib.Path, with_mask: bool = False
):
    """Extract a single video from the h5 file and store it in a compressed video.
    Parameters
    ----------
    h5_path: pathlib.Path
        Video h5 file path.
    video_idx: int
        Sequential index of video to extract.
    output_path: pathlib.Path
        Output video path.
    """
    with h5py.File(h5_path, "r") as f:

        video = f["images"][video_idx]

        if with_mask:
            mask = f["tag_masks"][video_idx] * f["loss_masks"][video_idx]
            video *= mask

        outputdict = {"-c:v": "libx264", "-crf": "0", "-preset": "veryslow", "-filter:v": "fps=6"}

        with skvideo.io.FFmpegWriter(output_path, outputdict=outputdict) as writer:
            for frame in video:
                writer.writeFrame(frame[:, :, None].repeat(3, axis=-1))

In [None]:
for i in range(10):
    path_zero = '/storage/mi/jennyonline/videos/videos1_zero_' + str(i) + '.mp4'
    extract_video(videos_path, clusters_zero[i], path_zero)
    path_one = '/storage/mi/jennyonline/videos/videos1_one_' + str(i) + '.mp4'
    extract_video(videos_path, clusters_one[i], path_one)
    path_two = '/storage/mi/jennyonline/videos/videos1_two_' + str(i) + '.mp4'
    extract_video(videos_path, clusters_two[i], path_two)  