In [3]:
import numpy as np
import cv2
from matplotlib import pyplot as plt
from IPython.display import Video
import plotly.express as px
import pandas as pd
import timeit
import os
import pickle

In [54]:
from sklearn.cluster import MeanShift, estimate_bandwidth, KMeans, DBSCAN
from sklearn.neighbors.kde import KernelDensity
from scipy.signal import argrelextrema

def kde_cluster(Xdf, kernel="gaussian", bandwidth=10, plot=False):
    """
    Uses KDE to create clusters out of word vecs
    """
    X = np.array(Xdf['x'])
    X = X.reshape(-1, 1)

    # TODO There is probably bias at the boundaries, should mirror X
    kde = KernelDensity(kernel=kernel, bandwidth=bandwidth).fit(X)
    s = np.linspace(0, np.max(X)*1.5)
    e = kde.score_samples(s.reshape(-1, 1))

    # Reshape back to a 1 by N array
    X = X.reshape(1, -1)

    minima = argrelextrema(e, np.less)[0]
    # Use the linspace to convert back into word indexes
    minima = [s[m] for m in minima]
    # (0, minima 1), (minima 1, minima 2), ... (minima n-1, minima n), (minima n, end)
    minima_pairs = list(zip(np.insert(minima, 0, 0), np.append(minima, s[-1])))

    clusters = [
      np.unique(X[np.logical_and(X >= m1, X < m2)]) for m1, m2 in minima_pairs
    ]

    if plot:
      plt.plot(s, e)
      plt.show()
      print(f"Number of clusters: {len(clusters)}")
      for c in clusters:
          print("\t", len(c), np.unique([int(x) for x in c]))

    return clusters

# Store frame numbers in the clusters instead of the average of the last frame
def frame_idxs_clusters(avg_clusters, df):
    df_2 = df.copy()
    df_2['cluster'] = np.full_like((len(avg_clusters)), -1)

    for cluster_number, c in enumerate(avg_clusters):
        for frame_number, frame in enumerate(df['x']):
            if df['x'][frame_number] >= np.min(c) and df['x'][frame_number] <= np.max(c):
#                 clusters[i].append(j)
                df_2['cluster'][frame_number] = cluster_number
#                 continue
  
    return df_2

In [16]:
CACHE = "./jre-episodes.pickle"

with open(CACHE, "rb") as f:
    episodes = pickle.load(f)

print(f"Number of loaded episodes: {len(episodes)}")

Number of loaded episodes: 2455


In [23]:
# Main episodes vs main episodes with video averages
eps = [ep for ep in episodes if ep.video_averages is not None and ep.is_main_episode]
len([ep for ep in episodes if ep.is_main_episode]), len(eps)

(1463, 1373)

In [30]:
averages, total_frames = eps[0].video_averages

In [33]:
len(averages), total_frames

(12639, 239820)

In [47]:
from ipywidgets import interact, IntSlider
def show_frame_averages(x):
    df = pd.DataFrame({
      'x': eps[x].video_averages[0],
      'n': range(len(eps[x].video_averages[0])),
    })
    fig = px.scatter(df, x='x', color='n', title=str(eps[x]))
    fig.show()

interact(show_frame_averages, x=IntSlider(value=402, min=0, max=len(eps) - 1))

interactive(children=(IntSlider(value=402, description='x', max=1372), Output()), _dom_classes=('widget-intera…

<function __main__.show_frame_averages(x)>

In [67]:
# TODO How to estimate bandwidth for 2d data?
BANDWIDTH = 2

from ipywidgets import interact, IntSlider
def show_frame_averages(x):
    df = pd.DataFrame({
      'x': eps[x].video_averages[0],
      'n': range(len(eps[x].video_averages[0])),
    })
    avg_clusters = kde_cluster(df, plot=True, bandwidth=BANDWIDTH)

    df = frame_idxs_clusters(avg_clusters, df)
    # For a discrete color scale
    df['cluster'] = df['cluster'].astype(str)
    
    fig = px.scatter(df, x='x', color='cluster', title=str(eps[x]))
    fig.show()

interact(show_frame_averages, x=IntSlider(value=215, min=0, max=len(eps) - 1))

interactive(children=(IntSlider(value=215, description='x', max=1372), Output()), _dom_classes=('widget-intera…

<function __main__.show_frame_averages(x)>

In [None]:
df_2 = frame_idxs_clusters(kde_cluster(df_avg, plot=True), df_avg)