In [3]:
import numpy as np
import cv2
from matplotlib import pyplot as plt
from IPython.display import Video
import plotly.express as px
import pandas as pd
import timeit
import os
import pickle

In [5]:
from sklearn.cluster import MeanShift, estimate_bandwidth, KMeans, DBSCAN
from sklearn.neighbors.kde import KernelDensity
from scipy.signal import argrelextrema

def kde_cluster(Xdf, kernel="gaussian", bandwidth=10, plot=False):
    """
    Uses KDE to create clusters out of word vecs
    """
    X = np.array(Xdf['x'])
    X = X.reshape(-1, 1)

    # TODO There is probably bias at the boundaries, should mirror X
    kde = KernelDensity(kernel=kernel, bandwidth=bandwidth).fit(X)
    s = np.linspace(0, np.max(X)*1.5)
    e = kde.score_samples(s.reshape(-1, 1))

    # Reshape back to a 1 by N array
    X = X.reshape(1, -1)

    minima = argrelextrema(e, np.less)[0]
    # Use the linspace to convert back into word indexes
    minima = [s[m] for m in minima]
    # (0, minima 1), (minima 1, minima 2), ... (minima n-1, minima n), (minima n, end)
    minima_pairs = list(zip(np.insert(minima, 0, 0), np.append(minima, s[-1])))

    clusters = [
      np.unique(X[np.logical_and(X >= m1, X < m2)]) for m1, m2 in minima_pairs
    ]

    if plot:
      plt.plot(s, e)
      plt.show()
      print(f"Number of clusters: {len(clusters)}")
      for c in clusters:
          print("\t", len(c), np.unique([int(x) for x in c]))

    return clusters

# Store frame numbers in the clusters instead of the average of the last frame
def frame_idxs_clusters(aavg_clusters, df):
  df_2 = df_avg.copy()
  df_2['cluster'] = np.full_like((len(averages)), -1)

  # clusters = []
  for i, c in enumerate(avg_clusters):
      # clusters.append([])
      for j, frame in enumerate(frames[:-1]):
          if normal_averages[j] >= np.min(c) and normal_averages[j] <= np.max(c):
              clusters[i].append(j)
              df_2['cluster'][j] = i
              continue
  
  return df_2



In [16]:
CACHE = "./jre-episodes.pickle"

with open(CACHE, "rb") as f:
    episodes = pickle.load(f)

print(f"Number of loaded episodes: {len(episodes)}")

Number of loaded episodes: 2455


In [17]:
[ep for ep in episodes if ep.video_averages is not None]

[[EED-LEWOCv0] Joe Rogan Experience #1539 - Jenny Kleeman,
 [t7uqHosIj4s] Joe Rogan Experience #1538 - Douglas Murray,
 [9iOGfAAticY] Joe Rogan Experience #1537 - Lex Fridman,
 [_Rl82OQDoOc] Joe Rogan Experience #1536 - Edward Snowden,
 [TDKYSLDq6es] Joe Rogan Experience #1535 - Tim Kennedy,
 [1mgN8r1mwvM] Joe Rogan Experience #1534 - Ron White,
 [fTPQ9KR5j8k] Joe Rogan Experience #1533 - Adam Curry,
 [hcPUoxTvw5g] Joe Rogan Experience #1532 - Mike Tyson,
 [D7WUMXKV-FE] Joe Rogan Experience #1531 - Miley Cyrus,
 [8xRz8ra9mdI] Joe Rogan Experience #1530 - Duncan Trussell,
 [5UXpbbX9-Wo] Joe Rogan Experience #1529 - Whitney Cummings & Annie Lederman,
 [KUCkrZKslr4] Joe Rogan Experience #1528 - Nikki Glaser,
 [NY3Zg37nIHo] Joe Rogan Experience #1527 - David Blaine,
 [7eRR7j1OCOs] Joe Rogan Experience #1526 - Ali Macofsky,
 [h9XzuUXj6Gc] Joe Rogan Experience #1525 - Tim Dillon,
 [PjLW5irADoM] Joe Rogan Experience #1524 - Ron Funches,
 [INSy7D2LBfU] Joe Rogan Experience #1523 - Joey Diaz & 

In [None]:
df = pd.DataFrame({
  'x': normal_averages,
#   'x': blurred_averages,
  'n': range(len(frames) - 1),
})

fig = px.scatter(df, x='x', color='n')
fig.show()

In [None]:
# TODO How to estimate bandwidth for 2d data?
avg_clusters = kde_cluster(df, plot=True, bandwidth=5)
# TODO Actually returns a df now
clusters = frame_idxs_clusters(avg_clusters)

clusters = np.array(clusters)
clusters.shape, clusters[1][0]

In [None]:
df_2 = frame_idxs_clusters(kde_cluster(df_avg, plot=True), df_avg)