In [1]:
import numpy as np
import cv2
from matplotlib import pyplot as plt
from IPython.display import Video
import plotly.express as px
import pandas as pd
import time
import os
import pickle
from tqdm import tqdm
from math import floor

In [2]:
WEBSITE = "../../jre-vis/public/"
# TODO How to estimate bandwidth for 2d data?
BANDWIDTH = .1
AVGS_LOCATION = "averages_scale.25.0_color"

In [3]:
def show_frame_from(ep, frame_no):
    video_name = [v for v in video_files if ep.title in v][0]
    cap = cv2.VideoCapture(video_name)
    fps = cap.get(cv2.CAP_PROP_FPS)
    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_no)    
    plt.title(as_timestamp(frame_no, fps))
    _, frame = cap.read()
    plt.imshow(frame)
    cap.release()
    
def as_timestamp(frame_no, fps):
    sec = frame_no / fps
    return f"{floor(sec / 60 / 60)}:{floor(sec / 60) % 60}:{round(sec % 60, 1)}"    

In [4]:
from sklearn.cluster import MeanShift, estimate_bandwidth, KMeans, DBSCAN
from sklearn.neighbors.kde import KernelDensity
from scipy.signal import argrelextrema

def kde_cluster(Xdf, kernel="gaussian", bandwidth=10, plot=False):
    """
    Uses KDE to create clusters out of word vecs
    """
    X = np.array(Xdf['x'])
    X = X.reshape(-1, 1)

    # TODO There is probably bias at the boundaries, should mirror X
    kde = KernelDensity(kernel=kernel, bandwidth=bandwidth).fit(X)
    s = np.linspace(0, np.max(X)*1.5)
    e = kde.score_samples(s.reshape(-1, 1))

    # Reshape back to a 1 by N array
    X = X.reshape(1, -1)

    minima = argrelextrema(e, np.less)[0]
    # Use the linspace to convert back into word indexes
    minima = [s[m] for m in minima]
    # (0, minima 1), (minima 1, minima 2), ... (minima n-1, minima n), (minima n, end)
    minima_pairs = list(zip(np.insert(minima, 0, 0), np.append(minima, s[-1])))

    clusters = [
      np.unique(X[np.logical_and(X >= m1, X < m2)]) for m1, m2 in minima_pairs
    ]

    if plot:
        plt.plot(s, e)
        plt.show()
        print(f"Number of clusters: {len(clusters)}")
        for c in clusters:
            print("\t", len(c), np.unique([int(x) for x in c]))

    return clusters

# Store frame numbers in the clusters instead of the average of the last frame
def frame_idxs_clusters(avg_clusters, df):
    df['cluster'] = np.full_like((len(avg_clusters)), -1)

    for cluster_number, c in enumerate(avg_clusters):
        cluster_cond = (df['x'] >= np.min(c)) & (df['x'] <= np.max(c))
        df.loc[cluster_cond, 'cluster'] = cluster_number
  
    return df



In [5]:
CACHE = "./jre-episodes.pickle"

with open(CACHE, "rb") as f:
    episodes = pickle.load(f)

print(f"Number of loaded episodes: {len(episodes)}")

Number of loaded episodes: 2462


## Load Averages 

In [6]:
import re
AVERAGES = f"../data/jre/{AVGS_LOCATION}"

for file in os.listdir(AVERAGES):
    matches = re.search(r"-([A-Za-z0-9-_]*).npy", file)
    video_id = matches[1][-11:]
    
    try:
        episode = [e for e in episodes if e.video_id == video_id][0]
        episode.video_averages = np.load(
            f"{AVERAGES}/{file}", allow_pickle=True
        )
    except Exception as e:
        print("Could not load video average data for ", video_id)

Main episodes vs main episodes with video averages

In [7]:
eps = [ep for ep in episodes if ep.video_averages is not None and ep.is_main_episode]
len([ep for ep in episodes if ep.is_main_episode]), len(eps)

(1470, 1465)

### Generate Clusters For Each Episode

In [8]:
def get_frame_count(ep):
    try:
        video_name = [v for v in video_files if ep.title in v][0]
    except IndexError as e:
        print("Could not find video", ep)
        raise e

    cap = cv2.VideoCapture(video_name)
    frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    cap.release()
    return frames, fps

def create_episode_clusters():
    pd.options.mode.chained_assignment = None  # default='warn'

    episode_clusters = []
    for ep in tqdm(eps):
        averages, total_frames = ep.video_averages
        df = pd.DataFrame({
          'x': averages,
          'n': range(len(averages)),
        })

        avg_clusters = kde_cluster(df, bandwidth=BANDWIDTH, plot=False)
        df = frame_idxs_clusters(avg_clusters, df)

        clusters = {}
        curr_cluster = df['cluster'][0]        
        playhead_start = 0
        skip_amount = total_frames / len(averages)
        playhead = lambda cluster_idx: (cluster_idx * skip_amount) / total_frames
        
        for i, cluster_num in enumerate(df['cluster']):
            if cluster_num != curr_cluster:
                clusters[curr_cluster] = clusters.get(curr_cluster, []) + [(playhead_start, playhead(i - 1))]
                playhead_start = playhead(i)
                curr_cluster = cluster_num
                
        episode_clusters.append((ep, clusters))
        
    return episode_clusters

In [9]:
CACHE = f"./episode-clusters___{AVGS_LOCATION}.pkl"
# LOAD = True
LOAD = False

episode_clusters = []

if LOAD:
    with open(CACHE, "rb") as f:
        episode_clusters = pickle.load(f)
else:
    episode_clusters = create_episode_clusters()
    with open(CACHE, "wb") as f:
        pickle.dump(episode_clusters, f)

100%|██████████| 1465/1465 [14:08<00:00,  1.73it/s]


In [37]:
print("Total ep clusters")
print(len(missing_ep_clusters))

missing_ep_clusters = [e for e,c in episode_clusters if len(c.items()) == 0]

print("Missing ep clusters")
len(missing_ep_clusters), [e.title for e in missing_ep_clusters]

Total ep clusters
19
Missing ep clusters


(19,
 ['Joe Rogan Experience #868 - John Dudley (Audio Only)',
  'Joe Rogan Experience #834 - Dan Doty',
  'Joe Rogan Experience #827 - Twitter Q&A with Joe',
  'Joe Rogan Experience #722 - Tony Hinchcliffe (Audio Only)',
  'Joe Rogan Experience #211 - Ari Shaffir (Part 2)',
  'Joe Rogan Experience #185 - Tom Segura',
  'Joe Rogan Experience #143 - Mayhem Miller',
  'Joe Rogan Experience #140 - Brendon Walsh (Part 2)',
  'Joe Rogan Experience #132 - Bert Kreischer',
  'Joe Rogan Experience #119 - Jan Irvin',
  'Joe Rogan Experience #47 - Michael Schiavello',
  'Joe Rogan Experience #39 - Joey Diaz, Eddie Bravo (Part 1)',
  'Joe Rogan Experience #37 - Ricky Schroder',
  'Joe Rogan Experience #33 -- Dane Cook',
  'Joe Rogan Experience #31 -- Mayhem Miller',
  'Joe Rogan Experience #29 - Brian Redban',
  'Joe Rogan Experience #26 - Bill Burr',
  'Joe Rogan Experience #15 - Brian Redban',
  'Joe Rogan Experience #14 - Brian Redban'])

# Generate pie data

In [10]:
pie_rows = []
for ep, cluster_timestamps in episode_clusters:
    data = {}
    for i, t in cluster_timestamps.items():
        data[i] = 0
        for start, end in t:
            data[i] += end - start

    pie_rows.append([ep.video_id, dict(data)])
            
pie_data = pd.DataFrame(pie_rows, columns=["id", "data"])
pie_data.to_csv(WEBSITE + "screen_time.csv")
len(pie_data)

1465

In [None]:
type(episode_clusters[0][1].keys()), episode_clusters[0][1].keys()

In [None]:
# Protobufs are 16mb.. json is 60ish
import json

def key_to_json(data):
    if data is None or isinstance(data, (bool, int, str)):
        return data
    if isinstance(data, (tuple, frozenset)):
        return str(data)
    if isinstance(data, np.int64):
        return int(data)
    raise Exception("Type error", (data, type(data)))

def to_json(data):
    if data is None or isinstance(data, (bool, int, tuple, range, str, list)):
        return data
    if isinstance(data, (set, frozenset)):
        return sorted(data)
    if isinstance(data, dict):
        return {key_to_json(key): to_json(data[key]) for key in data}
    raise TypeError

with open(PROTO_OUT + "screen_time_timelines.json", "w") as f:
    if False:
        json_ep_clusters = json.dumps(to_json({e.video_id:cl for e, cl in episode_clusters}))
        f.write(json_ep_clusters)

# Get pictures for each cluster

In [61]:
import cv2
from glob import glob
import imutils

video_files = list(glob("/Volumes/JRE/jre-bucket/jre/videos/*.mp4"))

BLACK_AND_WHITE = True
RESCALE_AMOUNT = .5

def rescale_frame(frame, amount=.75):
    width = int(frame.shape[1] * amount)
    height = int(frame.shape[0] * amount)
    dim = (width, height)
    return cv2.resize(frame, dim, interpolation =cv2.INTER_AREA)

video_cluster_images = {}
total = 0
for ep, cluster_timestamps in tqdm(episode_clusters):
    video_cluster_images[ep] = {}
    
    try:
        video_name = [v for v in video_files if ep.video_id == v[-15:-4]][0]
    except IndexError as e:
        print("Could not find video", ep)
        continue
    
    # Get a frame for the middle of each cluster timestamp
    for i, t in cluster_timestamps.items():
        cap = cv2.VideoCapture(video_name)
        frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        fps = cap.get(cv2.CAP_PROP_FPS)
       
        # Get the segment in the cluster with the longest time on that shot
        start, end = t[np.argmax([e - s for s, e in t])]
        middle = (end - start) / 2 + start
        # Don't get pictures for items with less than .1% of the total time
        if end - start < .001:            
            continue
        total += 1
        
        # Read frame        
        frame_no = int(frames * middle)
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_no)
        
        averages, _ = ep.video_averages
        
        ret, frame = cap.read()                

        frame = rescale_frame(frame, amount=RESCALE_AMOUNT)
        if BLACK_AND_WHITE: frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            
        video_cluster_images[ep][i] = {'frame':frame, 'frame_no':frame_no, 'frames':frames, 'fps':fps}        
        
        cap.release()

100%|██████████| 1465/1465 [14:55<00:00,  1.64it/s]


In [62]:
missing_video_cluster_images = [k for k,v in video_cluster_images.items() if len(v.items()) == 0]
len(video_cluster_images), len(missing_video_cluster_images)

(1465, 19)

## Save Images

In [63]:
for ep, clusters in video_cluster_images.items():
    for cluster_n, image in clusters.items():
        cv2.imwrite(f"{WEBSITE}/images/{ep.video_id}.cluster.{cluster_n}.jpg", image['frame'])

### Create timeline data

In [11]:
# protoc --python_out=./ ./screen-time-timeline.proto
# pbf screen-time-timeline.proto --browser > ../../jre-vis/src/lib/proto/screen-time.js
import screen_time_timeline_pb2 as timeline_proto

In [58]:
timelines = timeline_proto.Timelines()

for ep, cluster_timestamps in episode_clusters:
    if len(list(video_cluster_images[ep].values())) == 0:
        continue
        
    timeline = timelines.timelines.add()
    timeline.id = ep.video_id
    timeline.frames = list(video_cluster_images[ep].values())[0]['frames']    
    
    for i, t in cluster_timestamps.items():
        # If it doesn't have a picture, don't bother
        if not i in video_cluster_images[ep]:
            continue
        
        cluster = timeline.clusters.add()
        cluster.id = i
        for start, end in t:
            ts = cluster.timestamps.add()
            ts.start = start
            ts.end = end

In [59]:
PROTO_OUT = "../../jre-vis/public/"

with open(PROTO_OUT + "screen_time_timelines", "wb") as f:
    f.write(timelines.SerializeToString())

## View Images
Image is shown above scatterplot
Its corressponding cluster is highlighted in green
The orange dot indicates the frame it was taken from

In [60]:
from ipywidgets import interact, IntSlider, Dropdown
%matplotlib inline
from ipywidgets.widgets.interaction import show_inline_matplotlib_plots
import ipywidgets as widgets
from IPython.display import clear_output

def show_video_segments():
    def get_title(ep, cluster, timestamp):
        try:
            percentage = pie_data.loc[pie_data['id'] == ep.video_id]['data'].iloc[0][cluster]
        except Exception as e:
            percentage = 0
            print("Could not get percentage for", ep.number)
        return f"{ep.title} -- {cluster} ({round(percentage * 100)}%, {timestamp})"
    
    cluster = choose_cluster.value        
    ep, cluster_images = list(video_cluster_images.items())[ep_slider.value]
    img = cluster_images[cluster]
    
    averages = ep.video_averages[0]
    df = pd.DataFrame({ 'x': averages, 'n': range(len(averages))})
    avg_clusters = kde_cluster(df, bandwidth=BANDWIDTH, plot=False)
    df = frame_idxs_clusters(avg_clusters, df)
    df['color'] = np.where(df['cluster'] == cluster, 'green', 'black')

    with out:
        clear_output()                
        fig, (img_ax, clus_ax) = plt.subplots(2)
        fig.suptitle(get_title(ep, cluster, as_timestamp(img['frame_no'], img['fps'])))        
        
        # Show frame image
        if BLACK_AND_WHITE: img_ax.imshow(img['frame'], cmap='gray')
        else: img_ax.imshow(img['frame'])

        # Show scatter plot of clusters
        # Highlight the frame that was used as orange
        clus_ax.scatter(df['x'], y=df['n'], c=df['color'])        
        cluster_frame = int(img['frame_no'] / img['frames'] * len(df['color']))
        clus_ax.scatter(df.loc[cluster_frame, 'x'], y=df.loc[cluster_frame, 'n'], c='orange')

        show_inline_matplotlib_plots()
        
    
def on_cluster_change(change):
    show_video_segments()
    
def on_ep_change(change):
    new_x = change['new']
    ep, clusters = list(video_cluster_images.items())[new_x]
    choose_cluster.options = clusters.keys()
    choose_cluster.value = list(clusters.keys())[0]    
    show_video_segments()

initial_ep = 0

out = widgets.Output()
display(out)
    
choose_cluster = Dropdown(description='Cluster:')
choose_cluster.observe(on_cluster_change, names='value')

ep_slider = IntSlider(min=0, max=len(video_cluster_images) - 1)
ep_slider.observe(on_ep_change, names='value')
ep_slider.value = initial_ep
on_ep_change({'new': initial_ep})

display(ep_slider, choose_cluster)

Output()

IntSlider(value=0, max=1464)

Dropdown(description='Cluster:', options=(0, 1), value=0)

# Useful for checking out output of kde 
...With different values for the bandwidth

In [74]:
import plotly as py
import plotly.graph_objs as go
from ipywidgets import interact, IntSlider
py.offline.init_notebook_mode(connected = True)

bandwidth = BANDWIDTH
bandwidth = .01
frame_no = 194000

def show_frame_from(ep, frame_no):
    video_name = [v for v in video_files if ep.video_id == v[-15:-4]][0]
    cap = cv2.VideoCapture(video_name)
    fps = cap.get(cv2.CAP_PROP_FPS)
    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_no)    
    plt.title(as_timestamp(frame_no, fps))
    _, frame = cap.read()
    plt.imshow(frame)
    cap.release()

def show_frame_averages(x):
    df = pd.DataFrame({
      'x': eps[x].video_averages[0],
      'n': range(len(eps[x].video_averages[0])),
    })
    
    avg_clusters = kde_cluster(df, plot=False, bandwidth=bandwidth)

    df = frame_idxs_clusters(avg_clusters, df)
    # For a discrete color scale
    df['cluster'] = df['cluster'].astype(str)
    
    show_frame_from(eps[x], frame_no)

    title = f"{str(eps[x])} {len(df['x'])}"
    fig = px.scatter(df, x='x', color='cluster', title=title)
    fig.show()
        

interact(show_frame_averages, x=IntSlider(value=9, min=0, max=len(eps) - 1))


interactive(children=(IntSlider(value=9, description='x', max=1464), Output()), _dom_classes=('widget-interact…

<function __main__.show_frame_averages(x)>

In [None]:
# TODO Need some way to flag large deviations in std for clusters