In [28]:
import numpy as np
import pandas as pd
import pickle as pkl
from tqdm.notebook import tqdm as tqdm

In [29]:
empty_desc = np.zeros(768)
empty_title = np.zeros(768)

class Video:
    def __init__(self, uid): 
        self.uid = uid
        if uid not in video_db:
            self.title, self.tags, self.description, self.category, self.channel_title = None, None, None, None, None
            self.title_emb, self.tags_emb, self.description_emb = empty_title, [], empty_desc
            return

        vs = video_db[uid]['snippet']
        self.title = vs['title']
        self.tags = vs.get('tags', [])
        self.description = vs['description']
        self.category = vs['categoryId']
        self.channel_title = vs['channelTitle']
        
        self.title_emb = video_db[uid]['embeddings']['title']
        self.tags_emb = video_db[uid]['embeddings']['tags']
        self.description_emb = video_db[uid]['embeddings']['description']

    def __repr__(self):
        if self.title is None:
            return 'Video not found'
        return self.title
    
    def __str__(self):
        if self.title is None:
            return 'Video not found'
        return self.title
    
    def __hash__(self):
        return hash(self.uid)
    
def get_id(url):
    if len(url.split('v=')) <= 1:
        return None
    url = url.split('v=')[1]
    url = url.split('&')[0]
    return url

def decode_upnext(urls):
    urls = eval(urls.decode('utf-8'))
    ids = []
    for u in urls:
        ids.append(get_id(u))
    return ids

In [30]:
sessions = pd.read_pickle('all_sessions')
video_db = pd.read_pickle('video-dump-with-embeddings+transcript')

In [31]:
# Single Step Data (only access to this video)
ss_session = []
for session in tqdm(sessions):
    for video, next_video, in zip(session[:-1], session[1:]):
        video['selected'] = next_video['url']
        ss_session.append(video)


data = pd.DataFrame(ss_session)
data['url'] = data['url'].apply(get_id)
data['selected'] = data['selected'].apply(get_id)
data['upnext_content'] = data['upnext_content'].apply(decode_upnext)
data = data[['url', 'upnext_content', 'selected']]
data.columns = ['video', 'upnext', 'selected']
data.head()

  0%|          | 0/6762 [00:00<?, ?it/s]

Unnamed: 0,video,upnext,selected
0,489B7RNudAQ,"[ZOZyo6YOAu4, VtYi8AR11WM, Q32BGQmVLJ0, h1BsKI...",Q32BGQmVLJ0
1,jnXeE4TY2so,"[MeH-4wEuvZs, h1BsKIP4uYM, dqbyJIKLxok, VX4n8w...",HtyVbMZegn4
2,NQPNRwpGWXc,"[2PopspP_DbI, VPrrIGxjubI, 4s2ynUAJ5ZU, dEk8Hb...",VPrrIGxjubI
3,8tsnuvfMmtc,"[4iYl9oSjffQ, mTHedRdHJgk, aeWyp2vXxqA, 6S7VkI...",aeWyp2vXxqA
4,LcW4MCa5YCQ,"[-TgFz3qmE9U, 9scfWN6aXaU, O7VaXlMvAvk, h1BsKI...",z0Xpye7Ltlo


In [32]:
video_sessions = []
ss_sessions = data.to_dict('records')

for session in tqdm(ss_sessions):
    video_session = {}
    v = Video(session['video'])
    video_session['video'] = Video(session['video'])
    video_session['upnext'] = []
    for upnext in session['upnext']:
        video_session['upnext'].append(Video(upnext))
    video_session['selected'] = Video(session['selected'])
    video_sessions.append(video_session)

  0%|          | 0/8563 [00:00<?, ?it/s]

# Similarities between video and selected

In [33]:
# compute cosine similarity between two title embeddings

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def cosine_sim(a, b):
    return cosine_similarity(a.reshape(1, -1), b.reshape(1, -1))[0][0]

In [36]:
similarities = {'title': [], 'tags': [], 'description': []}

for session in tqdm(video_sessions):
    playing = session['video']
    selected = session['selected']
    suggested = session['upnext']

    title_similarity = dict()
    tags_similarity = dict()
    description_similarity = dict()

    # title
    sims = []
    for video in suggested:
        sims.append(cosine_sim(playing.title_emb, video.title_emb))
    title_similarity['suggested'] = sims
    title_similarity['selected'] = cosine_sim(playing.title_emb, selected.title_emb)

    # tags
    # sims = []
    # for video in suggested:
    #     sims.append(tags_cosine_sim(playing.tags_emb, video.tags_emb))
    # tags_similarity['suggested'] = sims
    # tags_similarity['selected'] = cosine_sim(playing.tags_emb, selected.tags_emb)

    # description
    sims = []
    for video in suggested:
        sims.append(cosine_sim(playing.description_emb, video.description_emb))
    description_similarity['suggested'] = sims
    description_similarity['selected'] = cosine_sim(playing.description_emb, selected.description_emb)
    
    similarities['description'].append(description_similarity)
    similarities['title'].append(title_similarity)

  0%|          | 0/8563 [00:00<?, ?it/s]

{'suggested': [0.52309793,
  0.41573066,
  0.5358752,
  0.4773757,
  0.21819827,
  0.3967138,
  0.31892043,
  0.0,
  0.46168518,
  0.60267174,
  0.44755438,
  0.16989785,
  0.0009108577,
  0.26928246,
  0.4076116,
  0.45672327,
  0.32553548,
  0.6058545,
  0.25709695],
 'selected': 0.5358752}