### User + Video Dataset (Binary)

#### Dataset

For each video:  
X = lets have a user history, playing video, suggested videos, selected video  
Y = was selected (1) or not (0)  

In [1]:
import pandas as pd
import pickle as pkl
import numpy as np

import sys
sys.path.append('../')
from utils import *

In [4]:
df = pd.read_parquet('../../data/logs.parquet')

In [26]:
sq = df[['userId', 'search_query']].dropna()
sq = sq.groupby('userId').agg({'search_query': list}).reset_index()
sq = sq.set_index('userId')
sq = sq.to_dict()['search_query']

Unnamed: 0_level_0,search_query
userId,Unnamed: 1_level_1
AABA10807,"[dnald trump oan, dnald trump oan interview, d..."
AAYG93273,[Bar Rescue]
ABRL25174,"[billy joel official video, billy joel innocen..."
ADMD65322,"[uptown funk, uptown funk, uptown funk postmod..."
AFBZ98182,"[jazza, jazza shorts, news new york]"
...,...
ZXUF35757,"[ariana grande santa tell me, tasha scott coca..."
ZYLW85591,"[ears flushed out, a little sound and gray]"
ZYWO62277,"[mccarthy, mccarthy&sp=EgIIAQ%253D%253D]"
ZZDV35060,"[hidden true crime, cord cutter news, wop song]"


In [30]:
pkl.dump(sq, open('../../data/dataset/user_search_query.pkl', 'wb'))

{'AABA10807': ['dnald trump oan',
  'dnald trump oan interview',
  'dnald trump oan interview&sp=EgIIBA%253D%253D',
  'oan',
  'anitra gunn',
  'how to make lens fit in frame',
  'how to make frame fit lenses',
  'bannon trading cards',
  'bannon trading cards',
  'melanie lawson',
  'attorney solomon esq',
  'attorney solomon esq',
  'attorney solomon esq divorce court',
  'judge denise gentile',
  'nicholas revello vs karen sangalaza',
  'judge ellerbe',
  'trump'],
 'AAYG93273': ['Bar Rescue'],
 'ABRL25174': ['billy joel official video',
  'billy joel innocent man official video',
  'honesty billy joel',
  'don%27t ask me why billy joel',
  'The Longest Time billy joel',
  'This Is the Time billy joel'],
 'ADMD65322': ['uptown funk',
  'uptown funk',
  'uptown funk postmodern jukebox',
  'uptown funk postmodern jukebox'],
 'AFBZ98182': ['jazza', 'jazza shorts', 'news new york'],
 'AFME53670': ['sleepers',
  'cubs news today',
  'association',
  'beatles songs',
  'orpheus',
  'grass

In [4]:
watches = pd.read_pickle('../../data/user-level-data')
df = pd.read_pickle('../../data/datasets/raw-video-level-watches')
videos = pd.read_pickle('../../data/videos_raw_metadata')
title_embeddings = pd.read_pickle('../../data/embeddings/title-autoencoded')
title_openai_embeddings = pd.read_pickle('../../data/embeddings/openai-title')
tag_embeddings = pd.read_pickle('../../data/embeddings/tag_embeddings.pkl')
topic_embeddings = pd.read_pickle('../../data/embeddings/topic_embeddings.pkl')

In [5]:
videos = clean(meta_data, titles=embeddings)

In [6]:
users = list(set(df['userId']))

data = dict()

for user in tqdm(users):
    data[user] = df[df['userId'] == user].sort_values('date')

sessions = dict()
for user in tqdm(users):
    sessions[user] = []
    sess = data[user][['date', 'url', 'is_video', 'upnext_content']].to_dict('records')
    session = []
    session_active = False
    for watch in sess:
        # when waiting for first video in a session
        if not session_active and watch['is_video']:
            session = []
            session.append(watch)
            session_active = True

        # back to back videos
        elif session_active and watch['is_video']:
            upnexts = decode_upnext(session[-1]['upnext_content'])
            this_url = get_id(watch['url'])
            if this_url in upnexts:
                session.append(watch)


        # if next watch is not video when session active
        elif session_active and not watch['is_video']:
            session_active = False
            if len(session) > 1:
                sessions[user].append(session)
            session = []

  0%|          | 0/2171 [00:00<?, ?it/s]

  0%|          | 0/2171 [00:00<?, ?it/s]

In [8]:
session_users = []
for user in sessions:
    if len(sessions[user]) != 0:
        session_users.append(user)

In [11]:
watches = dict()
for user in tqdm(session_users):
    watches[user] = []
    for session in sessions[user]:
        for watching, next_watch in zip(session[:-1], session[1:]):
            watch = dict()
            watch['playing'] = get_id(watching['url']) 
            watch['upnext'] = decode_upnext(watching['upnext_content']) 
            watch['selected'] = get_id(next_watch['url'])
            watches[user].append(watch)

  0%|          | 0/1289 [00:00<?, ?it/s]

In [26]:
def get_history_and_training(watch):
    # 20% of the watch list is used for history rest is used for training
    history = []
    training = []

    for i in range(len(watch)):
        if i < len(watch) * 0.2:
            history.append(watch[i])
        else:
            training.append(watch[i])
    return history, training

history = dict()
training = dict()

for user in tqdm(watches):
    history[user] = []
    training[user] = []
    h, t = get_history_and_training(watches[user])
    history[user] = h
    training[user] = t

  0%|          | 0/1289 [00:00<?, ?it/s]

array([-0.02052369, -0.00394123,  0.00676498, -0.02789925,  0.0364079 ,
        0.00423158,  0.01893815, -0.01024528, -0.03310304, -0.01286414,
       -0.00615279,  0.02214764, -0.03293549,  0.00024888,  0.00529397,
       -0.01657577, -0.02752809,  0.01507695, -0.00357913, -0.01581683,
       -0.0258556 ,  0.01929254,  0.02471216,  0.02040244,  0.00890152,
       -0.01546264,  0.00694602,  0.00397866,  0.00974122,  0.01126551,
       -0.00528934,  0.01407733, -0.01547308, -0.03903421,  0.01192805,
        0.00990158,  0.02335085, -0.01921366, -0.016178  ,  0.03032142,
       -0.01173447,  0.01126173, -0.01118469,  0.00057627, -0.02390211,
        0.00903398, -0.02650922, -0.00192399,  0.01496556, -0.01324484,
        0.00266217, -0.03945154,  0.00449591,  0.00193637, -0.01711477,
        0.02369141, -0.01885802, -0.00041714, -0.02460242, -0.01745125,
       -0.00019955,  0.00628997,  0.00455425,  0.00320548,  0.00558458,
       -0.00320261, -0.00017512,  0.00261015, -0.00067335, -0.01

In [43]:
history_embeddings = dict()

for user in tqdm(history):
    history_embeddings[user] = []
    for watch in history[user]:
        if watch['playing'] in videos:
            emb = np.array(embeddings[videos[watch['playing']]['title']])[0]
            history_embeddings[user].append(emb)
        else:
            history_embeddings[user].append(np.zeros(128))

  0%|          | 0/1289 [00:00<?, ?it/s]

In [44]:
pkl.dump(history_embeddings, open('../../data/embeddings/history_embeddings-todo', 'wb'))