In [2]:
import pandas as pd
import numpy as np
import pickle as pkl

import sys
sys.path.append('../')
from utils import *

In [3]:
watches = pd.read_pickle('../../data/user-level-data')
videos = pd.read_pickle('../../data/videos_raw_metadata')

In [4]:
watches_df = dict()

def convert(watch):
    d = dict()
    d['playing'] = watch['playing']['id']
    suggested = []
    for s in watch['suggested']:
        if s != None:
            suggested.append(s['id'])
    d['suggested'] = suggested
    d['selected'] = watch['selected']
    return d

for user in watches:
    watches_df[user] = []
    for watch in watches[user]:
        if watch['playing'] == None:
            continue
        watch = convert(watch)
        watches_df[user].append(watch)

In [5]:
history = dict()
users = list(watches_df.keys())
data = dict()

for user in users:
    if len(watches_df[user]) < 10:
        continue
    history[user] = watches_df[user][:10]

    for watch in watches_df[user][10:]:
        if watch['selected'] != None:
            if user not in data:
                data[user] = []
            data[user].append(watch)

for user in list(history.keys()):
    if user not in data:
        history.pop(user)

In [6]:
from collections import Counter


all_topics = []
all_tags = []

for video in videos:
    video = videos[video]
    
    try:
        topics = video['topicDetails']['topicCategories']
        topics = [topic.split('/')[-1].lower() for topic in topics]
    except:
        topics = []

    all_topics += topics
    
    try:
        tags = video['snippet']['tags']
        tags = [tag.lower() for tag in tags]
    except:
        tags = []

    all_tags += tags

all_topics = list(set(all_topics))

atc = Counter(all_tags)
atc = pd.DataFrame.from_dict(atc, 'index', columns=['count'])
atc = atc.sort_values(['count'], ascending=False).head(5000)
all_tags = list(atc.index)

def get_one_hot_vector(topics, tags):
    oh_topics = np.zeros(len(all_topics))
    for topic in topics:
        oh_topics[all_topics.index(topic)] = 1
        
    oh_tags = np.zeros(len(all_tags))
    for tag in tags:
        if tag in all_tags:
            oh_tags[all_tags.index(tag)] = 1
        
    oh = np.concatenate([oh_topics , oh_tags])
    
    return oh


def get_topic_vector(video):
    video = videos[video]
    try:
        topics = video['topicDetails']['topicCategories']
        topics = [topic.split('/')[-1].lower() for topic in topics]
    except:
        topics = []

    try:
        tags = video['snippet']['tags']
        tags = [tag.lower() for tag in tags]
    except:
        tags = []

    return get_one_hot_vector(topics, tags)


In [7]:
for user in history:
    history[user] = [s['playing'] for s in history[user]]
    history[user] = [get_topic_vector(s) for s in history[user]]
    history[user] = np.array(history[user]).mean(axis=0)

In [8]:
X = []
Y = []

for user in tqdm(data):
    user_history = history[user]
    for watch in data[user]:
        playing = watch['playing']
        selected = watch['selected']
        upnext = watch['suggested'][:15]

        if playing not in videos or selected not in videos or any([s not in videos for s in watch['suggested']]):
            continue
        if len(watch['suggested']) < 15:
            continue


        playing = get_topic_vector(watch['playing'])

        suggesteds = []
        y = []
        for i, s in enumerate(upnext):
            selected = watch['selected']

            sv = get_topic_vector(s)
            iv = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
            iv[i] = 1
            sv = np.concatenate([sv, iv])
            suggesteds.append(sv)
            
            if s == selected:
                y = sv



        if y != []:
            X.append([playing] + suggesteds)
            Y.append(y)

  0%|          | 0/1266 [00:00<?, ?it/s]

  if y != []:


In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

In [10]:
import tensorflow as tf

XX = X_train

px = []
sN = []

for i in range(15):
    sN.append([])

for x in XX:
    px.append(x[0])
    for i in range(15):
        sN[i].append(x[i+1])

px = np.array(px)
for i in range(15):
    sN[i] = np.array(sN[i])


y_train = np.array(y_train)

2023-08-03 15:01:02.244976: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [13]:
len(y_train), len(px), len(sN)

(11922, 11922, 15)

In [15]:
sN[0].shape

(11922, 5077)

In [61]:
import tensorflow as tf

XX = X_test

pxt = []
sNt = []

for i in range(15):
    sNt.append([])

for x in XX:
    pxt.append(x[0])
    for i in range(15):
        sNt[i].append(x[i+1])

pxt = np.array(pxt)
for i in range(15):
    sNt[i] = np.array(sNt[i])


y_test = np.array(y_test)

In [62]:
from tensorflow import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import SGD


playing_input = keras.layers.Input(shape=(5062))
suggested_inputs = []

for _ in range(15):
    suggested_layer = keras.layers.Input(shape=(5077))
    suggested_inputs.append(suggested_layer)

print('suggested inputs')

playing_layer = keras.layers.Dense(512, activation='relu')(playing_input)
suggested_layers = []
for sg in suggested_inputs:
    suggested_layers.append(keras.layers.Dense(512, activation='relu')(sg))

print('suggested layers')

inputs = [playing_input] + suggested_inputs

merged = keras.layers.Concatenate(axis=1)([playing_layer] + suggested_layers)
output = keras.layers.Dense(5077, activation='sigmoid')(merged)
# dense2 = keras.layers.Dense(6000, activation='sigmoid')(dense1)
# output = keras.layers.Dense(5063, activation='tanh')(dense2)
model = keras.models.Model(inputs=inputs, outputs=output)

suggested inputs
suggested layers


In [63]:
model.compile(loss='binary_crossentropy',
                optimizer='adam',
                metrics='cosine_similarity')

In [65]:
tf.config.run_functions_eagerly(True)

In [66]:
model.fit([px] + sN, y_train,
            epochs=64,
            batch_size=8, 
            validation_split=0.2)

Epoch 1/64
Epoch 2/64
Epoch 3/64
Epoch 4/64
Epoch 5/64
Epoch 6/64
Epoch 7/64
Epoch 8/64
Epoch 9/64
Epoch 10/64
Epoch 11/64
Epoch 12/64
Epoch 13/64
Epoch 14/64
Epoch 15/64
Epoch 16/64
Epoch 17/64
Epoch 18/64
Epoch 19/64
Epoch 20/64
Epoch 21/64
Epoch 22/64
Epoch 23/64
Epoch 24/64
Epoch 25/64
Epoch 26/64
 134/1193 [==>...........................] - ETA: 1:15 - loss: 2.2170e-04 - cosine_similarity: 0.9338

KeyboardInterrupt: 

In [67]:
model.evaluate([pxt] + sNt, y_test)



[0.001290591317228973, 0.48258230090141296]

In [68]:
y_pred = model.predict([pxt] + sNt)



In [69]:
yit = []

for y in y_train:
    y = np.argmax(y[-15:])
    yit.append(y)


yip = []

for y in y_pred:
    y = np.argmax(y[-15:])
    yip.append(y)

In [1]:
len(sNt

NameError: name 'sNt' is not defined

In [72]:
    yit

[0,
 1,
 0,
 0,
 1,
 6,
 1,
 0,
 13,
 0,
 8,
 6,
 6,
 3,
 1,
 1,
 9,
 3,
 0,
 1,
 3,
 11,
 13,
 9,
 2,
 2,
 13,
 0,
 2,
 11,
 2,
 0,
 1,
 7,
 1,
 1,
 2,
 0,
 12,
 0,
 0,
 0,
 0,
 1,
 2,
 0,
 8,
 1,
 6,
 1,
 2,
 2,
 14,
 0,
 0,
 4,
 0,
 3,
 0,
 0,
 0,
 1,
 1,
 0,
 3,
 0,
 1,
 5,
 2,
 6,
 1,
 8,
 3,
 2,
 0,
 1,
 4,
 4,
 0,
 12,
 10,
 0,
 0,
 1,
 6,
 0,
 0,
 0,
 0,
 1,
 0,
 9,
 0,
 12,
 0,
 2,
 6,
 9,
 13,
 0,
 0,
 1,
 11,
 9,
 0,
 2,
 2,
 8,
 0,
 0,
 0,
 12,
 5,
 0,
 14,
 2,
 13,
 7,
 0,
 10,
 10,
 1,
 5,
 5,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 9,
 2,
 0,
 7,
 0,
 1,
 4,
 3,
 0,
 0,
 2,
 9,
 4,
 14,
 2,
 0,
 0,
 14,
 10,
 10,
 0,
 6,
 8,
 0,
 1,
 4,
 0,
 9,
 6,
 12,
 0,
 0,
 5,
 2,
 3,
 5,
 0,
 6,
 2,
 1,
 9,
 1,
 0,
 2,
 8,
 0,
 13,
 2,
 9,
 0,
 2,
 2,
 0,
 1,
 0,
 0,
 0,
 11,
 0,
 0,
 11,
 0,
 9,
 1,
 6,
 0,
 0,
 3,
 1,
 14,
 9,
 9,
 0,
 10,
 0,
 7,
 0,
 5,
 0,
 10,
 0,
 0,
 1,
 3,
 12,
 12,
 10,
 0,
 5,
 2,
 0,
 2,
 3,
 2,
 0,
 6,
 4,
 2,
 11,
 4,
 3,
 6,
 5,
 6,
 7,
 2,
 0,
 0,
 1,
 2,
 3,


In [74]:
tp = 0
tt = 0

for p, t in zip(yip, yit):
    if p == t:
        tp+=1
    tt+=1

In [75]:
tp/tt

0.19758470311975848

In [49]:
SS = []

for i in range(15):
    SS.append([])

for s in sNt:
    for i in range(15):
        SS[i].append(s[i])

In [62]:
yip = []

for yp, ss in zip(y_test, SS):
    yip.append(np.argmax(CS([yp], SS[0])[0]))

In [63]:
yip

[10, 14, 14, 0, 0, 0, 14, 14, 14, 14, 14, 14, 14, 0, 0]

In [61]:
y_

array([[ 0.,  0.,  0., ...,  0.,  0., 10.],
       [ 0.,  0.,  0., ...,  0.,  0.,  7.],
       [ 0.,  0.,  1., ...,  0.,  0.,  2.],
       ...,
       [ 0.,  0.,  0., ...,  0.,  0.,  3.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])