In [1]:
import pandas as pd
import numpy as np
import pickle as pkl
import tensorflow as tf

import sys
sys.path.append('../')
from utils import *

2023-08-15 09:21:58.174379: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
watches = pd.read_pickle('../../data/user-level-data')
videos = pd.read_pickle('../../data/videos_raw_metadata')

In [3]:
watches_df = dict()

def convert(watch):
    d = dict()
    d['playing'] = watch['playing']['id']
    suggested = []
    for s in watch['suggested']:
        if s != None:
            suggested.append(s['id'])
    d['suggested'] = suggested
    d['selected'] = watch['selected']
    return d

for user in watches:
    watches_df[user] = []
    for watch in watches[user]:
        if watch['playing'] == None:
            continue
        watch = convert(watch)
        watches_df[user].append(watch)

In [4]:
history = dict()
users = list(watches_df.keys())
data = dict()

for user in users:
    if len(watches_df[user]) < 10:
        continue
    history[user] = watches_df[user][:10]

    for watch in watches_df[user][10:]:
        if watch['selected'] != None:
            if user not in data:
                data[user] = []
            data[user].append(watch)

for user in list(history.keys()):
    if user not in data:
        history.pop(user)

In [6]:
for user in history:
    history[user] = [s['playing'] for s in history[user]]
    history[user] = [get_topic_vector(s) for s in history[user]]
    history[user] = np.array(history[user]).mean(axis=0)

In [5]:
from collections import Counter


all_topics = []
all_tags = []

for video in videos:
    video = videos[video]
    
    try:
        topics = video['topicDetails']['topicCategories']
        topics = [topic.split('/')[-1].lower() for topic in topics]
    except:
        topics = []

    all_topics += topics
    
    try:
        tags = video['snippet']['tags']
        tags = [tag.lower() for tag in tags]
    except:
        tags = []

    all_tags += tags

all_topics = list(set(all_topics))

atc = Counter(all_tags)
atc = pd.DataFrame.from_dict(atc, 'index', columns=['count'])
atc = atc.sort_values(['count'], ascending=False).head(5000)
all_tags = list(atc.index)

def get_one_hot_vector(topics, tags):
    oh_topics = np.zeros(len(all_topics))
    for topic in topics:
        oh_topics[all_topics.index(topic)] = 1
        
    oh_tags = np.zeros(len(all_tags))
    for tag in tags:
        if tag in all_tags:
            oh_tags[all_tags.index(tag)] = 1
        
    oh = np.concatenate([oh_topics , oh_tags])
    
    return oh


def get_topic_vector(video):
    video = videos[video]
    try:
        topics = video['topicDetails']['topicCategories']
        topics = [topic.split('/')[-1].lower() for topic in topics]
    except:
        topics = []

    try:
        tags = video['snippet']['tags']
        tags = [tag.lower() for tag in tags]
    except:
        tags = []

    return get_one_hot_vector(topics, tags)


In [7]:
AX = []
PX = []
NX = []
Y = []
topic_vectors = dict()

for user in tqdm(data):
    user_history = history[user]
    user_history = np.concatenate([user_history, [0]])
    for watch in data[user]:
        
        playing = watch['playing']
        selected = watch['selected']
        upnext = watch['suggested'][:15]

        if playing not in videos or selected not in videos or any([s not in videos for s in watch['suggested']]):
            continue
        if len(watch['suggested']) < 15:
            continue


        playing = get_topic_vector(watch['playing'])
        playing = np.concatenate([playing, [0]])

        suggesteds = []
        
        for i, s in enumerate(upnext):
            if s not in topic_vectors:
                topic_vectors[s] = get_topic_vector(s)
            sv = topic_vectors[s]
            sv = np.concatenate([sv, [i]])
            suggesteds.append(sv)
        
        suggesteds = np.array(suggesteds)
        suggesteds = np.mean(suggesteds, axis=0)

        nxs = []
        px = []
        ax = [playing, suggesteds, user_history]

        for i, s in enumerate(upnext):
            selected = watch['selected']

            sv = topic_vectors[s]
            sv = np.concatenate([sv, [i]])

            if s == selected:
                px = [sv, suggesteds, user_history]
            else:
                nx = [sv, suggesteds, user_history]
                nxs.append(nx)


        if px == []:
            continue
        for nx in nxs:
            AX.append(ax)
            PX.append(px)
            NX.append(nx)

  0%|          | 0/1266 [00:00<?, ?it/s]

In [8]:
from sklearn.model_selection import train_test_split

AX_train, AX_test, PX_train, PX_test, NX_train, NX_test = train_test_split(AX, PX, NX)

In [9]:
from tensorflow import keras
from keras import Model, metrics, layers
from keras.models import Sequential
from keras.metrics import AUC, BinaryAccuracy
from keras.layers import Dense, Dropout, Flatten, Concatenate, Input, BatchNormalization
from tensorflow.keras.optimizers import SGD

In [10]:
DIM = 5063

In [11]:
class DistanceLayer(layers.Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def call(self, anchor, positive, negative):
        ap_distance = tf.reduce_sum(tf.square(anchor - positive), -1)
        an_distance = tf.reduce_sum(tf.square(anchor - negative), -1)
        return (ap_distance, an_distance)

DP = 0.3

suggested_tensor = Input(shape=[DIM])
playing_tensor = Input(shape=[DIM])
history_tensor = Input(shape=[DIM])


sg_dense = Dense(1024, activation='relu')(suggested_tensor)
sg_dense = Dropout(DP)(sg_dense)
sg = Flatten()(sg_dense)

pl_dense = Dense(1024, activation='relu')(playing_tensor)
pl_dense = Dropout(DP)(pl_dense)
pl = Flatten()(pl_dense)

hl_dense = Dense(1024, activation='relu')(history_tensor)
hl_dense = Dropout(DP)(hl_dense)
hl = Flatten()(hl_dense)

merged = Concatenate()([sg, pl, hl])

d = Dense(2056, activation='relu')(merged)
d = Dropout(DP)(d)
d = Dense(1024, activation='relu')(d)
d = Dropout(DP)(d)
flatten = Flatten()(d)
dense1 = Dense(1024, activation="relu")(flatten)
dense1 = Dropout(DP)(dense1)
dense1 = BatchNormalization()(dense1)
dense2 = Dense(256, activation="relu")(dense1)
dense2 = Dropout(DP)(dense2)
dense2 = BatchNormalization()(dense2)
output = Dense(256)(dense2)

embedding = Model(inputs=[playing_tensor, suggested_tensor, history_tensor], outputs=output)

2023-08-15 09:25:00.511556: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1639] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 46692 MB memory:  -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:01:00.0, compute capability: 8.6
2023-08-15 09:25:00.512415: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1639] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 46692 MB memory:  -> device: 1, name: NVIDIA RTX A6000, pci bus id: 0000:81:00.0, compute capability: 8.6


In [12]:
class SiameseModel(Model):
    def __init__(self, siamese_network, margin=0.5):
        super().__init__()
        self.siamese_network = siamese_network
        self.margin = margin
        self.loss_tracker = metrics.Mean(name="loss")

    def call(self, inputs):
        return self.siamese_network(inputs)

    def train_step(self, data):
        # GradientTape is a context manager that records every operation that
        # you do inside. We are using it here to compute the loss so we can get
        # the gradients and apply them using the optimizer specified in
        # `compile()`.
        with tf.GradientTape() as tape:
            loss = self._compute_loss(data)

        # Storing the gradients of the loss function with respect to the
        # weights/parameters.
        gradients = tape.gradient(loss, self.siamese_network.trainable_weights)

        # Applying the gradients on the model using the specified optimizer
        self.optimizer.apply_gradients(
            zip(gradients, self.siamese_network.trainable_weights)
        )

        # Let's update and return the training loss metric.
        self.loss_tracker.update_state(loss)
        return {"loss": self.loss_tracker.result()}

    def test_step(self, data):
        loss = self._compute_loss(data)

        # Let's update and return the loss metric.
        self.loss_tracker.update_state(loss)
        return {"loss": self.loss_tracker.result()}

    def _compute_loss(self, data):
        # The output of the network is a tuple containing the distances
        # between the anchor and the positive example, and the anchor and
        # the negative example.
        ap_distance, an_distance = self.siamese_network(data)

        # Computing the Triplet Loss by subtracting both distances and
        # making sure we don't get a negative value.
        loss = ap_distance - an_distance
        loss = tf.maximum(loss + self.margin, 0.0)
        return loss

    @property
    def metrics(self):
        # We need to list our metrics here so the `reset_states()` can be
        # called automatically.
        return [self.loss_tracker]

In [13]:
anchor_playing = Input(shape=[DIM], name='anchor_p')
anchor_suggested = Input(shape=[DIM], name='anchor_s')
anchor_history = Input(shape=[DIM], name='anchor_h')

positive_selected = Input(shape=[DIM], name='positive_p')
positive_suggested = Input(shape=[DIM], name='positive_s')
positive_history = Input(shape=[DIM], name='positive_h')

negative_selected = Input(shape=[DIM], name='negative_p')
negative_suggested = Input(shape=[DIM], name='negative_s')
negative_history = Input(shape=[DIM], name='negative_h')

distances = DistanceLayer()(
    embedding([anchor_playing, anchor_suggested, anchor_history]),
    embedding([positive_selected, positive_suggested, positive_history]),
    embedding([negative_selected, negative_suggested, negative_history]),
)


# distances = DistanceLayer()(
#     embedding(anchor),
#     embedding(positive),
#     embedding(negative),
# )

siamese_network = Model(
    inputs = [anchor_playing, anchor_suggested, anchor_history, positive_selected, positive_suggested, positive_history, negative_selected, negative_suggested, negative_history], outputs=distances
)

siamese_model = SiameseModel(siamese_network)
sgd = SGD(lr=0.005, decay=1e-6, momentum=0.9, nesterov=True)
siamese_model.compile(optimizer=sgd, loss='binary_crossentropy', metrics=['accuracy'])

In [14]:
Ap, As, Ah, Pp, Ps, Ph, Np, Ns, Nh = [], [], [], [], [], [], [], [], []
for ax, px, nx in zip(AX_train, PX_train, NX_train):
    Ap.append(ax[0])
    As.append(ax[1])
    Ah.append(ax[2])

    Pp.append(px[0])
    Ps.append(px[1])
    Ph.append(px[2])

    Np.append(nx[0])
    Ns.append(nx[1])
    Nh.append(nx[2])

In [15]:
Ap = np.array(Ap)
As = np.array(As)
Ah = np.array(Ah)

Pp = np.array(Pp)
Ps = np.array(Ps)
Ph = np.array(Ph)

Np = np.array(Np)
Ns = np.array(Ns)
Nh = np.array(Nh)

In [16]:
siamese_model.fit([Ap, As, Ah, Pp, Ps, Ph, Np, Ns, Nh], batch_size=32, epochs=100, validation_split=0.2)

Epoch 1/100


2023-08-15 09:25:50.699928: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:606] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2023-08-15 09:25:50.704855: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7f6e0c49ead0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-08-15 09:25:50.704871: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA RTX A6000, Compute Capability 8.6
2023-08-15 09:25:50.704877: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (1): NVIDIA RTX A6000, Compute Capability 8.6
2023-08-15 09:25:50.710153: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:255] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-08-15 09:25:50.818192: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:432] Loaded cuDNN version 8600
2023-08-15 09:25:50.92036

Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100