In [1]:
import pandas as pd
import numpy as np
import pickle as pkl
import tensorflow as tf

import sys
sys.path.append('../')
from utils import *

2023-08-17 12:22:34.375991: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
df = pd.read_pickle('../../data/datasets/raw-video-level-watches')
videos = pd.read_pickle('../../data/videos_raw_metadata')
title_embeddings = pd.read_pickle('../../data/embeddings/title-autoencoded')

2023-08-17 12:22:56.919060: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1639] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 46692 MB memory:  -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:01:00.0, compute capability: 8.6
2023-08-17 12:22:56.919782: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1639] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 46692 MB memory:  -> device: 1, name: NVIDIA RTX A6000, pci bus id: 0000:81:00.0, compute capability: 8.6


In [3]:
def trim(vec, element, length=15):
    #  a vector to length and keep the element
    if len(vec) > length:
        vec = vec[:length]
    if element not in vec:
        vec = vec[:-1] + [element]

    return vec

In [61]:
anchor = []
positive = []
negative = []

for watch in df:
    playing = watch['playing']
    selected = watch['selected']
    suggesteds = watch['upnext']

    suggesteds = trim(suggesteds, selected)
    
    if playing not in videos or selected not in videos or any([s not in videos for s in suggesteds]):
        continue
    
    p = np.array(title_embeddings[videos[playing]['snippet']['title']])[0]
    sv = np.array(title_embeddings[videos[selected]['snippet']['title']])[0]
    sx = [title_embeddings[videos[s]['snippet']['title']] for s in suggesteds]
    smean = np.mean(np.array(sx), axis=0)[0]
    nxs = []
    px = []
    ax = [p, smean]

    for s, sxx in zip(suggesteds, sx):
        if s == selected:
            px = [sv, smean]
        else:
            sxx = np.array(sxx)[0]
            nx = [sxx, smean]
            nxs.append(nx)
        
    if px == []:
        continue

    for nx in nxs:
        anchor.append(ax)
        positive.append(px)
        negative.append(nx)

In [71]:
DIM = 128

In [70]:
from sklearn.model_selection import train_test_split

AX_train, AX_test, PX_train, PX_test, NX_train, NX_test = train_test_split(anchor, positive, negative)

In [72]:
from tensorflow import keras
from keras import Model, metrics, layers
from keras.models import Sequential
from keras.metrics import AUC, BinaryAccuracy
from keras.layers import Dense, Dropout, Flatten, Concatenate, Input, BatchNormalization
from tensorflow.keras.optimizers import SGD

In [125]:
class DistanceLayer(layers.Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def call(self, anchor, positive, negative):
        ap_distance = tf.reduce_sum(tf.square(anchor - positive), -1)
        an_distance = tf.reduce_sum(tf.square(anchor - negative), -1)
        return (ap_distance, an_distance)

DP = 0.3

suggested_tensor = Input(shape=[DIM])
playing_tensor = Input(shape=[DIM])


sg_dense = Dense(128, activation='relu')(suggested_tensor)
sg_dense = Dropout(DP)(sg_dense)
sg = Flatten()(sg_dense)

pl_dense = Dense(128, activation='relu')(playing_tensor)
pl_dense = Dropout(DP)(pl_dense)
pl = Flatten()(pl_dense)

merged = Concatenate()([sg, pl])

d = Dense(128, activation='relu')(merged)
d = Dropout(DP)(d)
flatten = Flatten()(d)
dense1 = Dense(128, activation="relu")(flatten)
dense1 = Dropout(DP)(dense1)
dense1 = BatchNormalization()(dense1)
output = Dense(128, activation="relu")(dense1)

embedding = Model(inputs=[playing_tensor, suggested_tensor], outputs=output)

In [126]:
class SiameseModel(Model):
    def __init__(self, siamese_network, margin=0.5):
        super().__init__()
        self.siamese_network = siamese_network
        self.margin = margin
        self.loss_tracker = metrics.Mean(name="loss")

    def call(self, inputs):
        return self.siamese_network(inputs)

    def train_step(self, data):
        # GradientTape is a context manager that records every operation that
        # you do inside. We are using it here to compute the loss so we can get
        # the gradients and apply them using the optimizer specified in
        # `compile()`.
        with tf.GradientTape() as tape:
            loss = self._compute_loss(data)

        # Storing the gradients of the loss function with respect to the
        # weights/parameters.
        gradients = tape.gradient(loss, self.siamese_network.trainable_weights)

        # Applying the gradients on the model using the specified optimizer
        self.optimizer.apply_gradients(
            zip(gradients, self.siamese_network.trainable_weights)
        )

        # Let's update and return the training loss metric.
        self.loss_tracker.update_state(loss)
        return {"loss": self.loss_tracker.result()}

    def test_step(self, data):
        loss = self._compute_loss(data)

        # Let's update and return the loss metric.
        self.loss_tracker.update_state(loss)
        return {"loss": self.loss_tracker.result()}

    def _compute_loss(self, data):
        # The output of the network is a tuple containing the distances
        # between the anchor and the positive example, and the anchor and
        # the negative example.
        ap_distance, an_distance = self.siamese_network(data)

        # Computing the Triplet Loss by subtracting both distances and
        # making sure we don't get a negative value.
        loss = ap_distance - an_distance
        loss = tf.maximum(loss + self.margin, 0.0)
        return loss

    @property
    def metrics(self):
        # We need to list our metrics here so the `reset_states()` can be
        # called automatically.
        return [self.loss_tracker]

In [127]:
anchor_playing = Input(shape=[DIM], name='anchor_p')
anchor_suggested = Input(shape=[DIM], name='anchor_s')
anchor_history = Input(shape=[DIM], name='anchor_h')

positive_selected = Input(shape=[DIM], name='positive_p')
positive_suggested = Input(shape=[DIM], name='positive_s')
positive_history = Input(shape=[DIM], name='positive_h')

negative_selected = Input(shape=[DIM], name='negative_p')
negative_suggested = Input(shape=[DIM], name='negative_s')
negative_history = Input(shape=[DIM], name='negative_h')

distances = DistanceLayer()(
    embedding([anchor_playing, anchor_suggested]),
    embedding([positive_selected, positive_suggested]),
    embedding([negative_selected, negative_suggested]),
)


# distances = DistanceLayer()(
#     embedding(anchor),
#     embedding(positive),
#     embedding(negative),
# )

siamese_network = Model(
    inputs = [anchor_playing, anchor_suggested, positive_selected, positive_suggested, negative_selected, negative_suggested], outputs=distances
)

siamese_model = SiameseModel(siamese_network)
siamese_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [128]:
Ap, As, Pp, Ps, Np, Ns = [], [], [], [], [], []
for ax, px, nx in zip(AX_train, PX_train, NX_train):
    Ap.append(ax[0])
    As.append(ax[1])

    Pp.append(px[0])
    Ps.append(px[1])

    Np.append(nx[0])
    Ns.append(nx[1])

Ap = np.array(Ap)
As = np.array(As)

Pp = np.array(Pp)
Ps = np.array(Ps)

Np = np.array(Np)
Ns = np.array(Ns)

In [129]:
siamese_model.fit([Ap, As, Pp, Ps, Np, Ns], batch_size=32, epochs=32, validation_split=0.2)

Epoch 1/100

In [85]:
Ap_t, As_t, Pp_t, Ps_t, Np_t, Ns_t = [], [], [], [], [], []
for ax, px, nx in zip(AX_test, PX_test, NX_test):
    Ap_t.append(ax[0])
    As_t.append(ax[1])

    Pp_t.append(px[0])
    Ps_t.append(px[1])

    Np_t.append(nx[0])
    Ns_t.append(nx[1])

Ap_t = np.array(Ap_t)
As_t = np.array(As_t)

Pp_t = np.array(Pp_t)
Ps_t = np.array(Ps_t)

Np_t = np.array(Np_t)
Ns_t = np.array(Ns_t)

In [86]:
siamese_model.evaluate([Ap_t, As_t, Pp_t, Ps_t, Np_t, Ns_t])

  1/598 [..............................] - ETA: 12s - loss: 0.0857



0.11936970055103302

In [100]:
a = embedding([Ap_t, As_t]),
a = np.array(a)[0]

p = embedding([Pp_t, Ps_t]),
p = np.array(p)[0]

n = embedding([Np_t, Ns_t]),
n = np.array(n)[0]

In [118]:
from sklearn.metrics.pairwise import cosine_similarity

tp = 0
tn = 0
total = 0

for anc, pos, neg in zip(a, p, n):

    pv = cosine_similarity(anc.reshape(1, -1), pos.reshape(1, -1)).flatten()[0]
    nv = cosine_similarity(anc.reshape(1, -1), neg.reshape(1, -1)).flatten()[0]

    if pv > nv:
        tp += 1
    else:
        tn += 1

    total += 1

In [123]:
tp/total

0.9095140959255191

In [120]:
int(tn/total)

0

0.9969941