In [3]:
#imports
import string
import pandas as pd
import pickle as pkl
from collections import Counter
from matplotlib import pyplot as plt

import numpy as np
from random import sample

import fasttext
from sentence_transformers import SentenceTransformer
from gensim.models.fasttext import load_facebook_model

import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

from tqdm import tqdm
import utils

# reload import utils
from importlib import reload
reload(utils)


<module 'utils' from '/home/data/hussam/sparta/youtube-sim/src/utils.py'>

In [4]:
watches = pd.read_pickle('../data/user-level-data')

In [5]:
# split dataset into history and present data
# use first 50 videos as history

In [6]:
history = dict()
users = list(watches.keys())
# use first 50 watches as history
data = dict()

for user in users:
    if len(watches[user]) < 50:
        continue
    history[user] = watches[user][:50]

    for watch in watches[user][50:]:
        if watch['selected'] != None:
            if user not in data:
                data[user] = []
            data[user].append(watch)


# remove users from history that are not in data

for user in list(history.keys()):
    if user not in data:
        history.pop(user)

In [7]:
# Create history embeddings
users = list(history.keys())

for user in users:
    embs = []
    for watch in history[user]:
        if watch['playing'] == None:
            embs.append([0]*1536)
        else:
            embs.append(watch['playing']['embeddings']['title'])
    history[user] = embs

In [None]:
pkl.dump(history, open('../data/user-history-50.pkl', 'wb'))

In [6]:
def make_dataset(user, playing, suggested, selected):
    if playing == None:
        return [], []

    if selected == None:
        return [], []

    if len(suggested) != 19:
        return [], []

    playing = playing['embeddings']['title']
    Y = []
    for s in suggested:
        if s == None:
            Y.append(0)
            continue
        id = s.get('id', None)
        if id == selected:
            Y.append(1)
        else:
            Y.append(0)

    sg = []
    for s in suggested:
        if s == None:
            sg.append([0]*1536)
            continue
        sg.append(s['embeddings']['title'])
    suggested = sg
    selected = Y
    user_history = history[user]

    Y = np.array(Y).flatten()
    playing = np.array(playing).flatten()
    suggested = np.array(suggested)
    user_history = np.array(user_history).flatten()
    X = [playing, suggested, user_history]

    if Y.sum() == 0:
        return [], []

    return X, Y


X, Y = [], []
for user in users:
    for watch in data[user]:
        playing = watch['playing']
        suggested = watch['suggested']
        selected = watch['selected']
        x, y = make_dataset(user, playing, suggested, selected)
        if len(x) > 0:
            X.append(x)
            Y.append(y)

### Binary Crossentropy

In [7]:
playingX = []
user_historyX = []
suggestedX = []
selectedY = []


for x, y in zip(X, Y):
    upnexts = x[1]
    for i, upnext in zip(y, upnexts):
        suggestedX.append(upnext)
        selectedY.append(i)
        playingX.append(x[0])
        user_historyX.append(x[2])

In [8]:
playingX = np.array(playingX)
user_historyX = np.array(user_historyX)
suggestedX = np.array(suggestedX)
selectedY = np.array(selectedY)

print(playingX.shape, user_historyX.shape, suggestedX.shape, selectedY.shape)

(40546, 1536) (40546, 76800) (40546, 1536) (40546,)


In [9]:
import tensorflow as tf
import tensorflow.keras as keras

playing_input = tf.keras.layers.Input(shape=(1536), name="playing")
recommendation_input = tf.keras.layers.Input(shape=(1536), name="recommendation")
history_input = tf.keras.layers.Input(shape=(76800), name="history_input")

playing_dense = tf.keras.layers.Dense(1024, activation="relu")(playing_input)
playing_dense = tf.keras.layers.Dropout(0.3)(playing_dense)
playing_dense = tf.keras.layers.Dense(512, activation="relu")(playing_dense)
playing_dense = tf.keras.layers.Flatten()(playing_dense)

recommendation_dense = tf.keras.layers.Dense(1024, activation="relu")(recommendation_input)
recommendation_dense = tf.keras.layers.Dropout(0.3)(recommendation_dense)
recommendation_dense = tf.keras.layers.Dense(512, activation="relu")(recommendation_dense)
recommendation_dense = tf.keras.layers.Flatten()(recommendation_dense)

history_dense = tf.keras.layers.Dense(1024, activation="relu")(history_input)
history_dense = tf.keras.layers.Dropout(0.3)(history_dense)
history_dense = tf.keras.layers.Dense(512, activation="relu")(history_dense)
history_dense = tf.keras.layers.Dropout(0.3)(history_dense)
history_dense = tf.keras.layers.Dense(256, activation="relu")(history_dense)
history_dense = tf.keras.layers.Flatten()(history_dense)

concat = tf.keras.layers.Concatenate()([playing_dense, history_input, recommendation_dense])
concat = tf.keras.layers.Dense(1024, activation="relu")(concat)
concat = tf.keras.layers.Dropout(0.3)(concat)
concat = tf.keras.layers.Dense(512, activation="relu")(concat)
concat = tf.keras.layers.Dropout(0.3)(concat)
concat = tf.keras.layers.Dense(256, activation="relu")(concat)
concat = tf.keras.layers.Dropout(0.3)(concat)
concat = tf.keras.layers.Dense(128, activation="relu")(concat)
concat = tf.keras.layers.Dropout(0.3)(concat)
concat = tf.keras.layers.Dense(64, activation="relu")(concat)
concat = tf.keras.layers.Dense(32, activation="relu")(concat)
concat = tf.keras.layers.Flatten()(concat)

output = tf.keras.layers.Dense(1, activation="sigmoid")(concat)

model = tf.keras.Model(inputs=[playing_input, recommendation_input, history_input], outputs=output)

METRICS = [keras.metrics.TruePositives(name='tp'),
            keras.metrics.FalsePositives(name='fp'), 
            keras.metrics.TrueNegatives(name='tn'),
            keras.metrics.FalseNegatives(name='fn'),
            keras.metrics.BinaryAccuracy(name='accuracy'),
            keras.metrics.Precision(name='precision'),
            keras.metrics.Recall(name='recall'),
            keras.metrics.AUC(name='auc'),       
            keras.metrics.AUC(name='prc', curve='PR')]
# import sgd
from tensorflow.keras.optimizers import SGD

sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9)
model.compile(loss='binary_crossentropy',
                optimizer=sgd,
                metrics=[METRICS])

  "The `lr` argument is deprecated, use `learning_rate` instead.")


In [None]:
model.fit([playingX, suggestedX, user_historyX], np.array(selectedY), epochs=100, batch_size=16, validation_split=0.2)

Epoch 1/100
Epoch 2/100

In [13]:
model.evaluate([playingX, suggestedX, user_historyX], selectedY)



[0.20913134515285492,
 0.0,
 0.0,
 38377.0,
 2169.0,
 0.9465051889419556,
 0.0,
 0.0,
 0.5,
 0.05349479615688324]

### Categorical crossentropy

In [106]:
playingX = []
suggestedX = []
user_historyX = []

for x in X:
    playingX.append(x[0])
    suggestedX.append(x[1])
    user_historyX.append(x[2])

playingX = np.array(playingX)
suggestedX = np.array(suggestedX)
user_historyX = np.array(user_historyX)


print(user_historyX.shape, playingX.shape, suggestedX.shape, np.array(Y).shape)


(2134, 76800)

In [119]:
import tensorflow as tf
import tensorflow.keras as keras

playing_input = tf.keras.layers.Input(shape=(1536), name="playing")
recommendation_input = tf.keras.layers.Input(shape=(29184), name="recommendation")
history_input = tf.keras.layers.Input(shape=(76800), name="history_input")

playing_dense = tf.keras.layers.Dense(1024, activation="relu")(playing_input)
playing_dense = tf.keras.layers.Dropout(0.3)(playing_dense)
playing_dense = tf.keras.layers.Dense(512, activation="relu")(playing_dense)
playing_dense = tf.keras.layers.Flatten()(playing_dense)

recommendation_dense = tf.keras.layers.Dense(1024, activation="relu")(recommendation_input)
recommendation_dense = tf.keras.layers.Dropout(0.3)(recommendation_dense)
recommendation_dense = tf.keras.layers.Dense(512, activation="relu")(recommendation_dense)
recommendation_dense = tf.keras.layers.Flatten()(recommendation_dense)


history_dense = tf.keras.layers.Dense(1024, activation="relu")(history_input)
history_dense = tf.keras.layers.Dropout(0.3)(history_dense)
history_dense = tf.keras.layers.Dense(512, activation="relu")(history_dense)
history_dense = tf.keras.layers.Flatten()(history_dense)

concat = tf.keras.layers.Concatenate()([playing_dense, history_input, recommendation_dense])
concat = tf.keras.layers.Dense(1024, activation="relu")(concat)
concat = tf.keras.layers.Dropout(0.3)(concat)
concat = tf.keras.layers.Dense(512, activation="relu")(concat)
concat = tf.keras.layers.Dropout(0.3)(concat)
concat = tf.keras.layers.Dense(256, activation="relu")(concat)
concat = tf.keras.layers.Dropout(0.3)(concat)
concat = tf.keras.layers.Dense(128, activation="relu")(concat)
concat = tf.keras.layers.Dropout(0.3)(concat)
concat = tf.keras.layers.Dense(64, activation="relu")(concat)
concat = tf.keras.layers.Dense(32, activation="relu")(concat)
concat = tf.keras.layers.Flatten()(concat)

output = tf.keras.layers.Dense(19, activation="softmax")(concat)

model = tf.keras.Model(inputs=[playing_input, recommendation_input, history_input], outputs=output)

METRICS = [keras.metrics.TruePositives(name='tp'),
            keras.metrics.FalsePositives(name='fp'), 
            keras.metrics.TrueNegatives(name='tn'),
            keras.metrics.FalseNegatives(name='fn'),
            keras.metrics.BinaryAccuracy(name='accuracy'),
            keras.metrics.Precision(name='precision'),
            keras.metrics.Recall(name='recall'),
            keras.metrics.AUC(name='auc'),       
            keras.metrics.AUC(name='prc', curve='PR')]
# import sgd
from tensorflow.keras.optimizers import SGD

sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9)
model.compile(loss='categorical_crossentropy',
                optimizer=sgd,
                metrics=[METRICS])

model.fit([playingX, suggestedX, user_historyX], np.array(Y), epochs=100, batch_size=16, validation_split=0.2)

  super(SGD, self).__init__(name, **kwargs)
