In [1]:
import numpy as np
import pandas as pd
import pickle as pkl
from tqdm.notebook import tqdm as tqdm
from matplotlib import pyplot as plt
from collections import Counter

In [2]:
empty_desc = np.zeros(768)
empty_title = np.zeros(768)

class Video:
    def __init__(self, uid): 
        self.uid = uid
        if uid not in video_db:
            self.title, self.tags, self.description, self.category, self.channel_title = None, None, None, None, None
            self.title_emb, self.tags_emb, self.description_emb = empty_title, [], empty_desc
            return

        vs = video_db[uid]['snippet']
        self.title = vs['title']
        self.tags = vs.get('tags', [])
        self.description = vs['description']
        self.category = vs['categoryId']
        self.channel_title = vs['channelTitle']
        
        self.title_emb = video_db[uid]['embeddings']['title']
        self.tags_emb = video_db[uid]['embeddings']['tags']
        self.description_emb = video_db[uid]['embeddings']['description']

    def __repr__(self):
        if self.title is None:
            return 'Video not found'
        return self.title
    
    def __str__(self):
        if self.title is None:
            return 'Video not found'
        return self.title
    
    def __hash__(self):
        return hash(self.uid)
    
def get_id(url):
    if len(url.split('v=')) <= 1:
        return None
    url = url.split('v=')[1]
    url = url.split('&')[0]
    return url

def decode_upnext(urls):
    urls = eval(urls.decode('utf-8'))
    ids = []
    for u in urls:
        ids.append(get_id(u))
    return ids

In [3]:
sessions = pd.read_pickle('all_sessions')
video_db = pd.read_pickle('video-dump-with-embeddings+transcript')

In [4]:
# Single Step Data (only access to this video)
ss_session = []
for session in tqdm(sessions):
    for video, next_video, in zip(session[:-1], session[1:]):
        video['selected'] = next_video['url']
        ss_session.append(video)


data = pd.DataFrame(ss_session)
data['url'] = data['url'].apply(get_id)
data['selected'] = data['selected'].apply(get_id)
data['upnext_content'] = data['upnext_content'].apply(decode_upnext)
data = data[['url', 'upnext_content', 'selected']]
data.columns = ['video', 'upnext', 'selected']
data.head()

  0%|          | 0/6762 [00:00<?, ?it/s]

Unnamed: 0,video,upnext,selected
0,489B7RNudAQ,"[ZOZyo6YOAu4, VtYi8AR11WM, Q32BGQmVLJ0, h1BsKI...",Q32BGQmVLJ0
1,jnXeE4TY2so,"[MeH-4wEuvZs, h1BsKIP4uYM, dqbyJIKLxok, VX4n8w...",HtyVbMZegn4
2,NQPNRwpGWXc,"[2PopspP_DbI, VPrrIGxjubI, 4s2ynUAJ5ZU, dEk8Hb...",VPrrIGxjubI
3,8tsnuvfMmtc,"[4iYl9oSjffQ, mTHedRdHJgk, aeWyp2vXxqA, 6S7VkI...",aeWyp2vXxqA
4,LcW4MCa5YCQ,"[-TgFz3qmE9U, 9scfWN6aXaU, O7VaXlMvAvk, h1BsKI...",z0Xpye7Ltlo


In [5]:
video_sessions = []
ss_sessions = data.to_dict('records')

for session in tqdm(ss_sessions):
    video_session = {}
    v = Video(session['video'])
    video_session['video'] = Video(session['video'])
    video_session['upnext'] = []
    for upnext in session['upnext']:
        video_session['upnext'].append(Video(upnext))
    video_session['selected'] = Video(session['selected'])
    video_sessions.append(video_session)

  0%|          | 0/8563 [00:00<?, ?it/s]

In [8]:
def trims(X, Y, l=15):
    X = np.array(X)
    Y = np.array(Y)
    indices = np.where(Y == 0)[0]
    np.random.shuffle(indices)

    indices = indices[:len(Y) - l]
    X = np.delete(X, indices, axis=0)
    Y = np.delete(Y, indices, axis=0)
    return X, Y

In [9]:
# multi label output
X = []
Y = []
ss = []
for session in video_sessions:
    x = []
    y = []
    if session['selected'].title == None:
        continue

    if len(session['upnext']) >= 15:
        x.append(session['video'].title_emb)

        ss.append(session)
        selected = session['selected'].title
        upnext = session['upnext']
        for video in session['upnext']:
            if video.title == selected:
                y.append(1)
            else:
                y.append(0)

            x.append(video.title_emb)
        x, y = trims(x, y)
        X.append(x)
        Y.append(y)

X = np.array(X)
Y = np.array(Y)

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [12]:
X_train[0].shape, y_train.shape

((16, 768), (5686, 15))

In [13]:
# train cnn model

from keras.models import Sequential
from keras.layers import Dense, Conv2D, Flatten, MaxPooling2D, Dropout

model = Sequential()
model.add(Conv2D(128, kernel_size=(8, 8), activation='relu', input_shape=(16, 768, 1)))
model.add(Conv2D(64, kernel_size=3, activation='relu'))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(15, activation='softmax'))

sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9)
model.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=30)

2023-06-20 04:11:32.803284: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-06-20 04:11:32.803346: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-06-20 04:11:34.026038: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-06-20 04:11:34.026100: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2023-06-20 04:11:34.026146: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (toutatis): /proc/driver/nvidia/version does not exist
2023-06-20 04:11:34.026410: I tensorflow/core/platform/cpu_feature_g

NameError: name 'SGD' is not defined

In [273]:
# train neural network

from keras.models import Sequential
from keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import SGD


model = Sequential()
model.add(Dense(6400, activation='relu', input_dim=12288))
model.add(Dropout(0.4))
model.add(Dense(3200, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(1600, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(800, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(400, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(200, activation='relu'))
model.add(Dense(15, activation='softmax'))

sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9)
model.compile(loss='categorical_crossentropy',
                optimizer=sgd,
                metrics=['Accuracy'])

model.fit(X_train, y_train,
            epochs=50,
            batch_size=128)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f73621f7a60>

In [274]:
model.evaluate(X_test, y_test, batch_size=128)



[2.2189486026763916, 0.3614627420902252]

In [255]:
y_test_max = np.argmax(y_test, axis=1)
y_pred = model.predict(X_test)
y_pred_max = np.argmax(y_pred, axis=1)

In [266]:
from sklearn.metrics import f1_score
print(f1_score(y_test_max, y_pred_max, average='micro'))

# # recall and precision
# from sklearn.metrics import recall_score, precision_score

# print(recall_score(y_test, y_pred))
# print(precision_score(y_test, y_pred))

0.29043600562587907


In [247]:

y_pred = model.predict(X_test)

from sklearn.metrics import roc_curve, auc

fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(10, 5))
plt.plot(fpr, tpr, color='darkorange',
        lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy',
        lw=2, linestyle='--')  
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('AUC')
plt.legend(loc="lower right")

ValueError: multilabel-indicator format is not supported

In [88]:
m.predict(X_test).sum()

3079

In [86]:
# calculate f1score
from sklearn.metrics import f1_score
f1_score(y_test, m.predict(X_test))

0.11928269192020956