In [1]:
import numpy as np
import pandas as pd
import pickle as pkl
from tqdm.notebook import tqdm as tqdm
from matplotlib import pyplot as plt
from collections import Counter

In [2]:
empty_desc = np.zeros(768)
empty_title = np.zeros(768)

class Video:
    def __init__(self, uid): 
        self.uid = uid
        if uid not in video_db:
            self.title, self.tags, self.description, self.category, self.channel_title = None, None, None, None, None
            self.title_emb, self.tags_emb, self.description_emb = empty_title, [], empty_desc
            return

        vs = video_db[uid]['snippet']
        self.title = vs['title']
        self.tags = vs.get('tags', [])
        self.description = vs['description']
        self.category = vs['categoryId']
        self.channel_title = vs['channelTitle']
        
        self.title_emb = video_db[uid]['embeddings']['title']
        self.tags_emb = video_db[uid]['embeddings']['tags']
        self.description_emb = video_db[uid]['embeddings']['description']

    def __repr__(self):
        if self.title is None:
            return 'Video not found'
        return self.title
    
    def __str__(self):
        if self.title is None:
            return 'Video not found'
        return self.title
    
    def __hash__(self):
        return hash(self.uid)
    
def get_id(url):
    if len(url.split('v=')) <= 1:
        return None
    url = url.split('v=')[1]
    url = url.split('&')[0]
    return url

def decode_upnext(urls):
    urls = eval(urls.decode('utf-8'))
    ids = []
    for u in urls:
        ids.append(get_id(u))
    return ids

In [1]:
import pandas as pd
sessions = pd.read_pickle('all_sessions')

[{'date': Timestamp('2022-12-16 14:25:28.128000'),
  'url': 'https://www.youtube.com/watch?v=489B7RNudAQ',
  'is_video': True,
  'upnext_content': b'["https://www.youtube.com/watch?v=ZOZyo6YOAu4","https://www.youtube.com/watch?v=VtYi8AR11WM","https://www.youtube.com/watch?v=Q32BGQmVLJ0","https://www.youtube.com/watch?v=h1BsKIP4uYM","https://www.youtube.com/watch?v=4u5I8GYB79Y","https://www.youtube.com/watch?v=pWAyV_GoJxA","https://www.youtube.com/watch?v=roSRpjgxvnQ","https://www.youtube.com/watch?v=oD_MQMqnvoE","https://www.youtube.com/watch?v=CuluhA3S--U","https://www.youtube.com/watch?v=CwVTvKVPRQ0","https://www.youtube.com/watch?v=uFKRr3XWgyA","https://www.youtube.com/watch?v=m4W_O63qRvs","https://www.youtube.com/watch?v=CMMYYajwLms","https://www.youtube.com/watch?v=ANZDDO9TKc4","https://www.youtube.com/watch?v=XYG8s9CSLWY","https://www.youtube.com/watch?v=VFpk6MV42zg","https://www.youtube.com/watch?v=XENqcS9Wj34","https://www.youtube.com/watch?v=FNS3l8yvvrE","https://www.youtube.c

In [3]:
video_db = pd.read_pickle('video-dump-with-embeddings+transcript')

In [8]:
# Single Step Data (only access to this video)
ss_session = []
for session in tqdm(sessions):
    for video, next_video, in zip(session[:-1], session[1:]):
        video['selected'] = next_video['url']
        ss_session.append(video)


data = pd.DataFrame(ss_session)
data['url'] = data['url'].apply(get_id)
data['selected'] = data['selected'].apply(get_id)
data['upnext_content'] = data['upnext_content'].apply(decode_upnext)
data = data[['url', 'upnext_content', 'selected']]
data.columns = ['video', 'upnext', 'selected']
data.head()

  0%|          | 0/6762 [00:00<?, ?it/s]

Unnamed: 0,video,upnext,selected
0,489B7RNudAQ,"[ZOZyo6YOAu4, VtYi8AR11WM, Q32BGQmVLJ0, h1BsKI...",Q32BGQmVLJ0
1,jnXeE4TY2so,"[MeH-4wEuvZs, h1BsKIP4uYM, dqbyJIKLxok, VX4n8w...",HtyVbMZegn4
2,NQPNRwpGWXc,"[2PopspP_DbI, VPrrIGxjubI, 4s2ynUAJ5ZU, dEk8Hb...",VPrrIGxjubI
3,8tsnuvfMmtc,"[4iYl9oSjffQ, mTHedRdHJgk, aeWyp2vXxqA, 6S7VkI...",aeWyp2vXxqA
4,LcW4MCa5YCQ,"[-TgFz3qmE9U, 9scfWN6aXaU, O7VaXlMvAvk, h1BsKI...",z0Xpye7Ltlo


In [9]:
video_sessions = []
ss_sessions = data.to_dict('records')

for session in tqdm(ss_sessions):
    video_session = {}
    v = Video(session['video'])
    video_session['video'] = Video(session['video'])
    video_session['upnext'] = []
    for upnext in session['upnext']:
        video_session['upnext'].append(Video(upnext))
    video_session['selected'] = Video(session['selected'])
    video_sessions.append(video_session)

  0%|          | 0/8563 [00:00<?, ?it/s]

In [6]:
def trims(X, Y, l=15):
    X = np.array(X)
    Y = np.array(Y)
    indices = np.where(Y == 0)[0]
    np.random.shuffle(indices)

    indices = indices[:len(Y) - l]
    X = np.delete(X, indices, axis=0)
    Y = np.delete(Y, indices, axis=0)
    return X, Y

In [11]:
pkl.dump(video_sessions, open('video_sessions.pkl', 'wb'))

In [4]:
video_sessions = pkl.load(open('video_sessions.pkl', 'rb'))

In [42]:
def shuffle(l1, l2):
    indices = np.arange(len(l1))
    np.random.shuffle(indices)
    l1 = np.array(l1)[indices]
    l2 = np.array(l2)[indices]
    return l1, l2

shuffle([1, 2, 3, 4, 5], [4, 3, 2, 1, 0])

(array([5, 1, 2, 4, 3]), array([0, 4, 3, 1, 2]))

In [83]:
# multi label output
X = []
Y = []
ss = []
for session in video_sessions:
    x = []
    y = []
    if session['selected'].title == None:
        continue

    if len(session['upnext']) >= 15:
        xx = session['video'].title_emb

        ss.append(session)
        selected = session['selected'].title
        upnext = session['upnext']
        for video in session['upnext']:
            if video.title == selected:
                y.append(1)
            else:
                y.append(0)

            x.append(video.title_emb)

        x, y = shuffle(x, y)       
        x = [xx] + list(x)
        x, y = trims(x, y)
        X.append(x)
        Y.append(y)

X = np.array(X)
Y = np.array(Y)

In [84]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [80]:
# pkl.dump([X, Y], open('XY.pkl', 'wb'))

In [92]:
i = x[0]
j = x[1]

np.matmul(i, j, axis=(1, 2))

TypeError: matmul: axis can only be used with a single shared core dimension, not with the 3 distinct ones implied by signature (n?,k),(k,m?)->(n?,m?).

In [97]:
(i*j).shape

(768,)

In [98]:
# X_train = X_train.reshape(-1, 16, 768)

pX = []
rX = []

for x in X_train:
  xx = x[0]
  pX.append(x[0])
  rxx = []
  for rx in x[1:]:
    rxx.append(rx*xx)
  rX.append(rxx)

pX = np.array(pX)
rX = np.array(rX)

In [99]:
X.shape, Y.shape

((7108, 16, 768), (7108, 15))

In [35]:
# train cnn model

from keras.models import Sequential
from tensorflow.keras.optimizers import SGD
from keras.layers import Dense, Conv2D, Flatten, MaxPooling2D, Dropout

model = Sequential()
model.add(Conv2D(128, kernel_size=(8, 8), activation='relu', input_shape=(16, 768, 1)))
model.add(Conv2D(64, kernel_size=3, activation='relu'))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(15, activation='softmax'))

sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9)



METRICS = [keras.metrics.TruePositives(name='tp'),
            keras.metrics.FalsePositives(name='fp'), 
            keras.metrics.TrueNegatives(name='tn'),
            keras.metrics.FalseNegatives(name='fn'),
            keras.metrics.BinaryAccuracy(name='accuracy'),
            keras.metrics.Precision(name='precision'),
            keras.metrics.Recall(name='recall'),
            keras.metrics.AUC(name='auc'),       
            keras.metrics.AUC(name='prc', curve='PR')]
# import sgd


model.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=[METRICS])
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50)

  super(SGD, self).__init__(name, **kwargs)


NameError: name 'keras' is not defined

In [15]:
model.evaluate(X_test, y_test, batch_size=128)



[3.911512613296509, 0.1680731326341629]

In [100]:
X_train.shape, y_train.shape

((5686, 16, 768), (5686, 15))

In [50]:
# save data
pkl.dump([X_train, X_test, y_train, y_test], open('data.pkl', 'wb'))

In [49]:
# train neural network

from keras.models import Sequential
from keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import SGD


X_train = X_train.reshape(X_train.shape[0], 12288)


model = Sequential()
model.add(Dense(6400, activation='relu', input_dim=12288))
model.add(Dropout(0.4))
model.add(Dense(3200, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(1600, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(800, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(400, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(200, activation='relu'))
model.add(Dense(15, activation='softmax'))

sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9)

# add early stopping

from keras.callbacks import EarlyStopping


model.compile(loss='categorical_crossentropy',
                optimizer=sgd,
                metrics=['accuracy'], 
                callbacks=[EarlyStopping(monitor='accuracy', patience=3)]
                )

model.fit(X_train, y_train,
            epochs=100,
            batch_size=16)

  super(SGD, self).__init__(name, **kwargs)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100

KeyboardInterrupt: 

In [43]:
X_test = X_test.reshape(X_test.shape[0], 12288)
model.evaluate(X_test, y_test, batch_size=128)



[2.9538886547088623, 0.0555555559694767]

In [46]:
y_pred = model.predict(X_test)

In [64]:
# get the top 3 argmax from y_pred
y_pred = [1, 1, 1, 5, 4, 1, 1, 2]
y_pred = np.argsort(y_pred)
# y_pred = np.flip(y_pred)
y_pred[-3:]
# y_pred[3]
if np.argmax(y_pred) in y_pred:
    print('yes')

yes


In [75]:
ii=np.argmax(Y[0])
X[0][ii+1][0]

0.10375364869832993

In [69]:
for i in X[0]:
    print(i[0])

0.23706816136837006
0.6322957277297974
-0.24124228954315186
0.21707817912101746
0.021563613787293434
-0.22810393571853638
0.35623249411582947
0.2359631061553955
-0.31525903940200806
0.10375364869832993
-0.2930575907230377
-0.2260609120130539
-0.3247607350349426
-0.10608091205358505
-0.09520483762025833
0.9140424728393555


In [71]:
video_sessions[0]['video'].title_emb[0]

0.23706816

In [76]:
video_sessions[0]['selected'].title_emb[0]

0.10375365

In [59]:
correct = 0
total = 0

for yp, yt in zip(y_pred, y_test):
    yp = np.argsort(yp)[-3:]
    if np.argmax(yt) in yp:
        correct += 1
    
    total += 1

correct / total

array([0, 1, 2, 5, 6, 7, 4, 3])

In [53]:
y_pred

array([[13,  6,  3],
       [13,  6,  3],
       [13,  6,  3],
       ...,
       [13,  6,  3],
       [13,  6,  3],
       [13,  6,  3]])

In [274]:
model.evaluate(X_test, y_test, batch_size=128)



[2.2189486026763916, 0.3614627420902252]

In [255]:
y_test_max = np.argmax(y_test, axis=1)
y_pred = model.predict(X_test)
y_pred_max = np.argmax(y_pred, axis=1)

In [266]:
from sklearn.metrics import f1_score
print(f1_score(y_test_max, y_pred_max, average='micro'))

# # recall and precision
# from sklearn.metrics import recall_score, precision_score

# print(recall_score(y_test, y_pred))
# print(precision_score(y_test, y_pred))

0.29043600562587907


In [247]:

y_pred = model.predict(X_test)

from sklearn.metrics import roc_curve, auc

fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(10, 5))
plt.plot(fpr, tpr, color='darkorange',
        lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy',
        lw=2, linestyle='--')  
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('AUC')
plt.legend(loc="lower right")

ValueError: multilabel-indicator format is not supported

In [88]:
m.predict(X_test).sum()

3079

In [86]:
# calculate f1score
from sklearn.metrics import f1_score
f1_score(y_test, m.predict(X_test))

0.11928269192020956