In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
import re
from collections import defaultdict
from time import sleep
import logging
import innertube
import os

In [None]:
SEARCH_DATA = os.getcwd() + os.sep + 'search-data' + os.sep
df = pd.read_csv(SEARCH_DATA + "search_results_metadata.csv")
seed_video_id = set(df['video_id'].tolist())

In [4]:
def getRecommendationInnerTube(data):
    if 'contents' not in data:
        print("None found")
        return None
    
    recommendations = data['contents']['twoColumnWatchNextResults']
    if 'secondaryResults' in recommendations:
        recommendations = recommendations['secondaryResults']['secondaryResults']['results']
    else:
        print("None found")
        return None
    
    top_recommendations = []
    for rec in recommendations:
        try:
            video_info = rec['compactVideoRenderer']
            title = video_info['title']['simpleText']
            video_id = video_info['videoId']
            top_recommendations.append((title, video_id))
            if len(top_recommendations) == 4:
                break
        except KeyError:
            continue

    return top_recommendations
    
def collect_seed_videos_from_recommendations(id_to_recommendations):
    video_id_lists = []
    for video_id, list_ in id_to_recommendations.items():
        if list_:
            for video_id_rec in list_:
                video_id_lists.append(video_id_rec[1])
                
    print("Total recommendations collected: " + str(len(video_id_lists)))
    print("Total unique videos: " + str(len(set(video_id_lists))))
    return set(video_id_lists)

def collect_recommendations(seed_video_id):
    id_to_recommendations = defaultdict(list)
    for i, video_id in enumerate(seed_video_id):
        if i % 20 == 0:
            print(i)
        try:
            data = client.next(video_id)
        except Exception as e:
            print(e)
            return id_to_recommendations
        top_recommendations = getRecommendationInnerTube(data)
        id_to_recommendations[video_id] = top_recommendations
    return id_to_recommendations

In [30]:
client = innertube.InnerTube("WEB")

In [None]:
# manually run each level 
id_to_recommendations = collect_recommendations(level_recommendations) 

In [33]:
level_recommendations = collect_seed_videos_from_recommendations(id_to_recommendations)

Total recommendations collected: 202799
Total unique videos: 126585


In [34]:
with open('top4-recommendations-level5.json', "w") as json_file:
    json.dump(id_to_recommendations, json_file, indent=4)

In [None]:
RECOMMENDATION_DIR = os.getcwd() + '/oud-audit/data-collection-pipeline/recommendation-data/'

unique_videos = set()
for i in range (1, 6):
    str_file = 'top4-recommendations-level' + str(i) + '.json'
    with open(RECOMMENDATION_DIR + str_file, 'rb') as f:
        id_to_recommendations = json.load(f)
    set_recommendations = collect_seed_videos_from_recommendations(id_to_recommendations)
    unique_videos.update(set_recommendations)

Total recommendations collected: 6356
Total unique videos: 3107
Total recommendations collected: 12412
Total unique videos: 8489
Total recommendations collected: 33916
Total unique videos: 21849
Total recommendations collected: 87224
Total unique videos: 55248
Total recommendations collected: 202799
Total unique videos: 126585


In [8]:
len(unique_videos)

164085

In [41]:
with open('recommendations-all-level-unique-video-id.json', "w") as json_file:
    json.dump(list(unique_videos), json_file, indent=4)