In [174]:
import os
import numpy as np
import pandas as pd
import requests
import tqdm.notebook
import json
from googleapiclient.discovery import build as build_api_client
from loaders import load_semi_supervised, load_categories, load_all_videos, load_csv, load_thumbnails

In [172]:
key = 'API_KEY_HERE'

In [129]:
def search_relevant_videos(region_code, video_id):
    relevant_videos_ids = list()
    payload = {
      'publishedAfter': '2017-11-15T00:00:00Z',
      'publishedBefore': '2018-06-16T00:00:00Z',
      'order': 'relevance',
      'part': 'id',
      'maxResults': 50,
      'regionCode': region_code,
      'type': 'video',
      'relatedToVideoId': video_id,
      'key': key
    }

    response = requests.get('https://www.googleapis.com/youtube/v3/search', params=payload)
    if response.status_code == 200:
        content = response.json()
        items = content['items']

        for item in items:
            relevant_videos_ids.append(item['id']['videoId'])
            
    elif response.status_code == 404:
        return relevant_videos_ids
    
    else:
        raise Exception(response.status_code)

    return relevant_videos_ids

In [63]:
def get_videos_data(video_ids):
    output = dict()
    payload = {
      'part': 'id,snippet,statistics',
      'id': ','.join(video_ids),
      'key': key
    }

    response = requests.get('https://www.googleapis.com/youtube/v3/videos', params=payload)
    if response.status_code == 200:
        content = response.json()
        items = content['items']

        for item in items:
            output[item['id']] = item
            
    elif response.status_code == 404:
        return output
    
    else:
        raise Exception(response.status_code)

    return output

In [28]:
non_trending = pd.DataFrame(columns=['video_id', 'relevant_to', 'title', 'channel_title', 'category_id', 'publish_time', 'tags', 'views', 'likes', 'dislikes', 'comment_count', 'thumbnail_link', 'description', 'image_url'])

In [29]:
def filter_trending(videos_ids):
  return [vId for vId in videos_ids if vId not in trending_videos_ids]

In [30]:
def load_into_non_trending_df(relevant_to, content_dict, non_trending):
  for item in content_dict.values():
    video_id = item['id']

    snippet = item['snippet']
    title = snippet.get('title', '')
    channel_title = snippet.get('channelTitle', '')
    category_id = int(snippet.get('categoryId', '0'))
    publish_time = pd.to_datetime(snippet['publishedAt'], format='%Y-%m-%dT%H:%M:%S')
    tags = snippet.get('tags', [])
    thumbnail_link = snippet['thumbnails']['default']['url']
    description = snippet.get('description', '')
    
    for size in ['standard', 'high', 'medium', 'default']:
      if size in snippet['thumbnails']:
        image_url = snippet['thumbnails'][size]['url']
        break

    statistics = item['statistics']
    views = statistics.get('viewCount', -1)
    likes = statistics.get('likeCount', -1)
    dislikes = statistics.get('dislikeCount', -1)
    comment_count = statistics.get('commentCount', -1)

    serie = pd.Series([video_id, relevant_to, title, channel_title, category_id, publish_time, tags, views, likes, dislikes, comment_count, thumbnail_link, description, image_url], index=non_trending.columns)
    non_trending = non_trending.append(serie, ignore_index=True)
  return non_trending

In [31]:
def load_videos(relevant):
    videos_data_dict = dict()
    for video_ids in np.array_split(filter_trending(relevant), 3):
        videos_data_dict.update(get_videos_data(video_ids))
  
    return videos_data_dict

In [166]:
processed_df = load_all_videos()
trending_videos_ids = set(processed_df['video_id'])

with open('../youtube_data/loaded_relevant_videos_ids.json') as f:
    loaded_videos = json.load(f)

non_trending = pd.read_csv('../youtube_data/non_trending.csv', header=0)

try:
    for relevant_to in trending_videos_ids:
      if relevant_to not in loaded_videos:
        relevant_videos_ids = search_relevant_videos('US', relevant_to)
        relevant_videos_ids = filter_trending(relevant_videos_ids)
        print('------------------------ ----------------------')
        print(relevant_to)
        print(relevant_videos_ids)
        if len(relevant_videos_ids) > 0:
          content = load_videos(relevant_videos_ids)
          non_trending = load_into_non_trending_df(relevant_to, content, non_trending)
          non_trending.to_csv('../youtube_data/non_trending.csv', index=False)
          loaded_videos.append(relevant_to)
except Exception as e:
    print('ERROR:')
    print(e)
finally:
    with open('../youtube_data/loaded_relevant_videos_ids.json', 'w') as f:
        json.dump(loaded_videos, f)

------------------------ ----------------------
NBkC83-vh9A
[]
------------------------ ----------------------
gvuvxLzUB-w
[]
------------------------ ----------------------
76XBys_HXq4
[]
------------------------ ----------------------
4MtAl7F6xio
[]
------------------------ ----------------------
abSc5wLOqBM
[]
------------------------ ----------------------
J3IbiFCR-Xs
[]
------------------------ ----------------------
aAm6AHr14zY
[]
------------------------ ----------------------
VoUEz9yQJ8w
[]
------------------------ ----------------------
JVapiOu7Hkc
[]
------------------------ ----------------------
g7IfZX91qV0
[]
------------------------ ----------------------
erybgSeUHus
[]
------------------------ ----------------------
2omJBSQcm5g
[]
------------------------ ----------------------
IE5TzELzvmU
[]
------------------------ ----------------------
QaASZecxQww
[]
------------------------ ----------------------
WF82ABLw8s4
[]
------------------------ ----------------------
Lx_rI8A

------------------------ ----------------------
tEmodtG8NV0
['XiATXPJdIkw', 'AFVkIVdukog', 'lwa4Ov_Olms', 'dfvZ7WgVdQI', 'kIQS6xCeDcM', '-yzi4b2ZC_8', 't5kEMpTUWko', 'TwmN2K4InKs', '_TQqBcFQrIY', 'leZHuQLipJE', 'AXgGL5n98AI', '59r1cY7dzJ4', 'QL56YrRNfBk', 'PlQ_sIaFh1I', 'QDF48Hm0nao', 'dIVkzl1FrR0', '0ldaddX6G3s', 'Gq1tLdC4Syk', 'CkpOW8RIRtc', 'jQrdTvblSWY', 'FoL5nyuxJcQ', 'r048unNasnY', 'qidW2UssmUg', 'Ycm9hOQlO-g', 'vS-Jt15zoYk', '5R7oFp9rrc0', 'wySmP_EHCQQ', 'ZErZI1UPL9s', 'KWzsFgsKYK8', 'OdN82UbU62k', 'MaSUTSrU1WU', 'xjM5cPLaWZ4', 'DP48tzb-aRU', 'VUyv6Es6N7U', 'k00oTlt9Hks', 'LKGvk4bBvPY', '5m296w04it4', '974XDN9CQhk', 'y_FPY87idFc', 'd3zFnKwt4lQ', 'HUKhXjOhisA', 'uJotDcBu9Mo', 'yE9I_Wxwnek', '6vri36Y37PQ', '4gn6np-kNCA', 'n67Cgn3BtvI']
------------------------ ----------------------
E5yFcdPAGv0
['1V_xRb0x9aw', 'WXR-bCF5dbM', 'E2Q52cVx7Bo', 'HyHNuVaZJ-k', '5qJp6xlKEug', 'cLnkQAeMbIM', 'CKti7QixnJI', 'C-yP9f0gadU', 'PiNdcBg3xC8', 'nhPaWIeULKk', 'Tq7Ovshz1UI', 'hji4gBuOvIQ', 'lRlmM88

In [171]:
print(len(non_trending))
non_trending.head()

8798


Unnamed: 0,video_id,relevant_to,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,description,image_url
0,JKmXamRF9wM,817P8W8-mGE,Ed Sheeran ‒ Perfect Duet (Lyrics) ft. Beyoncé,SyrebralVibes,10,2017-12-01 04:04:01+00:00,"['syrebralvibes', 'syrebral vibes', 'electroni...",5434852,40379,2014,1347,https://i.ytimg.com/vi/JKmXamRF9wM/default.jpg,» Download Ed Sheeran ‒ Perfect Duet (Lyrics) ...,https://i.ytimg.com/vi/JKmXamRF9wM/sddefault.jpg
1,lp-EO5I60KA,817P8W8-mGE,Ed Sheeran - Thinking Out Loud [Official Video],Ed Sheeran,10,2014-10-07 13:57:37+00:00,"['edsheeran', 'ed sheeran', 'acoustic', 'live'...",3014436913,11361560,499596,315492,https://i.ytimg.com/vi/lp-EO5I60KA/default.jpg,Tickets for the Divide tour here - http://www....,https://i.ytimg.com/vi/lp-EO5I60KA/sddefault.jpg
2,bIB8EWqCPrQ,817P8W8-mGE,Ed Sheeran - Supermarket Flowers [Official Audio],Ed Sheeran,10,2017-03-03 07:11:55+00:00,"['edsheeran', 'ed sheeran', 'acoustic', 'live'...",127235566,651588,17037,26463,https://i.ytimg.com/vi/bIB8EWqCPrQ/default.jpg,÷. Out Now: https://atlanti.cr/yt-album\n\nSub...,https://i.ytimg.com/vi/bIB8EWqCPrQ/sddefault.jpg
3,1BR3ycghvTU,jCVkaZChKmc,WET N WILD Photo Focus Foundation Review & Wea...,Casey Holmes,26,2017-01-18 02:47:23+00:00,"['wet n wild', 'drugstore makeup', 'wear test'...",1456075,30469,684,2151,https://i.ytimg.com/vi/1BR3ycghvTU/default.jpg,Don't forget to subscribe here-- http://bit.ly...,https://i.ytimg.com/vi/1BR3ycghvTU/sddefault.jpg
4,NCJUpiLA8Y0,jCVkaZChKmc,DRUGSTORE First Impression 😍💦 Wet n Wild Found...,Shaaanxo,26,2018-01-16 20:14:25+00:00,"['first impression', 'wet n wild photo focus',...",256849,11272,110,953,https://i.ytimg.com/vi/NCJUpiLA8Y0/default.jpg,Today I'm testing the Wet n Wild Photo Focus F...,https://i.ytimg.com/vi/NCJUpiLA8Y0/sddefault.jpg


In [175]:
load_thumbnails(non_trending)

100%|██████████| 8600/8600 [05:47<00:00, 24.75it/s]

Downloaded: 8600
Broken links: 0
Already present: 0
Skipped: 0



