In [6]:
#!pip install google-api-python-client
import googleapiclient.discovery
import traceback
from tools import load_api_cfg

# Set the API access attributes & get a handle to the API service
apicfg=load_api_cfg('cfg.json')
#if apicfg is not None:
youtube = googleapiclient.discovery.build(apicfg['service_name'], apicfg['version'], developerKey=apicfg['key'])




In [7]:
'''
  Fetches the channel identified by the provided channel ID
  Input: Channel ID
  Returns: Dictionary representing channel information
'''
def get_channel_info(channel_id):
  try:
    # Prepare the API request
    request = youtube.channels().list(
      part="snippet,contentDetails,statistics,status",
      id=channel_id)
    # Execute the API request
    response = request.execute()
    item = response["items"][0]

    # Extract from the response, the channel attributes that we are interested in
    data=dict()
    data["id"]=item["id"]
    data["name"]=item["snippet"]["title"]
    data["type"] = item["kind"]
    data["description"]=item["snippet"]["description"]
    data["playlistId"]=item["contentDetails"]["relatedPlaylists"]["uploads"]
    data["views"]=item["statistics"]["viewCount"]
    data["hidden_subscribers"]=item["statistics"]["hiddenSubscriberCount"]
    data["subscriber_count"] = None
    if data["hidden_subscribers"] == False:
      data["subscriber_count"]=item["statistics"]["subscriberCount"]
    data["status"]=item["status"]["privacyStatus"]
    data["video_count"] = item["statistics"]["videoCount"]
    return data
  except:
    print('Failed to fetch or process channel info from Youtube Data API v3')
    print(traceback.print_exc())


In [8]:
'''
  Fetches the playlists for given channel
  Input: Channel ID
  Returns: List of dictionaries, each representing playlist info for the given channel
'''
def get_playlist_info(channel_id):
  try:
    # Prepare the API request
    request = youtube.playlists().list(part="snippet",channelId=channel_id, maxResults=10)

    # Execute the API request
    response = request.execute()
    item = response["items"]

    # Extract from the response, the channel attributes that we are interested in
    play_lists = dict()
    for obj in response["items"]:
      data =dict()
      data["channelId"] = obj["snippet"]["channelId"]
      data["name"] = obj["snippet"]["title"]
      data["id"] = obj["id"]
      play_lists[data["id"]]=data
    return play_lists
  except:
    print('Failed to fetch or process playlist info from Youtube Data API v3')
    print(traceback.print_exc())


In [9]:
'''
  Fetches comments for given channel
  Input: Channel ID
  Returns: Dictionary of dictionaries, each representing a comment thread of the given channel
'''
#Gets the comment for a given channel
def get_comments_info(channel_id):
  try:
    # Prepare the API request
    request = youtube.commentThreads().list(part="snippet",allThreadsRelatedToChannelId=channel_id, maxResults=20)

    # Execute the API request
    response = request.execute()
    items = response["items"]

    # Extract from the response, the channel attributes that we are interested in
    comment_threads = dict()
    for comment in items:
      data =dict()
      data["id"] = comment["id"]
      data["video_id"] = comment["snippet"]["videoId"]
      data["text"] = comment["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
      data["author"] = comment["snippet"]["topLevelComment"]["snippet"]["authorDisplayName"]
      data["published_date"] = comment["snippet"]["topLevelComment"]["snippet"]["publishedAt"]

      comment_threads[data["id"]]=data
    return comment_threads
  except:
    print('Failed to fetch or process comment threads info from Youtube Data API v3')
    print(traceback.print_exc())



In [95]:
'''
  Fetches video information , given a set of ids
  Input: Channel ID
  Returns: Dictionary of dictionaries, each representing video
           info for every video for a channel
'''
import re
import time

# Helper to convert API duration string to duration in seconds
def to_seconds(dur_str):
  duration=days=hours=minutes=seconds=0
  #'P#DT#H#M#S or PT#H#M#S or PT#M#S or PT#S
  pattern_days= re.compile('P(\d*)DT(\d*)H(\d*)M(\d*)S')
  pattern_hours= re.compile('PT(\d*)H(\d*)M(\d*)S')
  pattern_minutes= re.compile('PT(\d*)M(\d*)S')
  pattern_seconds= re.compile('PT(\d*)S')

  #m = pattern.match('DT2H3M40S10')
  m = pattern_days.match(dur_str)
  if m is not None:
    (days,hours,minutes,seconds) =m.groups()
  else:
    m=pattern_hours.match(dur_str)
    if m is not None:
      (hours,minutes,seconds) = m.groups()
    else:
      m=pattern_minutes.match(dur_str)
      if m is not None:
        (minutes,seconds) = m.groups()
      else:
        m=pattern_seconds.match(dur_str)
        if m is not None:
          seconds = m.group(1)
  #print(f'days:{days}, hours:{hours}, minutes:{minutes}, seconds:{seconds}')
  duration = (int(days)*86400)+(int(hours)*3600)+(int(minutes)*60)+int(seconds)
  return duration

'''
  Helper to fetch video Ids for this channel
  using the playlistItems API call
'''
def get_uploaded_video_ids(channel):
  video_ids=list()
  next_page=None
  page_size=50
  max_videos=1500
  next_page=''
  req_ct=0
  while next_page is not None:
    req_ct += 1
    #Execute API Request
    if next_page == '':
      request = youtube.playlistItems().list(part='snippet',playlistId=channel['playlistId'], maxResults=page_size)
    else:
      request = youtube.playlistItems().list(part='snippet',playlistId=channel['playlistId'], pageToken=next_page, maxResults=page_size)

   # print(f'''\nrequest[{req_ct}]: token={next_page}''')
    response=request.execute()

    # Process API response
    items=response['items']
    num_items_in_page= response['pageInfo']['resultsPerPage']

    if 'nextPageToken' in response:
      next_page=response['nextPageToken']
    else:
      next_page=None

    #print(f'''response[{req_ct}]: token={next_page} items={num_items_in_page}''')

    # Collect the Video Ids
    for item in items:
      if item['snippet']['resourceId']['kind'] == 'youtube#video':
        video_ids.append(item['snippet']['resourceId']['videoId'])

  print (f'''Total {len(video_ids)} video Ids fetched via {req_ct} requests.''')
  return video_ids

def get_video_info(channel):
  try:
    video_data = dict()

    # Get viedo IDs for this channel
    #vids = ",".join(map(str, video_ids))

    # Get video ids for the channel's upload playlist

    video_ids=get_uploaded_video_ids(channel)
    channel['video_count'] = len(video_ids)

    # Prepare the API request
    page_size=50
    batch_start=0
    batch_end=page_size
    req_ct=0

    video_data = dict()
    while batch_start<len(video_ids):
      vids = ",".join(map(str, video_ids[batch_start:batch_end]))
      # Execute the API request
      request = youtube.videos().list(part="snippet,contentDetails,statistics",id=vids, maxResults=page_size)
      #print(f'''request[{req_ct}]: vids:{vids}''')
      response = request.execute()
      #print(f'''response={response['items']}''')

      # Extract from the response, the channel attributes that we are interested in
      for video in response["items"]:
        data =dict()
        data["id"] = video["id"]
        data["playlist_id"]=channel['playlistId']
        data["channel_id"]=channel['id']

        data["name"]=video["snippet"]["title"]
        data["description"]=video["snippet"]["description"]
        data["published_date"]=video["snippet"]["publishedAt"]

        data["view_count"]=0
        if 'viewCount'in video["statistics"]:
          data["view_count"]=video["statistics"]["viewCount"]

        data["like_count"]=0
        if 'likeCount' in video["statistics"]:
          data["like_count"]=video["statistics"]["likeCount"]

        data["dislike_count"]=0 # Private information that cannot be obtained - setting to 0

        data["favorite_count"]=0
        if 'favoriteCount' in video["statistics"]:
          data["favorite_count"]=video["statistics"]["favoriteCount"]

        data["comment_count"]=0
        if 'commentCount' in video["statistics"]:
          data["comment_count"]=video["statistics"]["commentCount"]

        data["duration"]= to_seconds(video["contentDetails"]["duration"])

        # Thumbnail processing
        data["thumbnails"]=dict()
        for key in video["snippet"]["thumbnails"].keys():
          data["thumbnails"][key] = video["snippet"]["thumbnails"][key]["url"]

        data["caption_status"]= video["contentDetails"]["caption"]
        video_data[data["id"]]=data

      # Set the next batch
      req_ct += 1
      batch_start=batch_end
      batch_end += min(page_size, (len(video_ids)-batch_start))
      #print(f'''Requested for {len(vids.split(','))} videos''')
      #time.sleep(1)
    print(f"Total {req_ct} API requests fetched {len(video_data)} videos.")
    return video_data
  except:
    print('Failed to fetch or process video info from Youtube Data API v3')
    print(traceback.print_exc())



In [11]:
'''
  Fetches all the YouTube stats in oneshot - channel, playlist, comment threads and video
'''
def get_all_info(channel_id):
  stats=dict()
  # Fetch channel information
  stats["channel"] = get_channel_info(channel_id)

  # Fetch playlist information for the above channel
  stats["playlists"] = get_playlist_info(channel_id)

  # Fetch comment threads for the above channel
  stats["comments"] = get_comments_info(channel_id)

  # Fetch videos uploaded for this channel
  stats["videos"]=get_video_info(stats["channels"])
  return stats

In [101]:
'''
  Test Program that triggers all the 4 APIs
'''
channel_ids_high_video_count={
    'Tamil Pokkisham': 'UCS84kz7Fs8bzRs6xcPY9lQQ',
    'CNN': 'UCupvZG-5ko_eiXAupbDfxWw',
    "TEDx Talks":'UCsT0YIqwnpJCM-mx7-gSA4Q',
    "WION":"UC_gUM8rL-Lrg6O3adPW9K1g",
    "The Indian Mystics":"UCcnBJHMugWpuy6Y7-SlI1ew",
    "Future Technology":"UCoIPNbr4UXhhZXDBZaUNHwA",
    "SonyMusicSouthVEVO":"UCTNtRdBAiZtHP9w7JinzfUg",
    "NBA":"UCWJ2lWNubArHWmf3FIHbfcQ",
    "moneycontrol":"UChftTVI0QJmyXkajQYt2tiQ",
    "GUVI":"UCduIoIMfD8tT3KoU0-zBRgQ",
    "Neural networks":"UCYO_jab_esuFRV4b17AJtAw"
}

channel_ids={
    "GUVI":"UCduIoIMfD8tT3KoU0-zBRgQ",
    "Top Vegan":'UCRllpsGcCAd974xKkPL7KyQ',
    "Hebbar's Kitchen": "UCPPIsrNlEkaFQBk-4uNkOaw",
    "Informit" : "UC7s0i5C8EeQ7ckWCf-SXOgQ",
    "Simply Chess": "UCHU81qoYaZinuhdLoruBQ9g",
    "KG SPORTS": "UChf6U5RGIeZ165MQgJbKrKA",
    "Sky Editz":"UCNMYiqPGEJTtcImSyaEq62w",
    "MegaBuilds": "UC6TY36Ys_J6UPKd75TpRuUw",
    "The B1M": "UC6n8I1UDTKP1IWjQMg6_TwA",
    "Pat Kay":"UCeMvA8xJIGgvEjO0kgGFOpg",
    "NASA Climate Change":"UCP_hZt43bbGGf9ah6ATOvEg"
}

# Pick one of the channel IDs from the test channels above
channel_id= channel_ids["GUVI"]
print(f'''channel_id=test_channels["GUVI"]''')

# Fetch & print channel info
channel= get_channel_info(channel_id)
print(f'''channel={channel}''')

#Fetch & print playlist info
playlists = get_playlist_info(channel_id)
print(f'''playlists={playlists}''')

# Fetch & comments thread info
comments=get_comments_info(channel_id)
print(f'''comments={comments}''')

# Fetch & print video info for channel's videos
#video_ids = set()
#[video_ids.add(comments[key]["video_id"]) for key in comments.keys()]
print(f'''videos={get_video_info(channel)}''')

channel_id=test_channels["GUVI"]
channel={'id': 'UCduIoIMfD8tT3KoU0-zBRgQ', 'name': 'GUVI', 'type': 'youtube#channel', 'description': 'GUVI is an IIT-M & IIM-A incubated edu-tech company that offers Wide range of online Tech courses in your Native language.\nWith 100% placement support, Globally recognized certifications, Mentors from top global product companies  and Platforms improve your Programming and IT skills GUVI does it all. With over 18 Lakh+ learners and 9+ years of experience we provide apex learning through our Edu-tech services, GUVI shares its most loved Tech-videos on YouTube!\n\nWhat’s on our YouTube Channel?\nCareer guidance videos that give an idea for beginners & early professionals to upgrade their careers.\nNew initiatives & offers that unlock your flexible path for learning the latest technologies.\nWalkthrough videos of our practice platforms that shape up your coding & web development skills.\n\nKnow more by visiting our official website - www.guvi.in\n\n', 'pl

In [12]:
'''
  Simple test for ID batching for video reqeusts,
  without having to exhaust API quota
'''
import random
import string

#Function to generate list of random IDs with given length
def generate_random_strings(str_ct, str_len):
  ret=list()
  for i in range(0,str_ct):
    s = ''.join(random.choices(string.ascii_uppercase +
                             string.digits+ string.ascii_lowercase, k=str_len))
    ret.append(s)
  return ret

# Test code verify if the IDs are batched correctly
page_size=50
batch_start=0
batch_end=page_size
req_ct=0
video_ids=generate_random_strings(101, 10)
while batch_start < len(video_ids):
  req_ct +=1
  vids = ",".join(map(str, video_ids[batch_start:batch_end]))
  print(f'''request[{req_ct}]: vids:{vids}''')

  # Set the next batch
  batch_start=batch_end
  batch_end += min(page_size, (len(video_ids)-batch_start))



request[1]: vids:anuGljJXuq,UlYjWameeC,5yqPpYSwks,93xVB4EmL5,532UJX6MGg,KWv94QnJFY,6ZrMynVMyi,zE0KqqCP47,1RLKYC5fdX,vVK9NeBngq,qXhVQ8aEBk,MhztdBoLgR,P9riQ48aLP,2grQ5xnJzg,gCtUDjauqF,oLDN1qZlXk,9qODAFxSGe,FlCe0aPPxE,B3RJOoqk7R,gcu3Dsdlh4,bBfvnVualh,KgJ1WrZexm,vh6Wl8grKF,CAJr18oYhL,MNiFc69QGq,a25LK5qUer,zuEE1fuPC5,pzaYqB9RxQ,zi0A64CicQ,9hNZxw0rZ1,6akjJW0uNX,Co8hYYvL0d,LJdC95yC1W,XIBPDvLl7R,sW6FU2nEtn,SU7K45o0O8,RTayrjdNTi,ij6GIUmlBZ,4H4PELr9L3,xn5DZN2bMw,3vgdzIlDB7,fORqFWVY4h,hTfpA18iv0,aDwb94irST,YacwZNUvwP,22iGNggRG1,0mAC2iTwdg,i20D90OrVa,Zl97J9e6tj,5BYsHUZkF9
request[2]: vids:YHqf7H0UTH,gA3cwv8qqb,FqUE4IA88C,JsrFBjLg2u,BWVwVdrN5j,FoU01UuSr9,MyDdEgwcca,RVv2msxMyl,QUsQtwy0p7,urlJoGuqY2,aBEqlUeIP1,RZxBq5wt2A,HCtpGK4Ghn,HXytKvZNr0,tCFji83trC,akuK9JZfZO,d0jNkpyJuu,4O3Mw6SGWw,I2fzDHBdl9,j5eqIPcKud,O1xVa8jB32,Kqt5kRmszv,ktyZQ0viEy,BmhnbF2gQr,u9RXR4bHcY,kgE3lqoeJ8,7TtN6rtMCk,cMzxFTBQdy,4cnUSYXm0X,dwSkojbOfn,9A8Jqd4A6q,2XLV4ETgs5,yLSwxccw0A,0JsCOh4EPv,gOSAGaUMKi,Tm6rMM6FYh,J5DMDa5ZyG,afRDPu8wW