# Data Scrapping using YouTube API for Most Popular Youtube Channels

## Importing Libraries

In [57]:
import pandas as pd
import numpy as np

# Google API
from googleapiclient.discovery import build

## Data Creation with YouTube API

In [58]:
api_key = "##############5wLLlYnzcjlEeR_##########"

   #summary:
   # Here as per the requirements we collected channel ids from their respective Channels.
   # Krish Naik: https://www.youtube.com/user/krishnaik06
   # Hitesh Choudhary: https://www.youtube.com/c/HiteshChoudharydotcom
   # Naveen Reddy(Telusko): https://www.youtube.com/c/Telusko
   # Saurabh Shukla(My SirG.com): https://www.youtube.com/user/saurabhexponent1"""


channel_ids = ['UCNU_lfiiWBdtULKOw6X0Dig', # Krish Naik
               'UCXgGY0wkgOzynnHvSEVmE3A', # Hitesh Choudhary
               'UC59K-uG2A5ogwIrHw4bmlEg', # Telusko
               'UCkGS_3D0HEzfflFnG0bD24A', # My SirG.com
]

youtube = build('youtube', 'v3', developerKey=api_key)

### Channel Statistics

In [39]:
def get_channel_stats(youtube, channel_ids):
    """
    Get channel statistics: title, subscriber count, view count, video count, upload playlist
    Params:
    
    youtube: the build object from googleapiclient.discovery
    channels_ids: list of channel IDs
    
    Returns:
    Dataframe containing the channel statistics for all channels in the provided list: title, subscriber count, view count, video count, upload playlist
    
    """
    all_data = []
    request = youtube.channels().list(
                part='snippet,contentDetails,statistics',
                id=','.join(channel_ids))
    response = request.execute() 
    
    for i in range(len(response['items'])):
        data = dict(channelName = response['items'][i]['snippet']['title'],
                    description = response['items'][i]['snippet']['description'],
                    publishdate = response['items'][i]['snippet']['publishedAt'],
                    subscribers = response['items'][i]['statistics']['subscriberCount'],
                    views = response['items'][i]['statistics']['viewCount'],
                    totalVideos = response['items'][i]['statistics']['videoCount'],
                    playlistId = response['items'][i]['contentDetails']['relatedPlaylists']['uploads'],
                    thumbnail =  response['items'][i]['snippet']['thumbnails']['high']['url']
                    )
        all_data.append(data)
    
    return pd.DataFrame(all_data)

In [40]:
channel_data = get_channel_stats(youtube, channel_ids)
#channel_data

In [41]:
channel_data.dtypes

channelName    object
description    object
publishdate    object
subscribers    object
views          object
totalVideos    object
playlistId     object
thumbnail      object
dtype: object

I noticed that the numeric columns are in string format, so I will convert them into numeric so that we can visualize and do numeric operations on them.

In [42]:
# Convert count columns to numeric columns
numeric_cols = ['subscribers', 'views', 'totalVideos']
channel_data[numeric_cols] = channel_data[numeric_cols].apply(pd.to_numeric, errors='coerce')

## Get Video Statistics for all the Channels

In [59]:
def get_video_ids(youtube, playlist_id):
    """
    Get list of video IDs of all videos in the given playlist
    Params:
    
    youtube: the build object from googleapiclient.discovery
    playlist_id: playlist ID of the channel
    
    Returns:
    List of video IDs of all videos in the playlist
    
    """
    
    request = youtube.playlistItems().list(
                part='contentDetails',
                playlistId = playlist_id,
                maxResults = 50)
    response = request.execute()
    
    video_ids = []
    
    for i in range(len(response['items'])):
        video_ids.append(response['items'][i]['contentDetails']['videoId'])
        
    next_page_token = response.get('nextPageToken')
    more_pages = True
    
    while more_pages:
        if next_page_token is None:
            more_pages = False
        else:
            request = youtube.playlistItems().list(
                        part='contentDetails',
                        playlistId = playlist_id,
                        maxResults = 50,
                        pageToken = next_page_token)
            response = request.execute()
    
            for i in range(len(response['items'])):
                video_ids.append(response['items'][i]['contentDetails']['videoId'])
            
            next_page_token = response.get('nextPageToken')
        
    return video_ids[:50] # we only want 50 video from each channel

In [60]:
def get_video_details(youtube, video_ids):
    """
    Get video statistics of all videos with given IDs
    Params:
    
    youtube: the build object from googleapiclient.discovery
    video_ids: list of video IDs
    
    Returns:
    Dataframe with statistics of videos, i.e.:
        'channelTitle', 'title', 'description', 'tags', 'publishedAt'
        'viewCount', 'likeCount', 'favoriteCount', 'commentCount'
        'duration', 'definition', 'caption'
    """
        
    all_video_info = []
    
    for i in range(0, len(video_ids), 50):
        request = youtube.videos().list(
            part="snippet,contentDetails,statistics",
            id=','.join(video_ids[i:i+50])
        )
        response = request.execute() 

        for video in response['items']:
            stats_to_keep = {'snippet': ['channelTitle', 'title', 'description', 'tags', 'publishedAt'],
                             'statistics': ['viewCount', 'likeCount', 'favouriteCount', 'commentCount'],
                             'contentDetails': ['duration', 'definition', 'caption']
                            }
            video_info = {}
            video_info['video_id'] = video['id']

            for k in stats_to_keep.keys():
                for v in stats_to_keep[k]:
                    try:
                        video_info[v] = video[k][v]
                    except:
                        video_info[v] = None

            all_video_info.append(video_info)
    return all_video_info

In [61]:

video_df = pd.DataFrame()
#comments_df = pd.DataFrame()

for c in channel_data['channelName'].unique():
    
    print("Getting video information from channel: " + c)
    playlist_id = channel_data.loc[channel_data['channelName']== c, 'playlistId'].iloc[0]
    video_ids = get_video_ids(youtube, playlist_id)
        
    # get video data
    video_data = get_video_details(youtube, video_ids)
    # get comment data
    #comments_data = get_comments_in_videos(youtube, video_ids)

    # append video data together and comment data toghether
   
    video_df = video_df.append(video_data)
    #comments_df = comments_df.append(comments_data, ignore_index=True)
       
   

Getting video information from channel: Krish Naik


  video_df = video_df.append(video_data)


Getting video information from channel: Hitesh Choudhary


  video_df = video_df.append(video_data)


Getting video information from channel: MySirG.com


  video_df = video_df.append(video_data)


Getting video information from channel: Telusko


  video_df = video_df.append(video_data)


In [92]:
video_df.dtypes

video_id          object
channelTitle      object
title             object
description       object
tags              object
publishedAt       object
viewCount         object
likeCount         object
favouriteCount    object
commentCount      object
duration          object
definition        object
caption           object
dtype: object

In [94]:
cols = ['viewCount', 'likeCount', 'favouriteCount', 'commentCount']
video_df[cols] = video_df[cols].apply(pd.to_numeric, errors='coerce', axis=1)

## Get Comment Data

In [76]:
def get_comments_in_videos(youtube, video_ids):
    all_comments = []

    for video_id in video_ids:
        data = youtube.commentThreads().list(
            part='snippet,replies', 
            videoId=video_id, 
            maxResults='100', 
            textFormat="plainText").execute()

        for i in data["items"]:
            commentsData = dict(

            name = i["snippet"]['topLevelComment']["snippet"]["authorDisplayName"],
            comment = i["snippet"]['topLevelComment']["snippet"]["textDisplay"],
            published_at = i["snippet"]['topLevelComment']["snippet"]['publishedAt'],
            likes = i["snippet"]['topLevelComment']["snippet"]['likeCount'],
            replies = i["snippet"]['totalReplyCount'])

            all_comments.append(commentsData)

            totalReplyCount = i["snippet"]['totalReplyCount']

            if totalReplyCount > 0:

                parent = i["snippet"]['topLevelComment']["id"]

                data2 = youtube.comments().list(part='snippet', maxResults='100', parentId=parent,
                                                textFormat="plainText").execute()

                for i in data2["items"]:
                    commentsData = dict(
                    name = i["snippet"]["authorDisplayName"],
                    comment = i["snippet"]["textDisplay"],
                    published_at = i["snippet"]['publishedAt'],
                    likes = i["snippet"]['likeCount'],
                    replies = "")

                    all_comments.append(commentsData)

        while ("nextPageToken" in data):

            data = youtube.commentThreads().list(part='snippet', videoId=video_id, pageToken=data["nextPageToken"],
                                                maxResults='100', textFormat="plainText").execute()

            for i in data["items"]:
                commentsData = dict(
                name = i["snippet"]['topLevelComment']["snippet"]["authorDisplayName"],
                comment = i["snippet"]['topLevelComment']["snippet"]["textDisplay"],
                published_at = i["snippet"]['topLevelComment']["snippet"]['publishedAt'],
                likes = i["snippet"]['topLevelComment']["snippet"]['likeCount'],
                replies = i["snippet"]['totalReplyCount'])

                all_comments.append(commentsData)

                totalReplyCount = i["snippet"]['totalReplyCount']

                if totalReplyCount > 0:

                    parent = i["snippet"]['topLevelComment']["id"]

                    data2 = youtube.comments().list(part='snippet', maxResults='100', parentId=parent,
                                                    textFormat="plainText").execute()

                    for i in data2["items"]:
                        commentsData = dict(
                        name = i["snippet"]["authorDisplayName"],
                        comment = i["snippet"]["textDisplay"],
                        published_at = i["snippet"]['publishedAt'],
                        likes = i["snippet"]['likeCount'],
                        replies = '')

                        all_comments.append(commentsData)

    

    return all_comments

In [77]:


comments_df = pd.DataFrame()

for c in channel_data['channelName'].unique():
    
    print("Getting video information from channel: " + c)
    playlist_id = channel_data.loc[channel_data['channelName']== c, 'playlistId'].iloc[0]
    video_ids = get_video_ids(youtube, playlist_id)
        
    # get video data
    
    # get comment data
    comments_data = get_comments_in_videos(youtube, video_ids)

    # append video data together and comment data toghether
   
    
    comments_df = comments_df.append(comments_data, ignore_index=True)

Getting video information from channel: Krish Naik


  comments_df = comments_df.append(comments_data, ignore_index=True)


Getting video information from channel: Hitesh Choudhary


  comments_df = comments_df.append(comments_data, ignore_index=True)


Getting video information from channel: MySirG.com


  comments_df = comments_df.append(comments_data, ignore_index=True)


Getting video information from channel: Telusko


  comments_df = comments_df.append(comments_data, ignore_index=True)


In [78]:
comments_df

Unnamed: 0,name,comment,published_at,likes,replies
0,Krish Naik,Happy Teacher's Day. On the occasion of this a...,2022-09-08T11:22:40Z,4,3
1,San Malli,Hello sir i want to learn ai from your channel...,2022-09-08T13:18:26Z,0,
2,Aditya kumar,But teachers day to 3 din pahle tha na🤔,2022-09-08T11:44:57Z,0,
3,Aditya kumar,Ji sir thank you so much 😊,2022-09-08T11:44:19Z,0,
4,Mayank Shukla,Guys the name Data Science and people think th...,2022-09-09T07:34:21Z,0,0
...,...,...,...,...,...
8944,Telusko,"""Just a note that SubQuery are now using IPFS ...",2022-06-30T04:51:12Z,0,0
8945,AKHIL KILLER 500M,Sir please reply to this comment i have this d...,2022-06-23T12:51:34Z,0,0
8946,avinash software solutions,First view,2022-06-23T12:24:39Z,0,0
8947,Hack with Programming,first one to comment,2022-06-23T12:24:32Z,0,0


## Get thumbnail data

In [79]:
def get_video_thumbnails(youtube, video_ids):
    all_video_thumbnails = []
    
    for i in range(0, len(video_ids), 50):
        request = youtube.videos().list(
                    part='snippet,statistics',
                    id=','.join(video_ids))
        response = request.execute()
        
        for video in response['items']:
            video_thumbnails = dict(
                            thumbnails = video['snippet']['thumbnails']['high']['url']
                               )
            all_video_thumbnails.append(video_thumbnails)
    
    return all_video_thumbnails

In [80]:
thumbnails = get_video_thumbnails(youtube, video_ids)

In [81]:
thumbnails_df = pd.DataFrame()
#comments_df = pd.DataFrame()

for c in channel_data['channelName'].unique():
    
    print("Getting video information from channel: " + c)
    playlist_id = channel_data.loc[channel_data['channelName']== c, 'playlistId'].iloc[0]
    video_ids = get_video_ids(youtube, playlist_id)
        
    # get video data
    thumbnails = get_video_thumbnails(youtube, video_ids)
    # get comment data
    #comments_data = get_comments_in_videos(youtube, video_ids)

    # append video data together and comment data toghether
   
    thumbnails_df = thumbnails_df.append(thumbnails)
    #comments_df = comments_df.append(comments_data, ignore_index=True)

Getting video information from channel: Krish Naik


  thumbnails_df = thumbnails_df.append(thumbnails)


Getting video information from channel: Hitesh Choudhary


  thumbnails_df = thumbnails_df.append(thumbnails)


Getting video information from channel: MySirG.com


  thumbnails_df = thumbnails_df.append(thumbnails)


Getting video information from channel: Telusko


  thumbnails_df = thumbnails_df.append(thumbnails)


In [82]:
thumbnails_df

Unnamed: 0,thumbnails
0,https://i.ytimg.com/vi/pZakG-kIt_o/hqdefault.jpg
1,https://i.ytimg.com/vi/4VVhwfVf1k8/hqdefault.jpg
2,https://i.ytimg.com/vi/ngBPXUaScCw/hqdefault.jpg
3,https://i.ytimg.com/vi/dPARXQO8dkw/hqdefault.jpg
4,https://i.ytimg.com/vi/tpXu2DnK9HA/hqdefault.jpg
...,...
45,https://i.ytimg.com/vi/CHxpplpcIWE/hqdefault.jpg
46,https://i.ytimg.com/vi/TxKnV7_WnVE/hqdefault.jpg
47,https://i.ytimg.com/vi/GDdigOQ4qOY/hqdefault.jpg
48,https://i.ytimg.com/vi/s72FvmjrTw4/hqdefault.jpg


In [86]:
urls = thumbnails_df['thumbnails'].to_list()

In [84]:
from selenium.webdriver.common.by import By
import requests
import io
from PIL import Image
import time



In [89]:
def download_images(download_path, url, file_name):
    try:
        image_content = requests.get(url).content
        image_file = io.BytesIO(image_content)
        image = Image.open(image_file)
        file_path = download_path + file_name

        with open(file_path, "wb") as f:
            image.save(f, "JPEG")

            #print("Success")
    except Exception as e:
        print('Failed', e)

In [90]:
for i, url in enumerate(urls):
	download_images("D:/Projects/YouTube_Scrapping/data/imgs/", url, str(i) + ".jpg")

## Saving Data into CSV file

### Channel Details

In [91]:
channel_data.to_csv('D:/Projects/YouTube_Scrapping/data/channalStats.csv', index=False, header=True)

### Video Details

In [95]:
video_df.to_csv('D:/Projects/YouTube_Scrapping/data/videoDetails.csv', index=False, header=True)

### All Comments

In [96]:
comments_df.to_csv('D:/Projects/YouTube_Scrapping/data/allComments.csv', index=False, header=True)

### All Thumbnails

In [97]:
thumbnails_df.to_csv('D:/Projects/YouTube_Scrapping/data/allThumbnails.csv', index=False, header=True)