In [None]:
from googleapiclient.discovery import build
import json
import os
from dotenv import load_dotenv
import json
import pandas as pd

In [None]:
load_dotenv()
api_key = os.getenv('API_KEY')
data_path = os.getenv('DATA_PATH')

In [None]:
youtube = build('youtube', 'v3', developerKey=api_key)

In [None]:
def get_top_videos_using_channel_id(youtube,channelId,maxResults = 10) :
    videoIdList = []
    request = youtube.search().list(
        channelId=channelId,
        maxResults=maxResults,
        order="viewCount",
        part="snippet"
    )

    response = request.execute()
    if response and response['items']:
        for item in response['items']:
            if item['id']['kind'] == 'youtube#video':
                videoIdList.append(item['id']['videoId'])

    return videoIdList

In [None]:
def get_replies_of_comments(youtube, commentId):
    
    nextPageToken = 'null'
    replies = []    

    while nextPageToken != '':
        request = youtube.comments().list(
            part= 'snippet',
            parentId= commentId,
            maxResults = 100,
        )
        if(nextPageToken != 'null'):
            request = youtube.comments().list(
            part= 'snippet',
            parentId= commentId,
            maxResults = 100,
            pageToken= nextPageToken
        )
        
        response = request.execute()
        # return response
        

        for item in response['items']:
            replies.append(dict(
                id = item['id'],
                textOriginal = item['snippet']['textOriginal'],
                textDisplay = item['snippet']['textDisplay'],
                authorDisplayName = item['snippet']['authorDisplayName'],
                authorId = item['snippet']['authorChannelId']['value'],
                parentId = item['snippet']['parentId'],
                publishedAt = item['snippet']['publishedAt'],
                updatedAt = item['snippet']['updatedAt'],
                likeCount = item['snippet']['likeCount'],
                
            ))
        if 'nextPageToken' in response:
            nextPageToken = response['nextPageToken']
        else:
            nextPageToken = ''
    
    return replies

In [None]:
def get_video_comments_optimized(youtube, video_id):
    
    request = youtube.commentThreads().list(
        part="snippet,replies",
        videoId=video_id,
        order="relevance",
        maxResults=100
    )
    try:
        response = request.execute() 
        # return response

        comments = []
        
        for item in response['items']:
            replies = []
            if 'replies' in item:
                if item['snippet']['totalReplyCount'] <= len(item['replies']['comments']):
                    for reply in item['replies']['comments']:
                        replies.append(dict(
                            id = reply['id'],
                            textOriginal = reply['snippet']['textOriginal'],
                            textDisplay = reply['snippet']['textDisplay'],
                            authorDisplayName = reply['snippet']['authorDisplayName'],
                            authorId = reply['snippet']['authorChannelId']['value'],
                            parentId = reply['snippet']['parentId'],
                            publishedAt = reply['snippet']['publishedAt'],
                            updatedAt = reply['snippet']['updatedAt'],
                            likeCount = reply['snippet']['likeCount'],
                        ))
                else:
                    replies = get_replies_of_comments(youtube, item['id'])

            data = dict(
                id = item['id'],
                textOriginal = item['snippet']['topLevelComment']['snippet']['textOriginal'],
                textDisplay = item['snippet']['topLevelComment']['snippet']['textDisplay'],
                authorDisplayName = item['snippet']['topLevelComment']['snippet']['authorDisplayName'],
                authorId = item['snippet']['topLevelComment']['snippet']['authorChannelId']['value'],
                replyCount = item['snippet']['totalReplyCount'],
                publishedTime = item['snippet']['topLevelComment']['snippet']['publishedAt'],
                updateTime = item['snippet']['topLevelComment']['snippet']['updatedAt'],
                likeCount = item['snippet']['topLevelComment']['snippet']['likeCount'],
                replies = replies
            )
            
            
            comments.append(data)
        
        return comments
    except:
        return []

In [None]:
def extract_comments_from_channels(youtube, channel_list, videos_per_channel=10):

    os.makedirs(f'{data_path}/json', exist_ok=True)

    for channel in channels:
        videoIds = get_top_videos_using_channel_id(youtube, channel["channelId"], videos_per_channel)
        print(videoIds)

        with open(f'{data_path}/json/{channel["name"]}.json', "a") as f:
            f.write('[')


        for videoId in videoIds:
            comments = get_video_comments_optimized(youtube, videoId)
            videoData = dict(
                channelId=channel["channelId"],
                channelName=channel["name"],
                videoId=videoId,
                comments=comments,
            )
            
            # appending the comments to the json file
            with open(f'{data_path}/json/{channel["name"]}.json', "a") as f:
                json.dump(videoData, fp=f, indent=2)
                f.write(",")
            
        
        with open(f'{data_path}/json/{channel["name"]}.json', 'rb+') as f:
            f.seek(-1, 2)
            f.truncate()

        with open(f'{data_path}/json/{channel["name"]}.json', "a") as f:
            f.write(']')



In [None]:
channels = [
    dict(channelId = "UCsBjURrPoezykLs9EqgamOA",name = "fireship"),
    # dict(channelId = "UC8CX0LD98EDXl4UYX1MDCXg",name = "Valorant"),
    # dict(channelId = "UCXIJgqnII2ZOINSWNOGFThA",name = "FoxNews"),
    # dict(channelId = "UCUsN5ZwHx2kILm84-jPDeXw",name = "ComedyCentral"),
]

channels

In [None]:
extract_comments_from_channels(youtube, channel_list = channels, videos_per_channel = 10)

In [None]:
def json_to_excel(channel_list):

    os.makedirs(f'{data_path}/xlsx', exist_ok=True)

    for channel in channel_list:
        print(f'Extracting comment threads for {channel["name"]}')
        with open(f'{data_path}/json/{channel["name"]}.json') as f:
            data = json.load(f)

        comments = []

        for item in data:
            for comment in item['comments'] :
                if comment['replyCount'] < 2:
                    continue
                cnt = 1
                data = {
                    's.no.' : cnt,
                    'id' : comment['id'],
                    'isParent' : True,
                    'authorName' : comment['authorDisplayName'],
                    'text' : comment['textOriginal'],
                    'likeCount' : comment['likeCount'],
                    'label(CyberBullying,Normal)' : ''
                }
                cnt+=1
                comments.append(data)

                for reply in comment['replies'][::-1]:
                    reply = {
                        's.no.' : cnt,
                        'id' : reply['id'],
                        'isParent' : False,
                        'authorName' : reply['authorDisplayName'],
                        'text' : reply['textOriginal'],
                        'likeCount' : reply['likeCount'],
                        'label(CyberBullying,Normal)' : ''
                    }
                    cnt+=1
                    comments.append(reply)
        
        df = pd.DataFrame(comments)
        
        df.to_excel(f'{data_path}/xlsx/{channel["name"]}_threads.xlsx')
        print(f'Extracted comment threads for {channel["name"]}\n\n')

In [None]:
json_to_excel(channel_list = channels)