In [None]:
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
import csv
import os

final_data = []
youtube = build('youtube', 'v3', developerKey='AIzaSyDO0CqgeMm9pfPegdwYGpUaBP0ud5ZlXuw') 

def fetch_videos_by_query(query, region_code=None, category_id=None, is_live=False):
    videos_data = []
    try:
        published_after = '2022-05-01T00:00:00Z'
        published_before = '2025-05-10T00:00:00Z'
        next_page_token = None
        videos_fetched = 0;
        while True:
            search_response = youtube.search().list(
                q=query,
                part='snippet',
                maxResults=50,
                type='video',
                publishedAfter=published_after,
                publishedBefore=published_before,
                regionCode=region_code if region_code else None,
                videoCategoryId=category_id if category_id else None,
                order='date',
                pageToken=next_page_token
            ).execute()

            for item in search_response.get('items', []):
                video_id = item['id']['videoId']
                video_title = item['snippet']['title']
                video_details = fetch_video_details(video_id)
                comments = fetch_comments(video_title, video_id)
                videos_data.append({
                    'video_id': video_id,
                    'video_title': video_title,
                    'video_details': video_details,
                    'comments': comments
                })
                videos_fetched += 1
                print(f"videos_fetched so far: {videos_fetched}")
            next_page_token = search_response.get('nextPageToken')
            if not next_page_token:
                break

    except HttpError as e:
        print(f"Failed to retrieve videos: {e}")

    final_data.extend(videos_data)

def fetch_video_details(video_id):
    try:
        video_response = youtube.videos().list(
            part='snippet,contentDetails,statistics',
            id=video_id
        ).execute()
        video_details = []
        for video in video_response.get('items', []):
            video_details.append({
                'view_count': video['statistics'].get('viewCount', 'N/A'),
                'like_count': video['statistics'].get('likeCount', 'N/A'),
                'comment_count': video['statistics'].get('commentCount', 'N/A')
            })
        return video_details
    except HttpError as e:
        print(f"Failed to retrieve video details for video ID '{video_id}': {e}")
        return None

def fetch_comments(video_title, video_id, min_comments=2000, min_replies=1000):
    comments_data = []
    comments_collected = 0
    replies_collected = 0
    try:
        next_page_token = None
        while comments_collected < min_comments or replies_collected < min_replies:
            comments_response = youtube.commentThreads().list(
                part='snippet,replies',
                videoId=video_id,
                maxResults=100,
                pageToken=next_page_token
            ).execute()

            for comment_thread in comments_response.get('items', []):
                top_comment = comment_thread['snippet']['topLevelComment']
                comment_text = top_comment['snippet']['textDisplay']
                commenter_name = top_comment['snippet']['authorDisplayName']
                timestamp = top_comment['snippet']['publishedAt']

                # For top-level comments, parent is None or empty
                comments_data.append([video_title, commenter_name, comment_text, timestamp, 'Comment', '', video_id])
                comments_collected += 1

                # Replies to the top-level comment
                if 'replies' in comment_thread:
                    for reply in comment_thread['replies']['comments']:
                        reply_text = reply['snippet']['textDisplay']
                        replier_name = reply['snippet']['authorDisplayName']
                        reply_timestamp = reply['snippet']['publishedAt']
                        # The parent for the reply is the top-level commenter
                        comments_data.append([video_title, replier_name, reply_text, reply_timestamp, 'Reply', commenter_name, video_id])
                        replies_collected += 1
                        if replies_collected >= min_replies:
                            break

            next_page_token = comments_response.get('nextPageToken')
            if not next_page_token or comments_collected >= min_comments:
                break

    except HttpError as e:
        if e.resp.status == 403 and 'commentsDisabled' in str(e):
            print(f"Comments are disabled for video title '{video_title}' (ID: {video_id}). Skipping...")
        else:
            print(f"Failed to retrieve comments for video title '{video_title}': {e}")

    return comments_data

def write_to_csv(data):
    file_name = 'youtube_data_data.csv'
    with open(file_name, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Video ID', 'Video Title', 'View Count', 'Like Count', 'Comment Count',
                         'Commenter Name', 'Text', 'Timestamp', 'Type', 'Parent'])
        for video in data:
            for comment in video['comments']:
                writer.writerow([
                    video['video_id'],
                    video['video_title'],
                    video['video_details'][0]['view_count'] if video['video_details'] else 'N/A',
                    video['video_details'][0]['like_count'] if video['video_details'] else 'N/A',
                    video['video_details'][0]['comment_count'] if video['video_details'] else 'N/A',
                    comment[1],  # Commenter Name
                    comment[2],  # Text
                    comment[3],  # Timestamp
                    comment[4],  # Type (Comment/Reply)
                    comment[5],  # Parent
                ])

    file_path = os.path.abspath(file_name)
    print(f"CSV file saved at: {file_path}")

queries = [
    "Vline vs Metro",
    "Comparision between Metro and Vline Train",
    "Regional vs Metropolitan Train",
    "Melbourne Trains"
]
for q in queries:
    fetch_videos_by_query(q, region_code='AU')
write_to_csv(final_data)
