In [3]:
import requests
import pandas as pd
import random

def get_channel_details(api_key, channel_id):
    url = f"https://www.googleapis.com/youtube/v3/channels?part=statistics&id={channel_id}&key={api_key}"
    response = requests.get(url).json()
    if 'items' in response and len(response['items']) > 0:
        stats = response['items'][0]['statistics']
        return {
            'subscriber_count': stats.get('subscriberCount', '0')
        }
    return {'subscriber_count': '0'}

def get_video_details(api_key, video_id):
    url = f"https://www.googleapis.com/youtube/v3/videos?part=statistics,snippet,contentDetails&id={video_id}&key={api_key}"
    response = requests.get(url).json()
    details = {}
    if 'items' in response and len(response['items']) > 0:
        stats = response['items'][0]['statistics']
        snippet = response['items'][0]['snippet']
        content_details = response['items'][0]['contentDetails']
        duration = parse_duration(content_details.get('duration', 'PT0M'))
        details = {
            'description': snippet.get('description', ''),
            'channel_id': snippet['channelId'],
            'likes': stats.get('likeCount', '0'),
            'views': stats.get('viewCount', '0'),
            'comments': stats.get('commentCount', '0'),
            'stages': snippet.get('liveBroadcastContent', ''),
            'category': snippet.get('categoryId', ''),
            'licensed_content': content_details.get('licensedContent', False),
            'duration': duration,
            'comments_enabled': 'commentCount' in stats
        }
    return details

def parse_duration(duration):
    import isodate
    duration = isodate.parse_duration(duration)
    return duration.total_seconds() / 60  # Convert to minutes

def get_top_comments(api_key, video_id):
    url = f"https://www.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId={video_id}&key={api_key}&order=relevance&maxResults=3"
    response = requests.get(url).json()
    top_comments = []
    if 'items' in response:
        for item in response['items']:
            top_comments.append(item['snippet']['topLevelComment']['snippet']['textDisplay'])
    return top_comments

def collect_videos(api_key, search_query, order, max_videos):
    collected_data = []
    next_page_token = None
    while len(collected_data) < max_videos:
        url = f"https://www.googleapis.com/youtube/v3/search?key={api_key}&q={search_query}&part=snippet&type=video&maxResults={min(50, max_videos-len(collected_data))}&order={order}&pageToken={next_page_token or ''}"
        response = requests.get(url).json()
        if "items" not in response:
            break
        
        for item in response["items"]:
            video_id = item["id"]["videoId"]
            video_details = get_video_details(api_key, video_id)
            duration = video_details.get('duration', 0)
            if 4 <= duration <= 20:
                channel_details = get_channel_details(api_key, video_details['channel_id'])
                top_comments = get_top_comments(api_key, video_id)
                
                collected_data.append({
                    "search_query": search_query,
                    "video_id": video_id,
                    "title": item["snippet"]["title"],
                    "video_url": f"https://www.youtube.com/watch?v={video_id}",
                    "description": video_details['description'],
                    "channel_id": video_details['channel_id'],
                    "subscriber_count": channel_details['subscriber_count'],
                    "likes": video_details['likes'],
                    "views": video_details['views'],
                    "comments": video_details['comments'],
                    "top_comments": top_comments,
                    "stages": video_details['stages'],
                    "category": video_details['category'],
                    "licensed_content": video_details['licensed_content'],
                    "duration": video_details['duration'],
                    "comments_enabled": video_details['comments_enabled'],
                    "order": order
                })
                
                if len(collected_data) >= max_videos:
                    break
        
        next_page_token = response.get("nextPageToken")
        if not next_page_token:
            break
    
    return collected_data

# Set a seed for reproducibility
random.seed(42)

# Your API Key
api_key = "AIzaSyDUx6ksTMOX5PdEJccfUdRj5MsG_AcySYI"

# Define a list of search queries
search_queries = ["black business", "black entrepreneurship"]
videos_per_query = 50  # Total videos per category
videos_per_type = 25  # Number of videos per relevance and recent type

# Placeholder for collected video data
video_data = []

for search_query in search_queries:
    # Collect relevance videos
    video_data += collect_videos(api_key, search_query, 'relevance', videos_per_type)
    # Collect recent videos
    video_data += collect_videos(api_key, search_query, 'date', videos_per_type)

# Shuffle the collected data to ensure randomness
random.shuffle(video_data)

# Save video data to a CSV file
df = pd.DataFrame(video_data)
df.to_csv('black_business_videos.csv', index=False)
