In [14]:
import csv
import json
import datetime
import os
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from dotenv import load_dotenv

# Load environment variables from .env file to get your YouTube API key
load_dotenv()
api_key = os.getenv("YOUTUBE_API_KEY")
youtube = build("youtube", "v3", developerKey=api_key)

In [15]:
def get_video_comments(youtube, video_id):
    """Retrieve comments for a given video ID.
    
    If a quota/rate limit error is encountered, it stops processing.
    If comments are disabled, it skips that video.
    """
    comments = []
    request = youtube.commentThreads().list(
        part="snippet",
        videoId=video_id,
        textFormat="plainText",
        maxResults=100  # maximum allowed per request
    )
    while request:
        try:
            response = request.execute()
        except HttpError as e:
            error_message = str(e)
            if "quotaExceeded" in error_message or "rateLimitExceeded" in error_message:
                print("Rate limit/quota exceeded error encountered while fetching comments for video "
                      f"{video_id}. Stopping process.")
                raise  # Propagate the error to halt further processing.
            elif e.resp.status == 403 and "commentsDisabled" in error_message:
                print(f"Comments are disabled for video {video_id}. Skipping.")
                break
            else:
                raise
        for item in response.get("items", []):
            snippet = item["snippet"]["topLevelComment"]["snippet"]
            comment_text = snippet.get("textOriginal", "")
            author = snippet.get("authorDisplayName", "")
            comments.append({
                "youtube_vid": video_id,
                "comment": comment_text,
                "author": author
            })
        request = youtube.commentThreads().list_next(request, response)
    return comments

In [16]:
def scrape_youtube_comments(search_query):
    """Search YouTube using the provided query and retrieve comments from each found video."""
    search_response = youtube.search().list(
        q=search_query,
        part="id,snippet",
        type="video",
        maxResults=10  # adjust if necessary
    ).execute()
    
    video_ids = [item["id"]["videoId"] for item in search_response.get("items", [])]
    all_comments = []
    for vid in video_ids:
        print(f"Retrieving comments for video ID: {vid}")
        video_comments = get_video_comments(youtube, vid)
        all_comments.extend(video_comments)
    return all_comments

In [17]:
def save_campaign(campaign, output_json):
    """
    Read the existing JSON file, append the new campaign,
    and save the updated list atomically.
    """
    # Read the existing campaigns (if any)
    if os.path.exists(output_json):
        try:
            with open(output_json, "r", encoding="utf-8") as f:
                campaigns = json.load(f)
        except json.JSONDecodeError:
            print("Warning: Output file is corrupted or empty. Starting fresh.")
            campaigns = []
    else:
        campaigns = []
    
    # Append the new campaign data
    campaigns.append(campaign)
    
    # Write to a temporary file and then replace the original file atomically
    tmp_filename = output_json + ".tmp"
    with open(tmp_filename, "w", encoding="utf-8") as f:
        json.dump(campaigns, f, indent=4, ensure_ascii=False)
    os.replace(tmp_filename, output_json)


In [18]:
def process_csv_and_scrape(csv_filename, output_json):
    processed_campaigns = set()
    
    # Load previously processed campaigns to avoid duplicates.
    if os.path.exists(output_json):
        try:
            with open(output_json, "r", encoding="utf-8") as f:
                existing_campaigns = json.load(f)
                processed_campaigns = {campaign["name"] for campaign in existing_campaigns}
                print(f"Resuming. Found {len(processed_campaigns)} campaigns already processed.")
        except json.JSONDecodeError:
            print("Output file is corrupted. Starting fresh.")
            processed_campaigns = set()
    
    with open(csv_filename, mode="r", encoding="utf-8") as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            state = row.get("state", "").lower()
            if state not in ["successful", "failed"]:
                continue

            campaign_name = row.get("name", "")
            if campaign_name in processed_campaigns:
                print(f"Campaign '{campaign_name}' already processed. Skipping.")
                continue

            print(f"Processing campaign: {campaign_name}")
            
            # Extract campaign URL from the "urls" column (stored as a JSON string)
            try:
                urls = json.loads(row.get("urls", "{}"))
                campaign_link = urls.get("web", {}).get("project", "")
            except Exception:
                campaign_link = ""
            
            # Convert the deadline (Unix timestamp) to a normal datetime format
            try:
                deadline_epoch = int(row.get("deadline", "0"))
                deadline_str = datetime.datetime.fromtimestamp(deadline_epoch).strftime('%Y-%m-%d %H:%M:%S')
            except Exception:
                deadline_str = row.get("deadline", "")

            # Use campaign name as the YouTube search query and scrape comments
            try:
                comments = scrape_youtube_comments(campaign_name)
            except HttpError as e:
                error_message = str(e)
                if "quotaExceeded" in error_message or "rateLimitExceeded" in error_message:
                    print(f"Rate limit exceeded encountered while processing campaign '{campaign_name}'. Stopping processing.")
                    return  # Stop processing further campaigns.
                else:
                    print(f"Error while scraping YouTube comments for '{campaign_name}': {e}")
                    comments = []
            
            # Build an "others" dict with all fields not explicitly extracted
            keys_to_remove = {
                "name", "urls", "state", "backers_count", "blurb",
                "converted_pledged_amount", "goal", "deadline", 
                "percent_funded", "state_changed_at", "usd_pledged"
            }
            others = {k: v for k, v in row.items() if k not in keys_to_remove}

            campaign_data = {
                "name": campaign_name,
                "link": campaign_link,
                "comments": comments,
                "success": state,  # either "successful" or "failed"
                "backers_count": row.get("backers_count"),
                "blurb": row.get("blurb"),
                "converted_amount_pledged": row.get("converted_pledged_amount"),
                "goal": row.get("goal"),
                "deadline": deadline_str,
                "percent_funded": row.get("percent_funded"),
                "state_changed_at": row.get("state_changed_at"),
                "usd_pledged": row.get("usd_pledged"),
                "others": others
            }

            # Save the campaign by reading the old file, adding the new campaign, and writing it back.
            save_campaign(campaign_data, output_json)
            processed_campaigns.add(campaign_name)
            print(f"Saved campaign '{campaign_name}' to {output_json}")
    
    print("Processing complete.")


In [19]:
process_csv_and_scrape("../Kickstarter_2025-02-12T07_48_27_293Z/Kickstarter001.csv", "output.json")

Processing campaign: The Mermaid’s Purse oracle deck
Retrieving comments for video ID: 7khMEISoqUo
Retrieving comments for video ID: C3STcZj_ySE
Retrieving comments for video ID: YQXC24_K7y8
Retrieving comments for video ID: d81AKuuV_DI
Retrieving comments for video ID: CgdO_MFyKg8
Retrieving comments for video ID: fqXebyAaVEA
Retrieving comments for video ID: ItfwmqN2Tt8
Retrieving comments for video ID: UXEbu4jhA38
Retrieving comments for video ID: YiCnfWlUT10
Retrieving comments for video ID: 5b__VQWKlrc
Saved campaign 'The Mermaid’s Purse oracle deck' to output.json
Processing campaign: The 11:11 Oracle
Retrieving comments for video ID: lpmPDmkHBrE
Retrieving comments for video ID: TKtYHf4YTFw
Retrieving comments for video ID: G9dLcV8uosY
Retrieving comments for video ID: QfGsz9YHGWQ
Retrieving comments for video ID: 8r2C15ETSyA
Retrieving comments for video ID: cDO9vMtH63I
Retrieving comments for video ID: 9GftWIAPxHA
Comments are disabled for video 9GftWIAPxHA. Skipping.
Retrievi

In [20]:
import json

try:
    with open("output.json", "r", encoding="utf-8") as f:
        campaigns = json.load(f)
    print("Number of campaigns scraped:", len(campaigns))
except json.JSONDecodeError as e:
    print("Error loading JSON file:", e)

Number of campaigns scraped: 35
