In [34]:
import csv
import json
import datetime
import os
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from dotenv import load_dotenv

# Load environment variables from .env file to get your YouTube API key
load_dotenv()
# api_key = os.getenv("YOUTUBE_API_KEY")
api_key = "AIzaSyA_o96AW_YghupVmkM34TRi0gTcW3FDmzQ"
youtube = build("youtube", "v3", developerKey=api_key)

In [2]:
def get_video_comments(youtube, video_id):
    """Retrieve comments for a given video ID.
    
    If a quota/rate limit error is encountered, it stops processing.
    If comments are disabled, it skips that video.
    """
    comments = []
    request = youtube.commentThreads().list(
        part="snippet",
        videoId=video_id,
        textFormat="plainText",
        maxResults=100  # maximum allowed per request
    )
    while request:
        try:
            response = request.execute()
        except HttpError as e:
            error_message = str(e)
            if "quotaExceeded" in error_message or "rateLimitExceeded" in error_message:
                print("Rate limit/quota exceeded error encountered while fetching comments for video "
                      f"{video_id}. Stopping process.")
                raise  # Propagate the error to halt further processing.
            elif e.resp.status == 403 and "commentsDisabled" in error_message:
                print(f"Comments are disabled for video {video_id}. Skipping.")
                break
            else:
                raise
        for item in response.get("items", []):
            snippet = item["snippet"]["topLevelComment"]["snippet"]
            comment_text = snippet.get("textOriginal", "")
            author = snippet.get("authorDisplayName", "")
            comments.append({
                "youtube_vid": video_id,
                "comment": comment_text,
                "author": author
            })
        request = youtube.commentThreads().list_next(request, response)
    return comments

In [10]:
def scrape_youtube_comments(search_query):
    """Search YouTube using the provided query and retrieve comments from each found video."""
    search_response = youtube.search().list(
        q=search_query,
        part="id,snippet",
        type="video",
        maxResults=10  # adjust if necessary
    ).execute()

    print(search_response.get("items", []))
    
    video_ids = [item["id"]["videoId"] for item in search_response.get("items", [])]
    all_comments = []
    for vid in video_ids:
        print(f"Retrieving comments for video ID: {vid}")
        video_comments = get_video_comments(youtube, vid)
        all_comments.extend(video_comments)
    return all_comments

In [4]:
def save_campaign(campaign, output_json):
    """
    Read the existing JSON file, append the new campaign,
    and save the updated list atomically.
    """
    # Read the existing campaigns (if any)
    if os.path.exists(output_json):
        try:
            with open(output_json, "r", encoding="utf-8") as f:
                campaigns = json.load(f)
        except json.JSONDecodeError:
            print("Warning: Output file is corrupted or empty. Starting fresh.")
            campaigns = []
    else:
        campaigns = []
    
    # Append the new campaign data
    campaigns.append(campaign)
    
    # Write to a temporary file and then replace the original file atomically
    tmp_filename = output_json + ".tmp"
    with open(tmp_filename, "w", encoding="utf-8") as f:
        json.dump(campaigns, f, indent=4, ensure_ascii=False)
    os.replace(tmp_filename, output_json)


In [None]:
def process_csv_and_scrape(csv_filename, output_json):
    processed_campaigns = set()
    
    # Load previously processed campaigns to avoid duplicates.
    if os.path.exists(output_json):
        try:
            with open(output_json, "r", encoding="utf-8") as f:
                existing_campaigns = json.load(f)
                processed_campaigns = {campaign["name"] for campaign in existing_campaigns}
                print(f"Resuming. Found {len(processed_campaigns)} campaigns already processed.")
        except json.JSONDecodeError:
            print("Output file is corrupted. Starting fresh.")
            processed_campaigns = set()
    
    with open(csv_filename, mode="r", encoding="utf-8") as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            state = row.get("state", "").lower()
            if state not in ["successful", "failed"]:
                continue

            campaign_name = row.get("name", "")
            if campaign_name in processed_campaigns:
                print(f"Campaign '{campaign_name}' already processed. Skipping.")
                continue

            # --- Category Filter: Only process Technology campaigns ---
            try:
                category_obj = json.loads(row.get("category", "{}"))
            except Exception:
                category_obj = {}
            
            category_name = category_obj.get("name", "").lower()
            parent_name = category_obj.get("parent_name", "").lower()
            # Only process if the category or its parent is 'technology'
            if category_name != "technology" and parent_name != "technology":
                print(f"Skipping campaign '{campaign_name}' as it is not in the Technology category.")
                continue
            # -------------------------------------------------------------

            print(f"Processing campaign: {campaign_name}")
            
            # Extract campaign URL from the "urls" column (stored as a JSON string)
            try:
                urls = json.loads(row.get("urls", "{}"))
                campaign_link = urls.get("web", {}).get("project", "")
            except Exception:
                campaign_link = ""
            
            # Convert the deadline (Unix timestamp) to a normal datetime format
            try:
                deadline_epoch = int(row.get("deadline", "0"))
                deadline_str = datetime.datetime.fromtimestamp(deadline_epoch).strftime('%Y-%m-%d %H:%M:%S')
            except Exception:
                deadline_str = row.get("deadline", "")

            # Use campaign name as the YouTube search query and scrape comments
            try:
                comments = scrape_youtube_comments(campaign_name)
            except HttpError as e:
                error_message = str(e)
                if "quotaExceeded" in error_message or "rateLimitExceeded" in error_message:
                    print(f"Rate limit exceeded encountered while processing campaign '{campaign_name}'. Stopping processing.")
                    return  # Stop processing further campaigns.
                else:
                    print(f"Error while scraping YouTube comments for '{campaign_name}': {e}")
                    comments = []
            
            # Build an "others" dict with all fields not explicitly extracted
            keys_to_remove = {
                "name", "urls", "state", "backers_count", "blurb",
                "converted_pledged_amount", "goal", "deadline", 
                "percent_funded", "state_changed_at", "usd_pledged", "category"
            }
            others = {k: v for k, v in row.items() if k not in keys_to_remove}

            campaign_data = {
                "name": campaign_name,
                "link": campaign_link,
                "comments": comments,
                "success": state,  # either "successful" or "failed"
                "backers_count": row.get("backers_count"),
                "blurb": row.get("blurb"),
                "converted_amount_pledged": row.get("converted_pledged_amount"),
                "goal": row.get("goal"),
                "deadline": deadline_str,
                "percent_funded": row.get("percent_funded"),
                "state_changed_at": row.get("state_changed_at"),
                "usd_pledged": row.get("usd_pledged"),
                "others": others,
                # Optionally, save the parsed category for reference
                "category": category_obj  
            }

            # Save the campaign by reading the old file, adding the new campaign, and writing it back.
            save_campaign(campaign_data, output_json)
            processed_campaigns.add(campaign_name)
            print(f"Saved campaign '{campaign_name}' to {output_json}")
    
    print("Processing complete.")

In [36]:
process_csv_and_scrape("../../../Kickstarter_2025-02-12T07_48_27_293Z/Kickstarter001.csv", "./data/output_only_tech.json")

Resuming. Found 117 campaigns already processed.
Skipping campaign 'The Mermaid’s Purse oracle deck' as it is not in the Technology category.
Skipping campaign 'The 11:11 Oracle' as it is not in the Technology category.
Skipping campaign 'Rustic Fortune Tarot Deck' as it is not in the Technology category.
Skipping campaign 'Tu ne passeras pas à la casserole ce soir' as it is not in the Technology category.
Skipping campaign 'Pram Snatcher - A Short Film' as it is not in the Technology category.
Skipping campaign 'Knitcircus Studio: Gradient Yarns and Craft Community' as it is not in the Technology category.
Skipping campaign 'Aloha Themed Watches Designed in Hawaii' as it is not in the Technology category.
Campaign 'Atlas Hand: Tool Delivery System' already processed. Skipping.
Skipping campaign 'Blue Apple Tarot™' as it is not in the Technology category.
Skipping campaign 'UNDERCITY TALES #1-2: A Sci-Fi/Noir Series' as it is not in the Technology category.
Skipping campaign 'No Man's 

In [37]:
import json

try:
    with open("./data/output_only_tech.json", "r", encoding="utf-8") as f:
        campaigns = json.load(f)
    print("Number of campaigns scraped:", len(campaigns))
except json.JSONDecodeError as e:
    print("Error loading JSON file:", e)

Number of campaigns scraped: 124


In [38]:
import json

with open("./data/output_only_tech.json", "r", encoding="utf-8") as f:
    campaigns = json.load(f)

successful = sum(1 for campaign in campaigns if campaign.get("success") == "successful")
failed = sum(1 for campaign in campaigns if campaign.get("success") == "failed")

print("Successful campaigns:", successful)
print("Failed campaigns:", failed)

Successful campaigns: 94
Failed campaigns: 30


In [22]:
def process_csv_and_scrape_only_failed_campaigns(csv_filename, output_json):
    processed_campaigns = set()
    
    # Load previously processed campaigns to avoid duplicates.
    if os.path.exists(output_json):
        try:
            with open(output_json, "r", encoding="utf-8") as f:
                existing_campaigns = json.load(f)
                processed_campaigns = {campaign["name"] for campaign in existing_campaigns}
                print(f"Resuming. Found {len(processed_campaigns)} campaigns already processed.")
        except json.JSONDecodeError:
            print("Output file is corrupted. Starting fresh.")
            processed_campaigns = set()
    
    with open(csv_filename, mode="r", encoding="utf-8") as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            state = row.get("state", "").lower()
            # Only process campaigns that are 'failed'
            if state != "failed":
                continue

            campaign_name = row.get("name", "")
            if campaign_name in processed_campaigns:
                print(f"Campaign '{campaign_name}' already processed. Skipping.")
                continue

            print(f"Processing campaign: {campaign_name}")
            
            # Extract campaign URL from the "urls" column (stored as a JSON string)
            try:
                urls = json.loads(row.get("urls", "{}"))
                campaign_link = urls.get("web", {}).get("project", "")
            except Exception:
                campaign_link = ""
            
            # Convert the deadline (Unix timestamp) to a normal datetime format
            try:
                deadline_epoch = int(row.get("deadline", "0"))
                deadline_str = datetime.datetime.fromtimestamp(deadline_epoch).strftime('%Y-%m-%d %H:%M:%S')
            except Exception:
                deadline_str = row.get("deadline", "")

            # Use campaign name as the YouTube search query and scrape comments
            try:
                comments = scrape_youtube_comments(campaign_name)
            except HttpError as e:
                error_message = str(e)
                if "quotaExceeded" in error_message or "rateLimitExceeded" in error_message:
                    print(f"Rate limit exceeded encountered while processing campaign '{campaign_name}'. Stopping processing.")
                    return  # Stop processing further campaigns.
                else:
                    print(f"Error while scraping YouTube comments for '{campaign_name}': {e}")
                    comments = []
            
            # Build an "others" dict with all fields not explicitly extracted
            keys_to_remove = {
                "name", "urls", "state", "backers_count", "blurb",
                "converted_pledged_amount", "goal", "deadline", 
                "percent_funded", "state_changed_at", "usd_pledged"
            }
            others = {k: v for k, v in row.items() if k not in keys_to_remove}

            campaign_data = {
                "name": campaign_name,
                "link": campaign_link,
                "comments": comments,
                "success": state,  # this will be 'failed'
                "backers_count": row.get("backers_count"),
                "blurb": row.get("blurb"),
                "converted_amount_pledged": row.get("converted_pledged_amount"),
                "goal": row.get("goal"),
                "deadline": deadline_str,
                "percent_funded": row.get("percent_funded"),
                "state_changed_at": row.get("state_changed_at"),
                "usd_pledged": row.get("usd_pledged"),
                "others": others
            }

            # Save the campaign by reading the old file, adding the new campaign, and writing it back.
            save_campaign(campaign_data, output_json)
            processed_campaigns.add(campaign_name)
            print(f"Saved campaign '{campaign_name}' to {output_json}")
    
    print("Processing complete.")

In [29]:
process_csv_and_scrape_only_failed_campaigns("../../../Kickstarter_2025-02-12T07_48_27_293Z/Kickstarter001.csv", "./data/output.json")

Resuming. Found 144 campaigns already processed.
Campaign 'Atlas Hand: Tool Delivery System' already processed. Skipping.
Campaign 'The Dystopic Dimension Chronology' already processed. Skipping.
Campaign 'Monstrous Divinity' already processed. Skipping.
Campaign 'Dog Eyes - a book for life, save the dogs' already processed. Skipping.
Campaign 'Not Perfect Just Broke Podcast' already processed. Skipping.
Campaign 'One Weekend at Horror Land' already processed. Skipping.
Campaign 'The Vintage Modern Days Studio' already processed. Skipping.
Campaign 'Model of European Union - MEU Spain 2022' already processed. Skipping.
Campaign 'Stanthorpe Little Theatre's Foyer Extension' already processed. Skipping.
Campaign 'Totality Capsule : Scenes of Earth' already processed. Skipping.
Campaign 'A Mark of Disgrace: Living with HIV in Yemen.' already processed. Skipping.
Campaign 'Haiti From My Camera Lens' already processed. Skipping.
Campaign 'Catori Multi-Purpose Arts Center = Theater, Concerts

In [25]:
import json

# Load the output JSON data.
with open("./data/output.json", "r", encoding="utf-8") as f:
    campaigns = json.load(f)

unique_categories = set()

for campaign in campaigns:
    # Assuming the category field is stored in the "others" dict.
    others = campaign.get("others", {})
    cat_field = others.get("category")
    
    if cat_field:
        # Sometimes the category field might be a JSON string.
        try:
            cat_obj = json.loads(cat_field)
            # Get the 'name' from the category object if it exists.
            category_name = cat_obj.get("name")
            if category_name:
                unique_categories.add(category_name)
            else:
                unique_categories.add(str(cat_obj))
        except (json.JSONDecodeError, TypeError):
            # If it's not a JSON string, add it directly.
            unique_categories.add(str(cat_field))

print("Unique categories found in output.json:")
for cat in unique_categories:
    print("-", cat)

Unique categories found in output.json:
- Knitting
- R&B
- Jewelry
- Literary Spaces
- Spaces
- Sound
- Shorts
- Playing Cards
- Webcomics
- Cookbooks
- Anthologies
- Photo
- Comic Books
- Fabrication Tools
- Comedy


In [32]:
import json

# Load the processed campaigns from output.json.
with open("./data/output.json", "r", encoding="utf-8") as f:
    campaigns = json.load(f)

total_campaigns = len(campaigns)
# Count campaigns with no comments. We check if "comments" is empty or not present.
no_comment_count = sum(1 for campaign in campaigns if not campaign.get("comments"))
# Sum up the total number of comments across all campaigns.
total_comments = sum(len(campaign.get("comments", [])) for campaign in campaigns)

# Calculate the average number of comments per campaign.
average_comments = total_comments / total_campaigns if total_campaigns > 0 else 0

print("Total campaigns processed:", total_campaigns)
print("Campaigns with no comments:", no_comment_count)
print("Average number of comments per campaign:", average_comments)

Total campaigns processed: 144
Campaigns with no comments: 21
Average number of comments per campaign: 14910.4375
