In [None]:
#youtube

In [None]:
from googleapiclient.discovery import build
import pandas as pd
from tqdm import tqdm
# Set up API credentials
API_KEY = 'AIzaSyA5iu941DlosGOEJ5ZJv_wa_AQ5KdCJG2I'
YOUTUBE_API_SERVICE_NAME = 'youtube'
YOUTUBE_API_VERSION = 'v3'

# Initialize the YouTube API client
youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, developerKey=API_KEY)

def get_video_comments(video_id, max_comments=500):
    """
    Retrieve up to `max_comments` comments from a given YouTube video using the official API.
    Comments are sorted by relevance (likes/engagement).
    Returns a list of dictionaries, each containing author name, comment text, publish date, and like count.
    """
    comments = []
    next_page_token = None
    
    # Progress bar for visualization
    pbar = tqdm(total=max_comments, desc="Fetching comments")
    
    while len(comments) < max_comments:
        request = youtube.commentThreads().list(
            part='snippet',
            videoId=video_id,
            maxResults=100,
            order='relevance',  # Sort comments by relevance (e.g., likes/engagement)
            pageToken=next_page_token)
        response = request.execute()
        
        # Parse response and extract desired fields
        for item in response['items']:
            comment = item['snippet']['topLevelComment']['snippet']
            comments.append({
                'author': comment['authorDisplayName'],
                'text': comment['textDisplay'],
                'published_at': comment['publishedAt'],
                'like_count': comment['likeCount']
            })
            pbar.update(1)
            if len(comments) >= max_comments:
                break
        
        # Check and set the next page token for paginated results
        if 'nextPageToken' in response:
            next_page_token = response['nextPageToken']
        else:
            break
    
    pbar.close()
    return comments

In [None]:
# Loop for scraping comments from multiple videos
def crawl_multiple_videos(video_ids, max_comments_per_video=300):
    """
    Fetch comments from a list of YouTube video IDs.
    For each video, retrieve up to `max_comments_per_video` comments using get_video_comments().
    Returns a list of dictionaries with augmented video_id field for tracking provenance.
    """
    all_comments = []
    for vid in video_ids:
        print(f"Fetching comments for video {vid} ...")
        comments = get_video_comments(vid, max_comments=max_comments_per_video)
        # Add video ID to each comment for traceability
        for c in comments:
            c['video_id'] = vid
        all_comments.extend(comments)
    return all_comments

# List of selected video IDs (GPT-related)
video_ids = [
    'jPhJbKBuNnA', 'GiaNp0u_swU', 'xswXcoh0UXs', 'MmFLDvOFLW0', 'kopoLzvh5jY', 'oc6RV5c1yd0', 'boJG84Jcf-4',
    '_8yVOC4ciXc', '_x9AwxfjxvE', 'PqbB07n_uQ4', 'MirzFk_DSiI', '_nSmkyDNulk', 'vgYi3Wr7v_g', 'lEcg6AJ6DVY',
    '50W4YeQdnSg', 'iBfQTnA2n2s'
]

# Batch comment retrieval from multiple videos
comments = crawl_multiple_videos(video_ids, max_comments_per_video=200)

# Save to CSV for downstream analysis
pd.DataFrame(comments).to_csv('youtube_multi_videos_comments.csv', index=False)

# Load the data for subsequent processing (data cleaning, sentiment analysis, etc.)
df = pd.read_csv('youtube_multi_videos_comments.csv', encoding='utf-8')

In [None]:
# Data cleaning
import re

def clean_text(text):
    """
    Remove HTML tags, special entities, and excessive whitespace.
    Converts input to string for consistent processing.
    """
    # Remove HTML tags
    text = re.sub('<.*?>', '', str(text))
    # Replace special HTML entities
    text = text.replace('&amp;', '&').replace('&quot;', '"').replace('&#39;', "'")
    # Replace newlines and carriage returns with space, strip leading/trailing whitespace
    text = text.replace('\n', ' ').replace('\r', '').strip()
    return text

# Load raw comment data
df = pd.read_csv('youtube_multi_videos_comments.csv', encoding='utf-8')

# Apply text cleaning function to the 'text' column
df['text'] = df['text'].astype(str).apply(clean_text)

# Filter: retain comments with more than 5 characters
df = df[df['text'].str.len() > 5]

# Filter: remove comments containing only symbols or whitespace
df = df[~df['text'].str.match(r'^[<>\|\/\\\-_=\.\s]+$')]

# Remove duplicate comments based on text field
df = df.drop_duplicates(subset=['text'])

# Filter out advertising phrases
df = df[~df['text'].str.contains('buy now|check my channel', case=False)]

# Retain only English-language comments using langdetect
from langdetect import detect
def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False

df_en = df[df['text'].apply(is_english)]

# Format published date column as datetime; extract 'year_month' feature for aggregation
df['published_at'] = pd.to_datetime(df['published_at'], errors='coerce')
df['year_month'] = df['published_at'].dt.strftime('%Y-%m')

In [None]:
import matplotlib.pyplot as plt
comment_counts = df.groupby('year_month').size().reset_index(name='count')
plt.figure(figsize=(12, 6))
plt.bar(comment_counts['year_month'], comment_counts['count'], color='steelblue')
plt.xlabel('Year-Month')
plt.ylabel('Number of Comments')
plt.title('YouTube Comments Count by Month')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# bilibili

In [None]:
from datetime import datetime
import requests

# Simulate user visits for anti-scraping purposes; note: cookies have a limited validity period and must be updated regularly.
headers = {
    "cookies": "header_theme_version=CLOSE; enable_web_push=DISABLE; ...",  # (Use valid, updated cookies here)
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36 Edg/142.0.0.0",
}

# Comment endpoint for Bilibili API
url = "https://api.bilibili.com/x/v2/reply/wbi/main"

# For each video, you need to update 'oid', 'w_rid', and 'wts' by inspecting the network traffic in the developer console.
# How-to: Open Developer Tools > Network > Filter for 'main?oid', scroll the webpage to load multiple pages of comments.
# Each page contains 20 comments. The 'pagination_str' for the first page is '{"offset":""}', 
# for subsequent pages use '{"offset":"xxxxxxxx"}', and paste it into the function parameters, wrapped in single quotes.
# Each page requires unique values for 'w_rid' and 'wts'.

"""
Example usage:
w_rid = ['e29839f3c8ce192cbe0fc7a103c8811f', 'a0f76273c229dbb4fbd94c3b5ca17947', '8ee30604c03902d1c174bbd26ce72b0d']
wts = ['1763656706', '1763656711', '1763656713']
all_comments = get_bili_comments(442080192, '{"offset":"CAESEDE4MDU5ODQ0NjY5Njk5NDQiAggB"}', wts, w_rid)
"""

def get_bili_comments(oid, offset, wts, w_rid):
    """
    Scrape Bilibili comments for a specific video (oid).
    Handles pagination and anti-crawling tokens (w_rid, wts).
    Returns a list of comments with author, publish date, text, like count, and video ID.
    """
    all_comments = []
    params = {
        'oid': oid,
        'type': '1',
        'mode': '3',
        'pagination_str': '{"offset":""}',
        'plat': '1',
        'seek_rpid': '',
        'web_location': '1315875',
        'w_rid': w_rid[0],
        'wts': wts[0]
    }
    response = requests.get(url=url, params=params, headers=headers)
    json_data = response.json()
    if json_data['code'] == 0:
        print('Page 1 complete:', str(json_data)[:20], '...')
    else:
        print('Failed to fetch page 1')
    replies = json_data['data']['replies']
    n = 0
    for index in replies:
        n += 1
        ctime = index['ctime']
        date = str(datetime.fromtimestamp(ctime))
        dit = {
            'author': index['member']['uname'],
            'published_at': date,
            'text': index['content']['message'],
            'like_count': index['like'],
            'video_id': oid
        }
        all_comments.append(dit)
    
    # Loop through each additional page using updated anti-crawling tokens
    for i in range(1, len(wts)): 
        params = {
            'oid': oid,
            'type': '1',
            'mode': '3',
            'pagination_str': offset,
            'plat': '1',
            'web_location': '1315875',
            'w_rid': w_rid[i],
            'wts': wts[i]
        }
        response = requests.get(url=url, params=params, headers=headers)
        json_data = response.json()
        if json_data['code'] == 0:
            print(f"Page {i+1} complete:", str(json_data)[:20], '...')
        else:
            print(f"Failed to fetch page {i+1}")
            continue
        replies = json_data['data']['replies']
        n = 0
        for index in replies:
            n += 1
            ctime = index['ctime']
            date = str(datetime.fromtimestamp(ctime))
            dit = {
                'author': index['member']['uname'],
                'published_at': date,
                'text': index['content']['message'],
                'like_count': index['like'],
                'video_id': oid
            }
            all_comments.append(dit)
    return all_comments

In [None]:
# web crawler with json
import json
import glob
import os
from datetime import datetime
import pandas as pd
from tqdm import tqdm

# Locate all JSON files containing Bilibili comment data
files = glob.glob(os.path.join('bilijson', 'bilijson*.txt'))

all_comments = []

# Aggregate all comments from the parsed JSON files
for fname in tqdm(files, desc="Processing JSON files"):
    with open(fname, 'r', encoding='utf-8') as f:
        data = json.load(f)
    for index in data:
        ctime = index.get('ctime')
        date = str(datetime.fromtimestamp(ctime)) if ctime else ""
        dit = {
            'author': index['member']['uname'] if 'member' in index else "",
            'published_at': date,
            'text': index.get('content', {}).get('message', ""),
            'like_count': index.get('like', 0),
        }
        all_comments.append(dit)

# Save all parsed comments to a single CSV file for subsequent analysis
df_all = pd.DataFrame(all_comments)
df_all.to_csv('bili_videos_comments.csv', index=False, encoding='utf-8')

In [None]:
# Data cleaning
import re

# Load raw Bilibili comment data
df = pd.read_csv('bili_videos_comments.csv', encoding='utf-8')

# Clean comment text content
df['text'] = df['text'].astype(str).apply(clean_text)

# Filter: retain comments longer than 6 characters
df = df[df['text'].str.len() > 6]

# Filter: remove comments containing only symbols or whitespace
df = df[~df['text'].str.match(r'^[<>\|\/\\\-_=\.\s]+$')]

# Remove duplicate comments based on the text field
df = df.drop_duplicates(subset=['text'])

# Filter out promotional phrases (e.g., common Bilibili "triple support" memes)
df = df[~df['text'].str.contains('已三连', case=False)]

# Convert publication date string to standardized pandas datetime format
df['published_at'] = pd.to_datetime(df['published_at'], errors='coerce')

# Extract year-month as a new feature for time-based aggregation
df['year_month'] = df['published_at'].dt.strftime('%Y-%m')

# Save the cleaned dataset for further analysis
df.to_csv('bili_videos_comments.csv', index=False)

In [None]:
import matplotlib.pyplot as plt
comment_counts = df.groupby('year_month').size().reset_index(name='count')
plt.figure(figsize=(12, 6))
plt.bar(comment_counts['year_month'], comment_counts['count'], color='steelblue')
plt.xlabel('Year-Month')
plt.ylabel('Number of Comments')
plt.title('Bilibili Comments Count by Month')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Weibo

In [None]:
# Weibo crawler (sourced from GitHub)
# Install dependencies in Anaconda Prompt, then run the main application.
# Do NOT execute this cell directly.

# Clone the repository
git clone https://github.com/zhouyi207/WeiBoCrawler.git

# Install required Python packages
pip install -r requirements.txt

# Launch the Streamlit web crawler interface
streamlit run web/main.py

In [None]:
# Data cleaning
import re

# Load raw Bilibili comment data
df = pd.read_csv('weibo_searchs.csv', encoding='utf-8')

# Clean comment text content
df['text'] = df['text'].astype(str).apply(clean_text)

# Filter: retain comments longer than 6 characters
df = df[df['text'].str.len() > 6]

# Filter: remove comments containing only symbols or whitespace
df = df[~df['text'].str.match(r'^[<>\|\/\\\-_=\.\s]+$')]

# Remove duplicate comments based on the text field
df = df.drop_duplicates(subset=['text'])

# Filter out promotional phrases (e.g., common Bilibili "triple support" memes)
df = df[~df['text'].str.contains('已三连', case=False)]

# Convert publication date string to standardized pandas datetime format
df['published_at'] = pd.to_datetime(df['published_at'], errors='coerce')

# Extract year-month as a new feature for time-based aggregation
df['year_month'] = df['published_at'].dt.strftime('%Y-%m')

# Save the cleaned dataset for further analysis
df.to_csv('weibo_searchs.csv', index=False)

In [None]:
import matplotlib.pyplot as plt
comment_counts = df.groupby('year_month').size().reset_index(name='count')
plt.figure(figsize=(12, 6))
plt.bar(comment_counts['year_month'], comment_counts['count'], color='steelblue')
plt.xlabel('Year-Month')
plt.ylabel('Number of Comments')
plt.title('Bilibili Comments Count by Month')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# calculate weight by weight = log(like_counts+1)
import numpy as np
# weibo
df = pd.read_csv('weibo_searchs.csv', encoding='utf-8')
df['weight'] = np.log1p(df['like_count'])
df.to_csv('weibo_searchs.csv', index=False)

#bili
df = pd.read_csv('bili_videos_comments.csv', encoding='utf-8')
df['weight'] = np.log1p(df['like_count'])
df.to_csv('bili_videos_comments.csv', index=False)

#youtube
df = pd.read_csv('youtube_multi_videos_comments.csv', encoding='utf-8')
df['weight'] = np.log1p(df['like_count'])
df.to_csv('youtube_multi_videos_comments.csv', index=False)