In [3]:
import pandas as pd
import numpy as np
import matplotlib

# import the celeb csv file.
comments_df = pd.read_csv('../../datasets/processed/youtube_comments_with_topics.csv')

# import celeb csv file.
celeb_df = pd.read_csv("../../datasets/attendees.csv")

# check is imported.
comments_df.head(5)
celeb_df.head(5)

Unnamed: 0,Name,Year,Gender
0,Billie Eilish,2021,Unknown
1,A$AP Rocky,2021,Male
2,Rihanna,2021,Female
3,Jennifer Lopez,2021,Female
4,Lil Nas X,2021,Male


In [4]:
import json
import pandas as pd
import re
import networkx as nx
from itertools import combinations

# --- Step 1: Load attendee names ---
celeb_names = celeb_df['Name'].dropna().str.lower().unique()
celeb_patterns = [re.escape(name) for name in celeb_names]
celeb_regex = re.compile(r'\b(?:' + '|'.join(celeb_patterns) + r')\b')

# --- Step 2: Load YouTube comments data ---
print(f"Loaded {len(comments_df)} comments")

# --- Step 3: Initialize Graph ---
G = nx.Graph()

# --- Step 4: Scan for co-mentions ---
def extract_mentions(text):
    """Extract celebrity mentions from text"""
    if pd.isna(text) or text == "":
        return []
    return list(set(celeb_regex.findall(text.lower())))

# Process each comment
mention_count = 0
for index, row in comments_df.iterrows():
    # Extract text from the comment
    comment_text = str(row.get('text', ''))
    
    # Find mentioned celebrities
    mentioned = extract_mentions(comment_text)
    
    if len(mentioned) > 1:
        mention_count += 1
        # Create edges for all pairs of mentioned celebrities
        for celeb1, celeb2 in combinations(sorted(mentioned), 2):
            if G.has_edge(celeb1, celeb2):
                G[celeb1][celeb2]['weight'] += 1
            else:
                G.add_edge(celeb1, celeb2, weight=1)

# --- Step 5: Graph Summary ---
print(f"\n=== Co-mention Graph Summary ===")
print(f"Total comments processed: {len(comments_df)}")
print(f"Comments with multiple celebrity mentions: {mention_count}")
print(f"Nodes (Celebrities): {G.number_of_nodes()}")
print(f"Edges (Co-mentions): {G.number_of_edges()}")

# --- Step 6: Top co-mentions ---
if G.number_of_edges() > 0:
    print(f"\n=== Top 10 Celebrity Co-mentions ===")
    # Sort edges by weight
    edges_with_weights = [(u, v, data['weight']) for u, v, data in G.edges(data=True)]
    edges_with_weights.sort(key=lambda x: x[2], reverse=True)
    
    for i, (celeb1, celeb2, weight) in enumerate(edges_with_weights[:10]):
        print(f"{i+1}. {celeb1.title()} & {celeb2.title()}: {weight} co-mentions")

# --- Step 7: Most mentioned celebrities ---
print(f"\n=== Most Connected Celebrities ===")
if G.number_of_nodes() > 0:
    # Calculate degree (number of connections) for each celebrity
    degrees = dict(G.degree())
    sorted_degrees = sorted(degrees.items(), key=lambda x: x[1], reverse=True)
    
    for i, (celeb, degree) in enumerate(sorted_degrees[:10]):
        print(f"{i+1}. {celeb.title()}: {degree} connections")

# --- Optional: Save graph for further analysis ---
nx.write_graphml(G, "youtube_celeb_co_mentions.graphml")

print(f"\nGraph analysis complete!")

Loaded 69780 comments

=== Co-mention Graph Summary ===
Total comments processed: 69780
Comments with multiple celebrity mentions: 117
Nodes (Celebrities): 25
Edges (Co-mentions): 58

=== Top 10 Celebrity Co-mentions ===
1. Rihanna & Zendaya: 33 co-mentions
2. Rihanna & Iman: 14 co-mentions
3. Iman & Normani: 14 co-mentions
4. Ciara & Iman: 13 co-mentions
5. Rihanna & Ciara: 10 co-mentions
6. Rihanna & Normani: 10 co-mentions
7. Rihanna & Maluma: 7 co-mentions
8. Maluma & Ciara: 7 co-mentions
9. Rihanna & Lizzo: 6 co-mentions
10. Zendaya & Lizzo: 6 co-mentions

=== Most Connected Celebrities ===
1. Rihanna: 14 connections
2. Zendaya: 14 connections
3. Lizzo: 8 connections
4. Maluma: 8 connections
5. Emma Chamberlain: 8 connections
6. Shakira: 7 connections
7. Iman: 6 connections
8. Normani: 6 connections
9. Usher: 5 connections
10. Rosalía: 5 connections

Graph analysis complete!


In [9]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from collections import defaultdict
import os

# --- Step 1: Load YouTube comments data ---
print(f"Loaded {len(comments_df)} comments")

# --- Step 2: Create output directory for graphs ---
output_dir = "reply_graphs"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# --- Step 3: Function to create directed graph ---
def create_directed_graph(df, year_filter=None):
    """Create directed graph where users point to videos they commented on"""
    G = nx.DiGraph()  # Directed graph
    
    # Filter by year if specified
    if year_filter:
        df = df[df['met_gala_year'] == year_filter]
        graph_name = f"Met Gala {year_filter}"
    else:
        graph_name = "All Years"
    
    # Track node types
    video_nodes = set()
    user_nodes = set()
    video_info = {}
    
    print(f"\n=== Processing {graph_name} ===")
    print(f"Processing {len(df)} comments...")
    
    for index, row in df.iterrows():
        video_id = str(row.get('video_id', ''))
        author = str(row.get('author', ''))
        met_gala_year = str(row.get('met_gala_year', ''))
        
        # Skip if essential data is missing
        if pd.isna(video_id) or pd.isna(author) or video_id == '' or author == '':
            continue
        
        # Create video node (just the video_id as requested)
        video_node = f"{video_id}_{met_gala_year}"
        video_info[video_node] = (video_id, met_gala_year)
        
        # Add nodes to respective sets
        video_nodes.add(video_node)
        user_nodes.add(author)
        
        # Add directed edge: user -> video (user comments on video)
        if G.has_edge(author, video_node):
            G[author][video_node]['weight'] += 1
        else:
            G.add_edge(author, video_node, weight=1)
    
    return G, video_nodes, user_nodes, video_info, graph_name

# --- Step 4: Function to analyze graph ---
def analyze_graph(G, video_nodes, user_nodes, video_info, graph_name):
    """Analyze and print graph statistics"""
    print(f"\n=== {graph_name} Graph Summary ===")
    print(f"Unique videos: {len(video_nodes)}")
    print(f"Unique users: {len(user_nodes)}")
    print(f"Total nodes: {G.number_of_nodes()}")
    print(f"Total edges (user → video): {G.number_of_edges()}")
    
    if G.number_of_nodes() == 0:
        print("No data to analyze for this period.")
        return
    
    # Most active users (highest out-degree - commenting on most videos)
    print(f"\n--- Most Active Users ({graph_name}) ---")
    user_out_degrees = {}
    for node in G.nodes():
        if node in user_nodes:
            user_out_degrees[node] = G.out_degree(node)
    
    if user_out_degrees:
        sorted_users = sorted(user_out_degrees.items(), key=lambda x: x[1], reverse=True)
        for i, (user, degree) in enumerate(sorted_users[:10]):
            print(f"{i+1}. {user}: commented on {degree} videos")
    
    # Most popular videos (highest in-degree - most users commenting)
    print(f"\n--- Most Popular Videos ({graph_name}) ---")
    video_in_degrees = {}
    for node in G.nodes():
        if node in video_nodes:
            video_in_degrees[node] = G.in_degree(node)
    
    if video_in_degrees:
        sorted_videos = sorted(video_in_degrees.items(), key=lambda x: x[1], reverse=True)
        for i, (video_node, degree) in enumerate(sorted_videos[:10]):
            video_id, year = video_info[video_node]
            print(f"{i+1}. Video {video_id} ({year}): {degree} unique commenters")
    
    # Comment volume analysis (edge weights)
    print(f"\n--- Highest Comment Volumes ({graph_name}) ---")
    edges_with_weights = []
    for user, video, data in G.edges(data=True):
        if user in user_nodes:  # user -> video edge
            video_id, year = video_info[video]
            edges_with_weights.append((user, video_id, year, data['weight']))
    
    if edges_with_weights:
        edges_with_weights.sort(key=lambda x: x[3], reverse=True)
        for i, (user, video_id, year, weight) in enumerate(edges_with_weights[:10]):
            print(f"{i+1}. {user} → Video {video_id} ({year}): {weight} comments")

# --- Step 5: Function to save graph ---
def save_graph(G, video_nodes, user_nodes, video_info, graph_name, year_filter=None):
    """Save graph as GraphML file"""
    if year_filter:
        filename_base = f"reply_graph_{year_filter}"
    else:
        filename_base = "reply_graph_all_years"
    
    # Save as GraphML
    graphml_path = os.path.join(output_dir, f"{filename_base}.graphml")
    nx.write_graphml(G, graphml_path)
    
    print(f"Saved {graph_name} graph: {graphml_path}")

# --- Step 6: Create and analyze all-years graph ---
G_all, video_nodes_all, user_nodes_all, video_info_all, name_all = create_directed_graph(comments_df)
analyze_graph(G_all, video_nodes_all, user_nodes_all, video_info_all, name_all)
save_graph(G_all, video_nodes_all, user_nodes_all, video_info_all, name_all)

# --- Step 7: Create and analyze individual year graphs ---
unique_years = sorted(comments_df['met_gala_year'].dropna().unique())
print(f"\n=== Found {len(unique_years)} unique years: {unique_years} ===")

year_graphs = {}
for year in unique_years:
    G_year, video_nodes_year, user_nodes_year, video_info_year, name_year = create_directed_graph(comments_df, year)
    
    if G_year.number_of_nodes() > 0:  # Only analyze if graph has data
        analyze_graph(G_year, video_nodes_year, user_nodes_year, video_info_year, name_year)
        save_graph(G_year, video_nodes_year, user_nodes_year, video_info_year, name_year, year)
        year_graphs[year] = G_year
    else:
        print(f"No data found for {year}")

# --- Step 8: Cross-year comparison ---
print(f"\n=== Cross-Year Comparison ===")
for year in unique_years:
    if year in year_graphs:
        G = year_graphs[year]
        print(f"{year}: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")

# --- Step 9: Users active across multiple years ---
print(f"\n=== Users Active Across Multiple Years ===")
user_years = comments_df.groupby('author')['met_gala_year'].nunique().sort_values(ascending=False)
multi_year_users = user_years[user_years > 1]

if len(multi_year_users) > 0:
    for user, year_count in multi_year_users.head(10).items():
        years = sorted(comments_df[comments_df['author'] == user]['met_gala_year'].unique())
        total_comments = len(comments_df[comments_df['author'] == user])
        print(f"- {user}: active in {year_count} years ({', '.join(map(str, years))}) - {total_comments} total comments")
else:
    print("No users found commenting across multiple years")

print(f"\n=== Analysis Complete! ===")
print(f"GraphML files saved in '{output_dir}' directory:")
print(f"- All-years graph: reply_graph_all_years.graphml")
for year in unique_years:
    if year in year_graphs:
        print(f"- {year} graph: reply_graph_{year}.graphml")

Loaded 69780 comments

=== Processing All Years ===
Processing 69780 comments...

=== All Years Graph Summary ===
Unique videos: 22
Unique users: 50795
Total nodes: 50817
Total edges (user → video): 54805

--- Most Active Users (All Years) ---
1. @levjerraz6983: commented on 9 videos
2. @pollycipher: commented on 7 videos
3. @krishniarumugam: commented on 7 videos
4. @babybluecheeks: commented on 7 videos
5. @johnpearson1258: commented on 6 videos
6. @hautelemode: commented on 6 videos
7. @duchessedeberne3909: commented on 6 videos
8. @CARATMom: commented on 6 videos
9. @charlottemccaig578: commented on 6 videos
10. @dakotac180: commented on 6 videos

--- Most Popular Videos (All Years) ---
1. Video NW2oiPiqByk (2025): 8152 unique commenters
2. Video ZMrgtotgThk (2021): 7114 unique commenters
3. Video PbRZcvVnF0w (2022): 5241 unique commenters
4. Video P71sr0kZY7o (2024): 4509 unique commenters
5. Video AyFzKATCiv0 (2025): 3694 unique commenters
6. Video jlR-T42I18E (2024): 2948 unique