In [3]:
import pandas as pd
import numpy as np
import matplotlib

# import the celeb csv file.
comments_df = pd.read_csv('../../datasets/processed/youtube_comments_with_topics.csv')

# import celeb csv file.
celeb_df = pd.read_csv("../../datasets/attendees.csv")

# check is imported.
comments_df.head(5)
celeb_df.head(5)

Unnamed: 0,Name,Year,Gender
0,Billie Eilish,2021,Unknown
1,A$AP Rocky,2021,Male
2,Rihanna,2021,Female
3,Jennifer Lopez,2021,Female
4,Lil Nas X,2021,Male


In [4]:
import json
import pandas as pd
import re
import networkx as nx
from itertools import combinations

# --- Step 1: Load attendee names ---
celeb_names = celeb_df['Name'].dropna().str.lower().unique()
celeb_patterns = [re.escape(name) for name in celeb_names]
celeb_regex = re.compile(r'\b(?:' + '|'.join(celeb_patterns) + r')\b')

# --- Step 2: Load YouTube comments data ---
print(f"Loaded {len(comments_df)} comments")

# --- Step 3: Initialize Graph ---
G = nx.Graph()

# --- Step 4: Scan for co-mentions ---
def extract_mentions(text):
    """Extract celebrity mentions from text"""
    if pd.isna(text) or text == "":
        return []
    return list(set(celeb_regex.findall(text.lower())))

# Process each comment
mention_count = 0
for index, row in comments_df.iterrows():
    # Extract text from the comment
    comment_text = str(row.get('text', ''))
    
    # Find mentioned celebrities
    mentioned = extract_mentions(comment_text)
    
    if len(mentioned) > 1:
        mention_count += 1
        # Create edges for all pairs of mentioned celebrities
        for celeb1, celeb2 in combinations(sorted(mentioned), 2):
            if G.has_edge(celeb1, celeb2):
                G[celeb1][celeb2]['weight'] += 1
            else:
                G.add_edge(celeb1, celeb2, weight=1)

# --- Step 5: Graph Summary ---
print(f"\n=== Co-mention Graph Summary ===")
print(f"Total comments processed: {len(comments_df)}")
print(f"Comments with multiple celebrity mentions: {mention_count}")
print(f"Nodes (Celebrities): {G.number_of_nodes()}")
print(f"Edges (Co-mentions): {G.number_of_edges()}")

# --- Step 6: Top co-mentions ---
if G.number_of_edges() > 0:
    print(f"\n=== Top 10 Celebrity Co-mentions ===")
    # Sort edges by weight
    edges_with_weights = [(u, v, data['weight']) for u, v, data in G.edges(data=True)]
    edges_with_weights.sort(key=lambda x: x[2], reverse=True)
    
    for i, (celeb1, celeb2, weight) in enumerate(edges_with_weights[:10]):
        print(f"{i+1}. {celeb1.title()} & {celeb2.title()}: {weight} co-mentions")

# --- Step 7: Most mentioned celebrities ---
print(f"\n=== Most Connected Celebrities ===")
if G.number_of_nodes() > 0:
    # Calculate degree (number of connections) for each celebrity
    degrees = dict(G.degree())
    sorted_degrees = sorted(degrees.items(), key=lambda x: x[1], reverse=True)
    
    for i, (celeb, degree) in enumerate(sorted_degrees[:10]):
        print(f"{i+1}. {celeb.title()}: {degree} connections")

# --- Optional: Save graph for further analysis ---
nx.write_graphml(G, "youtube_celeb_co_mentions.graphml")

print(f"\nGraph analysis complete!")

Loaded 69780 comments

=== Co-mention Graph Summary ===
Total comments processed: 69780
Comments with multiple celebrity mentions: 117
Nodes (Celebrities): 25
Edges (Co-mentions): 58

=== Top 10 Celebrity Co-mentions ===
1. Rihanna & Zendaya: 33 co-mentions
2. Rihanna & Iman: 14 co-mentions
3. Iman & Normani: 14 co-mentions
4. Ciara & Iman: 13 co-mentions
5. Rihanna & Ciara: 10 co-mentions
6. Rihanna & Normani: 10 co-mentions
7. Rihanna & Maluma: 7 co-mentions
8. Maluma & Ciara: 7 co-mentions
9. Rihanna & Lizzo: 6 co-mentions
10. Zendaya & Lizzo: 6 co-mentions

=== Most Connected Celebrities ===
1. Rihanna: 14 connections
2. Zendaya: 14 connections
3. Lizzo: 8 connections
4. Maluma: 8 connections
5. Emma Chamberlain: 8 connections
6. Shakira: 7 connections
7. Iman: 6 connections
8. Normani: 6 connections
9. Usher: 5 connections
10. Rosalía: 5 connections

Graph analysis complete!


In [5]:
# --- Step 2: Create a bipartite graph (Authors <-> Videos) ---
G = nx.Graph()

# Track node types for visualization
video_nodes = set()
author_nodes = set()

# --- Step 3: Process each comment to create connections ---
video_info = {}  # Store video_id -> (video_id, met_gala_year) mapping

for index, row in comments_df.iterrows():
    video_id = str(row.get('video_id', ''))
    author = str(row.get('author', ''))
    met_gala_year = str(row.get('met_gala_year', ''))
    
    # Skip if essential data is missing
    if pd.isna(video_id) or pd.isna(author) or video_id == '' or author == '':
        continue
    
    # Create video node label combining video_id and met_gala_year
    video_node = f"{video_id}_{met_gala_year}"
    video_info[video_node] = (video_id, met_gala_year)
    
    # Add nodes to respective sets
    video_nodes.add(video_node)
    author_nodes.add(author)
    
    # Add edge between author and video (or increase weight if exists)
    if G.has_edge(author, video_node):
        G[author][video_node]['weight'] += 1
    else:
        G.add_edge(author, video_node, weight=1)

# --- Step 4: Graph Summary ---
print(f"\n=== Reply Graph Summary ===")
print(f"Total comments processed: {len(comments_df)}")
print(f"Unique videos: {len(video_nodes)}")
print(f"Unique authors: {len(author_nodes)}")
print(f"Total nodes: {G.number_of_nodes()}")
print(f"Total edges (author-video connections): {G.number_of_edges()}")

# --- Step 5: Top active authors (most videos commented on) ---
if G.number_of_nodes() > 0:
    print(f"\n=== Most Active Authors ===")
    author_degrees = {}
    for node in G.nodes():
        if node in author_nodes:
            author_degrees[node] = G.degree(node)
    
    sorted_authors = sorted(author_degrees.items(), key=lambda x: x[1], reverse=True)
    for i, (author, degree) in enumerate(sorted_authors[:10]):
        print(f"{i+1}. {author}: commented on {degree} videos")

# --- Step 6: Most commented videos ---
print(f"\n=== Most Commented Videos ===")
video_degrees = {}
for node in G.nodes():
    if node in video_nodes:
        video_degrees[node] = G.degree(node)

sorted_videos = sorted(video_degrees.items(), key=lambda x: x[1], reverse=True)
for i, (video_node, degree) in enumerate(sorted_videos[:10]):
    video_id, year = video_info[video_node]
    print(f"{i+1}. Video {video_id} ({year}): {degree} unique commenters")

# --- Step 7: Comments per video analysis ---
print(f"\n=== Comment Volume per Video ===")
comment_counts = comments_df.groupby(['video_id', 'met_gala_year']).size().reset_index(name='comment_count')
comment_counts_sorted = comment_counts.sort_values('comment_count', ascending=False)

for i, row in comment_counts_sorted.head(10).iterrows():
    print(f"{i+1}. Video {row['video_id']} ({row['met_gala_year']}): {row['comment_count']} total comments")

# --- Step 8: Author activity analysis ---
print(f"\n=== Author Comment Volume ===")
author_comment_counts = comments_df['author'].value_counts()
for i, (author, count) in enumerate(author_comment_counts.head(10).items()):
    print(f"{i+1}. {author}: {count} total comments")

# --- Step 9: Cross-year author activity ---
print(f"\n=== Authors Active Across Multiple Years ===")
author_years = comments_df.groupby('author')['met_gala_year'].nunique().sort_values(ascending=False)
multi_year_authors = author_years[author_years > 1]

if len(multi_year_authors) > 0:
    for author, year_count in multi_year_authors.head(10).items():
        years = sorted(comments_df[comments_df['author'] == author]['met_gala_year'].unique())
        print(f"- {author}: active in {year_count} years ({', '.join(map(str, years))})")
else:
    print("No authors found commenting across multiple years")

# --- Optional: Save graph for further analysis ---
# Save as GraphML (can be opened in Gephi, NetworkX, etc.)
nx.write_graphml(G, "author_video_reply_graph.graphml")

# Save edge list with weights
with open("author_video_edges.txt", "w") as f:
    f.write("author,video_node,video_id,met_gala_year,comment_count\n")
    for author, video_node, data in G.edges(data=True):
        if author in author_nodes:  # author -> video edge
            video_id, year = video_info[video_node]
            f.write(f"{author},{video_node},{video_id},{year},{data['weight']}\n")

print(f"\nGraph analysis complete!")
print(f"Files saved: 'author_video_reply_graph.graphml' and 'author_video_edges.txt'")


=== Reply Graph Summary ===
Total comments processed: 69780
Unique videos: 22
Unique authors: 50795
Total nodes: 50817
Total edges (author-video connections): 54805

=== Most Active Authors ===
1. @levjerraz6983: commented on 9 videos
2. @pollycipher: commented on 7 videos
3. @krishniarumugam: commented on 7 videos
4. @babybluecheeks: commented on 7 videos
5. @johnpearson1258: commented on 6 videos
6. @hautelemode: commented on 6 videos
7. @duchessedeberne3909: commented on 6 videos
8. @CARATMom: commented on 6 videos
9. @charlottemccaig578: commented on 6 videos
10. @dakotac180: commented on 6 videos

=== Most Commented Videos ===
1. Video NW2oiPiqByk (2025): 8152 unique commenters
2. Video ZMrgtotgThk (2021): 7114 unique commenters
3. Video PbRZcvVnF0w (2022): 5241 unique commenters
4. Video P71sr0kZY7o (2024): 4509 unique commenters
5. Video AyFzKATCiv0 (2025): 3694 unique commenters
6. Video jlR-T42I18E (2024): 2948 unique commenters
7. Video iWS3oVeyjL4 (2025): 2827 unique commen