In [1]:
import pandas as pd
import numpy as np
import matplotlib

# import the celeb csv file.
comments_df = pd.read_csv('../../datasets/raw/youtube_comments.csv')

# import celeb csv file.
celeb_df = pd.read_csv("../../datasets/attendees.csv")

# check is imported.
comments_df.head(5)
celeb_df.head(5)

Unnamed: 0,Name,Year,Gender
0,Billie Eilish,2021,Unknown
1,A$AP Rocky,2021,Male
2,Rihanna,2021,Female
3,Jennifer Lopez,2021,Female
4,Lil Nas X,2021,Male


In [None]:
import json
import pandas as pd
import re
import networkx as nx
from itertools import combinations
import os
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict, Counter

print(f"Attendees CSV columns: {list(celeb_df.columns)}")
print(f"Comments CSV columns: {list(comments_df.columns)}")

# --- Step 2: Text preprocessing function (for celebrity names only) ---
def preprocess_text(text):
    """Preprocess celebrity names for better matching with already-preprocessed comments"""
    if pd.isna(text) or text == "":
        return ""
    
    # Convert to lowercase
    text = str(text).lower()
    
    # Remove extra whitespace and normalize
    text = re.sub(r'\s+', ' ', text)
    
    # Remove common punctuation that might interfere with name matching
    text = re.sub(r'[^\w\s\'-]', ' ', text)
    
    # Handle common variations
    text = re.sub(r'\bkim\s*k\b', 'kim kardashian', text)
    text = re.sub(r'\bkardashian\b', 'kim kardashian', text)  
    text = re.sub(r'\bzendaya\b', 'zendaya', text)
    text = re.sub(r'\btaylor\s*swift\b', 'taylor swift', text)
    text = re.sub(r'\bt\s*swift\b', 'taylor swift', text)
    text = re.sub(r'\bbeyonce\b', 'beyonce', text)
    text = re.sub(r'\bbey\b', 'beyonce', text)
    text = re.sub(r'\bjlo\b', 'jennifer lopez', text)
    text = re.sub(r'\bj\s*lo\b', 'jennifer lopez', text)
    text = re.sub(r'\bariana\b', 'ariana grande', text)
    text = re.sub(r'\bari\b', 'ariana grande', text)
    text = re.sub(r'\bselena\b', 'selena gomez', text)
    text = re.sub(r'\briri\b', 'rihanna', text)
    text = re.sub(r'\bgaga\b', 'lady gaga', text)
    text = re.sub(r'\blady\s*gaga\b', 'lady gaga', text)
    text = re.sub(r'\bdua\b', 'dua lipa', text)
    text = re.sub(r'\bbillie\b', 'billie eilish', text)
    text = re.sub(r'\bthe\s*weeknd\b', 'the weeknd', text)
    text = re.sub(r'\bweeknd\b', 'the weeknd', text)
    
    return text.strip()

# Create mapping of celebrity -> years they attended
# Only preprocess celebrity names, comments are already preprocessed
celeb_year_mapping = {}
for index, row in celeb_df.iterrows():
    name = row.get('Name')
    year = row.get('Year')  # Attendees CSV uses 'Year' column (capital Y)
    
    if pd.notna(name) and pd.notna(year):
        processed_name = preprocess_text(name)  # Preprocess celebrity name to match preprocessed comments
        if processed_name:
            if processed_name not in celeb_year_mapping:
                celeb_year_mapping[processed_name] = set()
            celeb_year_mapping[processed_name].add(year)

print(f"\nLoaded celebrity attendance data for {len(celeb_year_mapping)} celebrities:")
for celeb, years in list(celeb_year_mapping.items())[:10]:  # Show first 10 as example
    print(f"  {celeb}: {sorted(years)}")
if len(celeb_year_mapping) > 10:
    print(f"  ... and {len(celeb_year_mapping) - 10} more")

# --- Step 3: Setup celebrity patterns (full names only) ---
raw_celeb_names = celeb_df['Name'].dropna().unique()

# Create mapping system for full names only
celeb_name_mapping = {}
all_patterns = []

for original_name in raw_celeb_names:
    processed_name = preprocess_text(original_name)
    if not processed_name:
        continue
    
    # Store the canonical name (full name only)
    canonical_name = processed_name
    
    # Add full name pattern only
    celeb_name_mapping[processed_name] = canonical_name
    all_patterns.append((processed_name, canonical_name, len(processed_name)))

# Sort patterns by length (longest first) for better matching
all_patterns.sort(key=lambda x: x[2], reverse=True)
pattern_strings = [re.escape(pattern[0]) for pattern in all_patterns]

celeb_regex = re.compile(r'\b(?:' + '|'.join(pattern_strings) + r')\b', re.IGNORECASE)

print(f"Loaded {len(raw_celeb_names)} celebrity names")
print(f"Created {len(pattern_strings)} search patterns (full names only)")
print(f"Loaded {len(comments_df)} comments")

# --- Step 4: Create output directory and data collection structures ---
output_dir = "comention_graphs"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Data collection structures for plotting
celebrity_mentions_by_year = defaultdict(lambda: defaultdict(int))  # {year: {celebrity: count}}
celebrity_mentions_overall = defaultdict(int)  # {celebrity: total_count}
yearly_totals = defaultdict(int)  # {year: total_mentions}

# --- Step 5: Function to extract celebrity mentions with year filtering ---
def extract_mentions(text, comment_year):
    """Extract celebrity mentions from text (text is already preprocessed)"""
    if pd.isna(text) or text == "":
        return []
    
    # Text is already preprocessed, use as-is
    processed_text = str(text).lower()
    
    # Find all matches with their positions
    matches = []
    for match in celeb_regex.finditer(processed_text):
        match_text = match.group().lower()
        if match_text in celeb_name_mapping:
            celebrity_name = celeb_name_mapping[match_text]
            
            # Only include if celebrity attended in this comment's year
            if celebrity_name in celeb_year_mapping and comment_year in celeb_year_mapping[celebrity_name]:
                matches.append((match.start(), match.end(), match_text, celebrity_name))
    
    # Remove overlapping matches (prefer longer matches)
    matches.sort(key=lambda x: x[0])
    
    non_overlapping = []
    for i, (start, end, match_text, canonical) in enumerate(matches):
        # Check if this match overlaps with any previously accepted match
        overlaps = False
        for prev_start, prev_end, _, _ in non_overlapping:
            if not (end <= prev_start or start >= prev_end):  # They overlap
                overlaps = True
                break
        
        if not overlaps:
            non_overlapping.append((start, end, match_text, canonical))
    
    # Extract unique celebrity names (remove duplicates where same person mentioned multiple times)
    celebrity_mentions = set()
    for _, _, _, canonical in non_overlapping:
        celebrity_mentions.add(canonical)
    
    return list(celebrity_mentions)

# --- Step 6: Function to create co-mention graph and collect mention data ---
def create_comention_graph(df, year_filter=None, collect_data=False):
    """Create co-mention graph for celebrities mentioned together and optionally collect mention data"""
    G = nx.Graph()
    
    # Filter by year if specified
    if year_filter:
        df = df[df['met_gala_year'] == year_filter]
        graph_name = f"Met Gala {year_filter}"
    else:
        graph_name = "All Years"
    
    print(f"\n=== Processing {graph_name} ===")
    print(f"Processing {len(df)} comments...")
    
    mention_count = 0
    total_mentions = 0
    
    # Process each comment
    for index, row in df.iterrows():
        # Extract text and year from the comment
        comment_text = str(row.get('text', ''))
        comment_year = row.get('met_gala_year')  # Extract the comment year!
        
        if pd.isna(comment_year):
            continue
        
        # Find mentioned celebrities (filtered by year they attended)
        mentioned = extract_mentions(comment_text, comment_year)
        total_mentions += len(mentioned)
        
        # Collect data for plotting if requested
        if collect_data:
            for celebrity in mentioned:
                celebrity_mentions_by_year[comment_year][celebrity] += 1
                celebrity_mentions_overall[celebrity] += 1
                yearly_totals[comment_year] += 1
        
        if len(mentioned) > 1:
            mention_count += 1
            # Create edges for all pairs of mentioned celebrities
            for celeb1, celeb2 in combinations(sorted(mentioned), 2):
                if G.has_edge(celeb1, celeb2):
                    G[celeb1][celeb2]['weight'] += 1
                else:
                    G.add_edge(celeb1, celeb2, weight=1)
    
    return G, mention_count, total_mentions, graph_name

# --- Step 7: Function to analyze graph ---
def analyze_comention_graph(G, mention_count, total_mentions, graph_name, df_size):
    """Analyze and print co-mention graph statistics"""
    print(f"\n=== {graph_name} Co-mention Graph Summary ===")
    print(f"Total comments processed: {df_size}")
    print(f"Total celebrity mentions: {total_mentions}")
    print(f"Comments with multiple celebrity mentions: {mention_count}")
    print(f"Nodes (Celebrities): {G.number_of_nodes()}")
    print(f"Edges (Co-mentions): {G.number_of_edges()}")
    
    if G.number_of_edges() == 0:
        print("No co-mentions found for this period.")
        return
    
    # Top co-mentions
    print(f"\n--- Top 10 Celebrity Co-mentions ({graph_name}) ---")
    edges_with_weights = [(u, v, data['weight']) for u, v, data in G.edges(data=True)]
    edges_with_weights.sort(key=lambda x: x[2], reverse=True)
    
    for i, (celeb1, celeb2, weight) in enumerate(edges_with_weights[:10]):
        print(f"{i+1}. {celeb1.title()} & {celeb2.title()}: {weight} co-mentions")
    
    # Most connected celebrities
    print(f"\n--- Most Connected Celebrities ({graph_name}) ---")
    if G.number_of_nodes() > 0:
        degrees = dict(G.degree())
        sorted_degrees = sorted(degrees.items(), key=lambda x: x[1], reverse=True)
        
        for i, (celeb, degree) in enumerate(sorted_degrees[:10]):
            print(f"{i+1}. {celeb.title()}: {degree} connections")
    
    # Additional statistics
    if G.number_of_nodes() > 0:
        print(f"\n--- Network Statistics ({graph_name}) ---")
        print(f"Average degree: {sum(dict(G.degree()).values()) / G.number_of_nodes():.2f}")
        
        # Most frequently co-mentioned pairs
        total_weight = sum([data['weight'] for u, v, data in G.edges(data=True)])
        print(f"Total co-mention instances: {total_weight}")
        
        if G.number_of_edges() > 0:
            avg_weight = total_weight / G.number_of_edges()
            print(f"Average co-mentions per pair: {avg_weight:.2f}")

# --- Step 8: Function to save graph ---
def save_comention_graph(G, graph_name, year_filter=None):
    """Save co-mention graph as GraphML file"""
    if year_filter:
        filename = f"celeb_comentions_{year_filter}.graphml"
    else:
        filename = "celeb_comentions_all_years.graphml"
    
    filepath = os.path.join(output_dir, filename)
    nx.write_graphml(G, filepath)
    print(f"Saved {graph_name} graph: {filepath}")

# --- Step 9: Create and analyze all-years graph (with data collection) ---
G_all, mentions_all, total_mentions_all, name_all = create_comention_graph(comments_df, collect_data=True)
analyze_comention_graph(G_all, mentions_all, total_mentions_all, name_all, len(comments_df))
save_comention_graph(G_all, name_all)

# --- Step 10: Create and analyze individual year graphs ---
unique_years = sorted(comments_df['met_gala_year'].dropna().unique())
print(f"\n=== Found {len(unique_years)} unique years: {unique_years} ===")

year_graphs = {}
for year in unique_years:
    year_df = comments_df[comments_df['met_gala_year'] == year]
    G_year, mentions_year, total_mentions_year, name_year = create_comention_graph(comments_df, year)
    
    if G_year.number_of_nodes() > 0:  # Only analyze if graph has data
        analyze_comention_graph(G_year, mentions_year, total_mentions_year, name_year, len(year_df))
        save_comention_graph(G_year, name_year, year)
        year_graphs[year] = G_year
    else:
        print(f"No co-mentions found for {year}")

# --- Step 11: Cross-year comparison ---
print(f"\n=== Cross-Year Co-mention Comparison ===")
for year in unique_years:
    if year in year_graphs:
        G = year_graphs[year]
        total_weight = sum([data['weight'] for u, v, data in G.edges(data=True)])
        print(f"{year}: {G.number_of_nodes()} celebrities, {G.number_of_edges()} pairs, {total_weight} total co-mentions")

# --- Step 12: Most mentioned celebrities across all data (with year filtering) ---
print(f"\n=== Most Mentioned Celebrities (All Years, Year-Filtered) ===")
all_celebrity_mentions = []
for index, row in comments_df.iterrows():
    comment_text = str(row.get('text', ''))
    comment_year = row.get('met_gala_year')
    
    if pd.notna(comment_year):
        mentioned = extract_mentions(comment_text, comment_year)
        all_celebrity_mentions.extend(mentioned)

if all_celebrity_mentions:
    mention_counts = Counter(all_celebrity_mentions)
    
    for i, (celeb, count) in enumerate(mention_counts.most_common(15)):
        # Show which years this celebrity attended
        years_attended = sorted(celeb_year_mapping.get(celeb, []))
        print(f"{i+1}. {celeb.title()}: mentioned {count} times (attended: {years_attended})")
else:
    print("No valid celebrity mentions found after year filtering")

# --- Step 13: Year-specific mention analysis ---
print(f"\n=== Celebrity Mentions by Year ===")
for year in unique_years:
    year_comments = comments_df[comments_df['met_gala_year'] == year]
    year_mentions = []
    
    for index, row in year_comments.iterrows():
        comment_text = str(row.get('text', ''))
        comment_year = row.get('met_gala_year')
        if pd.notna(comment_year):
            mentioned = extract_mentions(comment_text, comment_year)
            year_mentions.extend(mentioned)
    
    if year_mentions:
        year_counts = Counter(year_mentions)
        print(f"\n{year} - Top mentioned celebrities:")
        for i, (celeb, count) in enumerate(year_counts.most_common(5)):
            print(f"  {i+1}. {celeb.title()}: {count} mentions")
    else:
        print(f"\n{year} - No valid celebrity mentions found")

# --- Step 14: Create simplified blue bar chart visualizations ---
def create_celebrity_mention_plots():
    """Create blue bar charts showing top celebrity mentions for each year"""
    print(f"\n=== Creating Celebrity Mention Bar Charts ===")
    
    # Set up the plotting style
    plt.style.use('default')
    
    # Years to analyze
    years_to_plot = [2021, 2022, 2023, 2024, 2025]
    
    # Create individual year plots
    for year in years_to_plot:
        if year in celebrity_mentions_by_year and len(celebrity_mentions_by_year[year]) > 0:
            # Get top 15 celebrities for this year
            year_data = dict(celebrity_mentions_by_year[year])
            top_celebs = dict(Counter(year_data).most_common(15))
            
            if len(top_celebs) > 0:
                # Create figure
                fig, ax = plt.subplots(figsize=(12, 8))
                
                celeb_names = [name.title() for name in top_celebs.keys()]
                mention_counts = list(top_celebs.values())
                
                # Create horizontal bar chart in blue
                bars = ax.barh(celeb_names, mention_counts, color='#1f77b4', alpha=0.8)
                
                ax.set_xlabel('Number of Mentions', fontsize=12, fontweight='bold')
                ax.set_ylabel('Celebrity', fontsize=12, fontweight='bold')
                ax.set_title(f'Top Celebrity Mentions - Met Gala {year}', fontsize=16, fontweight='bold')
                ax.grid(axis='x', alpha=0.3)
                
                # Add value labels on bars
                for bar, count in zip(bars, mention_counts):
                    ax.text(bar.get_width() + max(mention_counts) * 0.01, 
                           bar.get_y() + bar.get_height()/2, 
                           str(count), ha='left', va='center', fontweight='bold')
                
                # Adjust layout
                plt.tight_layout()
                
                # Save the plot
                plot_path = os.path.join(output_dir, f'celebrity_mentions_{year}.png')
                plt.savefig(plot_path, dpi=300, bbox_inches='tight')
                print(f"Saved {year} celebrity mentions chart: {plot_path}")
                
                plt.show()
                plt.close()
            else:
                print(f"No celebrity mention data found for {year}")
        else:
            print(f"No data available for {year}")
    
    # Create overall chart (all years combined)
    if len(celebrity_mentions_overall) > 0:
        # Get top 15 celebrities overall
        top_celebs_overall = dict(Counter(celebrity_mentions_overall).most_common(15))
        
        # Create figure
        fig, ax = plt.subplots(figsize=(12, 8))
        
        celeb_names = [name.title() for name in top_celebs_overall.keys()]
        mention_counts = list(top_celebs_overall.values())
        
        # Create horizontal bar chart in blue
        bars = ax.barh(celeb_names, mention_counts, color='#1f77b4', alpha=0.8)
        
        ax.set_xlabel('Number of Mentions', fontsize=12, fontweight='bold')
        ax.set_ylabel('Celebrity', fontsize=12, fontweight='bold')
        ax.set_title('Top Celebrity Mentions - All Years (2021-2025)', fontsize=16, fontweight='bold')
        ax.grid(axis='x', alpha=0.3)
        
        # Add value labels on bars
        for bar, count in zip(bars, mention_counts):
            ax.text(bar.get_width() + max(mention_counts) * 0.01, 
                   bar.get_y() + bar.get_height()/2, 
                   str(count), ha='left', va='center', fontweight='bold')
        
        # Adjust layout
        plt.tight_layout()
        
        # Save the plot
        plot_path = os.path.join(output_dir, 'celebrity_mentions_all_years.png')
        plt.savefig(plot_path, dpi=300, bbox_inches='tight')
        print(f"Saved all years celebrity mentions chart: {plot_path}")
        
        plt.show()
        plt.close()
    
    # Show summary
    print(f"\n--- Summary ---")
    print(f"Created blue bar charts for celebrity mentions:")
    for year in years_to_plot:
        if year in celebrity_mentions_by_year and len(celebrity_mentions_by_year[year]) > 0:
            total_mentions = sum(celebrity_mentions_by_year[year].values())
            unique_celebs = len(celebrity_mentions_by_year[year])
            print(f"  {year}: {total_mentions} total mentions, {unique_celebs} unique celebrities")
    
    total_overall = sum(celebrity_mentions_overall.values())
    unique_overall = len(celebrity_mentions_overall)
    print(f"  All Years: {total_overall} total mentions, {unique_overall} unique celebrities")

# Create the visualizations
create_celebrity_mention_plots()

print(f"\n=== Analysis Complete! ===")
print(f"Year filtering applied: Celebrities only counted in years they attended")
print(f"GraphML files saved in '{output_dir}' directory:")
print(f"- All-years graph: celeb_comentions_all_years.graphml")
for year in unique_years:
    if year in year_graphs:
        print(f"- {year} graph: celeb_comentions_{year}.graphml")

FileNotFoundError: [Errno 2] No such file or directory: 'attendees.csv'

In [None]:
import json
import pandas as pd
import re
import networkx as nx
from itertools import combinations
import os
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict, Counter

# --- Step 1: Load data and create celebrity-year mapping ---
celeb_df = pd.read_csv("attendees.csv")  # Replace with your actual filename
comments_df = pd.read_csv("your_youtube_comments.csv")  # Replace with your actual filename

print(f"Attendees CSV columns: {list(celeb_df.columns)}")
print(f"Comments CSV columns: {list(comments_df.columns)}")

# --- Step 2: Text preprocessing function (for celebrity names only) ---
def preprocess_text(text):
    """Preprocess celebrity names for better matching with already-preprocessed comments"""
    if pd.isna(text) or text == "":
        return ""
    
    # Convert to lowercase
    text = str(text).lower()
    
    # Remove extra whitespace and normalize
    text = re.sub(r'\s+', ' ', text)
    
    # Remove common punctuation that might interfere with name matching
    text = re.sub(r'[^\w\s\'-]', ' ', text)
    
    # Handle common variations
    text = re.sub(r'\bkim\s*k\b', 'kim kardashian', text)
    text = re.sub(r'\bkardashian\b', 'kim kardashian', text)  
    text = re.sub(r'\bzendaya\b', 'zendaya', text)
    text = re.sub(r'\btaylor\s*swift\b', 'taylor swift', text)
    text = re.sub(r'\bt\s*swift\b', 'taylor swift', text)
    text = re.sub(r'\bbeyonce\b', 'beyonce', text)
    text = re.sub(r'\bbey\b', 'beyonce', text)
    text = re.sub(r'\bjlo\b', 'jennifer lopez', text)
    text = re.sub(r'\bj\s*lo\b', 'jennifer lopez', text)
    text = re.sub(r'\bariana\b', 'ariana grande', text)
    text = re.sub(r'\bari\b', 'ariana grande', text)
    text = re.sub(r'\bselena\b', 'selena gomez', text)
    text = re.sub(r'\briri\b', 'rihanna', text)
    text = re.sub(r'\bgaga\b', 'lady gaga', text)
    text = re.sub(r'\blady\s*gaga\b', 'lady gaga', text)
    text = re.sub(r'\bdua\b', 'dua lipa', text)
    text = re.sub(r'\bbillie\b', 'billie eilish', text)
    text = re.sub(r'\bthe\s*weeknd\b', 'the weeknd', text)
    text = re.sub(r'\bweeknd\b', 'the weeknd', text)
    
    return text.strip()

# Create mapping of celebrity -> years they attended
# Only preprocess celebrity names, comments are already preprocessed
celeb_year_mapping = {}
for index, row in celeb_df.iterrows():
    name = row.get('Name')
    year = row.get('Year')  # Attendees CSV uses 'Year' column (capital Y)
    
    if pd.notna(name) and pd.notna(year):
        processed_name = preprocess_text(name)  # Preprocess celebrity name to match preprocessed comments
        if processed_name:
            if processed_name not in celeb_year_mapping:
                celeb_year_mapping[processed_name] = set()
            celeb_year_mapping[processed_name].add(year)

print(f"\nLoaded celebrity attendance data for {len(celeb_year_mapping)} celebrities:")
for celeb, years in list(celeb_year_mapping.items())[:10]:  # Show first 10 as example
    print(f"  {celeb}: {sorted(years)}")
if len(celeb_year_mapping) > 10:
    print(f"  ... and {len(celeb_year_mapping) - 10} more")

# --- Step 3: Setup celebrity patterns (full names only) ---
raw_celeb_names = celeb_df['Name'].dropna().unique()

# Create mapping system for full names only
celeb_name_mapping = {}
all_patterns = []

for original_name in raw_celeb_names:
    processed_name = preprocess_text(original_name)
    if not processed_name:
        continue
    
    # Store the canonical name (full name only)
    canonical_name = processed_name
    
    # Add full name pattern only
    celeb_name_mapping[processed_name] = canonical_name
    all_patterns.append((processed_name, canonical_name, len(processed_name)))

# Sort patterns by length (longest first) for better matching
all_patterns.sort(key=lambda x: x[2], reverse=True)
pattern_strings = [re.escape(pattern[0]) for pattern in all_patterns]

celeb_regex = re.compile(r'\b(?:' + '|'.join(pattern_strings) + r')\b', re.IGNORECASE)

print(f"Loaded {len(raw_celeb_names)} celebrity names")
print(f"Created {len(pattern_strings)} search patterns (full names only)")
print(f"Loaded {len(comments_df)} comments")

# --- Step 4: Create output directory and data collection structures ---
output_dir = "comention_graphs"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Data collection structures for plotting
celebrity_mentions_by_year = defaultdict(lambda: defaultdict(int))  # {year: {celebrity: count}}
celebrity_mentions_overall = defaultdict(int)  # {celebrity: total_count}
yearly_totals = defaultdict(int)  # {year: total_mentions}

# --- Step 5: Function to extract celebrity mentions with year filtering ---
def extract_mentions(text, comment_year):
    """Extract celebrity mentions from text (text is already preprocessed)"""
    if pd.isna(text) or text == "":
        return []
    
    # Text is already preprocessed, use as-is
    processed_text = str(text).lower()
    
    # Find all matches with their positions
    matches = []
    for match in celeb_regex.finditer(processed_text):
        match_text = match.group().lower()
        if match_text in celeb_name_mapping:
            celebrity_name = celeb_name_mapping[match_text]
            
            # Only include if celebrity attended in this comment's year
            if celebrity_name in celeb_year_mapping and comment_year in celeb_year_mapping[celebrity_name]:
                matches.append((match.start(), match.end(), match_text, celebrity_name))
    
    # Remove overlapping matches (prefer longer matches)
    matches.sort(key=lambda x: x[0])
    
    non_overlapping = []
    for i, (start, end, match_text, canonical) in enumerate(matches):
        # Check if this match overlaps with any previously accepted match
        overlaps = False
        for prev_start, prev_end, _, _ in non_overlapping:
            if not (end <= prev_start or start >= prev_end):  # They overlap
                overlaps = True
                break
        
        if not overlaps:
            non_overlapping.append((start, end, match_text, canonical))
    
    # Extract unique celebrity names (remove duplicates where same person mentioned multiple times)
    celebrity_mentions = set()
    for _, _, _, canonical in non_overlapping:
        celebrity_mentions.add(canonical)
    
    return list(celebrity_mentions)

# --- Step 6: Function to create co-mention graph and collect mention data ---
def create_comention_graph(df, year_filter=None, collect_data=False):
    """Create co-mention graph for celebrities mentioned together and optionally collect mention data"""
    G = nx.Graph()
    
    # Filter by year if specified
    if year_filter:
        df = df[df['met_gala_year'] == year_filter]
        graph_name = f"Met Gala {year_filter}"
    else:
        graph_name = "All Years"
    
    print(f"\n=== Processing {graph_name} ===")
    print(f"Processing {len(df)} comments...")
    
    mention_count = 0
    total_mentions = 0
    
    # Process each comment
    for index, row in df.iterrows():
        # Extract text and year from the comment
        comment_text = str(row.get('text', ''))
        comment_year = row.get('met_gala_year')  # Extract the comment year!
        
        if pd.isna(comment_year):
            continue
        
        # Find mentioned celebrities (filtered by year they attended)
        mentioned = extract_mentions(comment_text, comment_year)
        total_mentions += len(mentioned)
        
        # Collect data for plotting if requested
        if collect_data:
            for celebrity in mentioned:
                celebrity_mentions_by_year[comment_year][celebrity] += 1
                celebrity_mentions_overall[celebrity] += 1
                yearly_totals[comment_year] += 1
        
        if len(mentioned) > 1:
            mention_count += 1
            # Create edges for all pairs of mentioned celebrities
            for celeb1, celeb2 in combinations(sorted(mentioned), 2):
                if G.has_edge(celeb1, celeb2):
                    G[celeb1][celeb2]['weight'] += 1
                else:
                    G.add_edge(celeb1, celeb2, weight=1)
    
    return G, mention_count, total_mentions, graph_name

# --- Step 7: Function to analyze graph ---
def analyze_comention_graph(G, mention_count, total_mentions, graph_name, df_size):
    """Analyze and print co-mention graph statistics"""
    print(f"\n=== {graph_name} Co-mention Graph Summary ===")
    print(f"Total comments processed: {df_size}")
    print(f"Total celebrity mentions: {total_mentions}")
    print(f"Comments with multiple celebrity mentions: {mention_count}")
    print(f"Nodes (Celebrities): {G.number_of_nodes()}")
    print(f"Edges (Co-mentions): {G.number_of_edges()}")
    
    if G.number_of_edges() == 0:
        print("No co-mentions found for this period.")
        return
    
    # Top co-mentions
    print(f"\n--- Top 10 Celebrity Co-mentions ({graph_name}) ---")
    edges_with_weights = [(u, v, data['weight']) for u, v, data in G.edges(data=True)]
    edges_with_weights.sort(key=lambda x: x[2], reverse=True)
    
    for i, (celeb1, celeb2, weight) in enumerate(edges_with_weights[:10]):
        print(f"{i+1}. {celeb1.title()} & {celeb2.title()}: {weight} co-mentions")
    
    # Most connected celebrities
    print(f"\n--- Most Connected Celebrities ({graph_name}) ---")
    if G.number_of_nodes() > 0:
        degrees = dict(G.degree())
        sorted_degrees = sorted(degrees.items(), key=lambda x: x[1], reverse=True)
        
        for i, (celeb, degree) in enumerate(sorted_degrees[:10]):
            print(f"{i+1}. {celeb.title()}: {degree} connections")
    
    # Additional statistics
    if G.number_of_nodes() > 0:
        print(f"\n--- Network Statistics ({graph_name}) ---")
        print(f"Average degree: {sum(dict(G.degree()).values()) / G.number_of_nodes():.2f}")
        
        # Most frequently co-mentioned pairs
        total_weight = sum([data['weight'] for u, v, data in G.edges(data=True)])
        print(f"Total co-mention instances: {total_weight}")
        
        if G.number_of_edges() > 0:
            avg_weight = total_weight / G.number_of_edges()
            print(f"Average co-mentions per pair: {avg_weight:.2f}")

# --- Step 8: Function to save graph ---
def save_comention_graph(G, graph_name, year_filter=None):
    """Save co-mention graph as GraphML file"""
    if year_filter:
        filename = f"celeb_comentions_{year_filter}.graphml"
    else:
        filename = "celeb_comentions_all_years.graphml"
    
    filepath = os.path.join(output_dir, filename)
    nx.write_graphml(G, filepath)
    print(f"Saved {graph_name} graph: {filepath}")

# --- Step 9: Create and analyze all-years graph (with data collection) ---
G_all, mentions_all, total_mentions_all, name_all = create_comention_graph(comments_df, collect_data=True)
analyze_comention_graph(G_all, mentions_all, total_mentions_all, name_all, len(comments_df))
save_comention_graph(G_all, name_all)

# --- Step 10: Create and analyze individual year graphs ---
unique_years = sorted(comments_df['met_gala_year'].dropna().unique())
print(f"\n=== Found {len(unique_years)} unique years: {unique_years} ===")

year_graphs = {}
for year in unique_years:
    year_df = comments_df[comments_df['met_gala_year'] == year]
    G_year, mentions_year, total_mentions_year, name_year = create_comention_graph(comments_df, year)
    
    if G_year.number_of_nodes() > 0:  # Only analyze if graph has data
        analyze_comention_graph(G_year, mentions_year, total_mentions_year, name_year, len(year_df))
        save_comention_graph(G_year, name_year, year)
        year_graphs[year] = G_year
    else:
        print(f"No co-mentions found for {year}")

# --- Step 11: Cross-year comparison ---
print(f"\n=== Cross-Year Co-mention Comparison ===")
for year in unique_years:
    if year in year_graphs:
        G = year_graphs[year]
        total_weight = sum([data['weight'] for u, v, data in G.edges(data=True)])
        print(f"{year}: {G.number_of_nodes()} celebrities, {G.number_of_edges()} pairs, {total_weight} total co-mentions")

# --- Step 12: Most mentioned celebrities across all data (with year filtering) ---
print(f"\n=== Most Mentioned Celebrities (All Years, Year-Filtered) ===")
all_celebrity_mentions = []
for index, row in comments_df.iterrows():
    comment_text = str(row.get('text', ''))
    comment_year = row.get('met_gala_year')
    
    if pd.notna(comment_year):
        mentioned = extract_mentions(comment_text, comment_year)
        all_celebrity_mentions.extend(mentioned)

if all_celebrity_mentions:
    mention_counts = Counter(all_celebrity_mentions)
    
    for i, (celeb, count) in enumerate(mention_counts.most_common(15)):
        # Show which years this celebrity attended
        years_attended = sorted(celeb_year_mapping.get(celeb, []))
        print(f"{i+1}. {celeb.title()}: mentioned {count} times (attended: {years_attended})")
else:
    print("No valid celebrity mentions found after year filtering")

# --- Step 13: Year-specific mention analysis ---
print(f"\n=== Celebrity Mentions by Year ===")
for year in unique_years:
    year_comments = comments_df[comments_df['met_gala_year'] == year]
    year_mentions = []
    
    for index, row in year_comments.iterrows():
        comment_text = str(row.get('text', ''))
        comment_year = row.get('met_gala_year')
        if pd.notna(comment_year):
            mentioned = extract_mentions(comment_text, comment_year)
            year_mentions.extend(mentioned)
    
    if year_mentions:
        year_counts = Counter(year_mentions)
        print(f"\n{year} - Top mentioned celebrities:")
        for i, (celeb, count) in enumerate(year_counts.most_common(5)):
            print(f"  {i+1}. {celeb.title()}: {count} mentions")
    else:
        print(f"\n{year} - No valid celebrity mentions found")

# --- Step 14: Create visualizations ---
def create_celebrity_mention_plots():
    """Create matplotlib plots showing celebrity mention patterns"""
    print(f"\n=== Creating Celebrity Mention Visualizations ===")
    
    # Set up the plotting style
    plt.style.use('default')
    sns.set_palette("husl")
    
    # Create figure with subplots
    fig = plt.figure(figsize=(20, 15))
    
    # 1. Overall top celebrities bar chart
    ax1 = plt.subplot(2, 3, 1)
    top_celebs = dict(Counter(celebrity_mentions_overall).most_common(15))
    
    celeb_names = [name.title() for name in top_celebs.keys()]
    mention_counts = list(top_celebs.values())
    
    bars = ax1.barh(celeb_names, mention_counts, color=sns.color_palette("viridis", len(celeb_names)))
    ax1.set_xlabel('Total Mentions')
    ax1.set_title('Top 15 Most Mentioned Celebrities (All Years)', fontsize=14, fontweight='bold')
    ax1.grid(axis='x', alpha=0.3)
    
    # Add value labels on bars
    for i, (bar, count) in enumerate(zip(bars, mention_counts)):
        ax1.text(bar.get_width() + max(mention_counts) * 0.01, bar.get_y() + bar.get_height()/2, 
                str(count), ha='left', va='center', fontweight='bold')
    
    # 2. Mentions per year line plot
    ax2 = plt.subplot(2, 3, 2)
    years = sorted(yearly_totals.keys())
    total_mentions_per_year = [yearly_totals[year] for year in years]
    
    ax2.plot(years, total_mentions_per_year, marker='o', linewidth=3, markersize=8, color='red')
    ax2.set_xlabel('Year')
    ax2.set_ylabel('Total Celebrity Mentions')
    ax2.set_title('Celebrity Mentions Over Time', fontsize=14, fontweight='bold')
    ax2.grid(True, alpha=0.3)
    ax2.set_xticks(years)
    
    # Add value labels on points
    for year, count in zip(years, total_mentions_per_year):
        ax2.annotate(str(count), (year, count), textcoords="offset points", 
                    xytext=(0,10), ha='center', fontweight='bold')
    
    # 3. Top celebrities by year heatmap
    ax3 = plt.subplot(2, 3, 3)
    
    # Get top 10 overall celebrities for heatmap
    top_10_celebs = [name for name, _ in Counter(celebrity_mentions_overall).most_common(10)]
    
    # Create matrix for heatmap
    heatmap_data = []
    for celeb in top_10_celebs:
        celeb_by_year = [celebrity_mentions_by_year[year][celeb] for year in years]
        heatmap_data.append(celeb_by_year)
    
    # Create heatmap
    sns.heatmap(heatmap_data, 
                xticklabels=years,
                yticklabels=[name.title() for name in top_10_celebs],
                annot=True, 
                fmt='d', 
                cmap='YlOrRd',
                ax=ax3,
                cbar_kws={'label': 'Mentions'})
    
    ax3.set_title('Top 10 Celebrities Mentions by Year', fontsize=14, fontweight='bold')
    ax3.set_xlabel('Year')
    ax3.set_ylabel('Celebrity')
    
    # 4. Year-over-year growth
    ax4 = plt.subplot(2, 3, 4)
    if len(years) > 1:
        growth_rates = []
        growth_years = []
        for i in range(1, len(years)):
            prev_count = yearly_totals[years[i-1]]
            curr_count = yearly_totals[years[i]]
            if prev_count > 0:
                growth_rate = ((curr_count - prev_count) / prev_count) * 100
                growth_rates.append(growth_rate)
                growth_years.append(f"{years[i-1]}-{years[i]}")
        
        colors = ['green' if rate >= 0 else 'red' for rate in growth_rates]
        bars = ax4.bar(growth_years, growth_rates, color=colors, alpha=0.7)
        ax4.set_xlabel('Year Transition')
        ax4.set_ylabel('Growth Rate (%)')
        ax4.set_title('Year-over-Year Growth in Celebrity Mentions', fontsize=14, fontweight='bold')
        ax4.grid(axis='y', alpha=0.3)
        ax4.axhline(y=0, color='black', linestyle='-', alpha=0.3)
        
        # Add value labels
        for bar, rate in zip(bars, growth_rates):
            height = bar.get_height()
            ax4.text(bar.get_x() + bar.get_width()/2., height + (1 if height >= 0 else -3),
                    f'{rate:.1f}%', ha='center', va='bottom' if height >= 0 else 'top', fontweight='bold')
    
    # 5. Celebrity diversity per year (number of unique celebrities mentioned)
    ax5 = plt.subplot(2, 3, 5)
    unique_celebs_per_year = [len(celebrity_mentions_by_year[year]) for year in years]
    
    ax5.bar(years, unique_celebs_per_year, color='purple', alpha=0.7)
    ax5.set_xlabel('Year')
    ax5.set_ylabel('Number of Unique Celebrities')
    ax5.set_title('Celebrity Diversity by Year', fontsize=14, fontweight='bold')
    ax5.grid(axis='y', alpha=0.3)
    ax5.set_xticks(years)
    
    # Add value labels
    for year, count in zip(years, unique_celebs_per_year):
        ax5.text(year, count + max(unique_celebs_per_year) * 0.01, str(count), 
                ha='center', va='bottom', fontweight='bold')
    
    # 6. Distribution of mentions (histogram)
    ax6 = plt.subplot(2, 3, 6)
    mention_counts_list = list(celebrity_mentions_overall.values())
    
    ax6.hist(mention_counts_list, bins=20, color='orange', alpha=0.7, edgecolor='black')
    ax6.set_xlabel('Number of Mentions')
    ax6.set_ylabel('Number of Celebrities')
    ax6.set_title('Distribution of Celebrity Mention Counts', fontsize=14, fontweight='bold')
    ax6.grid(axis='y', alpha=0.3)
    
    # Add statistics text
    mean_mentions = sum(mention_counts_list) / len(mention_counts_list)
    median_mentions = sorted(mention_counts_list)[len(mention_counts_list)//2]
    ax6.axvline(mean_mentions, color='red', linestyle='--', linewidth=2, label=f'Mean: {mean_mentions:.1f}')
    ax6.axvline(median_mentions, color='blue', linestyle='--', linewidth=2, label=f'Median: {median_mentions}')
    ax6.legend()
    
    plt.tight_layout()
    
    # Save the plot
    plot_path = os.path.join(output_dir, 'celebrity_mentions_analysis.png')
    plt.savefig(plot_path, dpi=300, bbox_inches='tight')
    print(f"Saved celebrity mentions visualization: {plot_path}")
    
    # Show summary statistics
    print(f"\n--- Celebrity Mention Statistics ---")
    print(f"Total unique celebrities mentioned: {len(celebrity_mentions_overall)}")
    print(f"Total mentions across all years: {sum(celebrity_mentions_overall.values())}")
    print(f"Average mentions per celebrity: {mean_mentions:.2f}")
    print(f"Most mentioned celebrity: {max(celebrity_mentions_overall, key=celebrity_mentions_overall.get).title()} ({max(celebrity_mentions_overall.values())} mentions)")
    print(f"Years analyzed: {min(years)} - {max(years)}")
    
    plt.show()

# Create the visualizations
create_celebrity_mention_plots()

print(f"\n=== Analysis Complete! ===")
print(f"Year filtering applied: Celebrities only counted in years they attended")
print(f"GraphML files saved in '{output_dir}' directory:")
print(f"- All-years graph: celeb_comentions_all_years.graphml")
for year in unique_years:
    if year in year_graphs:
        print(f"- {year} graph: celeb_comentions_{year}.graphml")