In [None]:
import pandas as pd
import numpy as np
import matplotlib

# import the csv file.
youtube_df = pd.read_csv('../../datasets/processed/youtube_comments_with_topics.csv')

# check is imported.
youtube_df.head(5)

Unnamed: 0,video_id,text,author,likes,date,time,year,met_gala_year,sentiment,dominant_topic
0,iWS3oVeyjL4,upppp ice ate monocl,@HippiesHealingApothecary,0,2025-05-15,12:00:00,2025,2025,0.6124,4
1,iWS3oVeyjL4,spoken conect scream well deep perhap everi qu...,@lotsofinterests,0,2025-05-15,06:00:00,2025,2025,0.8689,13
2,iWS3oVeyjL4,feel dear cours harri hair assign wouldv becom...,@SeventhGate008,0,2025-05-15,03:00:00,2025,2025,0.9274,5
3,iWS3oVeyjL4,photographi excel hat kojo highlight der remin...,@shortourt14,0,2025-05-15,01:00:00,2025,2025,0.6597,5
4,iWS3oVeyjL4,american anyway histori narrat comedi unlik af...,@KindnessKillsNONDO,0,2025-05-14,17:00:00,2025,2025,0.7845,14


In [None]:
import networkx as nx

# create a directed graph.
def create_directed_graph(df: pd.DataFrame) -> nx.DiGraph:

    graph = nx.DiGraph()

    # for data in df.iterrows():

yearly_graphs = {}  # Store graphs for each year
yearly_video_commenter_data = {}  # Store video-commenter mapping for each year

# Get unique years and sort them
years = sorted(youtube_df['met_gala_year'].unique())

# Loop through each year
for year in years:
    print(f"\n{'='*50}")
    print(f"Processing Met Gala Year: {year}")
    print(f"{'='*50}")
    
    # Filter data for current year
    year_data = youtube_df[youtube_df['met_gala_year'] == year].copy()
    
    print(f"Total comments in {year}: {len(year_data):,}")
    print(f"Unique videos in {year}: {year_data['video_id'].nunique():,}")
    print(f"Unique commenters in {year}: {year_data['author'].nunique():,}")
    
    # Initialize graph for this year
    commenterGraph = nx.Graph()
    
    # Dictionary to store video_id -> {commenter_id: commenter_name, ...}
    dVideoCommenters = {}
    
    # Get unique videos for this year
    unique_videos = year_data['video_id'].unique()
    
    # Loop through each video in this year
    for video_id in unique_videos:
        
        # Get all comments for this video
        video_comments = year_data[year_data['video_id'] == video_id]
        
        print(f"Processing video {video_id}: {len(video_comments)} comments")
        
        # Initialize dictionary for this video
        dVideoCommenters[video_id] = {}
        
        # Loop through each comment on this video
        for idx, comment_row in video_comments.iterrows():
            commenter = comment_row['author']
            
            # Skip if commenter is null or invalid
            if pd.isna(commenter) or commenter == 'ExternalUserError' or commenter == '':
                continue
            
            # Check if commenter is already in the graph for this year
            # If so, update their comment count
            # If not, create a new node with 1 comment
            if commenter in commenterGraph:
                commenterGraph.nodes[commenter]['commentNum'] += 1
            else:
                commenterGraph.add_node(commenter, commentNum=1, videosCommentedOn=set())
            
            # Add this video to the set of videos this commenter has commented on
            commenterGraph.nodes[commenter]['videosCommentedOn'].add(video_id)
            
            # Store commenter for this video
            dVideoCommenters[video_id][commenter] = commenter
        
        # Now create edges between all commenters who commented on this video
        commenters_on_video = list(dVideoCommenters[video_id].keys())
        
        # For each pair of commenters on this video, create/update an edge
        for i, commenter1 in enumerate(commenters_on_video):
            for j, commenter2 in enumerate(commenters_on_video):
                if i >= j:  # Avoid self-loops and duplicate edges
                    continue
                
                # Check if edge already exists between these commenters
                if commenterGraph.has_edge(commenter1, commenter2):
                    # Increment the number of shared videos
                    commenterGraph[commenter1][commenter2]['sharedVideos'] += 1
                    commenterGraph[commenter1][commenter2]['videoList'].append(video_id)
                else:
                    # Need to check if both nodes exist (they should, but safety check)
                    if commenter1 not in commenterGraph:
                        commenterGraph.add_node(commenter1, commentNum=0, videosCommentedOn=set())
                    if commenter2 not in commenterGraph:
                        commenterGraph.add_node(commenter2, commentNum=0, videosCommentedOn=set())
                    
                    # Add new edge with shared video count = 1
                    commenterGraph.add_edge(commenter1, commenter2, 
                                          sharedVideos=1, 
                                          videoList=[video_id])
    
    # Convert videosCommentedOn sets to counts for storage
    for node in commenterGraph.nodes():
        commenterGraph.nodes[node]['uniqueVideosCount'] = len(commenterGraph.nodes[node]['videosCommentedOn'])
        # Convert set to list for JSON serialization if needed later
        commenterGraph.nodes[node]['videosCommentedOn'] = list(commenterGraph.nodes[node]['videosCommentedOn'])
    
    # Store results for this year
    yearly_graphs[year] = commenterGraph
    yearly_video_commenter_data[year] = dVideoCommenters
    
    # Print summary statistics for this year
    print(f"\nYear {year} Network Summary:")
    print(f"  Nodes (commenters): {commenterGraph.number_of_nodes():,}")
    print(f"  Edges (commenter connections): {commenterGraph.number_of_edges():,}")
    
    # Calculate some interesting statistics
    if commenterGraph.number_of_nodes() > 0:
        # Average comments per commenter
        avg_comments = sum(data['commentNum'] for node, data in commenterGraph.nodes(data=True)) / commenterGraph.number_of_nodes()
        print(f"  Average comments per commenter: {avg_comments:.2f}")
        
        # Average videos per commenter
        avg_videos = sum(data['uniqueVideosCount'] for node, data in commenterGraph.nodes(data=True)) / commenterGraph.number_of_nodes()
        print(f"  Average videos per commenter: {avg_videos:.2f}")
        
        if commenterGraph.number_of_edges() > 0:
            # Average shared videos per connection
            avg_shared = sum(data['sharedVideos'] for u, v, data in commenterGraph.edges(data=True)) / commenterGraph.number_of_edges()
            print(f"  Average shared videos per connection: {avg_shared:.2f}")
            
            # Most connected commenter
            degrees = dict(commenterGraph.degree())
            most_connected = max(degrees, key=degrees.get)
            print(f"  Most connected commenter: {most_connected} ({degrees[most_connected]} connections)")

print(f"\n{'='*60}")
print("OVERALL SUMMARY:")
print(f"{'='*60}")
for year in years:
    graph = yearly_graphs[year]
    print(f"Year {year}: {graph.number_of_nodes():,} commenters, {graph.number_of_edges():,} connections")


Processing Met Gala Year: 2021
Total comments in 2021: 13,370
Unique videos in 2021: 4
Unique commenters in 2021: 10,331
Processing video ZMrgtotgThk: 9092 comments
Processing video mrFfGptVzrI: 2167 comments
Processing video qEvQa6xayYE: 2087 comments
Processing video qKYhgn1TiV4: 24 comments

Year 2021 Network Summary:
  Nodes (commenters): 10,331
  Edges (commenter connections): 28,047,904
  Average comments per commenter: 1.29
  Average videos per commenter: 1.01
  Average shared videos per connection: 1.00
  Most connected commenter: @dianemoonstone4715 (10312 connections)

Processing Met Gala Year: 2022
Total comments in 2022: 13,220
Unique videos in 2022: 4
Unique commenters in 2022: 10,569
Processing video PbRZcvVnF0w: 6749 comments
Processing video ItZ4SlxpOiI: 2153 comments
Processing video lyJqXb8Nj-I: 1935 comments
Processing video cpFc1RPOF7s: 2383 comments

Year 2022 Network Summary:
  Nodes (commenters): 10,569
  Edges (commenter connections): 19,027,186
  Average comme

In [None]:
# get all years in csv for loop to create directed community graphs based off the year of the met gala.
years = sorted(youtube_df['met_gala_year'].unique())
results = {}

for year in years:
    print(f"\n{'='*40}")
    print(f"Processing Met Gala Year: {year}")
    print(f"{'='*40}")
    
    # filter data for current year - THIS GETS ALL ROWS FOR THE YEAR.
    year_data = youtube_df[youtube_df['met_gala_year'] == year].copy()
    
    print(f"Total videos in {year}: {len(year_data):,}")  # comma formatting for large numbers.
    print(f"Unique authors in {year}: {year_data['author'].nunique():,}")
    print(f"Average videos per commenter: {len(year_data) / year_data['author'].nunique():.1f}")
    
    # For very large datasets, sample an amount of data  to check code is working.
    # COMMENT OUT WHEN RUNNING FOR REAL RESULTS.
    if len(year_data) > 5000:
        year_data = year_data.sample(n=5000, random_state=42)

    year_directed_graph = create_directed_graph(year_data)



Processing Met Gala Year: 2021
Total videos in 2021: 13,370
Unique authors in 2021: 10,331
Average videos per commenter: 1.3
Large dataset detected. Sampling 5000 videos for faster processing...
Using 5000 sampled videos
          video_id                                               text  \
58429  ZMrgtotgThk  work rich task pretend american class wear wel...   
66098  mrFfGptVzrI                              talkin nobodi madison   
61635  ZMrgtotgThk  either sort @josephinegodin9079 statu there' l...   
60002  ZMrgtotgThk  river never child fearless blackwel luke opini...   
63232  ZMrgtotgThk  research hautelemod interview articl lol guess...   

                  author  likes        date      time  year  met_gala_year  \
58429   @aquariusjewelry      0  2022-05-15  17:00:00  2022           2021   
66098    @madslivtxt3685      0  2022-05-15  17:00:00  2022           2021   
61635      @ElizabethT45      0  2022-05-15  17:00:00  2022           2021   
60002  @danielclaeys7598   

In [9]:
# loop through the hot submissions
for submission in subreddit.hot(limit=20):
    
    # check if author name is in the reply graph - if so, we update the number of submissions
    # associated with this user
    # if not, we construct a new node with 1 associated submission
    if submission.author.name in replyGraph:
        replyGraph.nodes[submission.author.name]['subNum'] += 1
    else:
        replyGraph.add_node(submission.author.name, subNum=1)

    submissionId = submission.name;
    # this stores the submissionId (in submission.name) and associate it to the author
    # (submission.author.name).
    dSubCommentId[submissionId] = {submissionId : submission.author.name}

    # for the current submission, retrieve the associated comments
    submission.comments.replace_more(limit=None)
    for comment in submission.comments.list():

        # some data checking to cater for deleted comments
        # we only add a link if the comment hasn't been deleted
        if comment.author is not None and comment.author.name != 'ExternalUserError':
            dSubCommentId[submissionId].update({comment.name : comment.author.name})

            # check if we have seen the comment's parent yet.  If not, then parent comment has been
            # deleted
            if comment.parent_id in dSubCommentId[submissionId]:
                # if edge exists, increment the replyNum, otherwise add a new edge
                if replyGraph.has_edge(comment.author.name, dSubCommentId[submissionId][comment.parent_id]):
                    replyGraph[comment.author.name][dSubCommentId[submissionId][comment.parent_id]]['replyNum'] += 1
                else:
                    # need to check if the nodes have been added yet, if not add it and set subNum to 0
                    if not comment.author.name in replyGraph:
                        replyGraph.add_node(comment.author.name, subNum=0)

                    if not dSubCommentId[submissionId][comment.parent_id] in replyGraph:
                        replyGraph.add_node(dSubCommentId[submissionId][comment.parent_id], subNum=0)

                    replyGraph.add_edge(comment.author.name, dSubCommentId[submissionId][comment.parent_id], replyNum=1)


NameError: name 'subreddit' is not defined