In [1]:
import os
import json
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Download NLTK stopwords if not already present
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# --- Preprocessing Function ---
def preprocess_text(text, do_stem=True, remove_stopwords=True):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'u\/\w+|@\w+', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    if remove_stopwords:
        tokens = [word for word in tokens if word not in stop_words]
    if do_stem:
        tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)

# --- Process All JSON Files in Folder ---
def preprocess_folder(folder_path):
    for filename in os.listdir(folder_path):
        if filename.endswith(".json"):
            full_path = os.path.join(folder_path, filename)

            with open(full_path, "r", encoding="utf-8") as f:
                data = json.load(f)

            # Preprocess each post and its comments
            for post in data:
                post["clean_title"] = preprocess_text(post.get("title", ""))
                post["clean_selftext"] = preprocess_text(post.get("selftext", ""))
                for comment in post.get("comments", []):
                    comment["clean_body"] = preprocess_text(comment.get("body", ""))

            # Save to new file
            new_filename = filename.replace(".json", "_cleaned.json")
            new_path = os.path.join(folder_path, new_filename)
            with open(new_path, "w", encoding="utf-8") as f:
                json.dump(data, f, indent=2, ensure_ascii=False)

            print(f" Cleaned file saved: {new_filename}")

# --- Run it ---
folder_path = "metgala_data" 
preprocess_folder(folder_path)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/navyachugh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


 Cleaned file saved: metgala_Fauxmoi_2021_cleaned.json
 Cleaned file saved: metgala_Fauxmoi_2021_cleaned_cleaned.json
 Cleaned file saved: metgala_Fauxmoi_2024_cleaned.json
 Cleaned file saved: metgala_popculturechat_2024_cleaned.json
 Cleaned file saved: metgala_Fauxmoi_2025_cleaned.json
 Cleaned file saved: metgala_popculturechat_2025_cleaned.json
 Cleaned file saved: metgala_Fauxmoi_2022_cleaned.json
 Cleaned file saved: metgala_popculturechat_2022_cleaned.json
 Cleaned file saved: metgala_Fauxmoi_2023_cleaned.json
 Cleaned file saved: metgala_popculturechat_2023_cleaned.json


In [2]:
import pandas as pd

# Load your CSV of attendees
celeb_df = pd.read_csv("attendees.csv")
celeb_names = celeb_df['Name'].dropna().unique()


In [3]:
import re

# Normalize names
celeb_patterns = [re.escape(name.lower()) for name in celeb_names]


In [4]:
def find_mentioned_celebs(text, celeb_list):
    text = text.lower()
    mentioned = []
    for celeb in celeb_list:
        if celeb in text:
            mentioned.append(celeb)
    return mentioned


In [5]:
import json

# Load your Reddit data (adjust filename as needed)
with open("metgala_data/metgala_popculturechat_2025.json", "r", encoding="utf-8") as f:
    reddit_data = json.load(f)

# Scan all posts + comments for celeb mentions
mention_summary = {}

for post in reddit_data:
    post_text = post['title'] + " " + post.get('selftext', '')
    comments = post.get("comments", [])

    # Check post itself
    mentioned = find_mentioned_celebs(post_text, celeb_patterns)
    for name in mentioned:
        mention_summary[name] = mention_summary.get(name, 0) + 1

    # Check comments
    for comment in comments:
        comment_text = comment.get('body', '')
        mentioned = find_mentioned_celebs(comment_text, celeb_patterns)
        for name in mentioned:
            mention_summary[name] = mention_summary.get(name, 0) + 1


In [6]:
# Sort by frequency
sorted_mentions = sorted(mention_summary.items(), key=lambda x: x[1], reverse=True)

for celeb, count in sorted_mentions:  # top 20
    print(f"{celeb}: {count} mentions")


zendaya: 192 mentions
doechii: 88 mentions
rihanna: 61 mentions
usher: 33 mentions
future: 20 mentions
shakira: 15 mentions
lorde: 15 mentions
madonna: 12 mentions
ciara: 10 mentions
maluma: 10 mentions
babyface: 5 mentions
lizzo: 3 mentions
rosé: 2 mentions
sza: 2 mentions
iman: 2 mentions
finneas: 1 mentions


In [7]:
abbreviations = {
    "$" : " dollar ",
    "€" : " euro ",
    "4ao" : "for adults only",
    "a.m" : "before midday",
    "a3" : "anytime anywhere anyplace",
    "aamof" : "as a matter of fact",
    "acct" : "account",
    "adih" : "another day in hell",
    "afaic" : "as far as i am concerned",
    "afaict" : "as far as i can tell",
    "afaik" : "as far as i know",
    "afair" : "as far as i remember",
    "afk" : "away from keyboard",
    "app" : "application",
    "approx" : "approximately",
    "apps" : "applications",
    "asap" : "as soon as possible",
    "asl" : "age, sex, location",
    "atk" : "at the keyboard",
    "ave." : "avenue",
    "aymm" : "are you my mother",
    "ayor" : "at your own risk", 
    "b&b" : "bed and breakfast",
    "b+b" : "bed and breakfast",
    "b.c" : "before christ",
    "b2b" : "business to business",
    "b2c" : "business to customer",
    "b4" : "before",
    "b4n" : "bye for now",
    "b@u" : "back at you",
    "bae" : "before anyone else",
    "bak" : "back at keyboard",
    "bbbg" : "bye bye be good",
    "bbc" : "british broadcasting corporation",
    "bbias" : "be back in a second",
    "bbl" : "be back later",
    "bbs" : "be back soon",
    "be4" : "before",
    "bfn" : "bye for now",
    "blvd" : "boulevard",
    "bout" : "about",
    "brb" : "be right back",
    "bros" : "brothers",
    "brt" : "be right there",
    "bsaaw" : "big smile and a wink",
    "btw" : "by the way",
    "bwl" : "bursting with laughter",
    "c/o" : "care of",
    "cet" : "central european time",
    "cf" : "compare",
    "cia" : "central intelligence agency",
    "csl" : "can not stop laughing",
    "cu" : "see you",
    "cul8r" : "see you later",
    "cv" : "curriculum vitae",
    "cwot" : "complete waste of time",
    "cya" : "see you",
    "cyt" : "see you tomorrow",
    "dae" : "does anyone else",
    "dbmib" : "do not bother me i am busy",
    "diy" : "do it yourself",
    "dm" : "direct message",
    "dwh" : "during work hours",
    "e123" : "easy as one two three",
    "eet" : "eastern european time",
    "eg" : "example",
    "embm" : "early morning business meeting",
    "encl" : "enclosed",
    "encl." : "enclosed",
    "etc" : "and so on",
    "faq" : "frequently asked questions",
    "fawc" : "for anyone who cares",
    "fb" : "facebook",
    "fc" : "fingers crossed",
    "fig" : "figure",
    "fimh" : "forever in my heart", 
    "ft." : "feet",
    "ft" : "featuring",
    "ftl" : "for the loss",
    "ftw" : "for the win",
    "fwiw" : "for what it is worth",
    "fyi" : "for your information",
    "g9" : "genius",
    "gahoy" : "get a hold of yourself",
    "gal" : "get a life",
    "gcse" : "general certificate of secondary education",
    "gfn" : "gone for now",
    "gg" : "good game",
    "gl" : "good luck",
    "glhf" : "good luck have fun",
    "gmt" : "greenwich mean time",
    "gmta" : "great minds think alike",
    "gn" : "good night",
    "g.o.a.t" : "greatest of all time",
    "goat" : "greatest of all time",
    "goi" : "get over it",
    "gps" : "global positioning system",
    "gr8" : "great",
    "gratz" : "congratulations",
    "gyal" : "girl",
    "h&c" : "hot and cold",
    "hp" : "horsepower",
    "hr" : "hour",
    "hrh" : "his royal highness",
    "ht" : "height",
    "ibrb" : "i will be right back",
    "ic" : "i see",
    "icq" : "i seek you",
    "icymi" : "in case you missed it",
    "idc" : "i do not care",
    "idgadf" : "i do not give a damn fuck",
    "idgaf" : "i do not give a fuck",
    "idk" : "i do not know",
    "ie" : "that is",
    "i.e" : "that is",
    "ifyp" : "i feel your pain",
    "IG" : "instagram",
    "iirc" : "if i remember correctly",
    "ilu" : "i love you",
    "ily" : "i love you",
    "imho" : "in my humble opinion",
    "imo" : "in my opinion",
    "imu" : "i miss you",
    "iow" : "in other words",
    "irl" : "in real life",
    "j4f" : "just for fun",
    "jic" : "just in case",
    "jk" : "just kidding",
    "jsyk" : "just so you know",
    "l8r" : "later",
    "lb" : "pound",
    "lbs" : "pounds",
    "ldr" : "long distance relationship",
    "lmao" : "laugh my ass off",
    "lmfao" : "laugh my fucking ass off",
    "lol" : "laughing out loud",
    "ltd" : "limited",
    "ltns" : "long time no see",
    "m8" : "mate",
    "mf" : "motherfucker",
    "mfs" : "motherfuckers",
    "mfw" : "my face when",
    "mofo" : "motherfucker",
    "mph" : "miles per hour",
    "mr" : "mister",
    "mrw" : "my reaction when",
    "ms" : "miss",
    "mte" : "my thoughts exactly",
    "nagi" : "not a good idea",
    "nbc" : "national broadcasting company",
    "nbd" : "not big deal",
    "nfs" : "not for sale",
    "ngl" : "not going to lie",
    "nhs" : "national health service",
    "nrn" : "no reply necessary",
    "nsfl" : "not safe for life",
    "nsfw" : "not safe for work",
    "nth" : "nice to have",
    "nvr" : "never",
    "nyc" : "new york city",
    "oc" : "original content",
    "og" : "original",
    "ohp" : "overhead projector",
    "oic" : "oh i see",
    "omdb" : "over my dead body",
    "omg" : "oh my god",
    "omw" : "on my way",
    "p.a" : "per annum",
    "p.m" : "after midday",
    "pm" : "prime minister",
    "poc" : "people of color",
    "pov" : "point of view",
    "pp" : "pages",
    "ppl" : "people",
    "prw" : "parents are watching",
    "ps" : "postscript",
    "pt" : "point",
    "ptb" : "please text back",
    "pto" : "please turn over",
    "qpsa" : "what happens", #"que pasa",
    "ratchet" : "rude",
    "rbtl" : "read between the lines",
    "rlrt" : "real life retweet", 
    "rofl" : "rolling on the floor laughing",
    "roflol" : "rolling on the floor laughing out loud",
    "rotflmao" : "rolling on the floor laughing my ass off",
    "rt" : "retweet",
    "ruok" : "are you ok",
    "sfw" : "safe for work",
    "sk8" : "skate",
    "smh" : "shake my head",
    "sq" : "square",
    "srsly" : "seriously", 
    "ssdd" : "same stuff different day",
    "tbh" : "to be honest",
    "tbs" : "tablespooful",
    "tbsp" : "tablespooful",
    "tfw" : "that feeling when",
    "thks" : "thank you",
    "tho" : "though",
    "thx" : "thank you",
    "tia" : "thanks in advance",
    "til" : "today i learned",
    "tl;dr" : "too long i did not read",
    "tldr" : "too long i did not read",
    "tmb" : "tweet me back",
    "tntl" : "trying not to laugh",
    "ttyl" : "talk to you later",
    "u" : "you",
    "u2" : "you too",
    "u4e" : "yours for ever",
    "utc" : "coordinated universal time",
    "w/" : "with",
    "w/o" : "without",
    "w8" : "wait",
    "wassup" : "what is up",
    "wb" : "welcome back",
    "wtf" : "what the fuck",
    "wtg" : "way to go",
    "wtpa" : "where the party at",
    "wuf" : "where are you from",
    "wuzup" : "what is up",
    "wywh" : "wish you were here",
    "yd" : "yard",
    "ygtr" : "you got that right",
    "ynk" : "you never know",
    "zzz" : "sleeping bored and tired"
}

In [8]:
import re

def clean_text(text, abbreviations):
    # Lowercase
    text = text.lower()

    # Replace abbreviations/slang
    for abbr, full_form in abbreviations.items():
        pattern = r'\b' + re.escape(abbr) + r'\b'
        text = re.sub(pattern, full_form, text)

    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)

    # Remove non-alphanumeric characters (except spaces)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text


In [9]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Setup
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Emoji pattern
emoji_pattern = re.compile(
    "["
    u"\U0001F600-\U0001F64F"  # emoticons
    u"\U0001F300-\U0001F5FF"  # symbols & pictographs
    u"\U0001F680-\U0001F6FF"  # transport & map symbols
    u"\U0001F1E0-\U0001F1FF"  # flags
    u"\U00002700-\U000027BF"
    u"\U000024C2-\U0001F251"
    "]+", flags=re.UNICODE
)

def preprocess_text(text, slang_dict, use_stemming=False, use_lemmatization=True):
    if not isinstance(text, str):
        return ""

    # Lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r"http\S+|www.\S+", "", text)
    
    # Remove emojis
    text = emoji_pattern.sub(r'', text)
    
    # Replace abbreviations/slang
    words = text.split()
    words = [slang_dict.get(word, word) for word in words]

    # Remove punctuation
    words = [word.translate(str.maketrans('', '', string.punctuation)) for word in words]
    
    # Remove stopwords
    words = [word for word in words if word and word not in stop_words]

    # Stemming or Lemmatization
    if use_stemming:
        words = [stemmer.stem(word) for word in words]
    elif use_lemmatization:
        words = [lemmatizer.lemmatize(word) for word in words]

    # Join words
    return " ".join(words)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/navyachugh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/navyachugh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/navyachugh/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [10]:
import os
import json

# Define your folder paths
input_folder = "metgala_data"
output_folder = "metgala_data_cleaned"

# Make sure output folder exists
os.makedirs(output_folder, exist_ok=True)

# Loop through all JSON files in the folder
for filename in os.listdir(input_folder):
    if filename.endswith(".json"):
        print(f" Preprocessing {filename}...")

        input_path = os.path.join(input_folder, filename)
        output_path = os.path.join(output_folder, filename)

        # Load file
        with open(input_path, "r", encoding="utf-8") as f:
            data = json.load(f)

        # Preprocess each post and comment
        for post in data:
            post['title'] = preprocess_text(post.get('title', ''), abbreviations)
            post['selftext'] = preprocess_text(post.get('selftext', ''), abbreviations)
            
            for comment in post.get('comments', []):
                comment['body'] = preprocess_text(comment.get('body', ''), abbreviations)

        # Save preprocessed data
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(data, f, indent=2, ensure_ascii=False)

print(" All files preprocessed!")


 Preprocessing metgala_Fauxmoi_2021_cleaned_cleaned.json...
 Preprocessing metgala_Fauxmoi_2021.json...
 Preprocessing metgala_Fauxmoi_2025_cleaned.json...
 Preprocessing metgala_Fauxmoi_2022_cleaned.json...
 Preprocessing metgala_Fauxmoi_2023_cleaned.json...
 Preprocessing metgala_Fauxmoi_2024_cleaned.json...
 Preprocessing metgala_Fauxmoi_2021_cleaned.json...
 Preprocessing metgala_Fauxmoi_2024.json...
 Preprocessing metgala_popculturechat_2024.json...
 Preprocessing metgala_Fauxmoi_2025.json...
 Preprocessing metgala_popculturechat_2025.json...
 Preprocessing metgala_Fauxmoi_2022.json...
 Preprocessing metgala_popculturechat_2022.json...
 Preprocessing metgala_popculturechat_2023_cleaned.json...
 Preprocessing metgala_popculturechat_2024_cleaned.json...
 Preprocessing metgala_Fauxmoi_2023.json...
 Preprocessing metgala_popculturechat_2023.json...
 Preprocessing metgala_popculturechat_2025_cleaned.json...
 Preprocessing metgala_popculturechat_2022_cleaned.json...
 All files preproces

In [11]:
import json
import pandas as pd
import re

# Load attendees list (make sure it's cleaned & lowercase)
celeb_df = pd.read_csv("attendees.csv")
celeb_names = celeb_df['Name'].dropna().unique()
celeb_names = [name.lower() for name in celeb_names]

# Optional: escape special characters in names
escaped_names = [re.escape(name) for name in celeb_names]

# Create a pattern to match any celeb name as a word
celeb_pattern = re.compile(r'\b(' + '|'.join(escaped_names) + r')\b', re.IGNORECASE)

# Load the preprocessed Reddit file
with open("metgala_data_cleaned/metgala_Fauxmoi_2025.json", "r", encoding="utf-8") as f:
    reddit_data = json.load(f)

# Track mentions
mention_counts = {}

# Check mentions in post + comments
for post in reddit_data:
    text = f"{post.get('title', '')} {post.get('selftext', '')}"
    for match in celeb_pattern.findall(text):
        match = match.lower()
        mention_counts[match] = mention_counts.get(match, 0) + 1

    for comment in post.get("comments", []):
        text = comment.get("body", "")
        for match in celeb_pattern.findall(text):
            match = match.lower()
            mention_counts[match] = mention_counts.get(match, 0) + 1

# Sort and display results
sorted_mentions = sorted(mention_counts.items(), key=lambda x: x[1], reverse=True)

print(" Celebrity Mentions:")
for name, count in sorted_mentions:
    print(f"{name}: {count} mentions")


 Celebrity Mentions:
zendaya: 172 mentions
rihanna: 36 mentions
doechii: 34 mentions
diana ross: 33 mentions
anna wintour: 27 mentions
walton goggins: 25 mentions
usher: 23 mentions
colman domingo: 21 mentions
lewis hamilton: 18 mentions
thom browne: 16 mentions
zac posen: 16 mentions
halle berry: 15 mentions
janelle monae: 15 mentions
sabrina carpenter: 15 mentions
cardi b: 12 mentions
anna sawai: 11 mentions
teyana taylor: 11 mentions
doja cat: 10 mentions
emma chamberlain: 10 mentions
ciara: 10 mentions
lauryn hill: 10 mentions
megan thee stallion: 9 mentions
future: 9 mentions
madonna: 9 mentions
lorde: 9 mentions
alton mason: 9 mentions
anne hathaway: 9 mentions
laura harrier: 8 mentions
sarah snook: 8 mentions
bad bunny: 7 mentions
james corden: 7 mentions
sydney sweeney: 7 mentions
tessa thompson: 7 mentions
coco jones: 7 mentions
ayo edebiri: 6 mentions
hunter schafer: 6 mentions
karlie kloss: 6 mentions
andrew scott: 6 mentions
maya hawke: 6 mentions
janelle monáe: 6 mentions


In [12]:
import json
import networkx as nx
import os

# --- Step 1: Load JSON files ---
def load_reddit_json(path):
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)

# Update these paths to your actual file locations
data1 = load_reddit_json('metgala_data_cleaned/metgala_popculturechat_2025.json')
data2 = load_reddit_json('metgala_data_cleaned/metgala_Fauxmoi_2025.json')

# --- Step 2: Combine datasets ---
combined_data = data1 + data2

# --- Step 3: Build User Interaction Graph ---
G = nx.Graph()

for post in combined_data:
    post_author = post.get("author", "N/A")
    commenters = [comment["author"] for comment in post.get("comments", []) if comment["author"] != "N/A"]

    # Add edges between post author and each commenter
    for commenter in commenters:
        if post_author != "N/A":
            G.add_edge(post_author, commenter)
    
    # Optionally: connect commenters to each other if they reply under same post
    for i in range(len(commenters)):
        for j in range(i + 1, len(commenters)):
            G.add_edge(commenters[i], commenters[j])

# --- Step 4: Analyze Graph ---
print(f"Total Nodes (users): {G.number_of_nodes()}")
print(f"Total Edges (interactions): {G.number_of_edges()}")

# Degree centrality (most connected users)
centrality = nx.degree_centrality(G)
top_users = sorted(centrality.items(), key=lambda x: x[1], reverse=True)[:10]

print("\n Top 10 Central Users:")
for user, score in top_users:
    print(f"{user}: {score:.4f}")


Total Nodes (users): 9688
Total Edges (interactions): 1039230

 Top 10 Central Users:
AutoModerator: 0.4371
Cultural-Party1876: 0.2486
Rude_Lifeguard: 0.2347
Educational-Help-126: 0.2323
InitiativeSad1021: 0.2205
Luna_Soma: 0.2183
Ester_LoverGirl: 0.2051
trulyremarkablegirl: 0.2028
lavabread23: 0.1953
PsychologicalClue6: 0.1900


In [13]:
import json
import networkx as nx
import pandas as pd
import re

# Step 1: Load Celebs
celeb_df = pd.read_csv("attendees.csv")
celeb_names = celeb_df['Name'].dropna().unique()
celeb_names_normalized = [name.lower() for name in celeb_names]

# Optional: compile regex pattern for speed
celeb_patterns = [re.escape(name.lower()) for name in celeb_names_normalized]
celeb_pattern = re.compile(r'\b(?:' + '|'.join(celeb_patterns) + r')\b')

# Step 2: Build Graph
G = nx.Graph()

for post in combined_data:
    comments = post.get("comments", [])
    
    for comment in comments:
        author = comment.get("author", "N/A")
        if author == "N/A":
            continue
        
        body = comment.get("body", "").lower()
        mentioned_celebs = celeb_pattern.findall(body)

        for celeb in mentioned_celebs:
            celeb = celeb.strip().lower()
            G.add_node(author, type="user")
            G.add_node(celeb, type="celeb")
            G.add_edge(author, celeb)

# Step 3: Print Summary
print(f" Total Nodes: {G.number_of_nodes()}")
print(f" Total Edges: {G.number_of_edges()}")

# Optional: Most mentioned celebrities
celeb_mentions = {}
for node in G.nodes(data=True):
    if node[1].get("type") == "celeb":
        celeb_mentions[node[0]] = G.degree(node[0])

sorted_mentions = sorted(celeb_mentions.items(), key=lambda x: x[1], reverse=True)[:10]
print("\n Top Mentioned Celebrities:")
for name, count in sorted_mentions:
    print(f"{name.title()}: {count} mentions")


 Total Nodes: 1298
 Total Edges: 1648

 Top Mentioned Celebrities:
Zendaya: 267 mentions
Doechii: 90 mentions
Rihanna: 71 mentions
Diana Ross: 56 mentions
Usher: 45 mentions
Anna Wintour: 41 mentions
Thom Browne: 39 mentions
Cardi B: 35 mentions
Lewis Hamilton: 35 mentions
Sabrina Carpenter: 30 mentions


In [14]:
# Save user interaction graph
nx.write_gexf(G, "user_interaction_graph_2025.gexf")
print(" User interaction graph saved as 'user_interaction_graph_2025.gexf'")


 User interaction graph saved as 'user_interaction_graph_2025.gexf'


In [15]:
# Save celebrity mention graph
nx.write_gexf(G, "celebrity_mention_graph_2025.gexf")
print("Celebrity mention graph saved as 'celebrity_mention_graph_2025.gexf'")


Celebrity mention graph saved as 'celebrity_mention_graph_2025.gexf'


In [16]:
for post in combined_data:
    subreddit = post.get("subreddit", "unknown")  # Add this if your data has subreddit info
    post_author = post.get("author", "N/A")
    
    if post_author != "N/A":
        G.add_node(post_author, subreddit=subreddit)

    for comment in post.get("comments", []):
        author = comment.get("author", "N/A")
        if author != "N/A":
            G.add_node(author, subreddit=subreddit)
            G.add_edge(post_author, author)


In [3]:
import json
import os
import networkx as nx
import pandas as pd
import community as community_louvain  # python-louvain

# -------- Step 1: Load and label data --------
def load_data(path, subreddit_name):
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    for post in data:
        post['subreddit'] = subreddit_name
    return data

data_fauxmoi = load_data('metgala_data_cleaned/metgala_Fauxmoi_2023.json', 'Fauxmoi')
data_popculture = load_data('metgala_data_cleaned/metgala_popculturechat_2023.json', 'PopCultureChat')
combined_data = data_fauxmoi + data_popculture

# -------- Step 2: Build User Interaction Graph --------
G = nx.Graph()

for post in combined_data:
    subreddit = post.get("subreddit", "Unknown")
    post_author = post.get("author", "N/A")
    commenters = [c.get("author", "N/A") for c in post.get("comments", []) if c.get("author", "N/A") != "N/A"]

    # Add post author
    if post_author != "N/A":
        G.add_node(post_author, subreddit=subreddit)

    # Add commenters and edges
    for commenter in commenters:
        G.add_node(commenter, subreddit=subreddit)
        if post_author != "N/A":
            G.add_edge(post_author, commenter)
    
    # Optionally connect commenters to each other
    for i in range(len(commenters)):
        for j in range(i + 1, len(commenters)):
            G.add_edge(commenters[i], commenters[j])

print(f"Total nodes: {G.number_of_nodes()}")
print(f"Total edges: {G.number_of_edges()}")

# -------- Step 3: Run Louvain Community Detection --------
partition = community_louvain.best_partition(G)
nx.set_node_attributes(G, partition, 'community')

# -------- Step 4: Export for Gephi --------
# Export as GraphML (supports node attributes like 'subreddit' and 'community')
nx.write_graphml(G, "community_detection_graph.graphml")
print("Graph exported as 'community_detection_graph.graphml' for Gephi.")


Total nodes: 5188
Total edges: 550239
Graph exported as 'community_detection_graph.graphml' for Gephi.


In [6]:
# Filter to active users
active_users = [n for n, d in G.degree() if d >= 10]
G_active = G.subgraph(active_users).copy()

print(f"Filtered graph: {G_active.number_of_nodes()} nodes, {G_active.number_of_edges()} edges")
nx.write_graphml(G_active, "community_detection_2023_active.graphml")


Filtered graph: 5176 nodes, 550178 edges


In [7]:
# Step 1: Apply Louvain
import community as community_louvain
partition = community_louvain.best_partition(G)
nx.set_node_attributes(G, partition, 'community')

# Step 2: Detect overlap
overlap_users = set()

for u, v in G.edges():
    comm_u = G.nodes[u]['community']
    comm_v = G.nodes[v]['community']
    if comm_u != comm_v:
        overlap_users.add(u)
        overlap_users.add(v)

print(f"Users bridging multiple communities: {len(overlap_users)}")


Users bridging multiple communities: 4905


In [8]:
# Mark all bridge users with an attribute
for node in G.nodes():
    G.nodes[node]['is_bridge'] = int(node in overlap_users)

# Export updated graph
nx.write_graphml(G, "bridge_users_graph.graphml")


In [10]:
import json
import networkx as nx
import community as community_louvain  # Louvain module
import os

# --- Load cleaned Reddit data ---
def load_reddit_json(path):
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)

# Update to match your actual filenames
data1 = load_reddit_json('metgala_data_cleaned/metgala_popculturechat_2023.json')
data2 = load_reddit_json('metgala_data_cleaned/metgala_Fauxmoi_2023.json')

# Combine data
combined_data = data1 + data2

# --- Build user interaction graph ---
G = nx.Graph()

for post in combined_data:
    post_author = post.get("author", "N/A")
    commenters = [comment["author"] for comment in post.get("comments", []) if comment["author"] != "N/A"]

    # Add edges between post author and commenters
    for commenter in commenters:
        if post_author != "N/A":
            G.add_edge(post_author, commenter)

    # Optionally connect commenters with each other
    for i in range(len(commenters)):
        for j in range(i + 1, len(commenters)):
            G.add_edge(commenters[i], commenters[j])

# --- Louvain Community Detection ---
print(f"🔗 Total Nodes: {G.number_of_nodes()}")
print(f"🔗 Total Edges: {G.number_of_edges()}")

partition = community_louvain.best_partition(G)
nx.set_node_attributes(G, partition, 'community')



🔗 Total Nodes: 5188
🔗 Total Edges: 550239


In [13]:
# Step 2: Keep largest connected component (removes isolated groups)
G_largest = max(nx.connected_components(G_filtered), key=len)
G_final = G_filtered.subgraph(G_largest).copy()

print(f"Final Nodes: {len(G_final.nodes())}")
print(f"Final Edges: {len(G_final.edges())}")


 Final Nodes: 5188
Final Edges: 550239


In [15]:
import json
import pandas as pd
import re
import networkx as nx
from itertools import combinations

# --- Step 1: Load attendee names ---
celeb_df = pd.read_csv("attendees.csv")
celeb_names = celeb_df['Name'].dropna().str.lower().unique()
celeb_patterns = [re.escape(name) for name in celeb_names]
celeb_regex = re.compile(r'\b(?:' + '|'.join(celeb_patterns) + r')\b')

# --- Step 2: Load Reddit data ---
with open("metgala_data_cleaned/metgala_popculturechat_2025.json", "r", encoding="utf-8") as f1:
    data1 = json.load(f1)
with open("metgala_data_cleaned/metgala_Fauxmoi_2025.json", "r", encoding="utf-8") as f2:
    data2 = json.load(f2)

combined_data = data1 + data2

# --- Step 3: Initialize Graph ---
G = nx.Graph()

# --- Step 4: Scan for co-mentions ---
def extract_mentions(text):
    return list(set(celeb_regex.findall(text.lower())))

for post in combined_data:
    texts = [post.get("title", "") + " " + post.get("selftext", "")]
    texts += [comment.get("body", "") for comment in post.get("comments", [])]

    for text in texts:
        mentioned = extract_mentions(text)
        if len(mentioned) > 1:
            for celeb1, celeb2 in combinations(sorted(mentioned), 2):
                if G.has_edge(celeb1, celeb2):
                    G[celeb1][celeb2]['weight'] += 1
                else:
                    G.add_edge(celeb1, celeb2, weight=1)

# --- Step 5: Graph Summary ---
print(f" Nodes (Celebrities): {G.number_of_nodes()}")
print(f" Edges (Co-mentions): {G.number_of_edges()}")

🧑‍🤝‍🧑 Nodes (Celebrities): 115
🔗 Edges (Co-mentions): 333


In [16]:
# --- Step 6: Export to Gephi ---
nx.write_graphml(G, "celeb_co_mentions.graphml")
print("📤 Exported as 'celeb_co_mentions.graphml'")

📤 Exported as 'celeb_co_mentions.graphml'


In [17]:
import json
import networkx as nx
from community import community_louvain

# ---------- CONFIG ----------
file_path = "metgala_data_cleaned/metgala_Fauxmoi_2025.json"
output_path = "fauxmoi_2025_community.graphml"
# ----------------------------

# Step 1: Load data
with open(file_path, "r", encoding="utf-8") as f:
    posts = json.load(f)

# Step 2: Build interaction graph
G = nx.Graph()
for post in posts:
    author = post.get("author", "N/A")
    commenters = [c["author"] for c in post.get("comments", []) if c.get("author") != "N/A"]

    for commenter in commenters:
        if author != "N/A":
            G.add_edge(author, commenter)

    for i in range(len(commenters)):
        for j in range(i + 1, len(commenters)):
            G.add_edge(commenters[i], commenters[j])

print(f" Nodes: {G.number_of_nodes()}, Edges: {G.number_of_edges()}")

# Step 3: Community detection
if G.number_of_nodes() > 1:
    partition = community_louvain.best_partition(G)
    nx.set_node_attributes(G, partition, 'community')
    
    # Step 4: Export to Gephi-readable file
    nx.write_graphml(G, output_path)
    print(f"Graph exported to '{output_path}'")
else:
    print(" Graph too small to detect communities.")


 Nodes: 4553, Edges: 370085
Graph exported to 'fauxmoi_2025_community.graphml'
