# INSY 670: Social Media Analytics - Group Project

In [None]:
import logging
import os

import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
from tqdm import tqdm
import gensim
import gensim.corpora as corpora
import nltk
import pyLDAvis
import pyLDAvis.gensim
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from rich import print
import warnings
from utils import (
    assign_topic,
    get_topic_submissions,
    create_bipartite_graph,
    create_topic_network,
    get_top_users,
    get_communities,
    calculate_centralities,
    get_filtered_topic_graph,
    get_largest_component,
    get_community_dataframe,
    plot_community_graph,
    project_bipartite_graph,
)

# %load_ext cudf.pandas
%load_ext rich
%load_ext autoreload
%autoreload 2

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("main")

nltk.download("stopwords")
pd.set_option("display.max_columns", None)
warnings.filterwarnings("ignore")
pyLDAvis.enable_notebook()
tqdm.pandas()

os.makedirs("./data", exist_ok=True)
os.makedirs("./graphs", exist_ok=True)

## Load data

In [None]:
SUBREDDIT_NAME = "FortNiteBR"
TIME_THRESHOLD = pd.Timestamp('2023-06-01 00:00:00')

### Submissions

In [None]:
SUBMISSIONS_DIR = "./data/reddit/processed/submissions/"
SUBMISSIONS_FILES = os.listdir(SUBMISSIONS_DIR)

SUBMISSIONS_IDS = sorted([int(x.split("_")[1]) for x in SUBMISSIONS_FILES])

submissions = []

for file_id in SUBMISSIONS_IDS[::-1]:
    x = pd.read_csv(f"{SUBMISSIONS_DIR}{SUBREDDIT_NAME}_{file_id}_submissions.csv")
    earliest_date = pd.to_datetime(x["created_utc"], unit="s").min()

    submissions.append(x)

    if earliest_date <= TIME_THRESHOLD:
        print(f"Threshold reached at {file_id}")
        break

submissions = pd.concat(submissions)

In [None]:
submissions

### Comments

In [None]:
COMMENTS_DIR = "./data/reddit/processed/comments/"
COMMENTS_FILES = os.listdir(COMMENTS_DIR)

COMMENTS_IDS = sorted([int(x.split("_")[1]) for x in COMMENTS_FILES])

comments = []

for file_id in COMMENTS_IDS[::-1]:
  x = pd.read_csv(f'{COMMENTS_DIR}{SUBREDDIT_NAME}_{file_id}_comments.csv')
  earliest_date = pd.to_datetime(x['created_utc'], unit='s').min()

  comments.append(x)

  if earliest_date <= TIME_THRESHOLD:
    print(f"Threshold reached at {file_id}")
    break

comments = pd.concat(comments)

In [None]:
submissions = submissions[~submissions["author"].isin(["[deleted]", "[removed]"])]
comments = comments[~comments["author"].isin(["[deleted]", "[removed]"])]
comments["submission_id"] = comments["link_id"].str.split("t3_").str[1]

In [None]:
comments

## Topic modeling

In [None]:
submissions['year'] = pd.to_datetime(submissions['created_utc'], unit='s').dt.year

In [None]:
# Clean submission title
stop_words = stopwords.words("english")
stop_words.extend(["google"])

def clean_post_title(post):
    return " ".join(
        [
            word
            for word in simple_preprocess(post)
            if word not in stop_words and len(word) > 2
        ]
    )

In [None]:
sample = submissions["title"].sample(10, random_state=42).tolist()
print(sample)

sample_cleaned = [clean_post_title(post) for post in sample]
print(sample_cleaned)

In [None]:
submissions["clean_title"] = submissions["title"].apply(clean_post_title)

In [None]:
submissions

In [None]:
titles = submissions['clean_title'].apply(str.split)
id2word = corpora.Dictionary(titles)

In [None]:
corpus = [id2word.doc2bow(title) for title in titles]

In [None]:
# number of topics
num_topics = 5

# Build LDA model
lda_model = gensim.models.LdaMulticore(
    corpus=corpus,
    id2word=id2word,
    num_topics=num_topics,
    workers=3,
    passes=10,
    iterations=200,
    random_state=42
)

In [None]:
print(lda_model.print_topics())
doc_lda = lda_model[corpus]


In [None]:
topic_words = lda_model.show_topics(num_topics=num_topics, num_words=10, formatted=False)

print(topic_words)

### Topic Summaries

- Topic 0: Skins and Cosmetic Items
    - Keywords: "skin", "skins", "back", "concept", "shop"
    - Summary: This topic focuses on discussions around different skins, back blings, and other cosmetic items available in the Fortnite item shop. Conversations revolve around cosmetic customization options.

- Topic 1: Fortnite Seasons and Map Changes  
    - Keywords: "season", "chapter", "map", "new"
    - Summary: This topic covers discussions related to the different seasons, chapters, and map updates in Fortnite. Indicates conversations about the evolving content and changes to the game world over time.

- Topic 2: Gameplay and Game Modes
    - Keywords: "ranked", "build", "play", "mode", "crew"
    - Summary: Encompasses discussions about the core gameplay aspects of Fortnite, including building mechanics, ranked mode, and other game modes. Suggests conversations around different ways to experience the game.

- Topic 3: Player Support and Account Issues
    - Keywords: "help", "account", "need", "epic"  
    - Summary: Relates to players seeking help or support, potentially with issues related to their Fortnite accounts or in-game problems. Indicates discussions around troubleshooting and assistance from the developer Epic Games.

- Topic 4: Bugs, Glitches, and In-Game Issues
    - Keywords: "bug", "glitch", "quest", "creative", "emote"
    - Summary: Revolves around discussions of bugs, glitches, and other issues encountered within the game. Suggests conversations about reporting and addressing various in-game problems or unintended behaviors.

In [None]:
topic_names = {
    0: "Skins, Cosmetics, and In-Game Shop",
    1: "New Seasons, Updates, and Map Changes",
    2: "General Questions and Discussions",
    3: "Player Support, Account Issues, and Quests",
    4: "Bugs, Glitches, and Gameplay Experiences",
}

In [None]:
submissions["topic"] = assign_topic(lda_model, corpus, topic_names)

submissions

In [None]:
submissions["topic"].value_counts(normalize=True, ascending=False)

## Network analysis

In [None]:
SELECTED_TOPIC = topic_names[0]
print(f'Selected topic: {SELECTED_TOPIC}')

In [None]:
submissions_t, comments_t = get_topic_submissions(SELECTED_TOPIC)

In [None]:
# Convert 'created_utc' to datetime and filter submissions before and after a specific date in one step

submissions_t["post_time"] = pd.to_datetime(submissions_t["created_utc"], unit="s")
cutoff_date = pd.Timestamp("2023-10-01 00:00:00")
submissions_t_before_3m = submissions_t[submissions_t["post_time"] < cutoff_date]
submissions_t_after_3m = submissions_t[submissions_t["post_time"] > cutoff_date]

# Filter comments based on the filtered submissions
comments_t_before_3m = comments_t[
    comments_t["submission_id"].isin(submissions_t_before_3m["id"])
]
comments_t_after_3m = comments_t[
    comments_t["submission_id"].isin(submissions_t_after_3m["id"])
]

comments_t_before_3m.shape, comments_t_after_3m.shape


In [None]:
G_filtered, filtered_node_data, B_t = get_filtered_topic_graph(
    SELECTED_TOPIC,
    submissions_t_after_3m,
    comments_t_after_3m,
)

In [None]:
G_filtered_before3m, filtered_node_data_before3m, B_t_before3m = get_filtered_topic_graph(
    SELECTED_TOPIC,
    submissions_t_before_3m,
    comments_t_before_3m,
    largest_component=False
)

### Community detection

In [None]:
G_filtered, communities, community_lens = get_communities(G_filtered, resolution=1.5)

print(f"Number of communities: {len(communities)}")
print(f"Community size: {community_lens}")

In [None]:
SELECTED_COMMUNITY_IDX = 14

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 15))

# Graph from before data
G_community_before3m = plot_community_graph(
    nx.subgraph(G_filtered_before3m, communities[SELECTED_COMMUNITY_IDX]),
    community_lens,
    SELECTED_COMMUNITY_IDX,
    SELECTED_TOPIC,
    ax=ax1,
)

# Graph from after data
G_community = plot_community_graph(
    nx.subgraph(G_filtered, communities[SELECTED_COMMUNITY_IDX]),
    community_lens,
    SELECTED_COMMUNITY_IDX,
    SELECTED_TOPIC,
    ax=ax2,
)


plt.tight_layout()

plt.savefig(
    f"./graphs/{SELECTED_TOPIC}_community_{SELECTED_COMMUNITY_IDX}_over_time.png",
    dpi=300,
)

plt.show()


In [None]:
G_community_df = get_community_dataframe(G_community)
G_community_before3m_df = get_community_dataframe(G_community_before3m)

### Posts and comments frequency

In [None]:
# Number of posts
num_posts_t = submissions_t_after_3m.groupby("author").agg(num_posts=("id", "nunique"))
num_posts_t_before_3m = submissions_t_before_3m.groupby("author").agg(
    num_posts=("id", "nunique")
)

# Number of posts commented on
num_posts_commented_on_t = comments_t_after_3m.groupby("author").agg(
    num_posts_commented_on=("submission_id", "nunique")
)
num_posts_commented_on_t_before_3m = comments_t_before_3m.groupby("author").agg(
    num_posts_commented_on=("submission_id", "nunique")
)

author_metrics_t = (
    (
        G_community_df.set_index("author")
        .merge(num_posts_t, left_index=True, right_index=True, how="left")
        .merge(num_posts_commented_on_t, left_index=True, right_index=True, how="left")
    )
    .sort_values(by="influencer_score", ascending=False)
    .fillna(0)
    .reset_index()
)

author_metrics_t_before_3m = (
    (
        G_community_before3m_df.set_index("author")
        .merge(num_posts_t_before_3m, left_index=True, right_index=True, how="left")
        .merge(
            num_posts_commented_on_t_before_3m,
            left_index=True,
            right_index=True,
            how="left",
        )
    )
    .sort_values(by="influencer_score", ascending=False)
    .fillna(0)
    .reset_index()
)

author_metrics_overall_t = pd.concat(
    [
        author_metrics_t.assign(time="after"),
        author_metrics_t_before_3m.assign(time="before"),
    ],
    axis=0
)

author_metrics_overall_t

In [None]:
author_metrics_overall_t.pivot_table(
    index="author",
    columns="time",
    values=['num_comments', 'num_posts', 'num_posts_commented_on', 'influencer_score'],
    aggfunc="mean",
).sort_values(by=('influencer_score', 'after'), ascending=False)