## Topic Modelling on Reddit Posts

In this section, we will perform topic modelling using the `reddit_ubisort_internal_posts.csv` dataset. This dataset was generated from scraping 7 subreddits based on 11 keywords. Initially, the dataset contained 2348 posts. After filtering the posts to include only those with 10 or more comments as an engagement factor, we have 1416 posts remaining for topic modelling.

In [1]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from gensim import corpora, models
import spacy
import matplotlib.pyplot as plt
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
import os
from collections import Counter
from gensim.models import CoherenceModel

target_column = 'body'

# Download NLTK data
nltk.download('stopwords')

# Load data
df = pd.read_csv("reddit_ubisoft_internal_posts.csv")
texts = df[target_column].astype(str).tolist()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\school\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Initialize spaCy
nlp = spacy.load('en_core_web_sm')
# Cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www.\S+', '', text)
    text = re.sub(r'@\w+|#\w+', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

cleaned_texts = [clean_text(text) for text in texts]

custom_stopwords = {'ubisoft', 'game', 'go', 'get', 'I'}
stop_words = set(stopwords.words('english'))
stop_words.update(custom_stopwords)

# Tokenization and Lemmatization
def tokenize_lemmatize(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc if token.is_alpha and token.lemma_ not in stop_words]

tokenized_texts = [tokenize_lemmatize(text) for text in cleaned_texts]

# Assuming 'tokenized_texts' is your list of tokenized tweets
all_words = [word for text in tokenized_texts for word in text]
word_freq = Counter(all_words)
print(word_freq.most_common(20))

# Create Dictionary and Corpus
dictionary = corpora.Dictionary(tokenized_texts)
dictionary.filter_extremes(no_below=15, no_above=0.5)
corpus = [dictionary.doc2bow(text) for text in tokenized_texts]

[('make', 4301), ('like', 4096), ('player', 3827), ('one', 3604), ('issue', 3592), ('would', 3575), ('assassin', 3383), ('new', 3207), ('time', 3178), ('see', 2894), ('use', 2891), ('play', 2806), ('also', 2617), ('well', 2352), ('creed', 2277), ('people', 2194), ('even', 2144), ('know', 2117), ('take', 2088), ('good', 2049)]


## Method 1: Using gensim models

In [6]:

from gensim.models import CoherenceModel
custom_stopwords = {'ubisoft', 'game', 'go', 'get', 'I'}

stop_words = set(stopwords.words('english'))
stop_words.update(custom_stopwords)

# Create Dictionary and Corpus
dictionary = corpora.Dictionary(tokenized_texts)
dictionary.filter_extremes(no_below=15, no_above=0.5)
corpus = [dictionary.doc2bow(text) for text in tokenized_texts]

# Build LDA Model
max_topics = 100
step = 10
for num_topics in range(10, max_topics+step, step):
    print(f"Number of Topics: {num_topics}")
    num_topics = 10
    passes = 15
    random_state = 42

    lda_model = models.LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=num_topics,
        random_state=random_state,
        update_every=1,
        chunksize=100,
        passes=passes,
        alpha='auto',
        per_word_topics=True
    )
    

    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=tokenized_texts, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()
    print(f'Coherence Score: {coherence_score}')
    
    perplexity = lda_model.log_perplexity(corpus)
    print(f'Perplexity: {perplexity}')

    # Print Topics
    for idx, topic in lda_model.print_topics(-1):
        print(f"Topic: {idx}\nWords: {topic}\n")

    # Visualize Topics
    lda_vis = gensimvis.prepare(lda_model, corpus, dictionary)
    # Uncomment the line below to display the visualization in the notebook
    # pyLDAvis.display(data=lda_vis)
    pyLDAvis.save_html(lda_vis, f'lda_visualization_topic_{num_topics}.html')

Number of Topics: 10
Coherence Score: 0.43656609932668067
Perplexity: -7.256860358781346
Topic: 0
Words: 0.042*"issue" + 0.035*"player" + 0.017*"address" + 0.012*"fix" + 0.011*"cause" + 0.010*"quest" + 0.010*"complete" + 0.009*"could" + 0.009*"mode" + 0.008*"prevent"

Topic: 1
Words: 0.028*"cheat" + 0.020*"support" + 0.018*"account" + 0.016*"ban" + 0.011*"cheater" + 0.011*"use" + 0.009*"report" + 0.009*"datum" + 0.008*"post" + 0.008*"help"

Topic: 2
Words: 0.011*"would" + 0.009*"war" + 0.009*"also" + 0.009*"ac" + 0.008*"city" + 0.007*"templar" + 0.007*"real" + 0.006*"story" + 0.006*"character" + 0.006*"historical"

Topic: 3
Words: 0.032*"ability" + 0.026*"fix" + 0.023*"team" + 0.021*"drone" + 0.021*"recoil" + 0.018*"attacker" + 0.017*"rainbow" + 0.015*"bug" + 0.014*"competitive" + 0.014*"server"

Topic: 4
Words: 0.015*"would" + 0.013*"use" + 0.010*"fc" + 0.010*"weapon" + 0.009*"kill" + 0.009*"gun" + 0.008*"enemy" + 0.008*"mission" + 0.007*"also" + 0.006*"system"

Topic: 5
Words: 0.132*

In [7]:
from bertopic import BERTopic

# Initialize BERTopic
topic_model = BERTopic()
topics, probabilities = topic_model.fit_transform(texts)

# Explore topics
topic_model.get_topic_info()

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,344,-1_the_and_to_of,"[the, and, to, of, in, is, it, that, for, this]",[Played on PS5\n\nHard (Berserker) Difficulty...
1,0,161,0_account_my_email_ubisoft,"[account, my, email, ubisoft, support, to, me,...",[#HUGE NEWS! After 4 months (!!) of active pes...
2,1,86,1_cry_far_you_the,"[cry, far, you, the, it, and, to, is, with, of]",[Hello to all the Far Cry enjoyers.\n\nThis is...
3,2,72,2_nan___,"[nan, , , , , , , , , ]","[nan, nan, nan]"
4,3,62,3_the_and_to_of,"[the, and, to, of, that, in, it, is, was, you]","[***Quick warning, the upcoming wall of text i..."
5,4,42,4_ban_me_banned_my,"[ban, me, banned, my, account, to, this, for, ...","[On January 19th, my Siege account was banned ..."
6,5,41,5_engagement_of_the_about,"[engagement, of, the, about, for, that, dlc, a...",[Based on statistics shared by Ubisoft and its...
7,6,33,6_addressed_issue_an_prevented,"[addressed, issue, an, prevented, players, cau...",[&#x200B;\n\n[Assassin's Creed Valhalla - Titl...
8,7,33,7_the_of_and_assassins,"[the, of, and, assassins, is, to, that, in, te...",[Assassin’s Creed now has 11 mainline titles r...
9,8,30,8_the_of_and_it,"[the, of, and, it, to, that, but, in, its, game]",[This post was originally a Comment in Respons...


## Method 2: Using GPT 4o model

In [9]:
import openai
from openai import RateLimitError
from bertopic import BERTopic
from bertopic.representation import OpenAI
import os
import time
from dotenv import load_dotenv

load_dotenv()

OPEN_AI_KEY = os.environ.get("OPEN_AI_KEY")
client = openai.OpenAI(api_key=OPEN_AI_KEY)
representation_model = OpenAI(client, model="gpt-4o-mini", chat=True)
topic_model = BERTopic(representation_model=representation_model)

def rate_limited_request():
    # while True:
    #     try:
    #         # Fit and transform the model
    #         topics, probabilities = topic_model.fit_transform(texts)
    #         break  # Exit loop if successful
    #     except RateLimitError:
    #         print("Rate limit exceeded. Retrying in 60 seconds...")
    #         time.sleep(60)  # Wait before retrying

    max_retries = 5
    retry_delay = 60  # seconds
    retries = 0

    while retries < max_retries:
        try:
            # Fit and transform the model
            topics, probabilities = topic_model.fit_transform(texts)
            break  # Exit loop if successful
        except RateLimitError:
            print("Rate limit exceeded. Retrying in 60 seconds...")
            time.sleep(retry_delay)  # Wait before retrying
        except OpenAIError as e:
            print(f"OpenAI error: {e}. Retrying in 60 seconds...")
            time.sleep(retry_delay)  # Wait before retrying
        except Exception as e:
            print(f"Unexpected error: {e}. Retrying in 60 seconds...")
            time.sleep(retry_delay)  # Wait before retrying
        retries += 1

    if retries == max_retries:
        print("Max retries reached. Exiting.")
        return None


    

# Call the rate-limited request function
rate_limited_request()

# Explore topics
topic_df = topic_model.get_topic_info()
topic_df.to_csv("topics_info.csv", index=False)

Rate limit exceeded. Retrying in 60 seconds...
Rate limit exceeded. Retrying in 60 seconds...
Rate limit exceeded. Retrying in 60 seconds...
Rate limit exceeded. Retrying in 60 seconds...
Rate limit exceeded. Retrying in 60 seconds...
Max retries reached. Exiting.


AttributeError: 'NoneType' object has no attribute 'items'

In [None]:
fig = topic_model.visualize_topics()
fig.write_html("bertopic_reddit_internal_visualization.html")