In [29]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from gensim import corpora, models
import spacy
import matplotlib.pyplot as plt
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
import os
from collections import Counter
from gensim.models import CoherenceModel

target_column = 'Pros'

# Download NLTK data
nltk.download('stopwords')

# Load data
df = pd.read_csv("ubisoft_reviews.csv",encoding='iso-8859-1')
texts = df[target_column].astype(str).tolist()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [30]:
# Initialize spaCy
nlp = spacy.load('en_core_web_sm')
# Cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www.\S+', '', text)
    text = re.sub(r'@\w+|#\w+', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

cleaned_texts = [clean_text(text) for text in texts]

custom_stopwords = {'ubisoft', 'game', 'go', 'get', 'I'}
stop_words = set(stopwords.words('english'))
stop_words.update(custom_stopwords)
# Tokenization and Lemmatization
def tokenize_lemmatize(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc if token.is_alpha and token.lemma_ not in stop_words]

tokenized_texts = [tokenize_lemmatize(text) for text in cleaned_texts]

# Assuming 'tokenized_texts' is your list of tokenized tweets
all_words = [word for text in tokenized_texts for word in text]
word_freq = Counter(all_words)
print(word_freq.most_common(20))

# Create Dictionary and Corpus
dictionary = corpora.Dictionary(tokenized_texts)
dictionary.filter_extremes(no_below=15, no_above=0.5)
corpus = [dictionary.doc2bow(text) for text in tokenized_texts]

[('work', 605), ('good', 502), ('great', 295), ('people', 241), ('life', 184), ('balance', 177), ('environment', 164), ('team', 138), ('company', 133), ('culture', 131), ('nice', 110), ('project', 104), ('benefit', 97), ('lot', 93), ('friendly', 90), ('place', 73), ('fun', 71), ('opportunity', 71), ('learn', 68), ('studio', 60)]


In [31]:
import openai
from bertopic import BERTopic
from bertopic.representation import OpenAI
from hdbscan import HDBSCAN
import os
from dotenv import load_dotenv

load_dotenv()

OPEN_AI_KEY = os.environ.get("OPEN_AI_KEY")
client = openai.OpenAI(api_key=OPEN_AI_KEY)
representation_model = OpenAI(client, model="gpt-4o-mini", chat=True)

hdbscan_model = HDBSCAN(
    min_cluster_size=5,  # Smaller clusters allowed
    metric='euclidean',
    cluster_selection_method='leaf'  # 'eom' or 'leaf'
)

topic_model = BERTopic(
    representation_model=representation_model,
    # hdbscan_model=hdbscan_model,
)

topics, probabilities = topic_model.fit_transform(texts)

# Explore topics
topic_df = topic_model.get_topic_info()
topic_df.to_csv("glassdoor_ubisoft_pros_topics_info.csv", index=False)

In [32]:
fig = topic_model.visualize_topics()
fig.write_html("glassdoor_ubisoft_pros_bertopic_visualization.html")