In [1]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from gensim import corpora, models
import spacy
import matplotlib.pyplot as plt
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
import os
from collections import Counter
from gensim.models import CoherenceModel

target_column = 'rawContent'
dataset_name = "twitter_ubisoft"
# Download NLTK data
nltk.download('stopwords')

# Load data
df = pd.read_csv("C:/Users/Admin/Documents/GitHub/is434-ubisoft/data-collection/twitter/data/combined.csv",encoding='iso-8859-1')
df = df[df["lang"]=="en"]
texts = df[target_column].astype(str).tolist()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Initialize spaCy
nlp = spacy.load('en_core_web_sm')
# Cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www.\S+', '', text)
    text = re.sub(r'@\w+|#\w+', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

cleaned_texts = [clean_text(text) for text in texts]

custom_stopwords = {'go', 'get', 'I'}
stop_words = set(stopwords.words('english'))
stop_words.update(custom_stopwords)
# Tokenization and Lemmatization
def tokenize_lemmatize(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc if token.is_alpha and token.lemma_ not in stop_words]

tokenized_texts = [tokenize_lemmatize(text) for text in cleaned_texts]

# Assuming 'tokenized_texts' is your list of tokenized tweets
all_words = [word for text in tokenized_texts for word in text]
word_freq = Counter(all_words)
print(word_freq.most_common(20))

# Create Dictionary and Corpus
dictionary = corpora.Dictionary(tokenized_texts)
dictionary.filter_extremes(no_below=15, no_above=0.5)
corpus = [dictionary.doc2bow(text) for text in tokenized_texts]

[('ubisoft', 81134), ('game', 42867), ('thank', 13291), ('creed', 12941), ('play', 11435), ('assassin', 11206), ('please', 10882), ('new', 10523), ('like', 10361), ('make', 9822), ('one', 8428), ('good', 7507), ('issue', 7487), ('look', 6733), ('see', 6620), ('know', 6519), ('platform', 6279), ('release', 6215), ('xbox', 6031), ('time', 5943)]


In [4]:
import openai
from bertopic import BERTopic
from bertopic.representation import OpenAI
from hdbscan import HDBSCAN
import os
from dotenv import load_dotenv

load_dotenv()

OPEN_AI_KEY = os.environ.get("OPEN_AI_KEY")
client = openai.OpenAI(api_key=OPEN_AI_KEY)
representation_model = OpenAI(client, model="gpt-4o-mini", chat=True)

hdbscan_model = HDBSCAN(
    min_cluster_size=130,  # Smaller clusters allowed
    metric='euclidean',
    cluster_selection_method='eom'  # 'eom' or 'leaf'
)

topic_model = BERTopic(
    representation_model=representation_model,
    hdbscan_model=hdbscan_model,
)

topics, probabilities = topic_model.fit_transform(texts)

# Explore topics
topic_df = topic_model.get_topic_info()
topic_df.to_csv(f"{dataset_name}_topics_info.csv", index=False)

  from .autonotebook import tqdm as notebook_tqdm
  import pkg_resources
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)


In [5]:
fig = topic_model.visualize_topics()
fig.write_html(f"{dataset_name}_visualization.html")

In [7]:
topic_model.save(f"{dataset_name}_model", serialization="safetensors", save_ctfidf=True)

In [8]:
topics_df = topic_model.get_document_info(texts)
topics_df.to_csv(f"{dataset_name}_docs_info.csv", index=False)

In [9]:
topic_model.reduce_topics(texts, nr_topics=35)
topic_df = topic_model.get_topic_info()
topic_df.to_csv(f"{dataset_name}_topics_info_35.csv", index=False)

In [10]:
fig = topic_model.visualize_topics()
fig.write_html(f"{dataset_name}_visualization_35.html")

In [11]:
topics_df2 = topic_model.get_document_info(texts)
topics_df2.to_csv(f"{dataset_name}_docs_info_35.csv", index=False)