In [1]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from gensim import corpora, models
import spacy
import matplotlib.pyplot as plt
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
import os
from collections import Counter
from gensim.models import CoherenceModel

target_column = 'rawContent'

# Download NLTK data
nltk.download('stopwords')

# Load data
df = pd.read_csv("C:/Users/Admin/Documents/GitHub/is434-ubisoft/data-collection/twitter/data/combined.csv")
df = df[df["lang"]=="en"]
texts = df[target_column].astype(str).tolist()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Initialize spaCy
nlp = spacy.load('en_core_web_sm')
# Cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www.\S+', '', text)
    text = re.sub(r'@\w+|#\w+', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

cleaned_texts = [clean_text(text) for text in texts]

custom_stopwords = {'ubisoft', 'game', 'go', 'get', 'I'}
stop_words = set(stopwords.words('english'))
stop_words.update(custom_stopwords)
# Tokenization and Lemmatization
def tokenize_lemmatize(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc if token.is_alpha and token.lemma_ not in stop_words]

tokenized_texts = [tokenize_lemmatize(text) for text in cleaned_texts]

# Assuming 'tokenized_texts' is your list of tokenized tweets
all_words = [word for text in tokenized_texts for word in text]
word_freq = Counter(all_words)
print(word_freq.most_common(20))

# Create Dictionary and Corpus
dictionary = corpora.Dictionary(tokenized_texts)
dictionary.filter_extremes(no_below=15, no_above=0.5)
corpus = [dictionary.doc2bow(text) for text in tokenized_texts]

[('thank', 13320), ('assassin', 13157), ('creed', 13076), ('play', 11454), ('please', 10893), ('new', 10576), ('like', 10376), ('make', 9849), ('one', 8454), ('good', 7521), ('issue', 7492), ('look', 6737), ('see', 6628), ('know', 6528), ('platform', 6338), ('release', 6228), ('would', 6073), ('xbox', 6045), ('time', 5961), ('hey', 5910)]


In [3]:
import openai
from bertopic import BERTopic
from bertopic.representation import OpenAI
import os
from dotenv import load_dotenv

load_dotenv()

OPEN_AI_KEY = os.environ.get("OPEN_AI_KEY")
client = openai.OpenAI(api_key=OPEN_AI_KEY)
representation_model = OpenAI(client, model="gpt-4o-mini", chat=True)
topic_model = BERTopic(representation_model=representation_model)

topics, probabilities = topic_model.fit_transform(texts)

# Explore topics
topic_df = topic_model.get_topic_info()
topic_df.to_csv("topics_info.csv", index=False)

  from .autonotebook import tqdm as notebook_tqdm
  import pkg_resources
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)


In [4]:
fig = topic_model.visualize_topics()
fig.write_html("bertopic_visualization.html")