Create Embeddings

In [None]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

# Define file paths
data_path = 'rpg/rpg_small_processed.csv'
embeddings_save_path = 'rpg/rpg_small_processed_embeddings.npy'

print("Step 1: Loading the data...")
# Load the data
df = pd.read_csv(data_path, usecols=['text'], low_memory=False)

print("Step 2: Preparing the documents...")
# Specify what the 'docs' are
docs = df['text'].tolist()

# Create embeddings
model = SentenceTransformer('thenlper/gte-small')
embeddings = model.encode(docs, show_progress_bar=True)

# Save Embeddings
with open(embeddings_save_path, 'wb') as f:
    np.save(f, embeddings)

Run BERTopic

In [None]:
import pickle
from umap import UMAP
from hdbscan import HDBSCAN
import numpy as np
from nltk.tokenize import sent_tokenize
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sentence_transformers import SentenceTransformer
import collections
from bertopic.representation import PartOfSpeech
from bertopic.representation import MaximalMarginalRelevance
from bertopic.representation import KeyBERTInspired
from tqdm import tqdm
from bertopic.cluster import BaseCluster

# Define file paths
data_path = 'attachment/attach_processed_length10.csv'
embeddings_save_path = 'attachment/doc/models/attach_doc_embeddings.npy'
model_save_path = "attachment/doc/models/attach_doc6_model.pkl"
data_save_path = "attachment/doc/models/attach_doc6_data.pkl"

print("Step 1: Loading the data...")
# Load the data
df = pd.read_csv(data_path, usecols=['text'], low_memory=False)

print("Step 2: Preparing the documents...")
# Specify what the 'docs' are
docs = df['text'].tolist()

# Load Embeddings
with open(embeddings_save_path, 'rb') as f:
    embeddings = np.load(f)

###### Extract vocab to be used in BERTopic
vocab = collections.Counter()
tokenizer = CountVectorizer(ngram_range=(1, 3)).build_tokenizer()
for doc in tqdm(docs):
    vocab.update(tokenizer(doc))
vocab = [word for word, frequency in vocab.items() if frequency >= 200]; len(vocab)

# Train model and reduce dimensionality of embeddings

umap_model = UMAP(
        n_components=5,  # has a wild impact hard to predict
        n_neighbors=30,  # Higher is a more gloabl strcture
        min_dist=0.0,   # Lower value means more dense packing
        random_state=42, # Reproducability
        metric="cosine", # have to pick something
        n_jobs=-1        # speed
        )
reduced_embeddings = umap_model.fit_transform(embeddings)

# Find clusters of semantically similar documents
hdbscan_model = HDBSCAN(
            min_cluster_size=100,           # smallest size group considered
            min_samples=50,                 # larger is more conservative - more noise
            leaf_size=40,                   # number of points per leaf node in the tree - default 40
            gen_min_span_tree=False,        # True creates minimum spanning trees - increasing RAM
            prediction_data=True,           # generates extra cached data of prediction labels for new data or reuse
            cluster_selection_method='leaf', # eom is normal - leaf might get more homogeneous clusters
            cluster_selection_epsilon=0.0,  # default - merges clusters below threshold
            core_dist_n_jobs=-1,            # For speed
            )
clusters = hdbscan_model.fit(reduced_embeddings).labels_

class Dimensionality:
    """ Use this for pre-calculated reduced embeddings """
    def __init__(self, reduced_embeddings):
        self.reduced_embeddings = reduced_embeddings
    
    def fit(self, X):
        return self
    
    def transform(self, X):
        return self.reduced_embeddings

# Set the main_representation for the model
main_representation = KeyBERTInspired()

# Additional ways of representing a topic
aspect_model = [KeyBERTInspired(top_n_words=10), MaximalMarginalRelevance(diversity=.3)]

# Prepare sub-models
embedding_model = SentenceTransformer('thenlper/gte-small')
umap_model = Dimensionality(reduced_embeddings)
hdbscan_model = BaseCluster()
vectorizer_model = CountVectorizer(vocabulary=vocab, stop_words="english")
representation_model = {
    "Main": main_representation,
    "Aspect1": aspect_model,
}

# Fit BERTopic without actually performing any clustering
topic_model= BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    representation_model=representation_model,
    verbose=True
)

# Fit model and transform documents
topics, _= topic_model.fit_transform(docs, embeddings=embeddings, y=clusters)

# Load the full dataset
full_data = pd.read_csv(data_path, low_memory=False)

# Add the topics to the full dataset
# Ensure the length of `topics` matches the number of rows in `full_data`
full_data['topics'] = topics

# Save the BERTopic model and the full dataset with topics to .pkl files
with open(model_save_path, "wb") as model_file:
    pickle.dump(topic_model, model_file)

with open(data_save_path, "wb") as data_file:
    pickle.dump(full_data, data_file)

print("Data and model saved successfully.")

# Analysis

## Text output

In [None]:
from bertopic import BERTopic

# Extract the results
topics = topic_model.get_topics()
topic_freq = topic_model.get_topic_freq()
topic_info = topic_model.get_topic_info()
representative_docs = topic_model.get_representative_docs()

# Assuming 'topic_info' is already defined and includes topic representations
csv_file_path = 'rpg/analysis/rpg_small_processed2_analysis.csv'  # Hardcoded save location for CSV

# Remove the 'Representative_Docs' column from topic_info DataFrame
topic_info = topic_info.drop(columns=['Representative_Docs'])

# Save the results in a more structured and readable manner
with open('rpg/analysis/rpg_small_processed2_analysis.txt', 'w') as f:
    # Topics
    f.write("TOPICS:\n")
    for topic_num, terms in topics.items():
        terms_str = ', '.join([term[0] for term in terms])
        f.write(f"Topic {topic_num}: {terms_str}\n")
    f.write("\n")

    # Topic Frequency
    f.write("TOPIC FREQUENCY:\n")
    for index, row in topic_freq.iterrows():
        f.write(f"Topic {row['Topic']}: {row['Count']} entries\n")
    f.write("\n")

    # Topic Info
    f.write("TOPIC INFO:\n")
    for index, row in topic_info.iterrows():
        f.write(f"Topic {row['Topic']}\n")
        f.write(f" - Name: {row['Name']}\n")
        f.write(" - Representation:\n")
        for term in row['Representation']:
            f.write(f"   * {term}\n")
        f.write("\n")

    # Representative Docs
    f.write("REPRESENTATIVE DOCS:\n")
    for topic_num, docs in representative_docs.items():
        f.write(f"Topic {topic_num} representative docs:\n")
        for doc in docs:
            f.write(f" - {doc}\n")
        f.write("\n")

# Convert 'topic_info' DataFrame directly to CSV
topic_info.to_csv(csv_file_path, index=False)

## Visualizations

In [None]:
topic_model.visualize_topics()

In [None]:
# with the original embeddings
topic_model.visualize_document_datamap(docs, embeddings=embeddings)


## Searching for topics

In [None]:
import pandas as pd

def explore_relevant_topics_to_file(topic_model, search_terms, txt_filename, csv_filename, top_n=5):
    """
    Find and save topics related to a list of search terms to a .txt file and a .csv file,
    along with representative documents for the topics.

    Parameters:
    - topic_model: The trained BERTopic model.
    - search_terms: A list of search terms/phrases related to the desired topics.
    - txt_filename: Name of the .txt file to save the results.
    - csv_filename: Name of the .csv file to save the topic information.
    - top_n: Number of top similar topics to retrieve for each search term.

    Returns:
    - None (writes the relevant topics, their terms, and representative docs to a .txt file
             and topic information to a .csv file)
    """
    topics_covered = set()  # To keep track of topics we've added representative docs for
    all_relevant_topics = set()  # To gather all unique topics from the search results
    topic_info_data = []  # To store topic information for the CSV file

    with open(txt_filename, 'w') as file:
        # Display search terms and their related topics at the top
        for term in search_terms:
            file.write(f"Searching for topics related to: '{term}'\n\n")
            topics, similarity = topic_model.find_topics(term, top_n=top_n)
            for topic, score in zip(topics, similarity):
                file.write(f"Topic {topic} (Similarity: {score:.4f})\n")
                all_relevant_topics.add(topic)  # Add topic to the set
                topic_info_data.append({'Topic': topic, 'Representation': tuple(topic_model.get_topic(topic)), 'Search Term': term})
            file.write("\n" + "-" * 50 + "\n")

        # Append topic details and representative documents at the end in numerical order
        for topic in sorted(all_relevant_topics):  # Sort topics numerically
            if topic not in topics_covered:
                topic_terms = topic_model.get_topic(topic)
                formatted_terms = ', '.join([f"{word[0]} ({word[1]:.4f})" for word in topic_terms])
                file.write(f"\nTopic {topic} Details: {formatted_terms}\n\n")
                reps = topic_model.get_representative_docs(topic)
                file.write(f"Representative Documents for Topic {topic}:\n")
                for doc in reps:
                    file.write("\n" + "-" * 30 + "\n")
                    file.write(f"{doc}\n")
                    file.write("-" * 30 + "\n")
                topics_covered.add(topic)
            file.write("-" * 50 + "\n")

    # Create a DataFrame from the topic information data
    topic_info_df = pd.DataFrame(topic_info_data)
    
    # Aggregate search terms for each topic
    topic_info_df = topic_info_df.groupby(['Topic', 'Representation'])['Search Term'].apply(', '.join).reset_index()
    
    # Save the topic information to a CSV file
    topic_info_df.to_csv(csv_filename, index=False)

# Updated list of search terms related to your research question
search_terms = [
    "Boundaries", "Limits", "Personal space", "Assertiveness", "Saying no",
    "Interpersonal boundaries", "Relationship limits", "Healthy relationships", "Assertive communication",
    "Personal growth", "Self-improvement", "Interpersonal skills", "Relationship building",
    "Communication skills", "Active listening", "Expressing emotions", "Nonverbal communication",
    "Self-care", "Self-compassion", "Mental health", "Emotional well-being", "Self-love",
    "Social connection", "Belonging", "Interpersonal relationships", "Social support", "Emotional intimacy",
    "Roleplaying", "Immersion", "Character development", "Alternate persona", "Escapism",
    "Authenticity", "Self-expression", "Identity exploration", "True self",
    "Anxiety relief", "Depression relief", "Therapeutic gaming", "Relaxation",
    "Player growth", "Personal development", "Supportive environment", "Encouraging rules",
    "Player education", "Mentoring", "Skill development", "Collaborative learning",
    "Flexibility", "Adaptability", "Open-mindedness", "Embracing change",
    "Time management", "Session planning", "Consistency", "Commitment",
    "Responsibility", "Maturity", "Life skills", "Independence",
    "Emotional intelligence", "Emotional regulation", "Self-awareness", "Empathy",
    "Safety", "Security", "Trust", "Comfort", "Supportive environment",
    "Trauma recovery", "Emotional healing", "Therapeutic roleplaying", "Coping mechanisms",
    "Genuineness", "Honesty",
    "Recognition", "Acknowledgment", "Validation", "Acceptance",
    "Inclusivity", "Self-acceptance",
    "Affirmation", "Support", "Encouragement", "Understanding",
    "Resilience", "Perseverance", "Problem-solving", "Determination", "Growth mindset"
]

# Use the function to explore the relevant topics and save to a .txt file and a .csv file
txt_filename = "rpg/analysis/rpg_small_processed2_relevant_topics.txt"
csv_filename = "rpg/analysis/rpg_small_processed2_relevant_topics.csv"
explore_relevant_topics_to_file(topic_model, search_terms, txt_filename, csv_filename)

This needs to properly load the documents - causing an error now.

In [None]:
import pandas as pd

# Assuming you've already loaded the BERTopic model and have the docs list
# Step 1: Get the document info
document_info = topic_model.get_document_info(docs)

# List of topics you want to extract
topics_to_include = [82, 332]

# Step 2: Filter the DataFrame by the given set of topics
filtered_df = document_info[document_info['Topic'].isin(topics_to_include)]

# Step 3: Select only the relevant columns
selected_df = filtered_df[['Document', 'Topic', 'Probability', 'Representation']]

# Step 4: Save the selected DataFrame to a .csv file with all documents
selected_df.to_csv("test.csv", index=False)

# Step 5: Create a DataFrame with only 200 documents for each topic
limited_df = pd.concat([filtered_df[filtered_df['Topic'] == topic].sample(min(len(filtered_df[filtered_df['Topic'] == topic]), 200)) 
                        for topic in topics_to_include])

# Step 6: Save the limited DataFrame to a .csv file with only 200 documents per topic
limited_df.to_csv("test.csv", index=False)

In [None]:
topics, similarity = topic_model.find_topics("Healing through my trauma.", top_n=5)
for topic, score in zip(topics, similarity):
    print(f"Topic {topic} (Similarity: {score:.4f}): {topic_model.get_topic(topic)}")


In [None]:
topics, similarity = topic_model.find_topics("Finding out my real identity, sense of self, and who I am.", top_n=5)
for topic, score in zip(topics, similarity):
    print(f"Topic {topic} (Similarity: {score:.4f}): {topic_model.get_topic(topic)}")


In [None]:
topics, similarity = topic_model.find_topics("Learning how to set boundaries and love myself.", top_n=5)
for topic, score in zip(topics, similarity):
    print(f"Topic {topic} (Similarity: {score:.4f}): {topic_model.get_topic(topic)}")

In [None]:
topics, similarity = topic_model.find_topics("No DnD is better than bad DnD", top_n=5)
for topic, score in zip(topics, similarity):
    print(f"Topic {topic} (Similarity: {score:.4f}): {topic_model.get_topic(topic)}")

In [None]:
topics, similarity = topic_model.find_topics("How can I learn to talk with problematic players?", top_n=5)
for topic, score in zip(topics, similarity):
    print(f"Topic {topic} (Similarity: {score:.4f}): {topic_model.get_topic(topic)}")

In [None]:
topics, similarity = topic_model.find_topics("Red flags and problem players", top_n=5)
for topic, score in zip(topics, similarity):
    print(f"Topic {topic} (Similarity: {score:.4f}): {topic_model.get_topic(topic)}")