In [1]:

# Add the parent directory to sys.path
import sys
sys.path.insert(0, '..')

# Load Dataset

In [4]:
!pwd
!ls

/code
compose		     notebooks			requirements-dev.in   script.py
data		     output.py			requirements-dev.txt
docker-compose.yaml  pyflow.log			requirements.in
makefile	     recommender_training_data	requirements.txt


In [5]:
import pandas as pd

# Replace "data.csv" with the actual path to your CSV file
csv_file = "data/questions.csv"

# Load the CSV into a DataFrame
df = pd.read_csv(csv_file)

questions = df["question"].tolist()

In [159]:
# # Reset an entire column
# df["author"] = ['leo audibert'] * len(df)

# # Save the updated DataFrame back to the CSV file
# df.to_csv(csv_file, index=False)

# Prepare Embeddings

In [6]:
from sentence_transformers import SentenceTransformer

sentence_model = SentenceTransformer("all-mpnet-base-v2")
embeddings = sentence_model.encode(questions, show_progress_bar=False)

Downloading (…)a8e1d/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)0bca8e1d/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)e1d/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)a8e1d/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)8e1d/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)bca8e1d/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

### Save embeddings to CSV

In [7]:
# Convert embeddings to a pandas Series and assign it to a new column in the DataFrame
df['embedding_all_mpnet_base_v2'] = pd.Series(embeddings.tolist())

# Check the updated DataFrame
print(df.head())

df.to_csv(csv_file, index=False)

                                            question   
0  What is something that you're afraid of that y...  \
1  Do you think AI will eventually surpass human ...   
2  How can we ensure that social justice movement...   
3            What's your favorite memory with a pet?   
4  Is there anything you wish you had said or don...   

                                                tags   
0                 ['Fear Topic', 'Overcoming Topic']  \
1  ['AI Challenges', 'AI Bias', 'AI Manufacturing...   
2            ['Social Justice', 'Intersectionality']   
3  ['Pet Memories', 'Funniest Joke', 'Crazy Fun',...   
4                              ['Wishes', 'Regrets']   

                                    extracted_topics   
0                    ['fear', 'fears', 'fears what']  \
1                       ['ai', 'think ai', 'ai can']   
2  ['social justice', 'about social', 'justice mo...   
3       ['pet', 'funniest thing', 'favorite animal']   
4               ['wish you', 'said or', 'tell 

### Save embeddings to file for later use in model

In [160]:
import pickle

PICKLED_DIR = "pickled_data"

# Save the embeddings to a file
with open(f"{PICKLED_DIR}/embeddings.pkl", "wb") as file:
    pickle.dump(embeddings, file)

### Load embeddings

In [85]:
# Load the embeddings from the file
with open(f"{PICKLED_DIR}/embeddings.pkl", "rb") as file:
    loaded_embeddings = pickle.load(file)

# Create Model

In [86]:
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import MaximalMarginalRelevance
from bertopic.representation import KeyBERTInspired

ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
representation_model = MaximalMarginalRelevance(diversity=0.2)
# Create your representation model
# representation_model = KeyBERTInspired()
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)


topic_model = BERTopic(
    n_gram_range=(1, 2),
    embedding_model=sentence_model,
    ctfidf_model=ctfidf_model,
    representation_model=representation_model,
)

topics, probs = topic_model.fit_transform(questions, embeddings)



### Save model to file

In [161]:
MODEL_PATH = f"{PICKLED_DIR}/my_model"

topic_model.save(MODEL_PATH)


### Load model from file

In [233]:
topic_model = BERTopic.load(MODEL_PATH)

# Explore

In [89]:
topic_model.get_topic(0)[:10]

[('conflicts in', 0.3039767775919272),
 ('resolve', 0.296112632649439),
 ('handle conflicts', 0.28983048543333045),
 ('resolve conflicts', 0.2418148109839119),
 ('conflicts or', 0.2418148109839119),
 ('this conflict', 0.2418148109839119),
 ('conflict resolution', 0.225654491087144),
 ('handle conflict', 0.2162857421305159),
 ('conflict in', 0.2162857421305159),
 ('approach conflicts', 0.20589528084845654)]

In [90]:
topic_model.get_topic_info().head()

Unnamed: 0,Topic,Count,Name
0,-1,951,-1_learned about_date_partner_about yourself
1,0,101,0_conflicts in_resolve_handle conflicts_resolv...
2,1,91,1_goal_to achieve_achieve_achieve in
3,2,88,2_tradition_traditions_traditions or_festivals
4,3,82,3_social media_media_think technology_social


In [91]:
topic_model.get_topic(0)  # Select the most frequent topic

[('conflicts in', 0.3039767775919272),
 ('resolve', 0.296112632649439),
 ('handle conflicts', 0.28983048543333045),
 ('resolve conflicts', 0.2418148109839119),
 ('conflicts or', 0.2418148109839119),
 ('this conflict', 0.2418148109839119),
 ('conflict resolution', 0.225654491087144),
 ('handle conflict', 0.2162857421305159),
 ('conflict in', 0.2162857421305159),
 ('approach conflicts', 0.20589528084845654)]

## Intertopic Distance Map

In [92]:
topic_model.visualize_topics()

In [93]:
from umap import UMAP

topic_model.visualize_documents(questions, embeddings=embeddings)

# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
topic_model.visualize_documents(questions, reduced_embeddings=reduced_embeddings)

## Topic Hierarchies

In [94]:
topic_model.visualize_hierarchy(top_n_topics=50)

In [95]:
topic_model.visualize_barchart(top_n_topics=15)

In [96]:
topic_model.visualize_heatmap(n_clusters=20, width=1000, height=1000)

In [97]:
topic_model.visualize_term_rank()

# Topic Reduction


In [235]:
topic_model.reduce_topics(questions, nr_topics=60)

ValueError: All arrays must be of the same length

In [27]:
topic_model.visualize_topics()

# Create tags and save to file
### Add extracted topics to the rows that have topic != -1

In [152]:
"""Add extracted topics to the rows that have topic != -1"""
# # Get the topic for each question
# for i in range(len(questions)):
#     topic = topic_model.get_topic(topics[i])
#     question = questions[i]
#     topic_integer_name = topics[i]

#     if topic_integer_name != -1:
#         row_index = df.loc[df["question"] == question].index[0]
#         df.at[row_index, "extracted_topics"] = [x[0] for x in topic[:3]]
#         # print(f"Question: {question}\nTopic: {[x[0] for x in topic[:3]]}, topic_integer_name: {topic_integer_name}\n")

# df.to_csv(csv_file, index=False)


### Create tag for identified topics

In [130]:
import os

os.environ['OPENAI_API_KEY'] = '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'

from marvin import ai_fn

@ai_fn
def create_tags_for_topics(DOCUMENTS: str, KEYWORDS: str) -> list[str]:
    """
    I have topic that contains the following documents: \n[DOCUMENTS]
    The topic is described by the following keywords: [KEYWORDS]

    Extract 1 short topic label with at most 2 words in the label, preferrably 1 word in the label
    """

In [149]:
"""Add tag for identified topics to the rows that have topic != -1. 
The tags are created by ChatGPT for each topic given a set of sentences 
sharing the topic, and the set of extracted topic words."""

topic_info = topic_model.get_topic_info()

topic_to_tags_dict = dict()

for topic in topic_info['Topic']:
    if topic != -1:  # -1 represents an outlier topic
        # Get the top 10 keywords for this topic
        keywords = topic_model.get_topic(topic)[:10]
        
        # Get the documents belonging to this topic
        docs = [doc for doc, top in zip(questions, topics) if top == topic]

        # topic_to_tags_dict[topic] = create_tags_for_topics('; '.join(docs[:5]), keywords)

        # print(f"Topic ID: {topic}")
        # print("Keywords:", keywords)
        # print("Documents:", docs[:3])  # Printing only the first 3 documents for brevity
        # print("\n")
    
print(topic_to_tags_dict)

{0: ['Conflict resolution', 'Handling conflicts'], 1: ['Personal Goals'], 2: ['Traditions', 'Festivals'], 3: ['Social media', 'Technology impact'], 4: ['Gifts received', 'Fashion era'], 5: ['Work-Life', 'Prioritization'], 6: ['Cooking Favorites'], 7: ['Philanthropy experience', 'Volunteer work'], 8: ['Relationship Tips', 'Romance Boosters'], 9: ['Personal Values', 'Core Beliefs'], 10: ['Sustainable Care', 'Environmental Awareness', 'Pet Sustainability', 'Future Sustainability', 'Community Sustainability'], 11: ['Identity and consciousness'], 12: ['Relationship Boundaries'], 13: ['Stress Relief', 'Coping Mechanisms'], 14: ['Trust Building', 'Relationship Trust'], 15: ['Activities', 'Fun'], 16: ['Fear Topic', 'Overcoming Topic'], 17: ['Gratitude Practices', 'Expressing Thanks'], 18: ['Relationship Insights'], 19: ['Masculinity/Femininity', 'Gender', 'Toxic Masculinity', 'Gender/Sexuality Stereotypes', 'Personal Definition of Masculinity'], 20: ['Personal Challenges'], 21: ['Emotional Adv

### Add tags for all sentences that are in category -1 using ChatGPT

In [137]:
"""Add tags for all sentences that are in category -1 using ChatGPT."""

existing_tags = set()

@ai_fn
def create_tags_for_question(QUESTION: str, EXISTING_TAGS: str) -> list[str]:
    """
    Extract a short topic label at most 2 words long for QUESTION. 
    Pick the topic label from existing_tags if it already contains a suitable topic label for QUESTION.
    """

In [158]:
# # Get the topic for each question
# for i in range(len(questions)):
#     topic = topic_model.get_topic(topics[i])
#     question = questions[i]
#     topic_integer_name = topics[i]

#     row_index = df.loc[df["question"] == question].index[0]
#     if topic_integer_name != -1:
#         df.at[row_index, "tags"] = topic_to_tags_dict[topic_integer_name]
#         # print(f"Question: {question}\nTopic: {topic_to_tags_dict[topic_integer_name]}, topic_integer_name: {topic_integer_name}\n")
#     else:
#         tags = create_tags_for_question(question, existing_tags)
#         df.at[row_index, "tags"] = tags
#         existing_tags.update(tags)
#         df.to_csv(csv_file, index=False)

# df.to_csv(csv_file, index=False)
print(existing_tags)


{'Community Challenges', 'Ancestral Influences', 'Learning Environment', 'Cultural Trends', 'Personal Discoveries', 'Recent Trivia', 'Interior Design', 'Resources', 'Sports Equipment', 'Photography Tips', 'Fitness', 'Self-Care Support', 'Relationship Goals', 'Climate Change', 'Retirement Planning', 'Card Games', 'Personal Qualities', 'News Updates', 'Archaeology Discoveries', 'Art', 'Sustainability', 'Personal Challenges', 'Cultural Diversity', 'Home Comfort', 'Parenting Challenges', 'Hobbies and Interests', 'Spirituality', 'Recent Events', 'Self-Esteem Boosting', 'Personal Progress', 'Leadership Lessons', 'Indoor Games', 'Sexuality and Vulnerability', 'Business Growth', 'Family Challenges', 'Personal Goals', 'Social Life', 'Religious Beliefs', 'Aging Support', 'Work Environment', 'Marginalized Communities', 'Communication in Sex', 'Emotional Analysis', 'Cultural Exchange', 'Science Misconceptions', 'Outdoor Experience', 'Business Promotion', 'Social Skills', 'Work Culture', 'Communica

### Add applicable interpersonal categories to each question

In [211]:
# Create tags for all sentences that are in category -1

INTERPERSONAL_CATEGORIES = ["self", "friendship", "romantic", "family", "professional", "acquaintance"]

import enum
from pydantic import BaseModel

class CategoryEnum(str, enum.Enum):
    SELF = "self"
    FRIENDSHIP = "friendship"
    ROMANTIC = "romantic"
    FAMILY = "family"
    PROFESSIONAL = "professional"
    ACQUAINTANCE = "acquaintance"

class InterpersonalCategoriesForQuestion(BaseModel):
    categories: list[CategoryEnum]

    def getListOfCategories(self):
        return 

@ai_fn
def extract_interpersonal_categories_for_question(QUESTION: str, INTERPERSONAL_CATEGORIES: str) -> InterpersonalCategoriesForQuestion:
    """
    Returns the CATEGORIES that a question would be appropriate to ask.
    Determines suitable INTERPERSONAL_CATEGORIES for a given QUESTION and returns a list of these INTERPERSONAL_CATEGORIES.
    """

categories_list = [CategoryEnum.ACQUAINTANCE, CategoryEnum.PROFESSIONAL]
interpersonal_categories = InterpersonalCategoriesForQuestion(categories=categories_list)
print(interpersonal_categories)

categories=[<CategoryEnum.ACQUAINTANCE: 'acquaintance'>, <CategoryEnum.PROFESSIONAL: 'professional'>]


In [214]:
# Use below set to verify if LLM created categories not supported.
added_interpersonal_categories = set()

# Get the topic for each question
# for i in range(len(questions)):
#     question = questions[i]
#     interpersonal_categories_objects = extract_interpersonal_categories_for_question(question, INTERPERSONAL_CATEGORIES)

#     interpersonal_categories = [x.value for x in interpersonal_categories_objects.categories]
#     row_index = df.loc[df["question"] == question].index[0]
#     df.at[row_index, "interpersonal_categories"] = interpersonal_categories
#     df.to_csv(csv_file, index=False)
#     added_interpersonal_categories.update(interpersonal_categories)

# df.to_csv(csv_file, index=False)
print(added_interpersonal_categories)

set()


# Top questions for top topics

In [224]:
topics, _ = topic_model.transform(questions)
documents_topics_df = pd.DataFrame({"Document": questions, "Topic": topics})

representative_questions = {}

for topic in documents_topics_df["Topic"].unique():
    representative_questions[topic] = documents_topics_df.loc[documents_topics_df["Topic"] == topic, "Document"].values[0]

print(representative_questions)


{16: "What is something that you're afraid of that you don't have to be?", 60: 'Do you think AI will eventually surpass human intelligence? Why or why not?', 103: 'How can we ensure that social justice movements are truly inclusive?', 32: "What's your favorite memory with a pet?", 125: 'Is there anything you wish you had said or done before they passed away?', 38: "What's something about me that you've come to appreciate?", -1: 'What can I do to help you stay motivated on your recovery journey?', 66: 'Do you prefer to sleep with or without a pillow?', 4: "What's the best sports-related gift you've ever received?", 7: 'Have you ever volunteered at an animal shelter? If yes, what was your experience?', 46: 'What is the most beautiful natural wonder you have ever seen?', 30: 'What are some ways you involve your family in financial decisions?', 5: 'Do you find it hard to disconnect from work at the end of the day?', 115: "What are some things you've done to take care of yourself and your p

In [236]:
representative_questions = {}

for topic in documents_topics_df["Topic"].unique():
    topic_questions = documents_topics_df.loc[documents_topics_df["Topic"] == topic, "Document"].values
    if len(topic_questions) >= 15:
        representative_questions[topic] = topic_questions[:3]
    else:
        representative_questions[topic] = topic_questions

for topic, questions in representative_questions.items():
    print(f"Topic {topic} representative questions:")
    for question in questions:
        print(question)


Topic 16 representative questions:
What is something that you're afraid of that you don't have to be?
What is the most inspiring story of overcoming fear that you've heard?
What would you do if fear wasn't holding you back?
Topic 60 representative questions:
Do you think AI will eventually surpass human intelligence? Why or why not?
What are some ways in which AI can be used to improve manufacturing processes?
How can we address the issue of bias in AI algorithms?
Topic 103 representative questions:
How can we ensure that social justice movements are truly inclusive?
How has your background shaped your perspective on social justice?
How do you address the intersection of social justice and mental health?
Topic 32 representative questions:
What's your favorite memory with a pet?
If you could only tell one joke for the rest of your life, what would it be?
What's the craziest thing you've ever done for fun?
Topic 125 representative questions:
Is there anything you wish you had said or don

In [237]:
import pandas as pd

# Prepare data for DataFrame
data = []
for topic, questions in representative_questions.items():
    for question in questions:
        data.append([topic, question])

# Create DataFrame
df = pd.DataFrame(data, columns=["Topic", "Question"])

# Write to CSV
df.to_csv('questions_by_topic.csv', index=False)
