# Two Steps

In [1]:
import pandas as pd
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
import numpy as np


# Load the data
data_path = 'attachment/attach_processed_length10.csv'

# Set embedding save path
embeddings_path = 'attachment/models/doc/attach_doc_embeddings.npy'

df = pd.read_csv(data_path, usecols=['text'], low_memory=False)
docs = df['text'].tolist()

# Prepare sub-models
embedding_model = SentenceTransformer('thenlper/gte-large')
embeddings = embedding_model.encode(docs, show_progress_bar=True)

np.save(embeddings_path, embeddings)

  from .autonotebook import tqdm as notebook_tqdm
Batches: 100%|██████████| 11895/11895 [34:27<00:00,  5.75it/s] 


In [20]:
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sentence_transformers import SentenceTransformer
import collections
from tqdm import tqdm
from scipy.cluster import hierarchy as sch

# Define file paths
data_path = 'attachment/attach_processed_length10.csv'
embeddings_path = 'attachment/models/doc/attach_doc_embeddings.npy'
model_save_path = 'attachment/models/doc/attach_doc3_model_dir/'

print("Step 1: Loading the data {data_path}")
# Load the data
df = pd.read_csv(data_path, usecols=['text'], low_memory=False)

print("Step 2: Preparing the documents...")
# Specify what the 'docs' are
docs = df['text'].tolist()

# Load the embeddings
embeddings = np.load(embeddings_path)

###### Extract vocab to be used in BERTopic
vocab = collections.Counter()
tokenizer = CountVectorizer(ngram_range=(1, 4)).build_tokenizer()
for doc in tqdm(docs):
    vocab.update(tokenizer(doc))
vocab = [word for word, frequency in vocab.items() if frequency >= 30]; len(vocab)


umap_model = UMAP(
        n_components=3,  # has a wild impact hard to predict
        n_neighbors=20,  # Higher is a more gloabl strcture
        min_dist=0.01,   # Lower value means more dense packing
        random_state=42, # Reproducability
        metric="cosine", # have to pick something
        n_jobs=-1        # speed
        )

hdbscan_model = HDBSCAN(
            min_cluster_size=100,           # smallest size group considered
            min_samples=20,               # larger is more conservative - more noise
            leaf_size=40,                   # number of points per leaf node in the tree - default 40
            gen_min_span_tree=True,        # True creates minimum spanning trees - increasing RAM
            prediction_data=True,           # generates extra cached data of prediction labels for new data or reuse
            cluster_selection_method='eom', # eom is normal - leaf might get more homogeneous clusters
            cluster_selection_epsilon=0.0,  # default - merges clusters below threshold
            core_dist_n_jobs=-1,            # For speed
            )

topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    verbose=True)
topics, probs = topic_model.fit_transform(docs, embeddings)

topic_model.save(model_save_path, serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)



Step 1: Loading the data {data_path}
Step 2: Preparing the documents...


100%|██████████| 380624/380624 [00:09<00:00, 40563.78it/s]
2024-05-05 22:10:29,620 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-05 22:17:26,630 - BERTopic - Dimensionality - Completed ✓
2024-05-05 22:17:26,635 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-05 22:17:39,393 - BERTopic - Cluster - Completed ✓
2024-05-05 22:17:39,432 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-05 22:18:03,771 - BERTopic - Representation - Completed ✓


In [21]:
# Retrieve the topic information as a DataFrame and assign it to topic_info_df
topic_info_df = topic_model.get_topic_info()
print(topic_info_df)

# Now you can save topic_info_df as a CSV file
topic_info_df.to_csv('attachment/analysis/doc/attach_doc3_topic_info.csv', index=False)
print("DataFrame saved as 'topic_info.csv'")



     Topic   Count                                  Name  \
0       -1  210760                      -1_you_to_and_it   
1        0   46044                         0_da_fa_ap_he   
2        1   15655                      1_she_her_you_to   
3        2   10252                      2_he_him_his_you   
4        3    5939            3_text_texting_phone_texts   
..     ...     ...                                   ...   
209    208     104     208_change_changing_habits_people   
210    209     102  209_thread_psychoanalyze_weekly_user   
211    210     101  210_resentment_hurt_forgiveness_them   
212    211     100        211_dating_single_myself_break   
213    212     100           212_needs_need_what_needing   

                                        Representation  \
0        [you, to, and, it, the, that, of, he, in, is]   
1          [da, fa, ap, he, and, to, the, was, me, my]   
2     [she, her, you, to, was, that, and, it, me, for]   
3    [he, him, his, you, it, to, your, and, tha

# Run a bunch of models

In [22]:
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sentence_transformers import SentenceTransformer
import collections
from tqdm import tqdm
from scipy.cluster import hierarchy as sch

# Define file paths
data_path = 'attachment/attach_processed_length10.csv'
embeddings_path = 'attachment/models/doc/attach_doc_embeddings.npy'
model_save_path = 'attachment/models/doc/attach_doc4_model_dir/'

print("Step 1: Loading the data {data_path}")
# Load the data
df = pd.read_csv(data_path, usecols=['text'], low_memory=False)

print("Step 2: Preparing the documents...")
# Specify what the 'docs' are
docs = df['text'].tolist()

# Load the embeddings
embeddings = np.load(embeddings_path)

###### Extract vocab to be used in BERTopic
vocab = collections.Counter()
tokenizer = CountVectorizer(ngram_range=(1, 3)).build_tokenizer()
for doc in tqdm(docs):
    vocab.update(tokenizer(doc))
vocab = [word for word, frequency in vocab.items() if frequency >= 30]; len(vocab)


umap_model = UMAP(
        n_components=3,  # has a wild impact hard to predict
        n_neighbors=30,  # Higher is a more gloabl strcture
        min_dist=0.01,   # Lower value means more dense packing
        random_state=42, # Reproducability
        metric="cosine", # have to pick something
        n_jobs=-1        # speed
        )

hdbscan_model = HDBSCAN(
            min_cluster_size=50,           # smallest size group considered
            min_samples=30,               # larger is more conservative - more noise
            leaf_size=40,                   # number of points per leaf node in the tree - default 40
            gen_min_span_tree=True,        # True creates minimum spanning trees - increasing RAM
            prediction_data=True,           # generates extra cached data of prediction labels for new data or reuse
            cluster_selection_method='eom', # eom is normal - leaf might get more homogeneous clusters
            cluster_selection_epsilon=0.0,  # default - merges clusters below threshold
            core_dist_n_jobs=-1,            # For speed
            )

topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    verbose=True)
topics, probs = topic_model.fit_transform(docs, embeddings)

topic_model.save(model_save_path, serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)

# Retrieve the topic information as a DataFrame and assign it to topic_info_df
topic_info_df = topic_model.get_topic_info()
print(topic_info_df)

# Now you can save topic_info_df as a CSV file
topic_info_df.to_csv('attachment/analysis/doc/attach_doc4_topic_info.csv', index=False)
print("DataFrame saved as 'topic_info.csv'")


Step 1: Loading the data {data_path}
Step 2: Preparing the documents...


100%|██████████| 380624/380624 [00:08<00:00, 42399.63it/s]
2024-05-05 22:18:22,599 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-05 22:26:31,065 - BERTopic - Dimensionality - Completed ✓
2024-05-05 22:26:31,069 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-05 22:26:44,308 - BERTopic - Cluster - Completed ✓
2024-05-05 22:26:44,345 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-05 22:27:08,065 - BERTopic - Representation - Completed ✓


     Topic   Count                                               Name  \
0       -1  204313                                    -1_you_to_he_it   
1        0   63760                                     0_da_fa_ap_and   
2        1   14520                                   1_she_her_you_to   
3        2    6215             2_thoughts_journaling_anxiety_emotions   
4        3    5461                  3_avoidant_avoidants_anxious_they   
..     ...     ...                                                ...   
260    259      51                     259_stage_stages_mindset_thank   
261    260      51  260_incompatibility_compatibility_incompatibil...   
262    261      51             261_memories_childhood_remember_memory   
263    262      50                  262_sabotage_self_completing_goal   
264    263      50                  263_intimacy_intimate_ea_physical   

                                        Representation  \
0       [you, to, he, it, and, that, the, of, in, for]   
1      

In [23]:
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sentence_transformers import SentenceTransformer
import collections
from tqdm import tqdm
from scipy.cluster import hierarchy as sch

# Define file paths
data_path = 'attachment/attach_processed_length10.csv'
embeddings_path = 'attachment/models/doc/attach_doc_embeddings.npy'
model_save_path = 'attachment/models/doc/attach_doc5_model_dir/'

print("Step 1: Loading the data {data_path}")
# Load the data
df = pd.read_csv(data_path, usecols=['text'], low_memory=False)

print("Step 2: Preparing the documents...")
# Specify what the 'docs' are
docs = df['text'].tolist()

# Load the embeddings
embeddings = np.load(embeddings_path)

###### Extract vocab to be used in BERTopic
vocab = collections.Counter()
tokenizer = CountVectorizer(ngram_range=(1, 3)).build_tokenizer()
for doc in tqdm(docs):
    vocab.update(tokenizer(doc))
vocab = [word for word, frequency in vocab.items() if frequency >= 30]; len(vocab)


umap_model = UMAP(
        n_components=3,  # has a wild impact hard to predict
        n_neighbors=40,  # Higher is a more gloabl strcture
        min_dist=0.01,   # Lower value means more dense packing
        random_state=42, # Reproducability
        metric="cosine", # have to pick something
        n_jobs=-1        # speed
        )

hdbscan_model = HDBSCAN(
            min_cluster_size=50,           # smallest size group considered
            min_samples=30,               # larger is more conservative - more noise
            leaf_size=25,                   # number of points per leaf node in the tree - default 40
            gen_min_span_tree=True,        # True creates minimum spanning trees - increasing RAM
            prediction_data=True,           # generates extra cached data of prediction labels for new data or reuse
            cluster_selection_method='eom', # eom is normal - leaf might get more homogeneous clusters
            cluster_selection_epsilon=0.0,  # default - merges clusters below threshold
            core_dist_n_jobs=-1,            # For speed
            )

topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    verbose=True)
topics, probs = topic_model.fit_transform(docs, embeddings)

topic_model.save(model_save_path, serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)

# Retrieve the topic information as a DataFrame and assign it to topic_info_df
topic_info_df = topic_model.get_topic_info()
print(topic_info_df)

# Now you can save topic_info_df as a CSV file
topic_info_df.to_csv('attachment/analysis/doc/attach_doc5_topic_info.csv', index=False)
print("DataFrame saved as 'topic_info.csv'")


Step 1: Loading the data {data_path}
Step 2: Preparing the documents...


100%|██████████| 380624/380624 [00:08<00:00, 42750.09it/s]
2024-05-05 22:27:25,036 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-05 22:36:54,043 - BERTopic - Dimensionality - Completed ✓
2024-05-05 22:36:54,048 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-05 22:37:06,619 - BERTopic - Cluster - Completed ✓
2024-05-05 22:37:06,656 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-05 22:37:29,906 - BERTopic - Representation - Completed ✓


     Topic   Count                                               Name  \
0       -1  222697                                   -1_you_to_and_it   
1        0   42327                                      0_da_fa_ap_he   
2        1   11854                                   1_she_her_you_to   
3        2   10293                                   2_he_him_his_you   
4        3    5420                3_thoughts_journaling_anxiety_helps   
..     ...     ...                                                ...   
257    256      52                   256_love_magnet_anxiety_yourself   
258    257      52                         257_run_hills_fuck_running   
259    258      52  258_introverted_introversion_introverts_introvert   
260    259      51                            259_him_text_he_contact   
261    260      50                    260_dodged_bullet_dodge_bullets   

                                        Representation  \
0        [you, to, and, it, that, the, of, he, is, in]   
1      

In [24]:
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sentence_transformers import SentenceTransformer
import collections
from tqdm import tqdm
from scipy.cluster import hierarchy as sch

# Define file paths
data_path = 'attachment/attach_processed_length10.csv'
embeddings_path = 'attachment/models/doc/attach_doc_embeddings.npy'
model_save_path = 'attachment/models/doc/attach_doc6_model_dir/'

print("Step 1: Loading the data {data_path}")
# Load the data
df = pd.read_csv(data_path, usecols=['text'], low_memory=False)

print("Step 2: Preparing the documents...")
# Specify what the 'docs' are
docs = df['text'].tolist()

# Load the embeddings
embeddings = np.load(embeddings_path)

###### Extract vocab to be used in BERTopic
vocab = collections.Counter()
tokenizer = CountVectorizer(ngram_range=(1, 3)).build_tokenizer()
for doc in tqdm(docs):
    vocab.update(tokenizer(doc))
vocab = [word for word, frequency in vocab.items() if frequency >= 30]; len(vocab)


umap_model = UMAP(
        n_components=5,  # has a wild impact hard to predict
        n_neighbors=60,  # Higher is a more gloabl strcture
        min_dist=0.1,   # Lower value means more dense packing
        random_state=42, # Reproducability
        metric="cosine", # have to pick something
        n_jobs=-1        # speed
        )

hdbscan_model = HDBSCAN(
            min_cluster_size=50,           # smallest size group considered
            min_samples=30,               # larger is more conservative - more noise
            leaf_size=50,                   # number of points per leaf node in the tree - default 40
            gen_min_span_tree=True,        # True creates minimum spanning trees - increasing RAM
            prediction_data=True,           # generates extra cached data of prediction labels for new data or reuse
            cluster_selection_method='eom', # eom is normal - leaf might get more homogeneous clusters
            cluster_selection_epsilon=0.0,  # default - merges clusters below threshold
            core_dist_n_jobs=-1,            # For speed
            )

topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    verbose=True)
topics, probs = topic_model.fit_transform(docs, embeddings)

topic_model.save(model_save_path, serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)

# Retrieve the topic information as a DataFrame and assign it to topic_info_df
topic_info_df = topic_model.get_topic_info()
print(topic_info_df)

# Now you can save topic_info_df as a CSV file
topic_info_df.to_csv('attachment/analysis/doc/attach_doc6_topic_info.csv', index=False)
print("DataFrame saved as 'topic_info.csv'")


Step 1: Loading the data {data_path}
Step 2: Preparing the documents...


100%|██████████| 380624/380624 [00:08<00:00, 42347.64it/s]
2024-05-05 22:37:46,086 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-05 22:50:45,036 - BERTopic - Dimensionality - Completed ✓
2024-05-05 22:50:45,043 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-05 22:51:10,623 - BERTopic - Cluster - Completed ✓
2024-05-05 22:51:10,656 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-05 22:51:32,888 - BERTopic - Representation - Completed ✓


   Topic   Count                              Name  \
0     -1     100                 -1_bot_you_the_to   
1      0  379527                  0_to_and_the_you   
2      1     703                 1_dog_and_cat_the   
3      2     141       2_page_please_the_questions   
4      3      85  3_bot_fucking_optout_shakespeare   
5      4      68         4_boop_beep_comments_peek   

                                      Representation  \
0     [bot, you, the, to, this, is, and, of, it, in]   
1     [to, and, the, you, it, that, of, is, in, for]   
2   [dog, and, cat, the, to, cats, my, it, pets, of]   
3  [page, please, the, questions, sub, for, toran...   
4  [bot, fucking, optout, shakespeare, thy, words...   
5  [boop, beep, comments, peek, sneak, downvote, ...   

                                 Representative_Docs  
0  [This is what anxious attachment looks like Ft...  
1  [Yes you are gonna need be self aware of your ...  
2  [I was worried about influencing my dog negati...  
3  [Than