# Upload data and embeddings

In [1]:
import pandas as pd
df_2024 = pd.read_csv('~/Library/Mobile Documents/com~apple~CloudDocs/UNI-kopi 2/Kandidat/virtual_environments/data/short.csv')
df_2023 = pd.read_csv('~/Library/Mobile Documents/com~apple~CloudDocs/UNI-kopi 2/Kandidat/virtual_environments/data/short_2023.csv')

# Combine datasets
df_combined = pd.concat([df_2024, df_2023], ignore_index=True)

documents = df_combined["translated_Comment"].tolist()

In [2]:
import numpy as np

embeddings = np.load("embeddings.npy")

In [None]:
del keybert_model
del mmr_model
del representation_model

# Reload definitions

Redefine definitions needed to upload the models later, so I don't need to run the grid searches again


In [3]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.model_selection import ParameterGrid
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance
from octis.evaluation_metrics.coherence_metrics import Coherence
from umap import UMAP
from hdbscan import HDBSCAN
import spacy


# Prepare Tokenizer model
# Load spaCy English model
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 1520000

# Define custom tokenizer function with lemmatization for topic representations
def spacy_tokenizer(doc):
    spacy_doc = nlp(doc)
    # Lemmatize, lowercase, remove stopwords & punctuation
    return [token.lemma_.lower() for token in spacy_doc if not token.is_stop and not token.is_punct] #

base_vectorizer = CountVectorizer(tokenizer=spacy_tokenizer, min_df=5) # remove words that appear in less than 5 documents
base_vectorizer.fit(documents) # fit the vectorizer to the documents, to find the vocabulary for topic representations
filtered_vocab = set(base_vectorizer.get_feature_names_out()) # # get the vocabulary of the filtered vectorizer

def filtered_tokenizer(doc): # Now create a custom tokenizer that only keeps words in the filtered vocabulary, which will be passed to the CountVectorizer for topic representations
    tokens = spacy_tokenizer(doc)
    return [t for t in tokens if t in filtered_vocab]

# Create CountVectorizer with custom tokenizer
vectorizer_model = CountVectorizer(tokenizer=filtered_tokenizer, # use the custom tokenizer
                                   #stop_words="english", # already done in the tokenizer, but countvectorizer might have other stop words
                                   lowercase = False, # lowercase is already done in the tokenizer
                                   min_df = 1, #  Excludes words that appear in only one document.
                                   ngram_range=(1, 2)) # consider topic representations that are made up of one or two words

def coherence_tokenizer(doc): # The tokenizer for the coherence score. This is a standard tokenizer that does not filter out words, but lemmatizes and lowercases them.
    spacy_doc = nlp(doc)
    return [
        token.lemma_.lower()
        for token in spacy_doc
        if not token.is_punct  # keep stopwords as I want to calculate the coherence score based on all words
    ]

tokens_for_coherence = [coherence_tokenizer(doc) for doc in documents] 


# The first grid 

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.model_selection import ParameterGrid
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance
from octis.evaluation_metrics.coherence_metrics import Coherence
from umap import UMAP
from hdbscan import HDBSCAN
import spacy

# Load your data
#df = pd.read_csv('~/Library/Mobile Documents/com~apple~CloudDocs/UNI-kopi 2/Kandidat/virtual_environments/data/short.csv')
#documents = df["translated_Comment"].tolist() # Convert to list of strings


# Define the hyperparameter grid
param_grid = {
    # BERTOPIC parameters
    #"min_topic_size": [50,70,90], # Minimum size of a topic. Should be higher than min_cluster_size
    #"top_n_words": [10],
    #"nr_topics": ["none","auto"], # None means no reduction

    #UMAP parameters
    "n_components": [2,5], # Number of dimensions to reduce to
    "n_neighbors": [10,15,20], # Number of neighbors to consider for UMAP 

    #HDBSCAN parameters
    "min_cluster_size": [50,70,90,110], # Min cluster size for HDBSCAN # Min cluster size for HDBSCAN. 
    "min_samples": [50,70,90,110] # Minimum number of samples in a cluster
}
#Create all possible combinations of the hyperparameter grid
grid = list(ParameterGrid(param_grid))


# Filter: only keep combinations where min_cluster_size <= min_topic_size
valid_grid = [
    params for params in grid
    if params["min_samples"] <= params["min_cluster_size"]
]

# Total number combinations. Used later in the print statement to show progress
total = len(valid_grid)

#total = len(grid)


# Embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
#Compute embeddings
#embeddings = embedding_model.encode(documents, show_progress_bar=True)


# Prepare Tokenizer model
# Load spaCy English model
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 1520000

# Define custom tokenizer function with lemmatization for topic representations
def spacy_tokenizer(doc):
    spacy_doc = nlp(doc)
    # Lemmatize, lowercase, remove stopwords & punctuation
    return [token.lemma_.lower() for token in spacy_doc if not token.is_stop and not token.is_punct] #

base_vectorizer = CountVectorizer(tokenizer=spacy_tokenizer, min_df=5) # remove words that appear in less than 5 documents
base_vectorizer.fit(documents) # fit the vectorizer to the documents, to find the vocabulary for topic representations
filtered_vocab = set(base_vectorizer.get_feature_names_out()) # # get the vocabulary of the filtered vectorizer

def filtered_tokenizer(doc): # Now create a custom tokenizer that only keeps words in the filtered vocabulary, which will be passed to the CountVectorizer for topic representations
    tokens = spacy_tokenizer(doc)
    return [t for t in tokens if t in filtered_vocab]

# Create CountVectorizer with custom tokenizer
vectorizer_model = CountVectorizer(tokenizer=filtered_tokenizer, # use the custom tokenizer
                                   #stop_words="english", # already done in the tokenizer, but countvectorizer might have other stop words
                                   lowercase = False, # lowercase is already done in the tokenizer
                                   min_df = 1, #  Excludes words that appear in only one document.
                                   ngram_range=(1, 2)) # consider topic representations that are made up of one or two words

def coherence_tokenizer(doc): # The tokenizer for the coherence score. This is a standard tokenizer that does not filter out words, but lemmatizes and lowercases them.
    spacy_doc = nlp(doc)
    return [
        token.lemma_.lower()
        for token in spacy_doc
        if not token.is_punct  # keep stopwords as I want to calculate the coherence score based on all words
    ]

tokens_for_coherence = [coherence_tokenizer(doc) for doc in documents] 


# Fine-tune representations 
#keybert_model = KeyBERTInspired()
#mmr_model = MaximalMarginalRelevance(diversity=0.2) #0.2 is used in the book.

#representation_model = {
    #"KeyBERT": keybert_model,
    #"MMR": mmr_model
#}


# Function to calculate Topic Diversity as the proportion of unique words across all top-k topic words (PUW)
def topic_diversity(topics, topk):
    all_words = [word for topic in topics for word in topic[:topk]]
    unique_words = set(all_words)
    return len(unique_words) / len(all_words)


# Defintions to later being able to track best model. 
best_model = None #no best model from the start
best_score = -1 # the first model with a score better than -1 will the current best. Just a placeholder make npmi_score > best_score work.
best_params = None #no best parameters from the start


# Results storage. A list that sores the coherence and diversty score of each model
results = []
# A list which saves all the models, so each model can be easily retrieved by typing its corresponding parameters
saved_models = {}  



#### THE LOOP ####
# Grid search
for i, params in enumerate(valid_grid, 1):#change to valid_grid as the parameter grid to iterate over if min_topic_size is used
    print(f"[{i}/{total}] Running with params: {params}") #print the current parameter combination being evaluated for each iteration 

    # UMAP
    umap_model = UMAP(n_neighbors=params["n_neighbors"],
                      n_components=params["n_components"], 
                      min_dist=0.0, 
                      metric='cosine', 
                      random_state=42)

    # HDBSCAN  
    hdbscan_model = HDBSCAN(min_cluster_size=params["min_cluster_size"], 
                            min_samples=params["min_samples"], 
                            metric='euclidean', 
                            cluster_selection_method='eom',
                            prediction_data=True
                            )

    # BERTopic model
    topic_model = BERTopic(
        #nr_topics=params["nr_topics"],
        #min_topic_size=params["min_topic_size"]
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        #representation_model=representation_model,
        vectorizer_model=vectorizer_model, #this makes some models not able to calculate NPMI
        #ctfidf_model=ctfidf_model,
        top_n_words=10,   
    )

    # Fit model
    topics, _ = topic_model.fit_transform(documents, embeddings)

    # Save meach model in the Saved_models list
    param_key = tuple(sorted(params.items()))  # extracting the parameters for the each model and creates a unique key for every model combination
    saved_models[param_key] = topic_model # save the each model in the saved_models list by its unique key 


    # Get top `top_n_words` words per topic (excluding outlier topic -1)
    topic_words = [] #an empty list that will be used to store the top words of every topic found for each model. So a list (topics) of lists (top words for each topic).
    
    for topic in topic_model.get_topic_freq().Topic: # grabs the topics from the topic_model
        if topic != -1: #skips the outlier topic -1
            topic_content = topic_model.get_topic(topic) #get the top words and scores for the topics.
            if isinstance(topic_content, list) and len(topic_content) > 0: # checks if the topic_content (top words) is a list and has more than 0 elements. To check if there actually is extracted topic words
                if isinstance(topic_content[0], tuple) and len(topic_content[0]) == 2: # checks if the first element of topic_content is a tuple and has two elements (word and score)
                    words = [word for word, _ in topic_content[:topic_model.top_n_words]] #if so, it extracts the words (but not the scores) in a variable called "words"
                else:
                    words = topic_content[:topic_model.top_n_words] #if there is only top words but no scores, extracts the top words in a variabkle called "words"
                topic_words.append(words) #append the topic words (defined in the variable "words") to the topic_words list for the model.

    if topic_words:
        
        # Calculate NPMI
        coherence_model = Coherence(texts=tokens_for_coherence, # reference corpus. Here it's all documents in the dataset
                                    topk=10, #specifies the number of top words to consider for NPMI calculation
                                    measure="c_npmi")
        npmi_score = coherence_model.score({"topics": topic_words}) # average

        # Calculate Topic Diversity
            # the topic_diversity command was defned before the loop. This is not a standard command, so I defined how it works earlier.
        diversity_score = topic_diversity(topics=topic_words, topk=10)

        # Calculate Topic Quality by multiplying NPMI score and Topic Diversity
        topic_quality = npmi_score * diversity_score 


        # Print the number of topics
        num_topics = len(topic_words) #number of topics is the length of the topic_words list
        num_outliers = sum(1 for topic in topics if topic == -1) #number of outliers is the number of topics that are equal to -1
        print(f"Number of Topics: {num_topics} | TC: {npmi_score:.3f} | TD: {diversity_score:.3f} | TQ: {topic_quality:.3f} | num_outliers: {num_outliers}") #4f = 4 decimals 

        # Track best model
        if npmi_score > best_score:
            best_score = npmi_score
            best_model = topic_model
            best_params = params

        # store the params, npmi, diversy, and num_topcis for each model in the results list
        results.append({
            "params": params,
            "num_topics": num_topics,
            "npmi_score": npmi_score,
            "topic_diversity": diversity_score,
            "topic_quality": topic_quality,
            "num_outliers": num_outliers
        })

    else:
        print("No valid topics were generated.")

#####

import csv

# Save the all the models' information to a CSV file

model_info = [] # an empty list to store model information. This is the dataframe that will be saved in the CSV file in the end 

# Loop through the models saved from the loop above and save the information to the model_info dataframe
for param_key, model in saved_models.items(): #activates a loop that goes through all the models (identified by their unique key).
    # Convert the tuple of params back to a dictionary (a formal step)
    params_dict = dict(param_key)
    
    # Get topic information
    #topics_freq = model.get_topic_freq() #retrives the number of topics and documents in each topic
    #num_topics = len(topics_freq[topics_freq.Topic != -1]) #retrives the number of topics (length) and exlcudes the outlier topic -1
    
    ## Get performance metrics if available
    # Matches the models with parameters saved in the loop with the parameters of models saved in the CVS file
    matching_result = next((r for r in results if all(r["params"][k] == v for k, v in params_dict.items())), None)
    
    if matching_result: 
        num_topics =matching_result.get("num_topics", 0) #extract the number of topics for the models
        num_outliers = matching_result.get("num_outliers", 0) #extract the number of outliers for the models
        npmi_score = matching_result.get("npmi_score", 0) #extract the NPMI score for the models
        diversity_score = matching_result.get("topic_diversity", 0) #extract the topic diversity score for the models
        topic_quality = matching_result.get("topic_quality", 0) #extract the topic quality score for the models
        
        model_info.append({  #assigns the number of topics and scores to the differnt models in the model_info list. This is the dataframe that will be saved in the CSV file.
            **params_dict,  
            "num_topics": num_topics,
            "npmi_score": npmi_score,
            "topic_diversity": diversity_score,
            "topic_quality": topic_quality,
            "num_outliers" : num_outliers   
        })

# Now we retrived all the information to the model_info dataframe, 
# Saves it to CSV file
with open('very_first_model.csv', 'w', newline='') as csvfile:
    fieldnames = list(model_info[0].keys()) if model_info else []
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    
    writer.writeheader()
    for info in model_info:
        writer.writerow(info)


# Sort the results by NPMI score in descending order
top_20_npmi = sorted(results, key=lambda x: x["npmi_score"], reverse=True)[:10]

# Sort the results by Topic Diversity score in descending order
top_20_diversity = sorted(results, key=lambda x: x["topic_diversity"], reverse=True)[:10]

# Sort the results by both NPMI and Topic Diversity score in descending order
top_20_quality = sorted(results, key=lambda x: (x["topic_quality"]), reverse=True)[:10]

# Print the top 20 based on NPMI
print("\nTop 20 Best Performing Models (Based on NPMI):\n")
for i, result in enumerate(top_20_npmi, start=1):
    print(f"Rank {i}: num_topics: {result['num_topics']} | {result['params']} | TC: {result['npmi_score']:.3f} | TD: {result['topic_diversity']:.3f} | TQ: {result['topic_quality']:.3f} | num_outliers: {result['num_outliers']})")

# Print the top 20 based on Topic Diversity
print("\nTop 20 Best Performing Models (Based on Topic Diversity):\n")
for i, result in enumerate(top_20_diversity, start=1):
    print(f"Rank {i}: num_topics: {result['num_topics']} | {result['params']} | TC: {result['npmi_score']:.3f} | TD: {result['topic_diversity']:.3f} | TQ: {result['topic_quality']:.3f} | num_outliers: {result['num_outliers']})")

# Print the top 20 based on topic quality
print("\nTop 20 Best Performing Models (Based on topic quality):\n")
for i, result in enumerate(top_20_quality, start=1):
    print(f"Rank {i}: num_topics: {result['num_topics']} | {result['params']} | TC: {result['npmi_score']:.3f} | TD: {result['topic_diversity']:.3f} | TQ: {result['topic_quality']:.3f} | num_outliers: {result['num_outliers']})")



[1/60] Running with params: {'min_cluster_size': 50, 'min_samples': 50, 'n_components': 2, 'n_neighbors': 10}
Number of Topics: 36 | TC: 0.058 | TD: 0.683 | TQ: 0.039 | num_outliers: 4382
[2/60] Running with params: {'min_cluster_size': 50, 'min_samples': 50, 'n_components': 2, 'n_neighbors': 15}
Number of Topics: 38 | TC: 0.059 | TD: 0.682 | TQ: 0.040 | num_outliers: 3848
[3/60] Running with params: {'min_cluster_size': 50, 'min_samples': 50, 'n_components': 2, 'n_neighbors': 20}
Number of Topics: 36 | TC: 0.073 | TD: 0.700 | TQ: 0.051 | num_outliers: 4106
[4/60] Running with params: {'min_cluster_size': 50, 'min_samples': 50, 'n_components': 5, 'n_neighbors': 10}
Number of Topics: 2 | TC: 0.102 | TD: 0.950 | TQ: 0.097 | num_outliers: 330
[5/60] Running with params: {'min_cluster_size': 50, 'min_samples': 50, 'n_components': 5, 'n_neighbors': 15}
Number of Topics: 33 | TC: 0.084 | TD: 0.718 | TQ: 0.060 | num_outliers: 4335
[6/60] Running with params: {'min_cluster_size': 50, 'min_samp

# Create models with final grid

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.model_selection import ParameterGrid
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance
from octis.evaluation_metrics.coherence_metrics import Coherence
from umap import UMAP
from hdbscan import HDBSCAN
import spacy

# Load your data
#df = pd.read_csv('~/Library/Mobile Documents/com~apple~CloudDocs/UNI-kopi 2/Kandidat/virtual_environments/data/short.csv')
#documents = df["translated_Comment"].tolist() # Convert to list of strings


# Define the hyperparameter grid
param_grid = {
    # BERTOPIC parameters
    #"min_topic_size": [50,70,90], # Minimum size of a topic. Should be higher than min_cluster_size
    #"top_n_words": [10],
    # "nr_topics": ["auto",10,15,20], # None means no reduction

    #UMAP parameters
    "n_components": [2,5], # Number of dimensions to reduce to
    "n_neighbors": [14,15,16], # Number of neighbors to consider for UMAP 

    #HDBSCAN parameters
    "min_cluster_size": [60,70,80], # Min cluster size for HDBSCAN # Min cluster size for HDBSCAN. 
    "min_samples": [30,40,50,60,70,80] # Minimum number of samples in a cluster
}
#Create all possible combinations of the hyperparameter grid
grid = list(ParameterGrid(param_grid))


# Filter: only keep combinations where min_cluster_size <= min_topic_size
valid_grid = [
    params for params in grid
    if params["min_samples"] <= params["min_cluster_size"]
]

# Total number combinations. Used later in the print statement to show progress
total = len(valid_grid)

#total = len(grid)



# Embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
#Compute embeddings
#embeddings = embedding_model.encode(documents, show_progress_bar=True)



# Prepare Tokenizer model
# Load spaCy English model
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 1520000

# Define custom tokenizer function with lemmatization for topic representations
def spacy_tokenizer(doc):
    spacy_doc = nlp(doc)
    # Lemmatize, lowercase, remove stopwords & punctuation
    return [token.lemma_.lower() for token in spacy_doc if not token.is_stop and not token.is_punct] #

base_vectorizer = CountVectorizer(tokenizer=spacy_tokenizer, min_df=5) # remove words that appear in less than 5 documents
base_vectorizer.fit(documents) # fit the vectorizer to the documents, to find the vocabulary for topic representations
filtered_vocab = set(base_vectorizer.get_feature_names_out()) # # get the vocabulary of the filtered vectorizer

def filtered_tokenizer(doc): # Now create a custom tokenizer that only keeps words in the filtered vocabulary, which will be passed to the CountVectorizer for topic representations
    tokens = spacy_tokenizer(doc)
    return [t for t in tokens if t in filtered_vocab]

# Create CountVectorizer with custom tokenizer
vectorizer_model = CountVectorizer(tokenizer=filtered_tokenizer, # use the custom tokenizer
                                   #stop_words="english", # already done in the tokenizer, but countvectorizer might have other stop words
                                   lowercase = False, # lowercase is already done in the tokenizer
                                   min_df = 1, #  Excludes words that appear in only one document.
                                   ngram_range=(1, 2)) # consider topic representations that are made up of one or two words

def coherence_tokenizer(doc): # The tokenizer for the coherence score. This is a standard tokenizer that does not filter out words, but lemmatizes and lowercases them.
    spacy_doc = nlp(doc)
    return [
        token.lemma_.lower()
        for token in spacy_doc
        if not token.is_punct  # keep stopwords as I want to calculate the coherence score based on all words
    ]

tokens_for_coherence = [coherence_tokenizer(doc) for doc in documents] 


# Fine-tune representations 
#keybert_model = KeyBERTInspired()
#mmr_model = MaximalMarginalRelevance(diversity=0.2) #0.2 is used in the book.

#representation_model = {
    #"KeyBERT": keybert_model,
    #"MMR": mmr_model
#}


# Function to calculate Topic Diversity as the proportion of unique words across all top-k topic words (PUW)
def topic_diversity(topics, topk):
    all_words = [word for topic in topics for word in topic[:topk]]
    unique_words = set(all_words)
    return len(unique_words) / len(all_words)


# Defintions to later being able to track best model. 
best_model = None #no best model from the start
best_score = -1 # the first model with a score better than -1 will the current best. Just a placeholder make npmi_score > best_score work.
best_params = None #no best parameters from the start


# Results storage. A list that sores the coherence and diversty score of each model
results = []
# A list which saves all the models, so each model can be easily retrieved by typing its corresponding parameters
saved_models = {}  



#### THE LOOP ####
# Grid search
for i, params in enumerate(valid_grid, 1):#change to valid_grid as the parameter grid to iterate over if min_topic_size is used
    print(f"[{i}/{total}] Running with params: {params}") #print the current parameter combination being evaluated for each iteration 

    # UMAP
    umap_model = UMAP(n_neighbors=params["n_neighbors"],
                      n_components=params["n_components"], 
                      min_dist=0.0, 
                      metric='cosine', 
                      random_state=42)

    # HDBSCAN  
    hdbscan_model = HDBSCAN(min_cluster_size=params["min_cluster_size"], 
                            min_samples=params["min_samples"], 
                            metric='euclidean', 
                            cluster_selection_method='eom',
                            prediction_data=True
                            )

    # BERTopic model
    topic_model = BERTopic(
        #nr_topics=params["nr_topics"],
        #min_topic_size=params["min_topic_size"]
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        #representation_model=representation_model,
        vectorizer_model=vectorizer_model, #this makes some models not able to calculate NPMI
        #ctfidf_model=ctfidf_model,
        top_n_words=10,   
    )

    # Fit model
    topics, _ = topic_model.fit_transform(documents, embeddings)

    # Save meach model in the Saved_models list
    param_key = tuple(sorted(params.items()))  # extracting the parameters for the each model and creates a unique key for every model combination
    saved_models[param_key] = topic_model # save the each model in the saved_models list by its unique key 


    # Get top `top_n_words` words per topic (excluding outlier topic -1)
    topic_words = [] #an empty list that will be used to store the top words of every topic found for each model. So a list (topics) of lists (top words for each topic).
    
    for topic in topic_model.get_topic_freq().Topic: # grabs the topics from the topic_model
        if topic != -1: #skips the outlier topic -1
            topic_content = topic_model.get_topic(topic) #get the top words and scores for the topics.
            if isinstance(topic_content, list) and len(topic_content) > 0: # checks if the topic_content (top words) is a list and has more than 0 elements. To check if there actually is extracted topic words
                if isinstance(topic_content[0], tuple) and len(topic_content[0]) == 2: # checks if the first element of topic_content is a tuple and has two elements (word and score)
                    words = [word for word, _ in topic_content[:topic_model.top_n_words]] #if so, it extracts the words (but not the scores) in a variable called "words"
                else:
                    words = topic_content[:topic_model.top_n_words] #if there is only top words but no scores, extracts the top words in a variabkle called "words"
                topic_words.append(words) #append the topic words (defined in the variable "words") to the topic_words list for the model.

    if topic_words:
        
        # Calculate NPMI
        coherence_model = Coherence(texts=tokens_for_coherence, # reference corpus. Here it's all documents in the dataset
                                    topk=10, #specifies the number of top words to consider for NPMI calculation
                                    measure="c_npmi")
        npmi_score = coherence_model.score({"topics": topic_words}) # average

        # Calculate Topic Diversity
            # the topic_diversity command was defned before the loop. This is not a standard command, so I defined how it works earlier.
        diversity_score = topic_diversity(topics=topic_words, topk=10)

        # Calculate Topic Quality by multiplying NPMI score and Topic Diversity
        topic_quality = npmi_score * diversity_score 


        # Print the number of topics
        num_topics = len(topic_words) #number of topics is the length of the topic_words list
        num_outliers = sum(1 for topic in topics if topic == -1) #number of outliers is the number of topics that are equal to -1
        print(f"Number of Topics: {num_topics} | TC: {npmi_score:.3f} | TD: {diversity_score:.3f} | TQ: {topic_quality:.3f} | num_outliers: {num_outliers}") #4f = 4 decimals 

        # Track best model
        if npmi_score > best_score:
            best_score = npmi_score
            best_model = topic_model
            best_params = params

        # store the params, npmi, diversy, and num_topcis for each model in the results list
        results.append({
            "params": params,
            "num_topics": num_topics,
            "npmi_score": npmi_score,
            "topic_diversity": diversity_score,
            "topic_quality": topic_quality,
            "num_outliers": num_outliers
        })

    else:
        print("No valid topics were generated.")

#####

import csv

# Save the all the models' information to a CSV file

model_info = [] # an empty list to store model information. This is the dataframe that will be saved in the CSV file in the end 

# Loop through the models saved from the loop above and save the information to the model_info dataframe
for param_key, model in saved_models.items(): #activates a loop that goes through all the models (identified by their unique key).
    # Convert the tuple of params back to a dictionary (a formal step)
    params_dict = dict(param_key)
    
    # Get topic information
    #topics_freq = model.get_topic_freq() #retrives the number of topics and documents in each topic
    #num_topics = len(topics_freq[topics_freq.Topic != -1]) #retrives the number of topics (length) and exlcudes the outlier topic -1
    
    ## Get performance metrics if available
    # Matches the models with parameters saved in the loop with the parameters of models saved in the CVS file
    matching_result = next((r for r in results if all(r["params"][k] == v for k, v in params_dict.items())), None)
    
    if matching_result: 
        num_topics =matching_result.get("num_topics", 0) #extract the number of topics for the models
        num_outliers = matching_result.get("num_outliers", 0) #extract the number of outliers for the models
        npmi_score = matching_result.get("npmi_score", 0) #extract the NPMI score for the models
        diversity_score = matching_result.get("topic_diversity", 0) #extract the topic diversity score for the models
        topic_quality = matching_result.get("topic_quality", 0) #extract the topic quality score for the models
        
        model_info.append({  #assigns the number of topics and scores to the differnt models in the model_info list. This is the dataframe that will be saved in the CSV file.
            **params_dict,  
            "num_topics": num_topics,
            "npmi_score": npmi_score,
            "topic_diversity": diversity_score,
            "topic_quality": topic_quality,
            "num_outliers" : num_outliers   
        })

# Now we retrived all the information to the model_info dataframe, 
# Saves it to CSV file
with open('simple_model1.csv', 'w', newline='') as csvfile:
    fieldnames = list(model_info[0].keys()) if model_info else []
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    
    writer.writeheader()
    for info in model_info:
        writer.writerow(info)

print(f"Model information saved to bertopic_models_info_preprocessing.csv with combined score column")



# Sort the results by NPMI score in descending order
top_20_npmi = sorted(results, key=lambda x: x["npmi_score"], reverse=True)[:10]

# Sort the results by Topic Diversity score in descending order
top_20_diversity = sorted(results, key=lambda x: x["topic_diversity"], reverse=True)[:10]

# Sort the results by both NPMI and Topic Diversity score in descending order
top_20_quality = sorted(results, key=lambda x: (x["topic_quality"]), reverse=True)[:10]

# Print the top 20 based on NPMI
print("\nTop 20 Best Performing Models (Based on NPMI):\n")
for i, result in enumerate(top_20_npmi, start=1):
    print(f"Rank {i}: num_topics: {result['num_topics']} | {result['params']} | TC: {result['npmi_score']:.3f} | TD: {result['topic_diversity']:.3f} | TQ: {result['topic_quality']:.3f} | num_outliers: {result['num_outliers']})")

# Print the top 20 based on Topic Diversity
print("\nTop 20 Best Performing Models (Based on Topic Diversity):\n")
for i, result in enumerate(top_20_diversity, start=1):
    print(f"Rank {i}: num_topics: {result['num_topics']} | {result['params']} | TC: {result['npmi_score']:.3f} | TD: {result['topic_diversity']:.3f} | TQ: {result['topic_quality']:.3f} | num_outliers: {result['num_outliers']})")

# Print the top 20 based on topic quality
print("\nTop 20 Best Performing Models (Based on topic quality):\n")
for i, result in enumerate(top_20_quality, start=1):
    print(f"Rank {i}: num_topics: {result['num_topics']} | {result['params']} | TC: {result['npmi_score']:.3f} | TD: {result['topic_diversity']:.3f} | TQ: {result['topic_quality']:.3f} | num_outliers: {result['num_outliers']})")



[1/90] Running with params: {'min_cluster_size': 60, 'min_samples': 30, 'n_components': 2, 'n_neighbors': 14}
Number of Topics: 33 | TC: 0.073 | TD: 0.682 | TQ: 0.050 | num_outliers: 3231
[2/90] Running with params: {'min_cluster_size': 60, 'min_samples': 30, 'n_components': 2, 'n_neighbors': 15}
Number of Topics: 33 | TC: 0.069 | TD: 0.655 | TQ: 0.045 | num_outliers: 3258
[3/90] Running with params: {'min_cluster_size': 60, 'min_samples': 30, 'n_components': 2, 'n_neighbors': 16}
Number of Topics: 37 | TC: 0.075 | TD: 0.697 | TQ: 0.052 | num_outliers: 3656
[4/90] Running with params: {'min_cluster_size': 60, 'min_samples': 30, 'n_components': 5, 'n_neighbors': 14}
Number of Topics: 29 | TC: 0.078 | TD: 0.748 | TQ: 0.058 | num_outliers: 3900
[5/90] Running with params: {'min_cluster_size': 60, 'min_samples': 30, 'n_components': 5, 'n_neighbors': 15}
Number of Topics: 28 | TC: 0.079 | TD: 0.711 | TQ: 0.056 | num_outliers: 3511
[6/90] Running with params: {'min_cluster_size': 60, 'min_sa

# Model 1

In [79]:
from bertopic import BERTopic

loaded_model = BERTopic.load("simple_model1")
print("Model loaded successfully.")

Model loaded successfully.


## Inspection

In [95]:
loaded_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,3027,-1_order_time_app_wolt,"[order, time, app, wolt, work, customer, deliv...","[Hello with you ""Wolt"" I will start by mention..."
1,0,2492,0_pay_wolt_order_payment,"[pay, wolt, order, payment, low, work, money, ...","[The pay per order is too low., Lots of discre..."
2,1,965,1_good_happy_wolt_thank,"[good, happy, wolt, thank, work, day, job, lov...","[wolt is the best i am very happy with wolt, w..."
3,2,430,2_ready_restaurant_food_wait,"[ready, restaurant, food, wait, time, food rea...","[Marking the food ready before it's…, More tha..."
4,3,429,3_support_wolt support_system_chat,"[support, wolt support, system, chat, team, ne...","[Wolt support not good.verybad, Leaf -tender s..."
5,4,373,4_courier_new courier_new_order,"[courier, new courier, new, order, order couri...","[few orders too many couriers, No orders!!! To..."
6,5,366,5_mcdonald_wait_order_mcdonalds,"[mcdonald, wait, order, mcdonalds, ready, mc, ...",[Hi! Today started great but the few times I’v...
7,6,355,6_map_address_google_gps,"[map, address, google, gps, location, app, map...","[Addresses were misleading, Please make it pos..."
8,7,298,7_order_home_order order_home order,"[order, home, order order, home order, order h...","[There are not enough amount of orders, There ..."
9,8,268,8_app_app work_work_time,"[app, app work, work, time, update, time app, ...","[The app was down for me, MOST RIGGED MANIPULA..."


In [80]:
# Display top 10 word representations for each topic (excluding outlier topic -1)
topic_info = loaded_model.get_topic_info()
for topic_id in topic_info.Topic:
    if topic_id != -1:
        words = loaded_model.get_topic(topic_id)
        if words:
            top_words = [word for word, _ in words[:10]]
            print(f"Topic {topic_id}: {', '.join(top_words)}")

Topic 0: pay, wolt, order, payment, low, work, money, distance, hour, km
Topic 1: good, happy, wolt, thank, work, day, job, love, great, work wolt
Topic 2: ready, restaurant, food, wait, time, food ready, mark, order, minute, order ready
Topic 3: support, wolt support, system, chat, team, new, bad, help, good, wolt
Topic 4: courier, new courier, new, order, order courier, hour, hire, stop, work, courier order
Topic 5: mcdonald, wait, order, mcdonalds, ready, mc, minute, time, donald, mc donald
Topic 6: map, address, google, gps, location, app, maps, wrong, google map, google maps
Topic 7: order, home, order order, home order, order home, go home, work late, work, get, get order
Topic 8: app, app work, work, time, update, time app, freeze, phone, slow, crash
Topic 9: bag, drink, pack, cup, food, paper, spill, soda, paper bag, need
Topic 10: app, order, order app, time, error, accept, get, support, work, app work
Topic 11: order, hour, wait, 2, minute, order hour, min, hour order, wait o

- Topic 0: en blanding af indtjening, og distance. Ved inspektion ligner det, at det er 50% indtjening 50% andre ting. 
- Topic 1: glad for at arbejde som courier
- Topic 2: Noget omkring at restauranterne ikke er klar med maden, og de markerer ordren klar før den er det 
- Topic 3: Wolt support
- Topic 4: Noget omkring nye kurerer, måske at de ansættes for mange nye
- Topic 5: McDonalds (måske noget med at vente?)
- Topic 6: Google maps, lokation og adresser. 
- Topic 7: Noget omkring orderer? Og hjem? Når jeg kigger i dokumenterne kan jeg se det handler om ikke nok ordrer.
- Topic 8: Appen der ikke virker optimalt 
- Topic 9: Udstyr såsom taske, kopholder, evt. hvordan venues forsejler ordrer
- Topic 10: Måske noget omkring at accepterer ordre, og fejl når man skal acceptere ordre? 
- Topic 11: Man venter lang tid på ordre 
- Topic 12: Burker king og andre restauranter med navnet ”burger” i
- Topic 13: Noget omkring opgaven eller få /acceptere opgaver? Ved inspektion ser det ud til at nogle handler om dette.
- Topic 14:  bundle orders
- Topic 15: blurry. Men i hvert fald noget omkring kompensation og blive trukket i løn
- Topic 16: Glad for wolt
- Topic 17: hvor dkk og tal er nævnt. Nok noget omkring indtjening 
- Topic 18: noget omkring distance. Ved inspektion ligner det, at det omhandler problemer med at appen giver ordrer som er -1 min, eller steder man lige har været.  


### Inspect documents in topics

I inspect the documents in the topics, to see if they seem coherent and belong to the right topic.
The idea is that feedback in the topics should quite similar. 
Moreover for at topic to make sense, we should be able to tell what the topic is about without seeing the topic representations

In [None]:
# Retrieve the topic assignments for each document
topics, _ = loaded_model.transform(df_combined["translated_Comment"].tolist())

In [124]:
from collections import defaultdict
import random

# Organize documents by their assigned topic
topic_to_docs = defaultdict(list)
for doc, topic in zip(documents, topics):
    topic_to_docs[topic].append(doc)

# Iterate over sorted topic IDs
for topic_id in sorted(topic_to_docs.keys()):
    docs = topic_to_docs[topic_id]
    sampled_docs = random.sample(docs, min(10, len(docs)))  # Randomly select up to 10 docs
    print(f"\n=== Topic {topic_id} ({len(docs)} documents) ===")
    for i, doc in enumerate(sampled_docs):
        print(f"{i+1}. {doc}")


=== Topic -1 (3027 documents) ===
1. restaurant Kostadious Burger Joint Trøjborg always late. And today I was waiting 15 minutes for nothing - someone else took the order. But they give the orders to the couriers. Totaly dissapointed from thos restaurant. Plese contact them if possible.
2. Everything goes ok and happy but when you pick up food from MacDonald, the wait is very long
3. Bad Service Bad Support
4. Jagger frederiksberg is more than 10 mins behind the time. Please ask them to keep it on time or adjust time accordingly
5. Really bad treatment of the courses in the city's café and pizzaria waited 40 minutes on the order as they kept saying just about every time you asked when the food was ready. Their attitude towards me and other couriers were condescending hope it is taken seriously
6. Durum Bar Frederiksberg Smallegade 56, 2000 Frederiksberg The restaurant is always cheating. They mark the food ready, but they start to make it only when we show up. They do they ask wolt fo

In [115]:
from collections import defaultdict
import random

# Organize documents by their assigned topic
topic_to_docs = defaultdict(list)
for doc, topic in zip(documents, topics):
    topic_to_docs[topic].append(doc)

# Select only topic 5
topic_id = -1
docs = topic_to_docs.get(topic_id, [])

# Filter documents with max length of 50 characters
filtered_docs = [doc for doc in docs if len(doc) <= 140]

if filtered_docs:
    sampled_docs = random.sample(filtered_docs, min(15, len(filtered_docs)))  # Randomly select up to 20 docs
    print(f"\n=== Topic {topic_id} ({len(docs)} documents) ===")
    for i, doc in enumerate(sampled_docs):
        print(f"{i+1}. {doc}")
else:
    print(f"No documents found for Topic {topic_id}.")


=== Topic -1 (3027 documents) ===
1. App unresponsive at times forcing me to restart the app many times Wolt market was very far behind schedule
2. About half an hour delay at Frankies in spite of having marked the order ready
3. Let's keep wolt as it is now . Let's gooooo
4. Could well wish that extra is given when sent out to an area where there is no possibility of taking some orders with returns
5. J3G thinks there is a problem as I have not received any odre
6. Have an excellent week
7. WTF ? WHERE IS ORDERS ???
8. Disgusting thieves in support
9. The swiping at the resturant doesn’t work
10. Have needed to focus a little on the study, but have been actively watching if there is a busy
11. Worst app after update. Horrible keeps crashing
12. are we going to work ?? no orders .. it's very weak .. what happened to this area ?
13. Larger range for order
14. App still really bad -Offering task 40 minutes ahead…
15. The worst company in the world by a long mile


1. Skulle være endt i app topic (8). Men måske fordi vedkommende også brokker sig over Wolt market, så skaber det støj --> problem ved to topics
2. skulle være endt i topic 2, omkring venues er forsinket
5. måske for dårligt formuleret, ellers skulle den være måske være endt i topic (11)
7. skulle være endt i 


## Merge topics

Based on the inspection of topics, it looks like the following topics can be merged


- 4 + 7 + 11 (lack of orders)

- 2 + 5 + 12 (venues being late)
- 1 + 16 (sentiment)


Now a create a dendogram which clusters the algorithm find the closest

In [82]:
loaded_model.visualize_hierarchy(custom_labels=True)

- confirms 7 + 11 are related (4 could be merged as well)
- comnfirms 2+5+12 are related
- confirms 1 + 16 are related

Suggest
- 8 + 10 could be merged (and 6)
- 3 + 15 could be merged
- 0 + 4 could be merged







In [83]:
# Createas a identical model called reduced_model, which is a copy of the loaded_model
# I'll use this to reduce the number of topics in the model, while keeping the original model intact
import copy

reduced_model = copy.deepcopy(loaded_model)

In [84]:
topics_to_merge = [[7, 11, 4],
                   [1, 16],
                   [2, 5, 12],
                   [8, 10],
                   [3, 15]
]

reduced_model.merge_topics(documents, topics_to_merge)

In [85]:
reduced_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,3027,-1_order_time_app_wolt,"[order, time, app, wolt, work, customer, deliv...","[Hello with you ""Wolt"" I will start by mention..."
1,0,2492,0_pay_order_wolt_payment,"[pay, order, wolt, payment, work, low, money, ...","[The pay per order is too low., Lots of discre..."
2,1,1045,1_happy_wolt_good_thank,"[happy, wolt, good, thank, work, happy wolt, d...","[wolt is the best i am very happy with wolt, w..."
3,2,977,2_ready_wait_order_restaurant,"[ready, wait, order, restaurant, time, food, b...",[some of the restaurant marked food as ready w...
4,3,887,3_courier_order_hour_new,"[courier, order, hour, new, new courier, order...","[Didn't get some order, There are No orders at..."
5,4,527,4_support_order_system_wolt support,"[support, order, system, wolt support, team, c...","[Wolt support not good.verybad, Support always..."
6,5,491,5_app_order_app work_work,"[app, order, app work, work, time, error, supp...","[The app works really well, The app was down f..."
7,6,355,6_map_address_google_gps,"[map, address, google, gps, app, location, map...","[Addresses were misleading, Please don’t pley ..."
8,7,241,7_bag_drink_pack_food,"[bag, drink, pack, food, cup, paper, spill, so...",[Please do something about the bags. Big proce...
9,8,146,8_task_task task_time_new task,"[task, task task, time, new task, accept, acce...","[I didn't get any task, I don't get as many ta..."


In [86]:
reduced_model.visualize_hierarchy(custom_labels=True)

In [87]:
topics_to_merge1 = [[0,3]]
                   

reduced_model.merge_topics(documents, topics_to_merge1)

In [88]:
reduced_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,3027,-1_order_time_app_wolt,"[order, time, app, wolt, work, customer, deliv...","[Hello with you ""Wolt"" I will start by mention..."
1,0,3379,0_order_pay_courier_hour,"[order, pay, courier, hour, wolt, work, paymen...","[I didn’t get that much order., Didn’t get any..."
2,1,1045,1_happy_wolt_good_thank,"[happy, wolt, good, thank, work, happy wolt, d...","[wolt is the best i am happy with wolt, wolt i..."
3,2,977,2_ready_wait_order_restaurant,"[ready, wait, order, restaurant, time, food, b...",[I had an unacceptable experience at McDonald'...
4,3,527,3_support_order_system_wolt support,"[support, order, system, wolt support, team, c...","[Wolt support not good.verybad, Leaf -tender s..."
5,4,491,4_app_order_app work_work,"[app, order, app work, work, time, support, er...","[The app works really well, The app was down f..."
6,5,355,5_map_address_google_app,"[map, address, google, app, gps, location, map...","[Addresses were misleading, Please make it pos..."
7,6,241,6_bag_drink_pack_food,"[bag, drink, pack, food, cup, paper, spill, so...",[Please do something about the bags. Big proce...
8,7,146,7_task_task task_time_new task,"[task, task task, time, new task, accept, acce...","[Because i always get less task, Then we are t..."
9,8,131,8_bundle_order_bundle order_pay,"[bundle, order, bundle order, pay, task, payme...",[Why the system give bundles when there are no...


Now there are 19 topics.

From inspecting the topic representations and documents in each topic (see further down), I have made the following assessment 

Topic 0: Problems with restaturants (especially being late) = cooperation with venues

Topic 1: Satisfaction of working with Wolt (also a few negative)

Topic 2: Order and salary (two different things, but closely connected)

Topic 3: App functionality

Topic 4: Something about distances in relation to earnings and vehicle

Topic 5: Too many couriers (and hiring new couriers), but some more express they don't care about courier or bad earnings. Too many couriers can also easily be linked to low earnings or too few orders

Topic 6: Wolt support

Topic 7: GPS tracking, google maps, and addresses

Topic 8: Wolt bag, and how restaurant pack/seal the food

Topic 9: Bad earnings 

Topic 10: mixed in relation to taks (too few tasks, some where task is just mentioned but about specific problems or frustrations).

Topic 11: Random topics, where city names are included.

Topic 12: Bundle orders

Topic 13: Low prices (some about new payment model). Prices is related to earnings.

Topic 14: Bonuses (either in relation to pay or schedule)

Topic 15: Lidt rodet topic. Men handler bl.a. om a acceptere / annullere ordre samt unfair penalties og mangel på kompensation når der er sket fejl

Topic 16: Low paypment + some comments about a new / old payment system

Topic 17: Happy with Wolt

Topic 18: No orders (couriers go home)

--------------------------------------------

The folliwing would logically make sense to merge

 1+17: satisfaction with Wolt

 2+5+9+13+17+18: earnings / too many couriers / orders / prices / pay

I inspect the documents 

In [89]:
# Get topic assignments for each document
topics, _ = reduced_model.transform(df_combined["translated_Comment"].tolist())

In [123]:
from collections import defaultdict

# Organize documents by their assigned topic
topic_to_docs = defaultdict(list)
for doc, topic in zip(documents, topics):
    topic_to_docs[topic].append(doc)

# Iterate over sorted topic IDs
for topic_id in sorted(topic_to_docs.keys()):
    docs = topic_to_docs[topic_id]
    sampled_docs = random.sample(docs, min(20, len(docs)))  # Randomly select up to 10 docs
    print(f"\n=== Topic {topic_id} ({len(docs)} documents) ===")
    for i, doc in enumerate(sampled_docs):
        print(f"{i+1}. {doc}")



=== Topic -1 (3027 documents) ===
1. This update is very bad, for large and double orders it is no longer worth going to deliver them, you earn far too little on them, others receive more, others less, for the same distance, this update is very bad
2. I am really happy when there are tasks but there may be a little quiet when i am online
3. Staff at Otto Frederiksberg gave bad estimate and took no responsibility for the 15 minute delay they caused to my customers
4. I couldn't reach the customer for 20 minutes and delivered it to her boyfriend. Again a problem at rush hour. That's too bad.
5. I took an order from a place, and when I arrived, they told me that the order was to pick in an hour from that time. If wolt allowes to the people to order on a specific time, they should write somewhere in the task so the courier can know and reject it if it’s not useful, or they should send the task to the courier when is going to be ready, not one hour before, because I lost time going to the 

## Reduce outliers

Outliers must be reduced after topics have been merged. It cannot technically work the other way around

In [92]:
import copy

reduced_outlier_model = copy.deepcopy(reduced_model)

I'll now look at some outliers to get an idea what proportion of outliers should be in one of the topics, and what proportion are actual outliers (noise) not belonging to any of the topics. 

I'll now try different approaches to outlier reduction. that is techniques and thresholds

In [98]:
import copy

# Define the strategy-threshold combinations
combinations = [
    ("c-tf-idf", 0.085),
    ("c-tf-idf", 0.09),
    ("c-tf-idf", 0.095),

]

# Store results
results = {}

# Loop through each combination
for strategy, threshold in combinations:
    print(f"Running: strategy = {strategy}, threshold = {threshold}")
    
    # Deep copy the original reduced model
    model_copy = copy.deepcopy(reduced_model)

    # Reduce outliers using specified strategy and threshold
    new_topics = model_copy.reduce_outliers(
        documents=documents,
        topics=topics,
        strategy=strategy,
        threshold=threshold
    )

    # Update topic representations
    model_copy.update_topics(
        docs=documents,  # <-- FIXED HERE
        topics=new_topics,
        vectorizer_model=vectorizer_model,
        #representation_model=representation_model
    )

    # Count number of outliers
    outlier_count = sum(1 for t in new_topics if t == -1)

    # Store results
    results[(strategy, threshold)] = {
        "model": model_copy,
        "topics": new_topics,
        "outlier_count": outlier_count
    }

# Optional: print summary of outlier counts
print("\nSummary of Outlier Counts:")
for (strategy, threshold), res in results.items():
    print(f"{strategy}, threshold={threshold}: {res['outlier_count']} outliers")


Running: strategy = c-tf-idf, threshold = 0.085




Running: strategy = c-tf-idf, threshold = 0.09




Running: strategy = c-tf-idf, threshold = 0.095





Summary of Outlier Counts:
c-tf-idf, threshold=0.085: 1467 outliers
c-tf-idf, threshold=0.09: 1564 outliers
c-tf-idf, threshold=0.095: 1664 outliers


first run 

- c-tf-idf, threshold=0.07: 1115 outliers

- c-tf-idf, threshold=0.075: 1226 outliers
- c-tf-idf, threshold=0.08: 1348 outliers
- c-tf-idf, threshold=0.085: 1467 outliers
- c-tf-idf, threshold=0.09: 1564 outliers

------------------------
Second run
- c-tf-idf, threshold=0.1: 1786 outliers

- c-tf-idf, threshold=0.105: 1953 outliers
- c-tf-idf, threshold=0.11: 2056 outliers

------------------------
third run

- c-tf-idf, threshold=0.085: 1467 outliers

- c-tf-idf, threshold=0.09: 1564 outliers
- c-tf-idf, threshold=0.095: 1664 outliers

Now I inspect the outliers being reassigned to the different topcis

In [102]:

from collections import defaultdict
import random

# Step 2: Get new topics (after outlier reduction using embedding + 0.25 threshold)
topics_c_tf_idf_085 = results[("c-tf-idf", 0.085)]["topics"]

# Step 1: Find reassigned outliers (originally -1, now assigned to a topic)
reassigned_outliers = [
    (i, doc, new_topic)
    for i, (doc, old_topic, new_topic) in enumerate(zip(documents, topics, topics_c_tf_idf_085))
    if old_topic == -1 and new_topic != -1
]

# Step 2: Group reassigned outliers by new topic
topic_to_outlier_docs = defaultdict(list)
for i, doc, new_topic in reassigned_outliers:
    topic_to_outlier_docs[new_topic].append(doc)

# Step 3: Print sampled outlier docs with ORIGINAL top words (from reduced_model2)
for topic_id in sorted(topic_to_outlier_docs.keys()):
    docs = topic_to_outlier_docs[topic_id]
    sampled_docs = random.sample(docs, min(10, len(docs)))

    # Get original top words from reduced_model_manual
    original_topic_words = reduced_model.get_topic(topic_id)
    if original_topic_words:
        top_words = ", ".join([word for word, _ in original_topic_words])
    else:
        top_words = "(No original top words)"

    print(f"Reassigned Outliers → Topic {topic_id} ({len(docs)} docs) | Original top words: {top_words}")
    for j, doc in enumerate(sampled_docs):  
        print(f"{j+1}. {doc}")
    print()

Reassigned Outliers → Topic 0 (357 docs) | Original top words: order, pay, courier, hour, wolt, work, payment, low, money, time
1. No work, low pay, bad restaurants, bad customers
2. So, finally i can give the feedback.. Im very disappointed. 1. I dont know what happened, but since beginning of february (after big freezing days) it feels like you hired 2x more couriers than needed and now I literally drive in the city center near the lakes in peak hours (17-19:30) on friday / saturday and I see area demand: Normal.... and whole day till 17:00 its Quiet... and for 5-10mins I have no order (which before i had like 3 orders pre minute) 2. Im getting those annoying and not true push notifications (which arent even messages i can respond to) saying that "We missed you on friday and saturday so go and earn today" .. i was literally working (freezing outside waiting 5mins for order) both days so why am I receiving these fake untrue messages and why are you trying to motivate me to go to work 

Now I inspect the outliers which are still left

In [103]:
from collections import defaultdict
import random


# Organize documents by new topic
topic_to_docs = defaultdict(list)
for doc, topic in zip(documents, topics_c_tf_idf_085):
    topic_to_docs[topic].append(doc)

# Select only topic 5
topic_id = -1
docs = topic_to_docs.get(topic_id, [])

if docs:
    sampled_docs = random.sample(docs, min(20, len(docs)))  # Randomly select up to 20 docs
    print(f"\n=== Topic {topic_id} ({len(docs)} documents) ===")
    for i, doc in enumerate(sampled_docs):
        print(f"{i+1}. {doc}")
else:
    print(f"No documents found for Topic {topic_id}.")


=== Topic -1 (1467 documents) ===
1. There are quite a few errors in the system
2. You keep log me out for no reason
3. Doesn't seem to be okay to be drawn so much for an error in the order as I had the opportunity to come Back with the right one.
4. Some venues were late, and for other the other was postponed by the customer
5. Have a great weekend
6. Would be nice with a function where you stay online but don't get orders. SKAT said the km report from Wolt should be enough proof for the transport tax deduction. So for example if going with a bike in the train towards a hotspot, and don't want to start before reaching the hotspot, there could be a transportation mode where the Wolt app keep the km counter ticking for going away and back to home.
7. Why isn't possible to talk directly with decisional staff, what are you afraid?!
8. I would like to be able to log in soon
9. Do we have to climb stairs to deliver food, like 3rd or 4th floor?
10. Hello, I work in the Vestegnen Vallensbæk 

In [None]:
reduced_outlier_model.update_topics(documents, topics = topics_c_tf_idf_085, vectorizer_model= vectorizer_model)



In [105]:
reduced_outlier_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1467,-1_order_excellent_customer_time,"[order, excellent, customer, time, delivery, w...","[Hello with you ""Wolt"" I will start by mention..."
1,0,3736,0_order_pay_wolt_courier,"[order, pay, wolt, courier, hour, work, time, ...","[I didn’t get that much order., Didn’t get any..."
2,1,1175,1_happy_wolt_good_day,"[happy, wolt, good, day, thank, work, happy wo...","[wolt is the best i am happy with wolt, wolt i..."
3,2,1332,2_ready_wait_order_time,"[ready, wait, order, time, food, restaurant, m...",[I had an unacceptable experience at McDonald'...
4,3,669,3_support_order_wolt support_contact,"[support, order, wolt support, contact, time, ...","[Wolt support not good.verybad, Leaf -tender s..."
5,4,786,4_app_order_app work_work,"[app, order, app work, work, time, support, er...","[The app works really well, The app was down f..."
6,5,394,5_address_map_google_app,"[address, map, google, app, location, gps, wro...","[Addresses were misleading, Please make it pos..."
7,6,299,6_bag_drink_food_pack,"[bag, drink, food, pack, order, cup, soda, pap...",[Please do something about the bags. Big proce...
8,7,227,7_task_accept_time_accept task,"[task, accept, time, accept task, task task, s...","[Because i always get less task, Then we are t..."
9,8,152,8_bundle_order_bundle order_pay,"[bundle, order, bundle order, pay, task, payme...",[Why the system give bundles when there are no...


In [106]:

# Display top 10 word representations for each topic (excluding outlier topic -1)
topic_info = reduced_outlier_model.get_topic_info()
for topic_id in topic_info.Topic:
    if topic_id != -1:
        words = reduced_outlier_model.get_topic(topic_id)
        if words:
            top_words = [word for word, _ in words[:10]]
            print(f"Topic {topic_id}: {', '.join(top_words)}")

Topic 0: order, pay, wolt, courier, hour, work, time, payment, money, low
Topic 1: happy, wolt, good, day, thank, work, happy wolt, job, love, wolt happy
Topic 2: ready, wait, order, time, food, restaurant, minute, burger, mark, mcdonald
Topic 3: support, order, wolt support, contact, time, wolt, system, bad, help, chat
Topic 4: app, order, app work, work, time, support, error, accept, order app, crash
Topic 5: address, map, google, app, location, gps, wrong, maps, google map, customer
Topic 6: bag, drink, food, pack, order, cup, soda, paper, spill, need
Topic 7: task, accept, time, accept task, task task, second, app, minute, new task, new
Topic 8: bundle, order, bundle order, pay, task, payment, new, system, courier, customer
Topic 9: dkk, hour, online, hour online, 3 hour, order, 3, dkk hour, 2, 2 hour
Topic 10: mile, offer, order close, drive mile, drive, offer ready, mile offer, close venue, app let, let drive


## Fine-tune topic representations

In [107]:
import copy

reduced_outlier_model_topics = copy.deepcopy(reduced_outlier_model)


In [108]:
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance

keybert_model = KeyBERTInspired()
mmr_model = MaximalMarginalRelevance(diversity=0.4)  # 0.2 is used in the book

representation_model = {
    "Main": mmr_model,
    "KeyBERT": keybert_model,
}

In [109]:
reduced_outlier_model_topics.update_topics(documents, topics = topics_c_tf_idf_085, vectorizer_model= vectorizer_model, representation_model=representation_model)



In [118]:
# Display top 10 word representations for each topic (excluding outlier topic -1)
topic_info = reduced_outlier_model_topics.get_topic_info()
for topic_id in topic_info.Topic:
    if topic_id != -1:
        words = reduced_outlier_model_topics.get_topic(topic_id)
        if words:
            top_words = [word for word, _ in words[:10]]
            print(f"Topic {topic_id}: {', '.join(top_words)}")

Topic 0: courier, payment, delivery, km, bad, kr, earn, bonus, car, go
Topic 1: happy, day, happy wolt, excellent day, work wolt, perfect, partner, happy work, support, help
Topic 2: restaurant, mcdonald, order ready, min, food ready, delay, customer, burger king, pick, venue
Topic 3: order, wolt support, system, deliver, cancel, customer, support team, problem, message, contact support
Topic 4: app work, work, error, order app, restart, delivery, app crash, bug, problem, freeze
Topic 5: map, location, gps, google maps, wrong address, problem, wolt app, copy address, card, house
Topic 6: food, cup, soda, spill, paper bag, mcdonald, lid, delivery, seal, wolt market
Topic 7: accept task, app, hour, receive, time task, venue, receive task, busy, cancel, error
Topic 8: bundle, payment, courier, customer, order bundle, bundle task, delivery, reject, bundle pay, km
Topic 9: hour online, dkk hour, bad, average, friday, hour order, online 2, peak hour, 70 dkk, hour street
Topic 10: order close

In [115]:
reduced_outlier_model_topics.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,Representative_Docs
0,-1,1467,-1_order_customer_week_busy,"[order, customer, week, busy, deliver, tip, fl...","[week excellent, excellent week, weekend excel...","[Hello with you ""Wolt"" I will start by mention..."
1,0,3736,0_courier_payment_delivery_km,"[courier, payment, delivery, km, bad, kr, earn...","[order order, order, delivery, wolt market, de...","[I didn’t get that much order., Didn’t get any..."
2,1,1175,1_happy_day_happy wolt_excellent day,"[happy, day, happy wolt, excellent day, work w...","[love wolt, wolt happy, wolt wolt, wolt, happy...","[wolt is the best i am happy with wolt, wolt i..."
3,2,1332,2_restaurant_mcdonald_order ready_min,"[restaurant, mcdonald, order ready, min, food ...","[order ready, food ready, delay, waiting, rest...",[I had an unacceptable experience at McDonald'...
4,3,669,3_order_wolt support_system_deliver,"[order, wolt support, system, deliver, cancel,...","[support help, support, contact support, suppo...","[Wolt support not good.verybad, Leaf -tender s..."
5,4,786,4_app work_work_error_order app,"[app work, work, error, order app, restart, de...","[app work, app app, app, fix app, app say, pro...","[The app works really well, The app was down f..."
6,5,394,5_map_location_gps_google maps,"[map, location, gps, google maps, wrong addres...","[google maps, google map, wrong address, addre...","[Addresses were misleading, Please make it pos..."
7,6,299,6_food_cup_soda_spill,"[food, cup, soda, spill, paper bag, mcdonald, ...","[bag order, use bag, bag food, food bag, bag b...",[Please do something about the bags. Big proce...
8,7,227,7_accept task_app_hour_receive,"[accept task, app, hour, receive, time task, v...","[task wait, task, task come, single task, task...","[Because i always get less task, Then we are t..."
9,8,152,8_bundle_payment_courier_customer,"[bundle, payment, courier, customer, order bun...","[order bundle, bundle payment, pay bundle, bun...",[Why the system give bundles when there are no...


In [126]:
topics, _ = reduced_outlier_model_topics.transform(df_combined["translated_Comment"].tolist())

In [128]:
from collections import defaultdict
import random

# Organize documents by their assigned topic
topic_to_docs = defaultdict(list)
for doc, topic in zip(documents, topics):
    topic_to_docs[topic].append(doc)

# Iterate over sorted topic IDs
for topic_id in sorted(topic_to_docs.keys()):
    docs = topic_to_docs[topic_id]
    
    # Filter to only include feedback with length ≤ 150
    short_docs = [doc for doc in docs if len(doc) <= 4000]
    sampled_docs = random.sample(short_docs, min(20, len(short_docs)))  # Randomly select up to 10 docs
    
    print(f"\n -- Topic {topic_id} -- ")
    for i, doc in enumerate(sampled_docs):
        print(f"{i+1}. {doc}")




 -- Topic -1 -- 
1. Im happy but im not a fan when i get a High value order (above 60kr)and i accept it and then it disappears and is replaced with a low value order (below 30kr)
2. Same bs: You have programmed the algorithm to wait with offering tasks until one is around 500 meter away from the venue, probably only when order level is quiet/order is not too late…
3. No tasks being sent through. Then taking offline as I wasn’t accepting tasks. Something wrong with app today.
4. Will the app not come soon in Apple CarPlay
5. Hello with you "Wolt" You talk about listening to us cures and fairly done the app. First you have set our starting fee down from 45,- to 35,- earlier it was a bird spacing of 450 meters when the reality was 2 km. Ñu has once again fixed the app in favor of Wolt and not us who have made you rich. In the video on YouTube, the guy tells you that you have listened to us where we used to get payment per piece for full distance. Now you have again changed the app in fav

# Model 2 

In [4]:
from bertopic import BERTopic

loaded_model2 = BERTopic.load("simple_model2")
print("Model loaded successfully.")


Model loaded successfully.


## Inspection

In [5]:
loaded_model2.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,4118,-1_order_time_wolt_app,"[order, time, wolt, app, work, delivery, pay, ...","[Hello with you ""Wolt"" I will start by mention..."
1,0,915,0_happy_good_thank_work,"[happy, good, thank, work, wolt, day, job, lov...","[i am very very happy with, I am very happy to..."
2,1,521,1_restaurant_ready_food_wait,"[restaurant, ready, food, wait, time, food rea...",[Some restaurants you have to wait a long time...
3,2,490,2_order_hour_order order_2,"[order, hour, order order, 2, wait, 1, receive...","[Nothing ?? No orders in, I don't have an orde..."
4,3,435,3_km_distance_delivery_pay,"[km, distance, delivery, pay, driver, drive, c...",[The adress of one of my delivery's today was ...
5,4,372,4_mcdonald_wait_order_mcdonalds,"[mcdonald, wait, order, mcdonalds, ready, minu...",[Hi! Today started great but the few times I’v...
6,5,371,5_courier_new courier_new_order,"[courier, new courier, new, order, order couri...","[Too few orders or too many courier, No orders..."
7,6,331,6_support_wolt support_chat_team,"[support, wolt support, chat, team, help, bad,...","[Wolt support not good.verybad, Leaf -tender s..."
8,7,322,7_map_address_google_gps,"[map, address, google, gps, location, app, map...","[No integrated route on in-app map, or easy wa..."
9,8,311,8_app_order_task_time,"[app, order, task, time, error, work, app work...",[Uphill…. Began as courier 5 months ago (it ma...


In [6]:
# Display top 10 word representations for each topic (excluding outlier topic -1)
topic_info = loaded_model2.get_topic_info()
for topic_id in topic_info.Topic:
    if topic_id != -1:
        words = loaded_model2.get_topic(topic_id)
        if words:
            top_words = [word for word, _ in words[:10]]
            print(f"Topic {topic_id}: {', '.join(top_words)}")

Topic 0: happy, good, thank, work, wolt, day, job, love, great, work wolt
Topic 1: restaurant, ready, food, wait, time, food ready, minute, mark, order, order ready
Topic 2: order, hour, order order, 2, wait, 1, receive, receive order, get, time
Topic 3: km, distance, delivery, pay, driver, drive, car, money, payment, bike
Topic 4: mcdonald, wait, order, mcdonalds, ready, minute, mc, time, donald, ignore
Topic 5: courier, new courier, new, order, order courier, hire, hour, courier order, money, stop
Topic 6: support, wolt support, chat, team, help, bad, support team, wolt, contact, support good
Topic 7: map, address, google, gps, location, app, maps, wrong, google map, google maps
Topic 8: app, order, task, time, error, work, app work, accept, support, order app
Topic 9: earning, salary, pay, low, wage, bad, get bad, hour, get, money
Topic 10: bag, drink, pack, cup, food, paper, spill, soda, customer, menu
Topic 11: burger, king, burger king, burger concept, concept, wait, minute, dela

### Inspect documents in topics

I inspect the documents in the topics, to see if they seem coherent and belong to the right topic.
The idea is that feedback in the topics should quite similar. 
Moreover for at topic to make sense, we should be able to tell what the topic is about without seeing the topic representations

Some topics seems to be adresseing the same themes, and could be merged. 

In [7]:

# Retrieve the topic assignments for each document
topics, _ = loaded_model2.transform(df_combined["translated_Comment"].tolist())


In [35]:
from collections import defaultdict
import random

# Organize documents by their assigned topic
topic_to_docs = defaultdict(list)
for doc, topic in zip(documents, topics):
    topic_to_docs[topic].append(doc)

# Iterate over sorted topic IDs
for topic_id in sorted(topic_to_docs.keys()):
    docs = topic_to_docs[topic_id]
    sampled_docs = random.sample(docs, min(10, len(docs)))  # Randomly select up to 10 docs
    print(f"\n=== Topic {topic_id} ({len(docs)} documents) ===")
    for i, doc in enumerate(sampled_docs):
        print(f"{i+1}. {doc}")



=== Topic -1 (4118 documents) ===
1. ATTENTION PLEASE: Ok so it happened now for the second time that I accept a task, bike all the way to the venue and once there I don't have the task anymore. The first time I contacted support but they couldn't (or wouldn't) help me. This time I understood what happens in these cases: by the time I accepted the task and arrived there to the venue the order got canceled. This last time the venue itself canceled by mistake (they wanted the reject a later order that they couldn't make in time and not my order that they have made already). The first time I don't know who canceled it (venue or customer). The point is when this happend you cannot just take the task away without some sort of notification (possibly with sound too or something). What if I biked 15 or 20 minutes all the way to the venue for nothing? I understand the customer or venue can decide to reject the task but the courier needs to be notified right away. The app right now basically ju

In [167]:
from collections import defaultdict
import random

# Organize documents by their assigned topic
topic_to_docs = defaultdict(list)
for doc, topic in zip(documents, topics):
    topic_to_docs[topic].append(doc)

# Select only topic 5
topic_id = 21
docs = topic_to_docs.get(topic_id, [])

# Filter documents with max length of 50 characters
filtered_docs = [doc for doc in docs if len(doc) <= 156]

if filtered_docs:
    sampled_docs = random.sample(filtered_docs, min(45, len(filtered_docs)))  # Randomly select up to 20 docs
    print(f"\n=== Topic {topic_id} ({len(docs)} documents) ===")
    for i, doc in enumerate(sampled_docs):
        print(f"{i+1}. {doc}")
else:
    print(f"No documents found for Topic {topic_id}.")



=== Topic 21 (90 documents) ===
1. I will be sent to orders and when I arrive at the specified place, the order is canceled and I will not be compensated
2. The oodren disappeared eftee 10 sec only not taking it
3. Could not mark items as delivered 3 times? Do I get money for waiting for Wolt support?
4. Canceling order is not working. Error Code: Task.Body.order_cancellation.Error.geofence
5. When you pick up an item to a customer, you should be allowed to deliver the item before picking up another delivery you have said yes to
6. Works like a charm. Just need to notify the courir when 2 orders are coming in at tye same time
7. I am still being penalized for notifying a work injury
8. Accellated automatically orders
9. Automatic cancel the order
10. Vendor concerns are not being heard
11. I was punished on a job and instruction that I followed from support
12. My order disappeared without me being informed that it had been canceled
13. There one who drove into my taillight while I wa

In [None]:
topic_to_docs

## Zero shot classification

In [None]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

sequence_to_classify =  " ".join([word for word, _ in loaded_model2.get_topic(1)])

# Our set of potential topic labels
candidate_labels = ['earnings or income', 'waiting', 'bundle deliveries', "wolt support"]
classifier(sequence_to_classify, candidate_labels)

config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use mps:0


{'sequence': 'good happy wolt thank work day job love great flexibility',
 'labels': ['wolt support',
  'earnings or income',
  'waiting',
  'bundle deliveries'],
 'scores': [0.6058537364006042,
  0.2803512215614319,
  0.09101628512144089,
  0.022778760641813278]}

## Merge topics

Based on the inspection of topics, it looks like the following topics can be merged
    - topic 2 + 13 + 25 all are about not enough orders / tasks
    - topic 0 + 23 are both about satisfaction of working with Wolt
    - topic 8 + 15 + 24 are all about the apps functionality 
    - topic 9 + 12 + 17 + 18 + 26 are all about earnings / pay 



Now a create a dendogram which clusters the algorithm find the closest

In [8]:
loaded_model2.visualize_hierarchy(custom_labels=True)

confirms that 15 + 8 + 24 are close to each other (app functionality)

confirms 0 + 23 are close to each other and very distinct from the rest.

confirms that 9 + 18 + 12 + 17 are close, but 26 is not really. However it would suggest 3 goes in as well, which after inspection looks ok. Further 22 could go in as last, but this have more with payment model to do. 

2 + 13 + 15 are all in the same area of the plot, but not connected as first links. 

Furthermore the dendogram suggest the following topics could be merged: 
1 (restuarant making wrong time estimate) + 4 (Mcdonalds problems (often being late)). This could make sense, but it could also make sense to keep McDonalds seperate. Senere kunne 11 så blive merged på, da det handler om steder med burger. I alt kunne disse tre topics samlet handle om arbejdet med venues.

5 + 14 giver ikke mening at merge

6 + 21 giver ikke rigtig mening, da 21 er for mudret. 







### Automatisk reducering + manuelt efter

In [None]:
# Createas a identical model called reduced_model, which is a copy of the loaded_model
# I'll use this to reduce the number of topics in the model, while keeping the original model intact
import copy

reduced_model2 = copy.deepcopy(loaded_model2)

In [None]:
# First I try with automatic reduction
reduced_model2.reduce_topics(documents, nr_topics="auto")

<bertopic._bertopic.BERTopic at 0x31e7c09d0>

In [None]:
reduced_model2.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,4118,-1_order_time_wolt_app,"[order, time, wolt, app, work, delivery, pay, ...","[Hello with you ""Wolt"" I will start by mention..."
1,0,1064,0_ready_wait_restaurant_order,"[ready, wait, restaurant, order, food, time, m...",[I had an unacceptable experience at McDonald'...
2,1,915,1_good_happy_wolt_thank,"[good, happy, wolt, thank, work, day, job, lov...","[I'm very excited and happy to work with wolt,..."
3,2,874,2_order_hour_pay_earning,"[order, hour, pay, earning, low, order order, ...","[There are not enough orders at all., There co..."
4,3,671,3_app_order_work_app work,"[app, order, work, app work, time, offer, dkk,...","[App crashes. App lets me drive for miles, the..."
5,4,435,4_km_distance_delivery_pay,"[km, distance, delivery, pay, driver, drive, c...",[You say it is 0.1km ekstra but actual distanc...
6,5,371,5_courier_new courier_new_order,"[courier, new courier, new, order, order couri...","[Few orders for too many couriers, No orders!!..."
7,6,331,6_support_wolt support_chat_team,"[support, wolt support, chat, team, help, bad,...","[Haven't used Wolt Support, so is still uncert..."
8,7,322,7_map_address_google_gps,"[map, address, google, gps, location, app, map...",[It would be nice to be able to copy the adres...
9,8,269,8_bag_drink_pack_cup,"[bag, drink, pack, cup, food, paper, spill, so...",[Please do something about the bags. Big proce...


In [None]:
# Display top 10 word representations for each topic (excluding outlier topic -1)
topic_info = reduced_model2.get_topic_info()
for topic_id in topic_info.Topic:
    if topic_id != -1:
        words = reduced_model2.get_topic(topic_id)
        if words:
            top_words = [word for word, _ in words[:10]]
            print(f"Topic {topic_id}: {', '.join(top_words)}")

Topic 0: ready, wait, restaurant, order, food, time, minute, burger, mcdonald, mark
Topic 1: good, happy, wolt, thank, work, day, job, love, great, work wolt
Topic 2: order, hour, pay, earning, low, order order, bad, salary, get, time
Topic 3: app, order, work, app work, time, offer, dkk, error, drive, mile
Topic 4: km, distance, delivery, pay, driver, drive, car, money, payment, long
Topic 5: courier, new courier, new, order, order courier, hour, hire, courier order, work, money
Topic 6: support, wolt support, chat, team, help, bad, wolt, support team, good, contact
Topic 7: map, address, google, gps, location, app, maps, wrong, google map, google maps
Topic 8: bag, drink, pack, cup, food, paper, spill, soda, order, customer
Topic 9: wolt, hour, kr, work, pay, money, day, kroner, low, night supplement
Topic 10: task, task task, new task, time, accept task, accept, new, disappear, hour, receive
Topic 11: wolt, wolt market, market vanløse, market, vanløse, danish, order, copenhagen, den

Now there are 19 topics.

From inspecting the topic representations and documents in each topic (see further down), I have made the following assessment 

Topic 0: Problems with restaturants (especially being late) = cooperation with venues

Topic 1: Satisfaction of working with Wolt (also a few negative)

Topic 2: Order and salary (two different things, but closely connected)

Topic 3: App functionality

Topic 4: Something about distances in relation to earnings and vehicle

Topic 5: Too many couriers (and hiring new couriers), but some more express they don't care about courier or bad earnings. Too many couriers can also easily be linked to low earnings or too few orders

Topic 6: Wolt support

Topic 7: GPS tracking, google maps, and addresses

Topic 8: Wolt bag, and how restaurant pack/seal the food

Topic 9: Bad earnings 

Topic 10: mixed in relation to taks (too few tasks, some where task is just mentioned but about specific problems or frustrations).

Topic 11: Random topics, where city names are included.

Topic 12: Bundle orders

Topic 13: Low prices (some about new payment model). Prices is related to earnings.

Topic 14: Bonuses (either in relation to pay or schedule)

Topic 15: Lidt rodet topic. Men handler bl.a. om a acceptere / annullere ordre samt unfair penalties og mangel på kompensation når der er sket fejl

Topic 16: Low paypment + some comments about a new / old payment system

Topic 17: Happy with Wolt

Topic 18: No orders (couriers go home)

--------------------------------------------

The folliwing would logically make sense to merge

 1+17: satisfaction with Wolt

 2+5+9+13+17+18: earnings / too many couriers / orders / prices / pay

I inspect the documents 

In [None]:
# Get topic assignments for each document
topics, _ = reduced_model2.transform(df_combined["translated_Comment"].tolist())


In [54]:
from collections import defaultdict

# Organize documents by their assigned topic
topic_to_docs = defaultdict(list)
for doc, topic in zip(documents, topics):
    topic_to_docs[topic].append(doc)

# Iterate over sorted topic IDs
for topic_id in sorted(topic_to_docs.keys()):
    docs = topic_to_docs[topic_id]
    sampled_docs = random.sample(docs, min(10, len(docs)))  # Randomly select up to 10 docs
    print(f"\n=== Topic {topic_id} ({len(docs)} documents) ===")
    for i, doc in enumerate(sampled_docs):
        print(f"{i+1}. {doc}")



=== Topic -1 (4118 documents) ===
1. The app works very badly. Often gets error messages when I have to enter "Items picked up". It causes i have to restart the app, which is very stressful when i'm busy
2. No orders. Not okay when earnings per order has gotten so shit....
3. Since the update of the payment, the waiting time for a task grew a lot, even in peak hours. And today I couldn’t mark my order as picked up from venue, even though my internet connection was fine. The problem was occurring for the 5 mins
4. Wolt application crashed
5. médium but no have orders?
6. Not enough deliveries
7. Hi, When I get drive orders every individual item in the order has to be marked off. For my current order there are 20 items to mark off. That is too much, and they don’t even have meaning, it’s just “parcel one”, “parcel 2”, ect… Could it be made easier ?
8. I took an order from a place, and when I arrived, they told me that the order was to pick in an hour from that time. If wolt allowes to t

In [180]:
from collections import defaultdict
import random

# Organize documents by their assigned topic
topic_to_docs = defaultdict(list)
for doc, topic in zip(documents, topics):
    topic_to_docs[topic].append(doc)

# Select only topic 5
topic_id = 5
docs = topic_to_docs.get(topic_id, [])

if docs:
    sampled_docs = random.sample(docs, min(20, len(docs)))  # Randomly select up to 20 docs
    print(f"\n=== Topic {topic_id} ({len(docs)} documents) ===")
    for i, doc in enumerate(sampled_docs):
        print(f"{i+1}. {doc}")
else:
    print(f"No documents found for Topic {topic_id}.")



=== Topic 5 (371 documents) ===
1. A lot of couriers and some mistakes of distance payment
2. Crazy how we couriers can’t make any money nowadays. Wolt should do something about it and it’s embarrassing
3. It was good, however, it has become somewhat worse root how poor food delivery is accepted. Overfilling courier destroys income potential, and the company arrogant approach to the same problem
4. Your top priority is scamming couriers and second is delivering orders late on purpose
5. Hi I would like to file an official complaint against Wolt Danmark ApS for the violation of its own Employees Code of Conduct Section B, point (1), by failing to provide a fair and accurate financial report, since the company refuses to give a detailed breakdown of the task fees offered to the courier partners. The financial report must include an overview of the various alleged extra fees that the courier partner should receive, like distance-fees, weekend bonus, weather bonus, uphill-fee and so on. U

In [None]:
reduced_model2.visualize_hierarchy(custom_labels=True)

From the dendogram here, we can see that the topics are generally more distinct now, since the scores are higher before topics will be merged

Confirms 1+17 make sense to merge

For other topic it shows are close (2+4, 0+15) I asses their connectiveness and rate them unrelated! No merging

For 2+5+9+13+16+18 which I all conlude in the very end are connected to earnings (too many couriers, to few order, too low pay, too low prices), the model places most of tem in the red area, making them somehow related. 




Now, I manually merges these topics, as their connectiness is not technically present, but logically I rate them connected

In [None]:
import copy

reduced_model2_2 = copy.deepcopy(reduced_model2)

In [None]:
topics_to_merge = [[1, 17],
                   [2, 5, 9, 13, 16, 18]
]

reduced_model2_2.merge_topics(documents, topics_to_merge)

In [None]:
reduced_model2_2.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,4118,-1_order_time_wolt_app,"[order, time, wolt, app, work, delivery, pay, ...","[Hello with you ""Wolt"" I will start by mention..."
1,0,1666,0_order_courier_hour_pay,"[order, courier, hour, pay, new, payment, low,...","[More orders would be delicious, There are not..."
2,1,1064,1_ready_wait_order_restaurant,"[ready, wait, order, restaurant, time, food, m...",[Ask Pick up places not to mark the order read...
3,2,997,2_happy_wolt_good_work,"[happy, wolt, good, work, thank, happy wolt, d...","[i am very happy with wolt, i am very happy wi..."
4,3,671,3_app_order_work_app work,"[app, order, work, app work, time, offer, dkk,...","[App crashes. App lets me drive for miles, the..."
5,4,435,4_km_distance_delivery_pay,"[km, distance, delivery, pay, driver, drive, c...",[You say it is 0.1km ekstra but actual distanc...
6,5,331,5_support_wolt support_chat_team,"[support, wolt support, chat, team, bad, help,...","[Haven't used Wolt Support, so is still uncert..."
7,6,322,6_map_address_google_gps,"[map, address, google, gps, app, location, map...",[Still problems with the addressees in Google ...
8,7,269,7_bag_drink_pack_cup,"[bag, drink, pack, cup, food, paper, order, sp...",[McDonalds City putting orders in big bags is ...
9,8,146,8_task_task task_time_new task,"[task, task task, time, new task, accept, acce...","[I don't get as many tasks, It's Sku Slay with..."


In [None]:
# Display top 10 word representations for each topic (excluding outlier topic -1)
topic_info = reduced_model2_2.get_topic_info()
for topic_id in topic_info.Topic:
    if topic_id != -1:
        words = reduced_model2_2.get_topic(topic_id)
        if words:
            top_words = [word for word, _ in words[:10]]
            print(f"Topic {topic_id}: {', '.join(top_words)}")

Topic 0: order, courier, hour, pay, new, payment, low, earning, work, wolt
Topic 1: ready, wait, order, restaurant, time, food, minute, burger, mcdonald, min
Topic 2: happy, wolt, good, work, thank, happy wolt, day, wolt happy, love, job
Topic 3: app, order, work, app work, time, offer, dkk, drive, error, task
Topic 4: km, distance, delivery, pay, driver, drive, car, money, payment, long
Topic 5: support, wolt support, chat, team, bad, help, wolt, support team, good, contact
Topic 6: map, address, google, gps, app, location, maps, wrong, google map, google maps
Topic 7: bag, drink, pack, cup, food, paper, order, spill, soda, customer
Topic 8: task, task task, time, new task, accept, accept task, new, hour, disappear, receive
Topic 9: wolt, market, wolt market, market vanløse, order, vanløse, danish, copenhagen, denmark, hour
Topic 10: bundle, bundle order, order, pay, task, accept, payment, new, order bundle, system
Topic 11: bonus, weekend, weekend bonus, order, bonus weekend, bonus t

Tjekker nu dokumenter i de sammensatte topics (0 og 2), for at se at det giver mening at de handler om det samme

In [None]:
# Get topic assignments for each document
topics, _ = reduced_model2_2.transform(df_combined["translated_Comment"].tolist())

In [237]:
from collections import defaultdict
import random

# Organize documents by their assigned topic
topic_to_docs = defaultdict(list)
for doc, topic in zip(documents, topics):
    topic_to_docs[topic].append(doc)

# Select only topic 2
topic_id = 12
docs = topic_to_docs.get(topic_id, [])

if docs:
    sampled_docs = random.sample(docs, min(25, len(docs)))  # Randomly select up to 20 docs
    print(f"\n=== Topic {topic_id} ({len(docs)} documents) ===")
    for i, doc in enumerate(sampled_docs):
        print(f"{i+1}. {doc}")
else:
    print(f"No documents found for Topic {topic_id}.")


=== Topic 12 (90 documents) ===
1. Why the F do I have to ask support everytime I want to cancel an order. I have enough experience to know when I have to cancel and when not. I’ve done 550 orders this month alone, let me cancel whenever want without support please
2. The two layers of support you have to go through to get anything done are highly frustrating and time wasting. Also if a task is to pickup a missing item from a previous order please let us know about it. Twice today I had venues flatly tell me they had no such order only to find out after 10 minutes back and forth with support that it was for a missing item.
3. Please make it possible to cancel an order instead of contacting support. Unfortunately, experience a wait of +5 min at Venues. And understandingly they are busy, but who bring out is time = money
4. I had an Incident with an order that I got, I went to the address but the girl told me she did not order any but the address is hers, and I asked her if may be her h

In [None]:
reduced_model2_2.visualize_hierarchy(custom_labels=True)

Topic 0 ser god ud, topic 2 ser også god ud (men nogle enkelte outliers her)

### Manuel reducering

In [9]:
# Createas a identical model called reduced_model, which is a copy of the loaded_model
# I'll use this to reduce the number of topics in the model, while keeping the original model intact
import copy

reduced_model_manual = copy.deepcopy(loaded_model2)

In [10]:
topics_to_merge = [[0, 23],
                   [1, 4, 11],
                   [2, 5, 13, 25],
                   [8, 15, 24],
                   [9, 12, 17, 18, 22, 26]
]

reduced_model_manual.merge_topics(documents, topics_to_merge)

In [259]:
reduced_model_manual.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,Representative_Docs
0,-1,4118,-1_order_app_work_delivery,"[order, app, work, delivery, pay, hour, day, c...","[wolt market, wolt, earn, fee, app, busy, deli...","[Hello with you ""Wolt"" I will start by mention..."
1,0,1073,0_courier_hour_new courier_work,"[courier, hour, new courier, work, order order...","[get order, order order, order, receive order,...","[Didn't get some order, There are not many ord..."
2,1,1064,1_restaurant_burger_mcdonald_min,"[restaurant, burger, mcdonald, min, delay, foo...","[food ready, order ready, restaurant, waiting ...","[Dear Wolt support, I would like to file anoth..."
3,2,997,2_happy_happy wolt_day_wolt happy,"[happy, happy wolt, day, wolt happy, work wolt...","[love wolt, wolt happy, happy wolt, wolt, wolt...","[i am very happy with wolt, i am very happy wi..."
4,3,803,3_pay_low_hour_payment,"[pay, low, hour, payment, earning, work, salar...","[order pay, pay order, payment, payment order,...","[The payment is a shameless, The payments are ..."
5,4,513,4_app_app work_error_order app,"[app, app work, error, order app, problem, res...","[app work, app bad, app app, app, app buggy, a...",[My app teasing therefore i have not been to w...
6,5,435,5_km_delivery_drive_payment,"[km, delivery, drive, payment, bike, scooter, ...","[distance payment, long distance, distance, co...",[The adress of one of my delivery's today was ...
7,6,331,6_support_wolt support_chat_support team,"[support, wolt support, chat, support team, su...","[wolt support, support support, support good, ...","[Wolt support not good.verybad, Support always..."
8,7,322,7_map_address_gps_maps,"[map, address, gps, maps, google map, google m...","[google map, map app, google maps, app map, ma...",[Sometimes wolt app doesn't redirect to accura...
9,8,269,8_bag_pack_food_spill,"[bag, pack, food, spill, soda, paper bag, lid,...","[bag order, food bag, use bag, paper bag, bag ...",[McDonalds City putting orders in big bags is ...


In [13]:
# Display top 10 word representations for each topic (excluding outlier topic -1)
topic_info = reduced_model_manual.get_topic_info()
for topic_id in topic_info.Topic:
    if topic_id != -1:
        words = reduced_model_manual.get_topic(topic_id)
        if words:
            top_words = [word for word, _ in words[:10]]
            print(f"Topic {topic_id}: {', '.join(top_words)}")

Topic 0: courier, order, task, hour, new, new courier, time, work, order order, home
Topic 1: ready, wait, order, restaurant, time, food, minute, burger, mcdonald, min
Topic 2: happy, wolt, good, work, thank, happy wolt, day, wolt happy, love, job
Topic 3: pay, low, hour, payment, earning, order, wolt, price, dkk, bad
Topic 4: app, order, app work, work, time, error, support, task, order app, accept
Topic 5: km, distance, delivery, pay, driver, drive, car, money, payment, long
Topic 6: support, wolt support, chat, team, bad, help, wolt, support team, good, contact
Topic 7: map, address, google, gps, app, location, maps, wrong, google map, google maps
Topic 8: bag, drink, pack, cup, food, paper, order, spill, soda, customer
Topic 9: wolt, wolt market, market, market vanløse, vanløse, order, danish, copenhagen, denmark, hour
Topic 10: bundle, bundle order, order, pay, task, accept, payment, order bundle, new, system
Topic 11: bonus, weekend, weekend bonus, order, bonus weekend, bonus tim

In [14]:
# Get topic assignments for each document
topics, _ = reduced_model_manual.transform(df_combined["translated_Comment"].tolist())

In [15]:
from collections import defaultdict
import random

# Organize documents by their assigned topic
topic_to_docs = defaultdict(list)
for doc, topic in zip(documents, topics):
    topic_to_docs[topic].append(doc)

# Select only topic 2
topic_id = 12
docs = topic_to_docs.get(topic_id, [])

if docs:
    sampled_docs = random.sample(docs, min(25, len(docs)))  # Randomly select up to 20 docs
    print(f"\n=== Topic {topic_id} ({len(docs)} documents) ===")
    for i, doc in enumerate(sampled_docs):
        print(f"{i+1}. {doc}")
else:
    print(f"No documents found for Topic {topic_id}.")


=== Topic 12 (94 documents) ===
1. App lets me drive for miles, then offers almost ready order when I’m close to the venue. This time was extra special.. I literally drive 2 kilometers from Ingrid Marievej 8, Valby back towards old Valby. I go slow, but no orders. And to no great surprise, I get not one but two orders, as soon as I am back in old Valby. And both orders are late for pick up, so I find it very likely that they have been offered in the market for a minimum of four minutes before they are due for pickup. Just know that it’s very annoying to a courier, and in my case lead me to not accept the next bundle offered by the app..
2. Venues marking the order ready when its not ready and waiting 10-20 mins for the order,the app sucks battery like crazy and if connected to bt,lowers all volumes.
3. App lets me drive for miles, then offers almost ready order when I’m close to the venue. ABSOLUTE JOKE TODAY. THE LEVEL OF MANIPULATION IS BEYOND BELIEF.
4. In my opinion we could have 

## Reduce outliers

### reduce outliers for automatisk + manuel cluster reduction

Outliers must be reduced after topics have been merged. It cannot technically work the other way around

In [None]:
import copy

reduced_outlier_model2 = copy.deepcopy(reduced_model2_2)

I'll now look at some outliers to get an idea what proportion of outliers should be in one of the topics, and what proportion are actual outliers (noise) not belonging to any of the topics. 

I'll now try different approaches to outlier reduction. that is techniques and thresholds

In [None]:
# Reduce outliers using the `embeddings` strategy
new_topics_2 = reduced_outlier_model2.reduce_outliers(documents, topics, threshold=0.7, strategy="embeddings")
# puts the outliers into one of the topics. The documents already assigfned to a topic will not be changed. 

In [None]:
# Now we need to update the model with the new assignments of the outliers, and new topic representations are calculated
# when specifying the representation model, KEYBERT and MMR are also displayed
reduced_outlier_model2.update_topics(documents, topics = new_topics_2, vectorizer_model= vectorizer_model, representation_model=representation_model)




In [None]:
import copy

# Define the strategy-threshold combinations
combinations = [
    ("embeddings", 0.25),
    ("embeddings", 0.45),
    ("embeddings", 0.55),
    ("c-tf-idf", 0.02),
    ("c-tf-idf", 0.07),
    ("c-tf-idf", 0.08),
    ("distributions", 0.03),
    ("distributions", 0.08),
    ("distributions", 0.09),
]

# Store results
results = {}

# Loop through each combination
for strategy, threshold in combinations:
    print(f"Running: strategy = {strategy}, threshold = {threshold}")
    
    # Deep copy the original reduced model
    model_copy = copy.deepcopy(reduced_model2)

    # Reduce outliers using specified strategy and threshold
    new_topics = model_copy.reduce_outliers(
        documents=documents,
        topics=topics,
        strategy=strategy,
        threshold=threshold
    )

    # Update topic representations
    model_copy.update_topics(
        docs=documents,  # <-- FIXED HERE
        topics=new_topics,
        vectorizer_model=vectorizer_model,
        representation_model=representation_model
    )

    # Count number of outliers
    outlier_count = sum(1 for t in new_topics if t == -1)

    # Store results
    results[(strategy, threshold)] = {
        "model": model_copy,
        "topics": new_topics,
        "outlier_count": outlier_count
    }

# Optional: print summary of outlier counts
print("\nSummary of Outlier Counts:")
for (strategy, threshold), res in results.items():
    print(f"{strategy}, threshold={threshold}: {res['outlier_count']} outliers")


First run

embeddings, threshold = 0.3: 302 outliers remain

embeddings, threshold = 0.5: 2044 outliers remain

embeddings, threshold = 0.7: 3996 outliers remain


c-tf-idf, threshold = 0.1: 2541 outliers remain

c-tf-idf, threshold = 0.2: 3956 outliers remain

c-tf-idf, threshold = 0.3: 4074 outliers remain

distributions, threshold = 0.1: 2007 outliers remain

distributions, threshold = 0.2: 3564 outliers remain

distributions, threshold = 0.3: 3984 outliers remain

------------------------------------------------------

Second run

embeddings, threshold = 0.35: 564 outliers remain

embeddings, threshold = 0.4: 915 outliers remain

embeddings, threshold = 0.45: 1443 outliers remain

c-tf-idf, threshold = 0.02: 142 outliers remain

c-tf-idf, threshold = 0.05: 798 outliers remain

c-tf-idf, threshold = 0.08: 1795 outliers remain

distributions, threshold = 0.06: 945 outliers remain

distributions, threshold = 0.07: 1219 outliers remain

distributions, threshold = 0.08: 1459 outliers remain

------------------------------------------------------

Third run 

embeddings, threshold = 0.25: 151 outliers remain

embeddings, threshold = 0.45: 1434 outliers remain

embeddings, threshold = 0.55: 2628 outliers remain

c-tf-idf, threshold = 0.02: 142 outliers remain

c-tf-idf, threshold = 0.07: 1459 outliers remain

c-tf-idf, threshold = 0.08: 1795 outliers remain

distributions, threshold = 0.03: 310 outliers remain

distributions, threshold = 0.08: 1459 outliers remain

distributions, threshold = 0.09: 1720 outliers remain

#### Inspection of embeddings025

In [None]:
topics_emb_025_2 = results[("embeddings", 0.25)]["model"]
topics_emb_025_2.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,Representative_Docs
0,-1,151,-1_tax_go_battery_quiet,"[tax, go, battery, quiet, add, tip, come, twic...","[tax, pay tax, deduction, tip, pay, extra step...","[tax, tip, worker right, basic worker, bike, d...","[Hello with you ""Wolt"" I will start by mention..."
1,0,2611,0_order_courier_hour_pay,"[order, courier, hour, pay, wolt, low, payment...","[order order, order, get order, receive order,...","[courier, pay, low, payment, work, day, earnin...","[More orders would be delicious, There are not..."
2,1,1527,1_ready_wait_order_time,"[ready, wait, order, time, food, restaurant, m...","[food ready, restaurant, mark food, order read...","[restaurant, burger, mcdonald, order ready, fo...",[Ask Pick up places not to mark the order read...
3,2,1421,2_happy_wolt_good_excellent,"[happy, wolt, good, excellent, day, work, than...","[love wolt, wolt happy, happy wolt, wolt good,...","[day, happy wolt, wolt happy, excellent week, ...","[i am very happy with wolt, i am very happy wi..."
4,3,1120,3_app_order_work_time,"[app, order, work, time, app work, support, ta...","[app app, app, app work, app crash, app let, o...","[app, app work, error, delivery, hour, crash, ...","[App crashes. App lets me drive for miles, the..."
5,4,947,4_km_distance_delivery_pay,"[km, distance, delivery, pay, car, drive, driv...","[wolt, favor wolt, fee, delivery, distance, lo...","[km, delivery, drive, kr, payment, bicycle, ki...",[You say it is 0.1km ekstra but actual distanc...
6,5,453,5_support_wolt support_chat_help,"[support, wolt support, chat, help, team, bad,...","[wolt support, support support, support good, ...","[support, wolt support, support bad, support g...","[Haven't used Wolt Support, so is still uncert..."
7,6,430,6_map_address_app_google,"[map, address, app, google, location, gps, map...","[address wolt, wrong address, google map, addr...","[map, address, gps, maps, google map, google m...",[Still problems with the addressees in Google ...
8,7,325,7_bag_drink_pack_food,"[bag, drink, pack, food, order, cup, restauran...","[bag order, paper bag, use bag, food bag, bag ...","[bag, food, soda, spill, paper bag, lid, mcdon...",[McDonalds City putting orders in big bags is ...
9,8,329,8_task_work_time_accept,"[task, work, time, accept, hour, busy, new, ac...","[task, task task, add task, single task, new t...","[task, work, accept task, new task, task task,...","[I don't get as many tasks, It's Sku Slay with..."


Now I look at how the outliers were reassigned into the the topics

In [None]:
# Step 1: Get original topics (before outlier reduction)
topics, _ = reduced_model2_2.transform(df_combined["translated_Comment"].tolist()) 

In [None]:

from collections import defaultdict
import random

# Step 2: Get new topics (after outlier reduction using embedding + 0.25 threshold)

new_topics_embedding025_2 = results[("embeddings", 0.25)]["topics"]
outlier_model_embedding025_2 = results[("embeddings", 0.25)]["model"]
documents = df_combined["translated_Comment"].tolist()

# Step 1: Find reassigned outliers (originally -1, now assigned to a topic)
reassigned_outliers = [
    (i, doc, new_topic)
    for i, (doc, old_topic, new_topic) in enumerate(zip(documents, topics, new_topics_embedding025))
    if old_topic == -1 and new_topic != -1
]

# Step 2: Group reassigned outliers by new topic
topic_to_outlier_docs = defaultdict(list)
for i, doc, new_topic in reassigned_outliers:
    topic_to_outlier_docs[new_topic].append(doc)

# Step 3: Print sampled outlier docs with ORIGINAL top words (from reduced_model2)
for topic_id in sorted(topic_to_outlier_docs.keys()):
    docs = topic_to_outlier_docs[topic_id]
    sampled_docs = random.sample(docs, min(10, len(docs)))

    # Get original top words from reduced_model2
    original_topic_words = reduced_model2_2.get_topic(topic_id)
    if original_topic_words:
        top_words = ", ".join([word for word, _ in original_topic_words])
    else:
        top_words = "(No original top words)"

    print(f"Reassigned Outliers → Topic {topic_id} ({len(docs)} docs) | Original top words: {top_words}")
    for j, doc in enumerate(sampled_docs):  
        print(f"{j+1}. {doc}")
    print()



Reassigned Outliers → Topic 0 (945 docs) | Original top words: order, courier, hour, pay, new, payment, low, earning, work, wolt
1. Increase the salaries or no one will work
2. When the weather is like this right now (storms/rain/snowy), you should raise the payment for every order by 25% to 30%. That will motivate the couriers to work on these days. Putting 3x effort than the usual on a day like this, getting vehicle spoilt, and getting paid the same is something where Wolt has failed me.
3. The payments given are low. Night workers should receive extra payment.
4. STOP INCOMING PEOPLE IN AALBORG!!! RISE THE PAYMENT
5. Wolt does not behave fairly. During 1.5 hours in one place, I was without an order, but the other colleagues had several orders
6. Whats going on with Wolt that is super dead? People don’t order anymore? Don’t think so. Maybe the competition is taking wolts place? Used to be great but now is just CRAP
7. The orders are very low and the amount that Volt pays is very low



topic 0: fits

topic 1: fits mostly

topic 2: fits

topic 3: fits

topic 4: most don't fit

topic 5: 50% passer

topic 6: mostly don't fit

topic 7: mostly don't fit

topic 8: mostly don't fit

topic 9: 50% fits

topic 10: 50% fits

topic 11: 50% fits

topic 12: 50% fits

Overordnet ligner det, at der er lidt form meget støj

Kigger nu på de outliers der er tilbage

In [None]:
from collections import defaultdict
import random


# Organize documents by new topic
topic_to_docs = defaultdict(list)
for doc, topic in zip(documents, new_topics_embedding025_2):
    topic_to_docs[topic].append(doc)

# Select only topic 5
topic_id = -1
docs = topic_to_docs.get(topic_id, [])

if docs:
    sampled_docs = random.sample(docs, min(20, len(docs)))  # Randomly select up to 20 docs
    print(f"\n=== Topic {topic_id} ({len(docs)} documents) ===")
    for i, doc in enumerate(sampled_docs):
        print(f"{i+1}. {doc}")
else:
    print(f"No documents found for Topic {topic_id}.")


=== Topic -1 (151 documents) ===
1. Wolt it's a fking shit
2. This whole operation is a fraud, a disgrace, a circus, an infantile pipe-dream. You can have it all
3. The last rasturante did not give me the right
4. A complete DISASTER. HATING IT A BIT MORE EVERY DAY HOW RIGGED THIS IS
5. Slavery 21st century version
6. You don’t deserve us
7. The algorithm is really intelligent
8. Seems very quiet whenever I go out lately.
9. Pinza odense needs to get a reality check ,as I was told to stand outside in the snow blizzard ,cause his floor would get dirty otherwise
10. you are thieves who rob us every day
11. So give me something to do, it can't fit this, what are you thinking about mannnnm
12. In do not listen anyway
13. It was raining a lot, but besides it, it was perfectly
14. A little quiet guard
15. Give me something to make, it r a scandal p.t
16. It would be great If I get naviagtions
17. Like an indicator that reminds us when we need a pin code from the customer
18. Stop stealing m

Mostly noise (hard to place in a topic). However most of it are aggresive tones

#### Inspecting of embedding045

In [None]:
topics_emb_045_2 = results[("embeddings", 0.45)]["model"]
topics_emb_045_2.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,Representative_Docs
0,-1,1434,-1_work_time_wolt_money,"[work, time, wolt, money, pay, good, day, like...","[earn, work, pay, hire, salary, job, money, le...","[work, pay, day, hour, tax, customer, sunday, ...","[Hello with you ""Wolt"" I will start by mention..."
1,0,2386,0_order_courier_hour_wolt,"[order, courier, hour, wolt, pay, low, payment...","[get order, receive order, order order, pay or...","[courier, wolt, low, payment, work, day, earni...","[More orders would be delicious, There are not..."
2,1,1387,1_ready_wait_order_time,"[ready, wait, order, time, restaurant, food, m...","[food ready, restaurant, order ready, delay, w...","[restaurant, mcdonald, min, order ready, food ...",[Ask Pick up places not to mark the order read...
3,2,1213,2_happy_wolt_good_excellent,"[happy, wolt, good, excellent, day, thank, wor...","[love wolt, wolt happy, happy wolt, wolt good,...","[day, happy wolt, wolt happy, excellent day, e...","[i am very happy with wolt, i am very happy wi..."
4,3,1031,3_app_order_work_time,"[app, order, work, time, app work, support, ta...","[app work, order app, app app, app, app let, w...","[app, app work, error, delivery, crash, app le...","[App crashes. App lets me drive for miles, the..."
5,4,798,4_km_distance_delivery_pay,"[km, distance, delivery, pay, drive, car, driv...","[wolt, favor wolt, fee, delivery, deliver, app...","[km, delivery, driver, kr, payment, bicycle, k...",[You say it is 0.1km ekstra but actual distanc...
6,5,389,5_support_wolt support_team_chat,"[support, wolt support, team, chat, help, bad,...","[wolt support, support support, support good, ...","[support, wolt support, chat, support good, su...","[Haven't used Wolt Support, so is still uncert..."
7,6,386,6_map_address_app_google,"[map, address, app, google, location, gps, map...","[google map, google maps, map app, wrong addre...","[map, address, app, gps, maps, google map, goo...",[Still problems with the addressees in Google ...
8,7,299,7_bag_drink_pack_order,"[bag, drink, pack, order, food, cup, restauran...","[bag order, food bag, paper bag, use bag, bag ...","[bag, pack, soda, spill, paper bag, lid, mcdon...",[McDonalds City putting orders in big bags is ...
9,8,223,8_task_time_accept_work,"[task, time, accept, work, busy, hour, task ta...","[task come, wait task, single task, add task, ...","[busy, task task, accept task, new task, time ...","[I don't get as many tasks, It's Sku Slay with..."


In [None]:
# Step 1: Get original topics (before outlier reduction)
topics, _ = reduced_model2_2.transform(df_combined["translated_Comment"].tolist()) 

In [None]:

from collections import defaultdict
import random

# Step 2: Get new topics (after outlier reduction using embedding + 0.25 threshold)
new_topics_embedding045_2 = results[("embeddings", 0.45)]["topics"]

# Step 1: Find reassigned outliers (originally -1, now assigned to a topic)
reassigned_outliers = [
    (i, doc, new_topic)
    for i, (doc, old_topic, new_topic) in enumerate(zip(documents, topics, new_topics_embedding045))
    if old_topic == -1 and new_topic != -1
]

# Step 2: Group reassigned outliers by new topic
topic_to_outlier_docs = defaultdict(list)
for i, doc, new_topic in reassigned_outliers:
    topic_to_outlier_docs[new_topic].append(doc)

# Step 3: Print sampled outlier docs with ORIGINAL top words (from reduced_model2)
for topic_id in sorted(topic_to_outlier_docs.keys()):
    docs = topic_to_outlier_docs[topic_id]
    sampled_docs = random.sample(docs, min(10, len(docs)))

    # Get original top words from reduced_model2
    original_topic_words = reduced_model2_2.get_topic(topic_id)
    if original_topic_words:
        top_words = ", ".join([word for word, _ in original_topic_words])
    else:
        top_words = "(No original top words)"

    print(f"Reassigned Outliers → Topic {topic_id} ({len(docs)} docs) | Original top words: {top_words}")
    for j, doc in enumerate(sampled_docs):  
        print(f"{j+1}. {doc}")
    print()


Reassigned Outliers → Topic 0 (720 docs) | Original top words: order, courier, hour, pay, new, payment, low, earning, work, wolt
1. This is not a complaint, but I would like to draw your attention to the behavior of the Volt Market Soborg employees. Firstly, they are in no hurry to give orders to the couriers, sometimes pretending that they simply do not notice the couriers, that they are standing and waiting for their orders. And secondly, they give orders with such an air as if they are doing someone a favor. Frankly speaking, on some shifts you don’t even want to take orders there because of such behavior of the staff. I hope that you will help them to respect the couriers a little, since this is your structure and you have a direct influence on them.
2. Wolt needs to do something about the venues who mark the order as ready and it's not ready. We loose time going to the venue, expecting the order is ready. When we get there they say it's not ready and that we need to wait 10 or 15+

topic 0: 50% fits

topic 1: mostly fits

topic 2: fits

topic 3: fits

topic 4: mostly fits

topic 5: fits

topic 6: 50% fits

topic 7: 50% fits

topic 8: mostly don't fit

topic 9: mostly don't fit

topic 10: fits 

topic 11: mostly fits

topic 12: mostly don't fit

Kigger nu på de outliers der er tilbage

In [196]:
from collections import defaultdict
import random


# Organize documents by new topic
topic_to_docs = defaultdict(list)
for doc, topic in zip(documents, new_topics_embedding045):
    topic_to_docs[topic].append(doc)

# Select only topic 5
topic_id = -1
docs = topic_to_docs.get(topic_id, [])

if docs:
    sampled_docs = random.sample(docs, min(20, len(docs)))  # Randomly select up to 20 docs
    print(f"\n=== Topic {topic_id} ({len(docs)} documents) ===")
    for i, doc in enumerate(sampled_docs):
        print(f"{i+1}. {doc}")
else:
    print(f"No documents found for Topic {topic_id}.")


=== Topic -1 (1434 documents) ===
1. All good, except for Mc Kastrup. They are without exception, always between 7-10 min backwards. Apology we are so busy they have such a disrespect to us cures
2. Hi It's seemed distances are longer than which are indicated in the tasks. There's no difference between first floor and fourth floor . In my opinion it should be added cost for each sale on destination , because of used energy and time for climbing the stairs . Thanks
3. Have an excellent Sunday
4. Add ratings to each venue after task 1-5 stars. To get the venues to become better. Best rated venue get diplomas to hang in restaurant
5. Stop asking for feedback repeatedly!
6. No extra boost this Saturday due to Monday "free day" ..... come on Wolt!!!! Is that the best way to motivate us to jump on the road??!!??
7. It is a scam, it is not worth putting in physical effort for this company.
8. There's not registered km drive ... Is there an error?
9. Make option in settings for rider to be ab

Nearly 50% could actually fit in a topic

#### Inspecting of embedding055

In [None]:
topics_emb_055_2 = results[("embeddings", 0.55)]["model"]
topics_emb_055_2.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,Representative_Docs
0,-1,2628,-1_order_work_time_wolt,"[order, work, time, wolt, excellent, day, hour...","[order, deliver, wolt, delivery, earn, work, b...","[order, work, day, hour, pay, delivery, custom...","[Hello with you ""Wolt"" I will start by mention..."
1,0,2072,0_order_courier_hour_pay,"[order, courier, hour, pay, wolt, low, payment...","[order, get order, order order, receive order,...","[courier, hour, wolt, payment, work, earning, ...","[More orders would be delicious, There are not..."
2,1,1273,1_ready_wait_order_time,"[ready, wait, order, time, food, restaurant, m...","[food ready, order ready, delay, waiting time,...","[restaurant, burger, mcdonald, order ready, fo...",[Ask Pick up places not to mark the order read...
3,2,1024,2_happy_wolt_good_work,"[happy, wolt, good, work, happy wolt, thank, d...","[love wolt, wolt happy, happy wolt, wolt, wolt...","[happy, happy wolt, day, wolt happy, work wolt...","[i am very happy with wolt, i am very happy wi..."
4,3,924,3_app_order_work_time,"[app, order, work, time, app work, support, ta...","[app app, app, app work, fix app, problem app,...","[app work, error, hour, app let, drive mile, o...","[App crashes. App lets me drive for miles, the..."
5,4,634,4_km_distance_delivery_drive,"[km, distance, delivery, drive, pay, order, dr...","[distance payment, wolt, fee, favor wolt, dist...","[km, delivery, drive, payment, kr, bike, app, ...",[You say it is 0.1km ekstra but actual distanc...
6,5,364,5_support_wolt support_team_chat,"[support, wolt support, team, chat, bad, help,...","[wolt support, support wolt, support support, ...","[support, wolt support, chat, support team, su...","[Haven't used Wolt Support, so is still uncert..."
7,6,351,6_map_address_app_google,"[map, address, app, google, gps, location, map...","[google map, google maps, map app, wrong addre...","[map, address, gps, maps, google map, google m...",[Still problems with the addressees in Google ...
8,7,283,7_bag_drink_pack_food,"[bag, drink, pack, food, order, cup, soda, pap...","[bag order, food bag, paper bag, use bag, bag ...","[bag, pack, soda, spill, paper bag, lid, mcdon...",[McDonalds City putting orders in big bags is ...
9,8,178,8_task_time_accept_task task,"[task, time, accept, task task, second, new, n...","[task come, task, task task, wait task, new ta...","[task task, new task, accept task, time task, ...","[I don't get as many tasks, It's Sku Slay with..."


In [None]:
# Step 1: Get original topics (before outlier reduction)
topics, _ = reduced_model2_2.transform(df_combined["translated_Comment"].tolist()) 

In [None]:

from collections import defaultdict
import random

# Step 2: Get new topics (after outlier reduction using embedding + 0.25 threshold)
new_topics_embedding055_2 = results[("embeddings", 0.55)]["topics"]

# Step 1: Find reassigned outliers (originally -1, now assigned to a topic)
reassigned_outliers = [
    (i, doc, new_topic)
    for i, (doc, old_topic, new_topic) in enumerate(zip(documents, topics, new_topics_embedding055))
    if old_topic == -1 and new_topic != -1
]

# Step 2: Group reassigned outliers by new topic
topic_to_outlier_docs = defaultdict(list)
for i, doc, new_topic in reassigned_outliers:
    topic_to_outlier_docs[new_topic].append(doc)

# Step 3: Print sampled outlier docs with ORIGINAL top words (from reduced_model2)
for topic_id in sorted(topic_to_outlier_docs.keys()):
    docs = topic_to_outlier_docs[topic_id]
    sampled_docs = random.sample(docs, min(10, len(docs)))

    # Get original top words from reduced_model2
    original_topic_words = reduced_model2_2.get_topic(topic_id)
    if original_topic_words:
        top_words = ", ".join([word for word, _ in original_topic_words])
    else:
        top_words = "(No original top words)"

    print(f"Reassigned Outliers → Topic {topic_id} ({len(docs)} docs) | Original top words: {top_words}")
    for j, doc in enumerate(sampled_docs):  
        print(f"{j+1}. {doc}")
    print()


Reassigned Outliers → Topic 0 (406 docs) | Original top words: order, courier, hour, pay, new, payment, low, earning, work, wolt
1. Instead of sending 56 messages about "super busy weekend" could try paying more. It's a shame how little are paying per order, day after day a little less.
2. Got my first 2 deliveries ever today, it was impeccable. However I didn't get any other orders after that for an hour
3. 1 every 45 min? MAX 90kr!? In the hour .. Are you serious? People are being sent, why are you slowed down? You literally run KBH round ends with 1-35kr! ? Fair distribution? The r destination, yesterday 35-120 kr lunch time! Is that good? Want to work for 35 kr/ or whatever get entered in the system per hour process is running on? Be honest. It r total disappointment and bad. This one yesterday 15/2 Kl- 9-230 kr 10-35kr. ??? 11- 50kr+ 80kr. ??? 12-1/100+45! ??? 13-125 + 45kr +45 (215)
4. It is a shame how much you're paying us for orders on days when there is rain and wind.
5. Very

topic 0: fits

topic 1: fits

topic 2: fits

topic 3: mostly fits

topic 4: mostly don't fit

topic 5: fits

topic 6: mostly don't fit

topic 7: 50% fits

topic 8: mostly don't fit

topic 9: mostly fit.

topic 10: fits 

topic 11: fits

topic 12: 50% fit

In [242]:
from collections import defaultdict
import random


# Organize documents by new topic
topic_to_docs = defaultdict(list)
for doc, topic in zip(documents, new_topics_embedding055):
    topic_to_docs[topic].append(doc)

# Select only topic 5
topic_id = -1
docs = topic_to_docs.get(topic_id, [])

if docs:
    sampled_docs = random.sample(docs, min(20, len(docs)))  # Randomly select up to 20 docs
    print(f"\n=== Topic {topic_id} ({len(docs)} documents) ===")
    for i, doc in enumerate(sampled_docs):
        print(f"{i+1}. {doc}")
else:
    print(f"No documents found for Topic {topic_id}.")


=== Topic -1 (2628 documents) ===
1. Would be smart with a feature where you could choose restaurants from which you did not want to have orders from. Are you will and get extremely many from places I can't get to or park.
2. Annoying with no reject button in bundle mode. Just got spammed with McDonald’s yesterday. And had to wait like 1-2 min x +10 times.
3. People are super happy to have food delivered and other delivered directly to the door. I feel I help some through a busy everyday life and make life a little more festive for others.
4. Customers will have to understand that the green location marking is what we Wolt Bude is running for. It is very important that it is precise and put precisely on their front door
5. Things are very slow in my area.
6. The code required from customers is annoying and a waste of time! Almost gona quit working due to its effects!
7. Have an excellent week
8. Same problem as last time
9. Very thin on orders today, it didn't exactly feel like "peak 

Some are noise, but some could be fitted into topics

#### Inspecting of embedding050

In [None]:
outlier_model_embedding025_2 = copy.deepcopy(reduced_model2_2)

In [None]:
# Reduce outliers using the `embeddings` strategy
new_topics_embedding05_2 = model_copy.reduce_outliers(documents, topics, threshold=0.5, strategy="embeddings")
# puts the outliers into one of the topics. The documents already assigfned to a topic will not be changed. 
# Now we need to update the model with the new assignments of the outliers, and new topic representations are calculated
# when specifying the representation model, KEYBERT and MMR are also displayed
outlier_model_embedding025_2.update_topics(documents, topics = new_topics_embedding05_2, vectorizer_model= vectorizer_model, representation_model=representation_model)




In [None]:
#model_copy = results[("embeddings", 0.25)]["model"]
outlier_model_embedding025.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,Representative_Docs
0,-1,2044,-1_work_time_wolt_order,"[work, time, wolt, order, excellent, hour, mon...","[week excellent, excellent week, weekend excel...","[work, hour, money, day, pay, customer, delive...","[Hello with you ""Wolt"" I will start by mention..."
1,0,2237,0_order_courier_hour_pay,"[order, courier, hour, pay, wolt, low, payment...","[order order, get order, order, order low, rec...","[courier, hour, low, payment, work, earning, d...","[More orders would be delicious, There are not..."
2,1,1331,1_ready_wait_order_time,"[ready, wait, order, time, food, restaurant, m...","[food ready, order ready, minute order, time r...","[restaurant, mcdonald, min, order ready, food ...",[Ask Pick up places not to mark the order read...
3,2,1093,2_happy_wolt_good_day,"[happy, wolt, good, day, happy wolt, work, tha...","[love wolt, wolt happy, happy wolt, wolt, wolt...","[happy, day, happy wolt, wolt happy, work wolt...","[i am very happy with wolt, i am very happy wi..."
4,3,991,3_app_order_work_time,"[app, order, work, time, app work, support, ta...","[app app, app work, app, fix app, app crash, a...","[app work, error, delivery, app let, drive mil...","[App crashes. App lets me drive for miles, the..."
5,4,717,4_km_distance_delivery_drive,"[km, distance, delivery, drive, pay, car, orde...","[distance payment, wolt, favor wolt, fee, deli...","[km, delivery, drive, kr, payment, bike, kilom...",[You say it is 0.1km ekstra but actual distanc...
6,5,372,5_support_wolt support_team_chat,"[support, wolt support, team, chat, bad, help,...","[wolt support, support wolt, support support, ...","[support, wolt support, chat, support good, su...","[Haven't used Wolt Support, so is still uncert..."
7,6,365,6_map_address_app_google,"[map, address, app, google, gps, location, map...","[google map, google maps, map app, app map, co...","[map, address, app, gps, maps, google map, goo...",[Still problems with the addressees in Google ...
8,7,290,7_bag_drink_pack_food,"[bag, drink, pack, food, order, cup, soda, pap...","[bag order, paper bag, use bag, food bag, bag ...","[bag, pack, soda, spill, paper bag, lid, mcdon...",[McDonalds City putting orders in big bags is ...
9,8,198,8_task_time_accept_task task,"[task, time, accept, task task, hour, new, sec...","[task, task task, wait task, task come, get ta...","[task, task task, new task, accept task, busy,...","[I don't get as many tasks, It's Sku Slay with..."


In [None]:
# Step 1: Get original topics (before outlier reduction)
topics, _ = reduced_model2_2.transform(df_combined["translated_Comment"].tolist()) 

In [None]:
from collections import defaultdict
import random

# Step 2: Get new topics (after outlier reduction using embedding + 0.25 threshold)


# Step 1: Find reassigned outliers (originally -1, now assigned to a topic)
reassigned_outliers = [
    (i, doc, new_topic)
    for i, (doc, old_topic, new_topic) in enumerate(zip(documents, topics, new_topics_embedding05_2))
    if old_topic == -1 and new_topic != -1
]

# Step 2: Group reassigned outliers by new topic
topic_to_outlier_docs = defaultdict(list)
for i, doc, new_topic in reassigned_outliers:
    topic_to_outlier_docs[new_topic].append(doc)

# Step 3: Print sampled outlier docs with ORIGINAL top words (from reduced_model2)
for topic_id in sorted(topic_to_outlier_docs.keys()):
    docs = topic_to_outlier_docs[topic_id]
    sampled_docs = random.sample(docs, min(10, len(docs)))

    # Get original top words from reduced_model2
    original_topic_words = reduced_model2_2.get_topic(topic_id)
    if original_topic_words:
        top_words = ", ".join([word for word, _ in original_topic_words])
    else:
        top_words = "(No original top words)"

    print(f"Reassigned Outliers → Topic {topic_id} ({len(docs)} docs) | Original top words: {top_words}")
    for j, doc in enumerate(sampled_docs):  
        print(f"{j+1}. {doc}")
    print()

Reassigned Outliers → Topic 0 (571 docs) | Original top words: order, courier, hour, pay, new, payment, low, earning, work, wolt
1. how can you take orders that disappear in a second,
2. The payment is lower than March
3. Way to few deliveries
4. normal?????? but no have orders what happened
5. The order estimates are surreal. Today, between 11.45 and 13.15 not even an order??? It's very strange...
6. I don't get orders in peak hours and distance is still not correct on many orders
7. No respect. You pay the orders even lower every day. I will go out deliver when you start respecting and paying decent amount
8. Is it possible to pay more money on orders because we are using petrol ( benzine) to deliver the food . Otherwise very happy….
9. No orders for an hour. Previous hour paid a lot below acceptable rate, best locations
10. Hello, I'm based in Vestegnen Vallensbæk and Ishøj, handling customer orders where we receive two or three offers per hour. I'm dedicated to contributing to Wolt


topic 0: fits

topic 1: fits mostly

topic 2: fits

topic 3: fits

topic 4: most don't fit

topic 5: 50% passer

topic 6: mostly don't fit

topic 7: mostly don't fit

topic 8: mostly don't fit

topic 9: 50% fits

topic 10: 50% fits

topic 11: 50% fits

topic 12: 50% fits

Overordnet ligner det, at der er lidt form meget støj

In [None]:
from collections import defaultdict
import random


# Organize documents by new topic
topic_to_docs = defaultdict(list)
for doc, topic in zip(documents, new_topics_embedding025_2):
    topic_to_docs[topic].append(doc)

# Select only topic 5
topic_id = -1
docs = topic_to_docs.get(topic_id, [])

if docs:
    sampled_docs = random.sample(docs, min(20, len(docs)))  # Randomly select up to 20 docs
    print(f"\n=== Topic {topic_id} ({len(docs)} documents) ===")
    for i, doc in enumerate(sampled_docs):
        print(f"{i+1}. {doc}")
else:
    print(f"No documents found for Topic {topic_id}.")

In [None]:
 
from octis.evaluation_metrics.coherence_metrics import Coherence
coherence_model = Coherence(texts=tokens_for_coherence, # reference corpus. Here it's all documents in the dataset
                            topk=10, #specifies the number of top words to consider for NPMI calculation
                            measure="c_npmi")
npmi_score = coherence_model.score({"topics": topic_words_target_model}) # average

# Calculate Topic Diversity
# the topic_diversity command was defned before the loop. This is not a standard command, so I defined how it works earlier.
diversity_score = topic_diversity(topics = topic_words_target_model, topk=10)

# Calculate Topic Quality by multiplying NPMI score and Topic Diversity
topic_quality = npmi_score * diversity_score 


Mean of empty slice.


invalid value encountered in scalar divide



In [80]:
for i, topic in enumerate(topic_words_target_model):
    score = coherence_model.score({"topics": [topic]})
    print(f"Topic {i}: NPMI = {score}")


Topic 0: NPMI = -0.009005572304378559
Topic 1: NPMI = 0.1402920801882038
Topic 2: NPMI = 0.04223354805439683
Topic 3: NPMI = 0.02140121370663232
Topic 4: NPMI = -0.04580471063209176
Topic 5: NPMI = -0.15760481957677328
Topic 6: NPMI = 0.17599019618695694
Topic 7: NPMI = 0.026913872987460865
Topic 8: NPMI = 0.07887086558831503
Topic 9: NPMI = 0.08594379337345763
Topic 10: NPMI = -0.12377918713783191
Topic 11: NPMI = 0.04619751059702082
Topic 12: NPMI = 0.05604190803107952
Topic 13: NPMI = -0.5272028101349079
Topic 14: NPMI = -0.05510848210649167
Topic 15: NPMI = -0.19818362204376036
Topic 16: NPMI = -0.46947643980528525



Mean of empty slice.


invalid value encountered in scalar divide



Topic 17: NPMI = nan
Topic 18: NPMI = -0.6001775191780095


### redcution of outliers for manuel cluster reduction

In [17]:
import copy

# Define the strategy-threshold combinations
combinations = [
    #("embeddings", 0.36),
    #("embeddings", 0.37),
    #("embeddings", 0.38),
    #("embeddings", 0.39),
    #("embeddings", 0.40),
    #("embeddings", 0.41),
    ("c-tf-idf", 0.075),
    ("c-tf-idf", 0.08),
    ("c-tf-idf", 0.085),
    ("c-tf-idf", 0.09),

    #("distributions", 0.09),
    #("distributions", 0.095),
    #("distributions", 0.10),
]

# Store results
results_c_tf_idf = {}

# Loop through each combination
for strategy, threshold in combinations:
    print(f"Running: strategy = {strategy}, threshold = {threshold}")
    
    # Deep copy the original reduced model
    model_copy = copy.deepcopy(reduced_model_manual)

    # Reduce outliers using specified strategy and threshold
    new_topics = model_copy.reduce_outliers(
        documents=documents,
        topics=topics,
        strategy=strategy,
        threshold=threshold
    )

    # Update topic representations
    model_copy.update_topics(
        docs=documents,  # <-- FIXED HERE
        topics=new_topics,
        vectorizer_model=vectorizer_model,
        #representation_model=representation_model
    )

    # Count number of outliers
    outlier_count = sum(1 for t in new_topics if t == -1)

    # Store results
    results_c_tf_idf[(strategy, threshold)] = {
        "model": model_copy,
        "topics": new_topics,
        "outlier_count": outlier_count
    }

# Optional: print summary of outlier counts
print("\nSummary of Outlier Counts:")
for (strategy, threshold), res in results_c_tf_idf.items():
    print(f"{strategy}, threshold={threshold}: {res['outlier_count']} outliers")


Running: strategy = c-tf-idf, threshold = 0.075




Running: strategy = c-tf-idf, threshold = 0.08




Running: strategy = c-tf-idf, threshold = 0.085




Running: strategy = c-tf-idf, threshold = 0.09





Summary of Outlier Counts:
c-tf-idf, threshold=0.075: 1587 outliers
c-tf-idf, threshold=0.08: 1758 outliers
c-tf-idf, threshold=0.085: 1945 outliers
c-tf-idf, threshold=0.09: 2122 outliers


First run

- embeddings, threshold=0.4: 2415 outliers

- embeddings, threshold=0.45: 3172 outliers

- embeddings, threshold=0.5: 3682 outliers
- c-tf-idf, threshold=0.07: 1406 outliers
- c-tf-idf, threshold=0.08: 1758 outliers
- c-tf-idf, threshold=0.09: 2122 outliers
- distributions, threshold=0.08: 1401 outliers
- distributions, threshold=0.09: 1657 outliers
- distributions, threshold=0.1: 1943 outliers

------------------------------------------------------

Second run

- embeddings, threshold=0.25: 457 outliers

- embeddings, threshold=0.3: 955 outliers
- embeddings, threshold=0.35: 1627 outliers
- c-tf-idf, threshold=0.075: 1587 outliers
- c-tf-idf, threshold=0.08: 1758 outliers
- c-tf-idf, threshold=0.085: 1945 outliers
- distributions, threshold=0.09: 1657 outliers
- distributions, threshold=0.095: 1819 outliers
- distributions, threshold=0.1: 1943 outliers

------------------------------------------------------

third run
- Summary of Outlier Counts:

- embeddings, threshold=0.36: 1772 outliers

- embeddings, threshold=0.37: 1928 outliers
- embeddings, threshold=0.38: 2111 outliers
- embeddings, threshold=0.39: 2260 outliers
- embeddings, threshold=0.4: 2415 outliers
- embeddings, threshold=0.41: 2583 outliers


#### Inspection of embeddings 0,35

In [196]:
# Step 1: Get original topics (before outlier reduction)
topics, _ = reduced_model_manual.transform(df_combined["translated_Comment"].tolist()) 

In [235]:

from collections import defaultdict
import random

# Step 2: Get new topics (after outlier reduction using embedding + 0.25 threshold)
topics_emb_035_manual = results[("embeddings", 0.4)]["topics"]

# Step 1: Find reassigned outliers (originally -1, now assigned to a topic)
reassigned_outliers = [
    (i, doc, new_topic)
    for i, (doc, old_topic, new_topic) in enumerate(zip(documents, topics, topics_emb_035_manual))
    if old_topic == -1 and new_topic != -1
]

# Step 2: Group reassigned outliers by new topic
topic_to_outlier_docs = defaultdict(list)
for i, doc, new_topic in reassigned_outliers:
    topic_to_outlier_docs[new_topic].append(doc)

# Step 3: Print sampled outlier docs with ORIGINAL top words (from reduced_model2)
for topic_id in sorted(topic_to_outlier_docs.keys()):
    docs = topic_to_outlier_docs[topic_id]
    sampled_docs = random.sample(docs, min(10, len(docs)))

    # Get original top words from reduced_model_manual
    original_topic_words = reduced_model_manual.get_topic(topic_id)
    if original_topic_words:
        top_words = ", ".join([word for word, _ in original_topic_words])
    else:
        top_words = "(No original top words)"

    print(f"Reassigned Outliers → Topic {topic_id} ({len(docs)} docs) | Original top words: {top_words}")
    for j, doc in enumerate(sampled_docs):  
        print(f"{j+1}. {doc}")
    print()

Reassigned Outliers → Topic 0 (700 docs) | Original top words: courier, hour, new courier, work, order order, order courier, day, hire, earning, courier order
1. I would like to arrange for delivery.
2. Yesterday I had an order from Wolt Market and there was 6 bags and for that it would be needed 2 couriers. I was waiting for over 15 min to be ready and then decided to ask them about it because is not really usual to be that late. For my surprise they told me that only one of the couriers get the notification that is ready which for me make no sense. Could you please inform that to the IT team so that can be fixed otherwise the customer will receive an order within 10min and the rest who knows when?
3. I took an order that was almost an hour late, i get there and am told that somebody took the order already. Support tells mr i can wait for a remake. I wait for a remake and deliver it. This ordeal takes 30minutes on top of delivering - but according to your guidelines ( as i am told by 

In [198]:
from collections import defaultdict
import random


# Organize documents by new topic
topic_to_docs = defaultdict(list)
for doc, topic in zip(documents, topics_emb_035_manual):
    topic_to_docs[topic].append(doc)

# Select only topic 5
topic_id = -1
docs = topic_to_docs.get(topic_id, [])

if docs:
    sampled_docs = random.sample(docs, min(20, len(docs)))  # Randomly select up to 20 docs
    print(f"\n=== Topic {topic_id} ({len(docs)} documents) ===")
    for i, doc in enumerate(sampled_docs):
        print(f"{i+1}. {doc}")
else:
    print(f"No documents found for Topic {topic_id}.")


=== Topic -1 (1627 documents) ===
1. I could not be allowed to accept but well reject. Tried turning off my phone but teasing me. Trying again tomorrow, unfortunately, has no more time right now :(
2. Today was a very disappointed day, first you put the booster from 25 to 20% and try to sale a ideia of a mistake, typo, come on, we are not stupid. After this I made 25 deliveries and was supposed to be the big day bla bla bla, and with 25 deliveries I made not so different if I compare with 31/12, when I made 26 deliveries more the 1.300, today with 25 I made less than 1.300! For me we were cheated! Very disappointed and sad about.
3. Ah why this keep showing
4. It was my first guard today Seems it went wellited
5. Boost should be start from 18:00 pm to 21:00 pm. There is no use of boost in between 17:00 to 18:00 pm as the day time increases than before. People usually start their dinner after 18:00 pm. You must take action. Very low amount of order in between 17-18 pm . We have observe

#### Inspection of c-tf-idf 0,085

In [18]:

from collections import defaultdict
import random

# Step 2: Get new topics (after outlier reduction using embedding + 0.25 threshold)
topics_c_tf_idf_0085_manual = results_c_tf_idf[("c-tf-idf", 0.085)]["topics"]

# Step 1: Find reassigned outliers (originally -1, now assigned to a topic)
reassigned_outliers = [
    (i, doc, new_topic)
    for i, (doc, old_topic, new_topic) in enumerate(zip(documents, topics, topics_c_tf_idf_0085_manual))
    if old_topic == -1 and new_topic != -1
]

# Step 2: Group reassigned outliers by new topic
topic_to_outlier_docs = defaultdict(list)
for i, doc, new_topic in reassigned_outliers:
    topic_to_outlier_docs[new_topic].append(doc)

# Step 3: Print sampled outlier docs with ORIGINAL top words (from reduced_model2)
for topic_id in sorted(topic_to_outlier_docs.keys()):
    docs = topic_to_outlier_docs[topic_id]
    sampled_docs = random.sample(docs, min(10, len(docs)))

    # Get original top words from reduced_model_manual
    original_topic_words = reduced_model_manual.get_topic(topic_id)
    if original_topic_words:
        top_words = ", ".join([word for word, _ in original_topic_words])
    else:
        top_words = "(No original top words)"

    print(f"Reassigned Outliers → Topic {topic_id} ({len(docs)} docs) | Original top words: {top_words}")
    for j, doc in enumerate(sampled_docs):  
        print(f"{j+1}. {doc}")
    print()

Reassigned Outliers → Topic 0 (242 docs) | Original top words: courier, order, task, hour, new, new courier, time, work, order order, home
1. Busy time without orders
2. I don't know if it's the app or to meny currier/drivers. But the last week I worked from 11 to around 13:30 o'clock with so few orders that I actually lost money on gas and time on it. Just take today's peak time 1 hour and 34 minutes and got only 1 order on 40kr drove the whole city so meny times. There most be a solution on this matter.
3. Absolutely all is good just with 2 things I’m not happy. 1. Waiting time with the restaurants - they don’t care about the couriers and let us wait sometimes more then 20min for 1 order , then this is absolutely loosing time and nobody pay us for that. 2. The busy time is just 17-20h if you’re not able to work at that time in general you will not earn money from Wolt
4. No enough orders. A lot of new partners
5. no work no order what's going on?
6. A lot of new partners. I don't hav

In [19]:
from collections import defaultdict
import random


# Organize documents by new topic
topic_to_docs = defaultdict(list)
for doc, topic in zip(documents, topics_c_tf_idf_0085_manual):
    topic_to_docs[topic].append(doc)

# Select only topic 5
topic_id = -1
docs = topic_to_docs.get(topic_id, [])

if docs:
    sampled_docs = random.sample(docs, min(20, len(docs)))  # Randomly select up to 20 docs
    print(f"\n=== Topic {topic_id} ({len(docs)} documents) ===")
    for i, doc in enumerate(sampled_docs):
        print(f"{i+1}. {doc}")
else:
    print(f"No documents found for Topic {topic_id}.")


=== Topic -1 (1945 documents) ===
1. I can not go online for some weeks
2. Too much spam messaging
3. Being offered too many trips where I can't keep legally with my car.
4. Last 3 weeks get less assignments.
5. Too little money compared to how much gasoline has been used
6. Slavery 21st century version
7. We can’t get any order. You hired a kot of people so we can’t earn money anymore. Area demand always quiet. Please fix this problem
8. I get few orders despite good demand. It's like I've been blacklisted. It's just not clear why.
9. Can you please explain to me why I deserve to be scammed so much
10. one customer's adress was shown not where it was by ~100m. This caused the delivery to be a bit slower
11. Burger King does not get delivery under the hot lamp, and then you have to bring your bike or scooter in. enough to be sure that Wolt is blamed for cold items
12. Still trying to figure out how to earn enough. Maybe an ebike is necessary...
13. I have requested the termination of 

#### Inspection of distribution 0,9

In [212]:

from collections import defaultdict
import random

# Step 2: Get new topics (after outlier reduction using embedding + 0.25 threshold)
topics_distribution009_manual = results[("distributions", 0.1)]["topics"]

# Step 1: Find reassigned outliers (originally -1, now assigned to a topic)
reassigned_outliers = [
    (i, doc, new_topic)
    for i, (doc, old_topic, new_topic) in enumerate(zip(documents, topics, topics_distribution009_manual))
    if old_topic == -1 and new_topic != -1
]

# Step 2: Group reassigned outliers by new topic
topic_to_outlier_docs = defaultdict(list)
for i, doc, new_topic in reassigned_outliers:
    topic_to_outlier_docs[new_topic].append(doc)

# Step 3: Print sampled outlier docs with ORIGINAL top words (from reduced_model2)
for topic_id in sorted(topic_to_outlier_docs.keys()):
    docs = topic_to_outlier_docs[topic_id]
    sampled_docs = random.sample(docs, min(10, len(docs)))

    # Get original top words from reduced_model_manual
    original_topic_words = reduced_model_manual.get_topic(topic_id)
    if original_topic_words:
        top_words = ", ".join([word for word, _ in original_topic_words])
    else:
        top_words = "(No original top words)"

    print(f"Reassigned Outliers → Topic {topic_id} ({len(docs)} docs) | Original top words: {top_words}")
    for j, doc in enumerate(sampled_docs):  
        print(f"{j+1}. {doc}")
    print()

Reassigned Outliers → Topic 0 (230 docs) | Original top words: courier, hour, new courier, work, order order, order courier, day, hire, earning, courier order
1. Everything went wrong :-( Many illum gave my order to the other courier, another restaurant very delayed and seemed nothing wrong about it. Wolt courier should be able to give restaurants reviews
2. Couriers should be notify which floor the order is going to. Today I had all the orders in the 4th floor and I am not paid for that. I should be able to decide if I tak the task knowing the whole complexity of it
3. I don't know if it's the app or to meny currier/drivers. But the last week I worked from 11 to around 13:30 o'clock with so few orders that I actually lost money on gas and time on it. Just take today's peak time 1 hour and 34 minutes and got only 1 order on 40kr drove the whole city so meny times. There most be a solution on this matter.
4. Slightly higher fees for us driving a car would be nice. I get all too often ta

In [205]:
from collections import defaultdict
import random


# Organize documents by new topic
topic_to_docs = defaultdict(list)
for doc, topic in zip(documents, topics_distribution009_manual):
    topic_to_docs[topic].append(doc)

# Select only topic 5
topic_id = -1
docs = topic_to_docs.get(topic_id, [])

if docs:
    sampled_docs = random.sample(docs, min(20, len(docs)))  # Randomly select up to 20 docs
    print(f"\n=== Topic {topic_id} ({len(docs)} documents) ===")
    for i, doc in enumerate(sampled_docs):
        print(f"{i+1}. {doc}")
else:
    print(f"No documents found for Topic {topic_id}.")


=== Topic -1 (1657 documents) ===
1. Ang. It released Memo last Friday that it is expected to be busy in KBH on Sunday night. This did not really turn out to be true, despite many customers returning home from vacation. I had also thought it would be busy, but it just shows how hard it is to predict demand. I hope you are doing an internal evaluation on this announcement, because you can next rely on a future announcement from you about great demand?
2. Macdonald Bellarup wasted more than 20 mints
3. Streeet namesare to smalll. Count not swipe "delivered" 2out of5 times..
4. Not so much and do right now, but that summer
5. Have an excellent Sunday
6. Every day I earn less, a scam!!
7. Almost no order. Sometimes you earn 200kr per hour - other times a maximum of 100.
8. It is thrilled that the extra money does not work when you pick up 15:59 and it starts at 16, so you have to wait to accept instead of taking it.
9. City gave me a free pizza :)
10. The job became so much frustrating an

### Update the topic model for outliers reduction

In [20]:
import copy

reduced_outlier_model_manual = copy.deepcopy(reduced_model_manual)

In [21]:
reduced_outlier_model_manual.update_topics(documents, topics = topics_c_tf_idf_0085_manual, vectorizer_model= vectorizer_model)




In [22]:
reduced_outlier_model_manual.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1945,-1_order_delivery_time_work,"[order, delivery, time, work, wolt, excellent,...","[Hello with you ""Wolt"" I will start by mention..."
1,0,1315,0_order_courier_hour_task,"[order, courier, hour, task, time, new, work, ...","[Didn't get some order, There are not many ord..."
2,1,1440,1_ready_wait_order_time,"[ready, wait, order, time, food, minute, resta...","[Dear Wolt support, I would like to file anoth..."
3,2,1206,2_happy_wolt_good_work,"[happy, wolt, good, work, day, thank, happy wo...","[i am very happy with wolt, i am very happy wi..."
4,3,1120,3_low_pay_payment_hour,"[low, pay, payment, hour, price, order, work, ...","[The payment is a shameless, The payments are ..."
5,4,806,4_app_time_app work_order,"[app, time, app work, order, work, error, supp...",[My app teasing therefore i have not been to w...
6,5,657,5_km_distance_delivery_drive,"[km, distance, delivery, drive, pay, car, driv...",[The adress of one of my delivery's today was ...
7,6,449,6_support_wolt support_help_chat,"[support, wolt support, help, chat, bad, team,...","[Wolt support not good.verybad, Support always..."
8,7,383,7_address_map_google_location,"[address, map, google, location, app, gps, map...",[Sometimes wolt app doesn't redirect to accura...
9,8,314,8_bag_drink_food_pack,"[bag, drink, food, pack, order, cup, soda, pap...",[McDonalds City putting orders in big bags is ...


In [32]:

# Display top 10 word representations for each topic (excluding outlier topic -1)
topic_info = reduced_outlier_model_manual.get_topic_info()
for topic_id in topic_info.Topic:
    if topic_id != -1:
        words = reduced_outlier_model_manual.get_topic(topic_id)
        if words:
            top_words = [word for word, _ in words[:10]]
            print(f"Topic {topic_id}: {', '.join(top_words)}")

Topic 0: order, courier, hour, task, time, new, work, 2, order order, receive
Topic 1: ready, wait, order, time, food, minute, restaurant, burger, mark, mcdonald
Topic 2: happy, wolt, good, work, day, thank, happy wolt, job, love, wolt happy
Topic 3: low, pay, payment, hour, price, order, work, bad, earning, wolt
Topic 4: app, time, app work, order, work, error, support, task, accept, wolt
Topic 5: km, distance, delivery, drive, pay, car, driver, order, long, wolt
Topic 6: support, wolt support, help, chat, bad, team, wolt, contact, support bad, support team
Topic 7: address, map, google, location, app, gps, maps, wrong, google map, google maps
Topic 8: bag, drink, food, pack, order, cup, soda, paper, spill, restaurant
Topic 9: wolt market, market, wolt, order wolt, order, market vanløse, vanløse, danish, copenhagen, hour
Topic 10: bundle, order, bundle order, pay, task, payment, system, new, order bundle, accept
Topic 11: bonus, weekend, excellent weekend, weekend excellent, weekend b

## Fine tune topic representations

In [23]:
import copy

reduced_outlier_model_manual_topics = copy.deepcopy(reduced_outlier_model_manual)

In [42]:
keybert_model = KeyBERTInspired()
mmr_model = MaximalMarginalRelevance(diversity=0.4) #0.2 is used in the book.

representation_model = {
    "Main": mmr_model,
    "KeyBERT": keybert_model,
    
}

In [43]:
reduced_outlier_model_manual_topics.update_topics(documents, topics = topics_c_tf_idf_0085_manual, vectorizer_model= vectorizer_model, representation_model=representation_model)




In [44]:
reduced_outlier_model_manual_topics.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,Representative_Docs
0,-1,1945,-1_work_customer_deliver_day,"[work, customer, deliver, day, hour, bad, busy...","[week excellent, excellent week, excellent sun...","[delivery, work, customer, deliver, day, hour,...","[Hello with you ""Wolt"" I will start by mention..."
1,0,1315,0_hour_work_order order_receive,"[hour, work, order order, receive, new courier...","[get order, order order, order, wait order, re...","[courier, hour, work, order order, new courier...","[Didn't get some order, There are not many ord..."
2,1,1440,1_order_minute_mark_mcdonald,"[order, minute, mark, mcdonald, food ready, de...","[food ready, order ready, delay, restaurant, m...","[minute, restaurant, burger, mcdonald, order r...","[Dear Wolt support, I would like to file anoth..."
3,2,1206,2_happy_work_day_happy wolt,"[happy, work, day, happy wolt, wolt happy, wor...","[love wolt, wolt happy, happy wolt, wolt, wolt...","[happy, day, happy wolt, wolt happy, excellent...","[i am very happy with wolt, i am very happy wi..."
4,3,1120,3_low_hour_work_wolt,"[low, hour, work, wolt, dkk, system, earn, lik...","[low payment, low pay, order pay, payment, pay...","[low, pay, payment, hour, work, earning, salar...","[The payment is a shameless, The payments are ..."
5,4,806,4_app work_work_error_delivery,"[app work, work, error, delivery, restart, ord...","[app work, fix app, app app, app, problem app,...","[app, app work, error, crash, fix, restart, or...",[My app teasing therefore i have not been to w...
6,5,657,5_delivery_drive_kr_payment,"[delivery, drive, kr, payment, work, bicycle, ...","[wolt, favor wolt, hello wolt, fee, fairly app...","[km, delivery, drive, pay, kr, payment, app, b...",[The adress of one of my delivery's today was ...
7,6,449,6_support_wolt support_chat_support team,"[support, wolt support, chat, support team, su...","[wolt support, support wolt, support bad, supp...","[support, wolt support, chat, support bad, sup...","[Wolt support not good.verybad, Support always..."
8,7,383,7_location_gps_google map_copy,"[location, gps, google map, copy, wrong addres...","[address wrong, wrong address, incorrect addre...","[address, map, app, gps, maps, google map, goo...",[Sometimes wolt app doesn't redirect to accura...
9,8,314,8_pack_order_soda_spill,"[pack, order, soda, spill, customer, paper bag...","[paper bag, bag order, use bag, pack bag, food...","[bag, pack, soda, spill, paper bag, lid, mcdon...",[McDonalds City putting orders in big bags is ...


In [45]:
# Display top 10 word representations for each topic (excluding outlier topic -1)
topic_info = reduced_outlier_model_manual_topics.get_topic_info()
for topic_id in topic_info.Topic:
    if topic_id != -1:
        words = reduced_outlier_model_manual_topics.get_topic(topic_id)
        if words:
            top_words = [word for word, _ in words[:10]]
            print(f"Topic {topic_id}: {', '.join(top_words)}")

Topic 0: hour, work, order order, receive, new courier, wolt, order courier, today, people, lot
Topic 1: order, minute, mark, mcdonald, food ready, delay, late, burger king, 5, waste
Topic 2: happy, work, day, happy wolt, wolt happy, work wolt, day excellent, today, team, happy work
Topic 3: low, hour, work, wolt, dkk, system, earn, like, wage, pay order
Topic 4: app work, work, error, delivery, restart, order app, bug, try, app crash, km
Topic 5: delivery, drive, kr, payment, work, bicycle, long distance, trip, kilometer, app favor
Topic 6: support, wolt support, chat, support team, support good, support help, contact support, thank support, support support, message
Topic 7: location, gps, google map, copy, wrong address, wolt app, problem, copy address, house, card
Topic 8: pack, order, soda, spill, customer, paper bag, lid, mcdonald, seal, packaging
Topic 9: wolt market, order, market vanløse, denmark, courier, ote, delivery, today, app wolt, employee
Topic 10: bundle, payment, orde

## Save / Load the final model

In [46]:
reduced_outlier_model_manual_topics.save("reduced_outlier_model_manual_topics")




In [None]:
from bertopic import BERTopic

model = BERTopic.load("reduced_outlier_model_manual_topics")

## Inspect final model

In [50]:
reduced_outlier_model_manual_topics.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,Representative_Docs
0,-1,1945,-1_work_customer_deliver_day,"[work, customer, deliver, day, hour, bad, busy...","[week excellent, excellent week, excellent sun...","[delivery, work, customer, deliver, day, hour,...","[Hello with you ""Wolt"" I will start by mention..."
1,0,1315,0_hour_work_order order_receive,"[hour, work, order order, receive, new courier...","[get order, order order, order, wait order, re...","[courier, hour, work, order order, new courier...","[Didn't get some order, There are not many ord..."
2,1,1440,1_order_minute_mark_mcdonald,"[order, minute, mark, mcdonald, food ready, de...","[food ready, order ready, delay, restaurant, m...","[minute, restaurant, burger, mcdonald, order r...","[Dear Wolt support, I would like to file anoth..."
3,2,1206,2_happy_work_day_happy wolt,"[happy, work, day, happy wolt, wolt happy, wor...","[love wolt, wolt happy, happy wolt, wolt, wolt...","[happy, day, happy wolt, wolt happy, excellent...","[i am very happy with wolt, i am very happy wi..."
4,3,1120,3_low_hour_work_wolt,"[low, hour, work, wolt, dkk, system, earn, lik...","[low payment, low pay, order pay, payment, pay...","[low, pay, payment, hour, work, earning, salar...","[The payment is a shameless, The payments are ..."
5,4,806,4_app work_work_error_delivery,"[app work, work, error, delivery, restart, ord...","[app work, fix app, app app, app, problem app,...","[app, app work, error, crash, fix, restart, or...",[My app teasing therefore i have not been to w...
6,5,657,5_delivery_drive_kr_payment,"[delivery, drive, kr, payment, work, bicycle, ...","[wolt, favor wolt, hello wolt, fee, fairly app...","[km, delivery, drive, pay, kr, payment, app, b...",[The adress of one of my delivery's today was ...
7,6,449,6_support_wolt support_chat_support team,"[support, wolt support, chat, support team, su...","[wolt support, support wolt, support bad, supp...","[support, wolt support, chat, support bad, sup...","[Wolt support not good.verybad, Support always..."
8,7,383,7_location_gps_google map_copy,"[location, gps, google map, copy, wrong addres...","[address wrong, wrong address, incorrect addre...","[address, map, app, gps, maps, google map, goo...",[Sometimes wolt app doesn't redirect to accura...
9,8,314,8_pack_order_soda_spill,"[pack, order, soda, spill, customer, paper bag...","[paper bag, bag order, use bag, pack bag, food...","[bag, pack, soda, spill, paper bag, lid, mcdon...",[McDonalds City putting orders in big bags is ...


In [59]:
# Display top 10 word representations for each topic (excluding outlier topic -1)
topic_info = reduced_outlier_model_manual_topics.get_topic_info()
for topic_id in topic_info.Topic:
    if topic_id != -1:
        words = reduced_outlier_model_manual_topics.get_topic(topic_id)
        if words:
            top_words = [word for word, _ in words[:10]]
            print(f"Topic {topic_id}: {', '.join(top_words)}")

Topic 0: hour, work, order order, receive, new courier, wolt, order courier, today, people, lot
Topic 1: order, minute, mark, mcdonald, food ready, delay, late, burger king, 5, waste
Topic 2: happy, work, day, happy wolt, wolt happy, work wolt, day excellent, today, team, happy work
Topic 3: low, hour, work, wolt, dkk, system, earn, like, wage, pay order
Topic 4: app work, work, error, delivery, restart, order app, bug, try, app crash, km
Topic 5: delivery, drive, kr, payment, work, bicycle, long distance, trip, kilometer, app favor
Topic 6: support, wolt support, chat, support team, support good, support help, contact support, thank support, support support, message
Topic 7: location, gps, google map, copy, wrong address, wolt app, problem, copy address, house, card
Topic 8: pack, order, soda, spill, customer, paper bag, lid, mcdonald, seal, packaging
Topic 9: wolt market, order, market vanløse, denmark, courier, ote, delivery, today, app wolt, employee
Topic 10: bundle, payment, orde

In [129]:
# Get topic assignments for each document
topics2, _ = reduced_outlier_model_manual_topics.transform(df_combined["translated_Comment"].tolist())

In [136]:
from collections import defaultdict
import random

# Organize documents by their assigned topic
topic_to_docs = defaultdict(list)
for doc, topic in zip(documents, topics2):
    topic_to_docs[topic].append(doc)

# Iterate over sorted topic IDs
for topic_id in sorted(topic_to_docs.keys()):
    docs = topic_to_docs[topic_id]
    
    # Filter to only include feedback with length ≤ 150
    short_docs = [doc for doc in docs if len(doc) <= 400000]
    sampled_docs = random.sample(short_docs, min(20, len(short_docs)))  # Randomly select up to 10 docs
    
    print(f"\n -- Topic {topic_id} -- ")
    for i, doc in enumerate(sampled_docs):
        print(f"{i+1}. {doc}")




 -- Topic -1 -- 
1. Fist shift was ok not as busy as i expected
2. After the update, the app works very poorly
3. No money. Working for nothing
4. Nothing works in appropriate condition. App doesn't see the real distance between me and restaurants. DDS in all it's "beauty"
5. Hello with you "Wolt" I will start by mentioning that I have been very happy with my time at Wolt and proud to tell how well we serve, as today is called how well we used to and earn, because that time is over. You talk about listening to us cures and fairly done the app. First you put our starting fee down from 45,- to 35,- earlier it was a bird spacing, such as praying at Islands Brygge, birds distance of 450 meters above the water when reality was 2 km. Ñu has once again fixed the app in favor of Wolt and not us who have made you rich. In the video on YouTube, the guy tells us that you have listened to us, but that is not entirely correct or you have misunderstood something. Now you have again changed the app 

# Model 3

In [90]:
from bertopic import BERTopic

loaded_model_3 = BERTopic.load("simple_model3")
print("Model loaded successfully.")

Model loaded successfully.


In [91]:
loaded_model_3.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,4111,-1_order_time_wolt_app,"[order, time, wolt, app, work, delivery, custo...",[The world's bad wolt guard. It requires a lit...
1,0,868,0_good_happy_wolt_work,"[good, happy, wolt, work, thank, day, love, jo...","[wolt is the best i am happy with wolt, I am v..."
2,1,633,1_order_order order_hour_home,"[order, order order, hour, home, wait, home or...","[if there are no orders, I'll be off in a whil..."
3,2,624,2_distance_km_delivery_drive,"[distance, km, delivery, drive, pay, long, car...","[Where is the delivery’s ?!, Deliveries should..."
4,3,452,3_payment_earning_pay_salary,"[payment, earning, pay, salary, money, low, wa...","[The earning are less than before, The payment..."
5,4,380,4_restaurant_ready_food_wait,"[restaurant, ready, food, wait, time, food rea...","[Much behind at last restaurant, Orders coming..."
6,5,374,5_courier_new courier_new_order,"[courier, new courier, new, order, order couri...","[No orders and to much couriers…, few orders t..."
7,6,354,6_map_address_google_location,"[map, address, google, location, gps, app, map...","[Addresses were misleading, Please make it pos..."
8,7,307,7_mcdonald_wait_mcdonalds_order,"[mcdonald, wait, mcdonalds, order, ready, minu...","[15-20 minute wait at McDonald's, Very long wa..."
9,8,274,8_support_wolt support_team_support team,"[support, wolt support, team, support team, ba...","[Haven't used Wolt Support, so is still uncert..."


# Model with zero-shot

In [None]:
from bertopic import BERTopic
import pandas as pd
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.model_selection import ParameterGrid
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance
from octis.evaluation_metrics.coherence_metrics import Coherence
from umap import UMAP
from hdbscan import HDBSCAN
import spacy
from tqdm import tqdm

# Your documents
documents = df_combined["translated_Comment"].tolist()

# Your zero-shot topic list
zeroshot_topic_list = [
    "Flexibility in work hours and when to go online",
    "Earnings and pay including bonuses",
    "Planning your time",
    "experience and satisfaction with completing jobs or tasks",
    "Interactions with customers",
    "Interactions and chat with Wolt support team",
    "Interactions with venues or restaurants",
    "Techlogical issues or bugs with the app",
    "Communication",
    "Gear, equipment, bag, clothes, cup,",
    "Safety"
]

# Embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
#Compute embeddings
#embeddings = embedding_model.encode(documents, show_progress_bar=True)

# Prepare Tokenizer model
# Load spaCy English model
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 1520000

# Define custom tokenizer function with lemmatization for topic representations
def spacy_tokenizer(doc):
    spacy_doc = nlp(doc)
    # Lemmatize, lowercase, remove stopwords & punctuation
    return [token.lemma_.lower() for token in spacy_doc if not token.is_stop and not token.is_punct] #

base_vectorizer = CountVectorizer(tokenizer=spacy_tokenizer, min_df=5) # remove words that appear in less than 5 documents
base_vectorizer.fit(documents) # fit the vectorizer to the documents, to find the vocabulary for topic representations
filtered_vocab = set(base_vectorizer.get_feature_names_out()) # # get the vocabulary of the filtered vectorizer

def filtered_tokenizer(doc): # Now create a custom tokenizer that only keeps words in the filtered vocabulary, which will be passed to the CountVectorizer for topic representations
    tokens = spacy_tokenizer(doc)
    return [t for t in tokens if t in filtered_vocab]

# Create CountVectorizer with custom tokenizer
vectorizer_model = CountVectorizer(tokenizer=filtered_tokenizer, # use the custom tokenizer
                                   #stop_words="english", # already done in the tokenizer, but countvectorizer might have other stop words
                                   lowercase = False, # lowercase is already done in the tokenizer
                                   #min_df = 1, #  Excludes words that appear in only one document.
                                   ngram_range=(1, 2)) # consider topic representations that are made up of one or two words

def coherence_tokenizer(doc): # The tokenizer for the coherence score. This is a standard tokenizer that does not filter out words, but lemmatizes and lowercases them.
    spacy_doc = nlp(doc)
    return [
        token.lemma_.lower()
        for token in spacy_doc
        if not token.is_punct  # keep stopwords as I want to calculate the coherence score based on all words
    ]

tokens_for_coherence = [coherence_tokenizer(doc) for doc in documents] 


# Fine-tune representations 
#keybert_model = KeyBERTInspired()
#mmr_model = MaximalMarginalRelevance(diversity=0.2) #0.2 is used in the book.

#representation_model = {
    #"KeyBERT": keybert_model,
    #"MMR": mmr_model
#}


# Function to calculate Topic Diversity as the proportion of unique words across all top-k topic words (PUW)
def topic_diversity(topics, topk):
    all_words = [word for topic in topics for word in topic[:topk]]
    unique_words = set(all_words)
    return len(unique_words) / len(all_words)


# Results storage. A list that sores the coherence and diversty score of each model
results = []
# A list which saves all the models, so each model can be easily retrieved by typing its corresponding parameters
saved_models = {}  


# Loop through models 1 to 10
for i in tqdm(range(1, 11)):
    # Load existing saved model
    loaded_model = BERTopic.load(f"simple_model{i}")

    umap_params = loaded_model.umap_model.get_params()
    hdbscan_params = loaded_model.hdbscan_model.get_params()

    # Re-initialize the model with zero-shot topic modeling enabled
    # Note: We must create a new BERTopic instance with the same saved parameters plus zero-shot params
    topic_model = BERTopic(
        embedding_model=embedding_model,
        umap_model = UMAP(**umap_params),
        hdbscan_model = HDBSCAN(**hdbscan_params),
        vectorizer_model=vectorizer_model,
        zeroshot_topic_list=zeroshot_topic_list,
        zeroshot_min_similarity=0.5,
        top_n_words=10,
    )

    # Optionally, if you want to keep the original fitted topics and just add zero-shot, you might try:
    # topic_model.topics_, topic_model.probs_ = loaded_model.topics_, loaded_model.probs_

    # Fit or transform documents
    topics, probs = topic_model.fit_transform(documents)


    # Get top `top_n_words` words per topic (excluding outlier topic -1)
    topic_words = [] #an empty list that will be used to store the top words of every topic found for each model. So a list (topics) of lists (top words for each topic).
    
    for topic in topic_model.get_topic_freq().Topic: # grabs the topics from the topic_model
        if topic != -1: #skips the outlier topic -1
            topic_content = topic_model.get_topic(topic) #get the top words and scores for the topics.
            if isinstance(topic_content, list) and len(topic_content) > 0: # checks if the topic_content (top words) is a list and has more than 0 elements. To check if there actually is extracted topic words
                if isinstance(topic_content[0], tuple) and len(topic_content[0]) == 2: # checks if the first element of topic_content is a tuple and has two elements (word and score)
                    words = [word for word, _ in topic_content[:topic_model.top_n_words]] #if so, it extracts the words (but not the scores) in a variable called "words"
                else:
                    words = topic_content[:topic_model.top_n_words] #if there is only top words but no scores, extracts the top words in a variabkle called "words"
                topic_words.append(words) #append the topic words (defined in the variable "words") to the topic_words list for the model.

    if topic_words:
        
        # Calculate NPMI
        coherence_model = Coherence(texts=tokens_for_coherence, # reference corpus. Here it's all documents in the dataset
                                    topk=10, #specifies the number of top words to consider for NPMI calculation
                                    measure="c_npmi")
        npmi_score = coherence_model.score({"topics": topic_words}) # average

        # Calculate Topic Diversity
            # the topic_diversity command was defned before the loop. This is not a standard command, so I defined how it works earlier.
        diversity_score = topic_diversity(topics=topic_words, topk=10)

        # Calculate Topic Quality by multiplying NPMI score and Topic Diversity
        topic_quality = npmi_score * diversity_score 

        # Print the number of topics
        num_topics = len(topic_words) #number of topics is the length of the topic_words list
        num_outliers = sum(1 for topic in topics if topic == -1) #number of outliers is the number of topics that are equal to -1
        print(f"Number of Topics: {num_topics} | TC: {npmi_score:.3f} | TD: {diversity_score:.3f} | TQ: {topic_quality:.3f} | num_outliers: {num_outliers}") #4f = 4 decimals 


    # Save the updated model with zero-shot topics included
    topic_model.save(f"simple_model{i}_zeroshot")

  0%|          | 0/10 [00:00<?, ?it/s]OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.

Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.



Number of Topics: 30 | TC: 0.072 | TD: 0.703 | TQ: 0.051 | num_outliers: 3716



Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.



Number of Topics: 32 | TC: 0.089 | TD: 0.725 | TQ: 0.065 | num_outliers: 4063



Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.



Number of Topics: 32 | TC: 0.087 | TD: 0.716 | TQ: 0.062 | num_outliers: 4046



Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.



Number of Topics: 33 | TC: 0.087 | TD: 0.718 | TQ: 0.062 | num_outliers: 3877



Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.



Number of Topics: 31 | TC: 0.079 | TD: 0.713 | TQ: 0.056 | num_outliers: 3898



Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.



Number of Topics: 35 | TC: 0.088 | TD: 0.731 | TQ: 0.064 | num_outliers: 4345



Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.



Number of Topics: 30 | TC: 0.081 | TD: 0.703 | TQ: 0.057 | num_outliers: 3811



Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.



Number of Topics: 28 | TC: 0.086 | TD: 0.689 | TQ: 0.059 | num_outliers: 3882



Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.



Number of Topics: 31 | TC: 0.088 | TD: 0.723 | TQ: 0.063 | num_outliers: 4126



Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.



Number of Topics: 10 | TC: 0.061 | TD: 0.810 | TQ: 0.050 | num_outliers: 64


100%|██████████| 10/10 [17:32<00:00, 105.28s/it]


In [None]:
from bertopic import BERTopic

# Load the saved model
simple_model1_zeroshot = BERTopic.load("simple_model1_zeroshot")

# Get topic information (frequency, topic ID, etc.)
simple_model1_zeroshot.get_topic_info()



Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,3716,-1_order_time_wolt_work,"[order, time, wolt, work, delivery, customer, ...",[50- 90 kr / 30-40 min.. when it's 140-200 kr....
1,0,11,Flexibility in work hours and when to go online,"[online, flexible, hour, flexibility, earn mon...",[I am very happy with wolt and I like to go on...
2,1,22,Earnings and pay including bonuses,"[earning, bonus, earning hour, earning increas...","[Okay-ish earnings per hour, Earnings per hour..."
3,2,3,experience and satisfaction with completing jo...,"[happy like, satisfy happy, job satisfy, verry...","[Satisfied with everything, I am happy, but I ..."
4,3,4,Interactions with customers,"[customer, wonder, donalds, store, wonder eye,...","[Very friendly customer service!, I love the n..."
5,4,74,Interactions and chat with Wolt support team,"[wolt support, wolt, support, work wolt, team,...",[Wolt support very very very bad.too much bad ...
6,5,34,Interactions with venues or restaurants,"[venue, restaurant, venue mark, ready, mark, o...",[app throws orders with a certain distance and...
7,6,329,Techlogical issues or bugs with the app,"[app, app work, work, crash, error, app crash,...","[The app is working very bad, The app is worki..."
8,7,1,"Gear, equipment, bag, clothes, cup,","[buy gear, sadly buy, sadly, gear, buy, , , , , ]",[Sadly that you have to buy gear yourself]
9,8,152,8_task_task task_new task_accept task,"[task, task task, new task, accept task, accep...","[It's Sku Slay with Tasks, Because i always ge..."


In [38]:
topics = simple_model1_zeroshot.topics_
documents = df_combined["translated_Comment"].tolist()  # if this was the original list


In [44]:
from collections import defaultdict
import random

# Organize documents by new topic
topic_to_docs = defaultdict(list)
for doc, topic in zip(documents, topics):
    topic_to_docs[topic].append(doc)

# Select only topic 5
topic_id = 0
docs = topic_to_docs.get(topic_id, [])

if docs:
    sampled_docs = random.sample(docs, min(20, len(docs)))  # Randomly select up to 20 docs
    print(f"\n=== Topic {topic_id} ({len(docs)} documents) ===")
    for i, doc in enumerate(sampled_docs):
        print(f"{i+1}. {doc}")
else:
    print(f"No documents found for Topic {topic_id}.")


=== Topic 0 (11 documents) ===
1. I think it's great that I can be self -employed and I can go online or offline whenever I want. Also I earn money to cycle around and deliver
2. Super nice that you are so flexible with working hours. I couldn't wish for a better job
3. Very flexible workplace
4. Flexible working hours
5. The flexibility is absolutely great. I can work when I want. No more stress about thinking about the child.
6. I can log in and work when I have time.
7. I worked 12 13 hours as online on of my friend worked 5 6 hours as online but we earn same money. Do you distribute fairly?
8. Time flexibility is important, and we can have all the hours we want.
9. I consider that the freedom of hours that we can work is important.
10. I am very happy with wolt and I like to go online every day since this is the only job I have right now and I would like to have more orders during the time I am online
11. I stay online long time but have a few tasks only with long distanses


In [89]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.model_selection import ParameterGrid
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance
from octis.evaluation_metrics.coherence_metrics import Coherence
from umap import UMAP
from hdbscan import HDBSCAN
import spacy

# Load your data
#df = pd.read_csv('~/Library/Mobile Documents/com~apple~CloudDocs/UNI-kopi 2/Kandidat/virtual_environments/data/short.csv')
#documents = df["translated_Comment"].tolist() # Convert to list of strings


# Define the hyperparameter grid
param_grid = {
    # BERTOPIC parameters
    #"min_topic_size": [50,70,90], # Minimum size of a topic. Should be higher than min_cluster_size
    #"top_n_words": [10],
    #"nr_topics": ["none","auto"], # None means no reduction

    #UMAP parameters
    "n_components": [2], # Number of dimensions to reduce to
    "n_neighbors": [15], # Number of neighbors to consider for UMAP 

    #HDBSCAN parameters
    "min_cluster_size": [30,50,70,90,110,130,150], # Min cluster size for HDBSCAN # Min cluster size for HDBSCAN. 
    #"min_samples": [70] # Minimum number of samples in a cluster
}
#Create all possible combinations of the hyperparameter grid
grid = list(ParameterGrid(param_grid))


# Filter: only keep combinations where min_cluster_size <= min_topic_size
#valid_grid = [
   # params for params in grid
    #if params["min_samples"] <= params["min_cluster_size"]
#]

# Total number combinations. Used later in the print statement to show progress
#total = len(valid_grid)

total = len(grid)


# Embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
#Compute embeddings
#embeddings = embedding_model.encode(documents, show_progress_bar=True)



# Prepare Tokenizer model
# Load spaCy English model
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 1520000

# Define custom tokenizer function with lemmatization for topic representations
def spacy_tokenizer(doc):
    spacy_doc = nlp(doc)
    # Lemmatize, lowercase, remove stopwords & punctuation
    return [token.lemma_.lower() for token in spacy_doc if not token.is_stop and not token.is_punct] #

base_vectorizer = CountVectorizer(tokenizer=spacy_tokenizer, min_df=5) # remove words that appear in less than 5 documents
base_vectorizer.fit(documents) # fit the vectorizer to the documents, to find the vocabulary for topic representations
filtered_vocab = set(base_vectorizer.get_feature_names_out()) # # get the vocabulary of the filtered vectorizer

def filtered_tokenizer(doc): # Now create a custom tokenizer that only keeps words in the filtered vocabulary, which will be passed to the CountVectorizer for topic representations
    tokens = spacy_tokenizer(doc)
    return [t for t in tokens if t in filtered_vocab]

# Create CountVectorizer with custom tokenizer
vectorizer_model = CountVectorizer(tokenizer=filtered_tokenizer, # use the custom tokenizer
                                   #stop_words="english", # already done in the tokenizer, but countvectorizer might have other stop words
                                   lowercase = False, # lowercase is already done in the tokenizer
                                   min_df = 1, #  Excludes words that appear in only one document.
                                   ngram_range=(1, 2)) # consider topic representations that are made up of one or two words

def coherence_tokenizer(doc): # The tokenizer for the coherence score. This is a standard tokenizer that does not filter out words, but lemmatizes and lowercases them.
    spacy_doc = nlp(doc)
    return [
        token.lemma_.lower()
        for token in spacy_doc
        if not token.is_punct  # keep stopwords as I want to calculate the coherence score based on all words
    ]

tokens_for_coherence = [coherence_tokenizer(doc) for doc in documents] 


# Fine-tune representations 
#keybert_model = KeyBERTInspired()
#mmr_model = MaximalMarginalRelevance(diversity=0.2) #0.2 is used in the book.

#representation_model = {
    #"KeyBERT": keybert_model,
    #"MMR": mmr_model
#}


# Function to calculate Topic Diversity as the proportion of unique words across all top-k topic words (PUW)
def topic_diversity(topics, topk):
    all_words = [word for topic in topics for word in topic[:topk]]
    unique_words = set(all_words)
    return len(unique_words) / len(all_words)


# Defintions to later being able to track best model. 
best_model = None #no best model from the start
best_score = -1 # the first model with a score better than -1 will the current best. Just a placeholder make npmi_score > best_score work.
best_params = None #no best parameters from the start


# Results storage. A list that sores the coherence and diversty score of each model
results = []
# A list which saves all the models, so each model can be easily retrieved by typing its corresponding parameters
saved_models = {}  



#### THE LOOP ####
# Grid search
for i, params in enumerate(grid, 1):#change to valid_grid as the parameter grid to iterate over if min_topic_size is used
    print(f"[{i}/{total}] Running with params: {params}") #print the current parameter combination being evaluated for each iteration 

    # UMAP
    umap_model = UMAP(n_neighbors=params["n_neighbors"],
                      n_components=params["n_components"], 
                      min_dist=0.0, 
                      metric='cosine', 
                      random_state=42)

    # HDBSCAN  
    hdbscan_model = HDBSCAN(min_cluster_size=params["min_cluster_size"], 
                            #min_samples=params["min_samples"], 
                            metric='euclidean', 
                            cluster_selection_method='eom',
                            prediction_data=True
                            )

    # BERTopic model
    topic_model = BERTopic(
        #nr_topics=params["nr_topics"],
        #min_topic_size=params["min_topic_size"]
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        #representation_model=representation_model,
        vectorizer_model=vectorizer_model, #this makes some models not able to calculate NPMI
        #ctfidf_model=ctfidf_model,
        top_n_words=10,   
    )

    # Fit model
    topics, _ = topic_model.fit_transform(documents, embeddings)

    # Save meach model in the Saved_models list
    param_key = tuple(sorted(params.items()))  # extracting the parameters for the each model and creates a unique key for every model combination
    saved_models[param_key] = topic_model # save the each model in the saved_models list by its unique key 


    # Get top `top_n_words` words per topic (excluding outlier topic -1)
    topic_words = [] #an empty list that will be used to store the top words of every topic found for each model. So a list (topics) of lists (top words for each topic).
    
    for topic in topic_model.get_topic_freq().Topic: # grabs the topics from the topic_model
        if topic != -1: #skips the outlier topic -1
            topic_content = topic_model.get_topic(topic) #get the top words and scores for the topics.
            if isinstance(topic_content, list) and len(topic_content) > 0: # checks if the topic_content (top words) is a list and has more than 0 elements. To check if there actually is extracted topic words
                if isinstance(topic_content[0], tuple) and len(topic_content[0]) == 2: # checks if the first element of topic_content is a tuple and has two elements (word and score)
                    words = [word for word, _ in topic_content[:topic_model.top_n_words]] #if so, it extracts the words (but not the scores) in a variable called "words"
                else:
                    words = topic_content[:topic_model.top_n_words] #if there is only top words but no scores, extracts the top words in a variabkle called "words"
                topic_words.append(words) #append the topic words (defined in the variable "words") to the topic_words list for the model.

    if topic_words:
        
        # Calculate NPMI
        coherence_model = Coherence(texts=tokens_for_coherence, # reference corpus. Here it's all documents in the dataset
                                    topk=10, #specifies the number of top words to consider for NPMI calculation
                                    measure="c_npmi")
        npmi_score = coherence_model.score({"topics": topic_words}) # average

        # Calculate Topic Diversity
            # the topic_diversity command was defned before the loop. This is not a standard command, so I defined how it works earlier.
        diversity_score = topic_diversity(topics=topic_words, topk=10)

        # Calculate Topic Quality by multiplying NPMI score and Topic Diversity
        topic_quality = npmi_score * diversity_score 


        # Print the number of topics
        num_topics = len(topic_words) #number of topics is the length of the topic_words list
        num_outliers = sum(1 for topic in topics if topic == -1) #number of outliers is the number of topics that are equal to -1
        print(f"Number of Topics: {num_topics} | TC: {npmi_score:.3f} | TD: {diversity_score:.3f} | TQ: {topic_quality:.3f} | num_outliers: {num_outliers}") #4f = 4 decimals 

        # Track best model
        if npmi_score > best_score:
            best_score = npmi_score
            best_model = topic_model
            best_params = params

        # store the params, npmi, diversy, and num_topcis for each model in the results list
        results.append({
            "params": params,
            "num_topics": num_topics,
            "npmi_score": npmi_score,
            "topic_diversity": diversity_score,
            "topic_quality": topic_quality,
            "num_outliers": num_outliers
        })

    else:
        print("No valid topics were generated.")

#####

import csv

# Save the all the models' information to a CSV file

model_info = [] # an empty list to store model information. This is the dataframe that will be saved in the CSV file in the end 

# Loop through the models saved from the loop above and save the information to the model_info dataframe
for param_key, model in saved_models.items(): #activates a loop that goes through all the models (identified by their unique key).
    # Convert the tuple of params back to a dictionary (a formal step)
    params_dict = dict(param_key)
    
    # Get topic information
    #topics_freq = model.get_topic_freq() #retrives the number of topics and documents in each topic
    #num_topics = len(topics_freq[topics_freq.Topic != -1]) #retrives the number of topics (length) and exlcudes the outlier topic -1
    
    ## Get performance metrics if available
    # Matches the models with parameters saved in the loop with the parameters of models saved in the CVS file
    matching_result = next((r for r in results if all(r["params"][k] == v for k, v in params_dict.items())), None)
    
    if matching_result: 
        num_topics =matching_result.get("num_topics", 0) #extract the number of topics for the models
        num_outliers = matching_result.get("num_outliers", 0) #extract the number of outliers for the models
        npmi_score = matching_result.get("npmi_score", 0) #extract the NPMI score for the models
        diversity_score = matching_result.get("topic_diversity", 0) #extract the topic diversity score for the models
        topic_quality = matching_result.get("topic_quality", 0) #extract the topic quality score for the models
        
        model_info.append({  #assigns the number of topics and scores to the differnt models in the model_info list. This is the dataframe that will be saved in the CSV file.
            **params_dict,  
            "num_topics": num_topics,
            "npmi_score": npmi_score,
            "topic_diversity": diversity_score,
            "topic_quality": topic_quality,
            "num_outliers" : num_outliers   
        })

# Now we retrived all the information to the model_info dataframe, 
# Saves it to CSV file
with open('min_cluster_size', 'w', newline='') as csvfile:
    fieldnames = list(model_info[0].keys()) if model_info else []
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    
    writer.writeheader()
    for info in model_info:
        writer.writerow(info)


# Sort the results by NPMI score in descending order
top_20_npmi = sorted(results, key=lambda x: x["npmi_score"], reverse=True)[:10]

# Sort the results by Topic Diversity score in descending order
top_20_diversity = sorted(results, key=lambda x: x["topic_diversity"], reverse=True)[:10]

# Sort the results by both NPMI and Topic Diversity score in descending order
top_20_quality = sorted(results, key=lambda x: (x["topic_quality"]), reverse=True)[:10]

# Print the top 20 based on NPMI
print("\nTop 20 Best Performing Models (Based on NPMI):\n")
for i, result in enumerate(top_20_npmi, start=1):
    print(f"Rank {i}: num_topics: {result['num_topics']} | {result['params']} | TC: {result['npmi_score']:.3f} | TD: {result['topic_diversity']:.3f} | TQ: {result['topic_quality']:.3f} | num_outliers: {result['num_outliers']})")

# Print the top 20 based on Topic Diversity
print("\nTop 20 Best Performing Models (Based on Topic Diversity):\n")
for i, result in enumerate(top_20_diversity, start=1):
    print(f"Rank {i}: num_topics: {result['num_topics']} | {result['params']} | TC: {result['npmi_score']:.3f} | TD: {result['topic_diversity']:.3f} | TQ: {result['topic_quality']:.3f} | num_outliers: {result['num_outliers']})")

# Print the top 20 based on topic quality
print("\nTop 20 Best Performing Models (Based on topic quality):\n")
for i, result in enumerate(top_20_quality, start=1):
    print(f"Rank {i}: num_topics: {result['num_topics']} | {result['params']} | TC: {result['npmi_score']:.3f} | TD: {result['topic_diversity']:.3f} | TQ: {result['topic_quality']:.3f} | num_outliers: {result['num_outliers']})")



[1/7] Running with params: {'min_cluster_size': 30, 'n_components': 2, 'n_neighbors': 15}
Number of Topics: 63 | TC: 0.063 | TD: 0.735 | TQ: 0.046 | num_outliers: 3378
[2/7] Running with params: {'min_cluster_size': 50, 'n_components': 2, 'n_neighbors': 15}
Number of Topics: 38 | TC: 0.059 | TD: 0.682 | TQ: 0.040 | num_outliers: 3848
[3/7] Running with params: {'min_cluster_size': 70, 'n_components': 2, 'n_neighbors': 15}
Number of Topics: 26 | TC: 0.100 | TD: 0.688 | TQ: 0.069 | num_outliers: 4059
[4/7] Running with params: {'min_cluster_size': 90, 'n_components': 2, 'n_neighbors': 15}
Number of Topics: 18 | TC: 0.065 | TD: 0.683 | TQ: 0.044 | num_outliers: 4458
[5/7] Running with params: {'min_cluster_size': 110, 'n_components': 2, 'n_neighbors': 15}
Number of Topics: 15 | TC: 0.064 | TD: 0.680 | TQ: 0.043 | num_outliers: 4448
[6/7] Running with params: {'min_cluster_size': 130, 'n_components': 2, 'n_neighbors': 15}
Number of Topics: 10 | TC: 0.055 | TD: 0.690 | TQ: 0.038 | num_outli