In [4]:
import pandas as pd

df = pd.read_csv('data/processed/customer_support_tickets_processed.csv')

In [5]:
from bertopic import BERTopic
import pandas as pd # Assuming you have pandas for DataFrame manipulation

# Use the processed descriptions from the dataframe
documents = df['Processed Description'].tolist()


# --- Step 1: Instantiate the BERTopic Model ---
# We configure the model with some basic parameters.
# 'language="english"' helps with stop word removal.
# 'calculate_probabilities=True' allows us to see how confident the model is about each assignment.
# 'verbose=True' will print progress updates during the training process.
topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True)

# --- Step 2: Train the Model and Transform the Documents ---
# This single line of code runs the entire pipeline:
# 1. Creates embeddings for all documents.
# 2. Reduces their dimensionality with UMAP.
# 3. Clusters them with HDBSCAN.
# 4. Extracts topic representations with c-TF-IDF.
# This may take a few minutes depending on your hardware and the number of documents.
topics, probs = topic_model.fit_transform(documents)

print("\nTraining complete!")
print(f"BERTopic found {len(topic_model.get_topic_info()) - 1} topics (excluding the outlier topic -1).")

# --- Step 3: Inspect the Results ---
# Let's see the most frequent topics found by the model.
# The `get_topic_info()` method gives a nice summary DataFrame.
print("\nTop 10 most frequent topics:")
display(topic_model.get_topic_info().head(10))

# --- Step 4: Examine a Specific Topic ---
# Let's look at the keywords for the first topic (Topic 0).
# The output shows the most important words and their scores for that topic.
print("\nKeywords for the most frequent topic (Topic 0):")
display(topic_model.get_topic(0))

  from .autonotebook import tqdm as notebook_tqdm
2025-10-04 15:36:05,208 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 265/265 [00:11<00:00, 23.84it/s]
2025-10-04 15:36:18,215 - BERTopic - Embedding - Completed ✓
2025-10-04 15:36:18,216 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
2025-10-04 15:36:26,154 - BERTopic - Dimensionality - Completed ✓
2025-10-04 15:36:26,155 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-10-04 15:36:28,134 - BERTopic - Cluster - Completed ✓
2025-10-04 15:36:28,139 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-10-04 15:36:28,253 - BERTopic - Representation - Completed ✓



Training complete!
BERTopic found 125 topics (excluding the outlier topic -1).

Top 10 most frequent topics:


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1071,-1_as_my_the_but,"[as, my, the, but, troubleshooting, to, steps,...",[ive recently set up my productpurchased but i...
1,0,342,0_desired_through_find_guide,"[desired, through, find, guide, perform, actio...",[im having an issue with the productpurchased ...
2,1,336,1_acts_unexpectedly_intermittent_sometimes,"[acts, unexpectedly, intermittent, sometimes, ...",[im having an issue with the productpurchased ...
3,2,331,2_errormessage_popping_peculiar_mean,"[errormessage, popping, peculiar, mean, messag...",[im having an issue with the productpurchased ...
4,3,319,3_concerned_safe_security_ensure,"[concerned, safe, security, ensure, like, abou...",[im having an issue with the productpurchased ...
5,4,300,4_factory_performed_hoping_reset,"[factory, performed, hoping, reset, didnt, hel...",[im having an issue with the productpurchased ...
6,5,285,5_afterward_firmware_happening_related,"[afterward, firmware, happening, related, upda...",[im having an issue with the productpurchased ...
7,6,285,6_occurring_recent_havent_changes,"[occurring, recent, havent, changes, after, ma...",[im having an issue with the productpurchased ...
8,7,217,7_battery_sudden_decrease_life,"[battery, sudden, decrease, life, longer, much...",[im having an issue with the productpurchased ...
9,8,189,8_unexpected_causing_errors_app,"[unexpected, causing, errors, app, loss, bug, ...",[ive noticed a software bug in the productpurc...



Keywords for the most frequent topic (Topic 0):


[('desired', np.float64(0.03945779478386017)),
 ('through', np.float64(0.03777341038217167)),
 ('find', np.float64(0.03475229503527543)),
 ('guide', np.float64(0.03442105083211244)),
 ('perform', np.float64(0.03419909613582983)),
 ('action', np.float64(0.03410623101257848)),
 ('option', np.float64(0.03325472916925416)),
 ('could', np.float64(0.02722273435063374)),
 ('unable', np.float64(0.027109222003893717)),
 ('me', np.float64(0.024853232278179604))]

In [6]:
# Assuming 'topic_model' and 'documents' are already loaded and the model is fitted.

if 'topic_model' in globals():
	# --- 1. Visualize the Intertopic Distance Map ---
	# This helps us see how topics relate to each other.
	# Good model: distinct, well-separated clusters.
	# Bad model: all clusters overlapping in one big blob.
	intertopic_map_fig = topic_model.visualize_topics()
	intertopic_map_fig.show()

	# --- 2. Visualize Topic Keyword Scores ---
	# This helps us judge the coherence of individual topics.
	# Do the words in the top topic make sense together?
	barchart_fig = topic_model.visualize_barchart(top_n_topics=10) # Visualize top 10 topics
	barchart_fig.show()

	# --- 3. Visualize the Topic Hierarchy ---
	# This shows if some topics are actually sub-topics of a larger theme.
	# It can help us decide if we have too many or too few topics.
	hierarchy_fig = topic_model.visualize_hierarchy(top_n_topics=20)
	hierarchy_fig.show()
else:
	print("Error: 'topic_model' is not defined. Please run the cell where you train the BERTopic model.")

In [7]:
import optuna
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

# This objective function is what Optuna will try to optimize.
def objective(trial):
    """
    An Optuna objective function to find the best BERTopic hyperparameters.
    We aim to maximize the number of topics while penalizing a large outlier topic.
    """
    # --- Suggest Hyperparameters ---
    # We define the search space for our key parameters.
    min_cluster_size = trial.suggest_int("min_cluster_size", 15, 150)
    ngram_range = trial.suggest_categorical("ngram_range", [(1, 1), (1, 2)])
    min_df = trial.suggest_int("min_df", 1, 5)

    # --- Create the Topic Model with Suggested Parameters ---
    vectorizer_model = CountVectorizer(ngram_range=ngram_range, stop_words="english", min_df=min_df)
    
    topic_model_to_tune = BERTopic(
        min_topic_size=min_cluster_size,
        vectorizer_model=vectorizer_model,
        language="english",
        calculate_probabilities=False, # Disable for speed during tuning
        verbose=False
    )
    
    topics, _ = topic_model_to_tune.fit_transform(documents)
    
    # --- Define our Metric: What makes a "good" model? ---
    # 1. We want a reasonable number of topics.
    num_topics = len(topic_model_to_tune.get_topic_info())
    
    # 2. We want to minimize the number of outliers (unclassified documents).
    outlier_ratio = topic_model_to_tune.get_topic_info().iloc[0]['Count'] / len(documents)
    
    # A simple scoring function: higher is better.
    # We penalize models with a very high outlier ratio (> 30%).
    score = num_topics - (outlier_ratio * 100 if outlier_ratio > 0.3 else 0)

    return score


# --- Run the Optimization Study ---
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20) # Run 20 different trials

# --- Print the Best Results ---
print("\n--- Optuna Study Complete ---")
print("Best trial:")
trial = study.best_trial
print(f"  Value (Score): {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

[I 2025-10-04 15:36:53,820] A new study created in memory with name: no-name-1319905f-d182-4619-9643-af14c02075aa
[I 2025-10-04 15:37:08,958] Trial 0 finished with value: 32.0 and parameters: {'min_cluster_size': 99, 'ngram_range': (1, 2), 'min_df': 2}. Best is trial 0 with value: 32.0.
[I 2025-10-04 15:37:22,732] Trial 1 finished with value: 41.0 and parameters: {'min_cluster_size': 52, 'ngram_range': (1, 1), 'min_df': 3}. Best is trial 1 with value: 41.0.
[I 2025-10-04 15:37:35,743] Trial 2 finished with value: 42.0 and parameters: {'min_cluster_size': 38, 'ngram_range': (1, 1), 'min_df': 1}. Best is trial 2 with value: 42.0.
[I 2025-10-04 15:37:48,513] Trial 3 finished with value: 42.0 and parameters: {'min_cluster_size': 44, 'ngram_range': (1, 2), 'min_df': 3}. Best is trial 2 with value: 42.0.
[I 2025-10-04 15:38:01,341] Trial 4 finished with value: 41.0 and parameters: {'min_cluster_size': 52, 'ngram_range': (1, 2), 'min_df': 3}. Best is trial 2 with value: 42.0.
[I 2025-10-04 15


--- Optuna Study Complete ---
Best trial:
  Value (Score): 80.0
  Params: 
    min_cluster_size: 15
    ngram_range: (1, 1)
    min_df: 2


In [11]:
import mlflow
import pandas as pd
import numpy as np
from IPython.display import display

def objective_with_mlflow(trial):
    with mlflow.start_run():
        params = {
            "min_cluster_size": trial.suggest_int("min_cluster_size", 15, 150),
            "ngram_range": trial.suggest_categorical("ngram_range", ["(1, 1)", "(1, 2)"]),
            "min_df": trial.suggest_int("min_df", 1, 5)
        }
        mlflow.log_params(params)

        # Convert ngram_range back to tuple
        ngram_tuple = eval(params["ngram_range"])

        # Fit the model
        vectorizer_model = CountVectorizer(ngram_range=ngram_tuple, stop_words="english", min_df=params["min_df"])
        topic_model_to_tune = BERTopic(min_topic_size=params["min_cluster_size"], vectorizer_model=vectorizer_model, verbose=False)
        topics, _ = topic_model_to_tune.fit_transform(documents)

        # Calculate and log metrics
        num_topics = len(topic_model_to_tune.get_topic_info())
        outlier_count = topic_model_to_tune.get_topic_info().iloc[0]['Count']
        total_docs = len(documents)
        outlier_ratio = outlier_count / total_docs

        metrics = {"num_topics": num_topics, "outlier_ratio": outlier_ratio}
        mlflow.log_metrics(metrics)

        # Score for Optuna
        score = num_topics - (outlier_ratio * 100 if outlier_ratio > 0.3 else 0)

    return score

study_mlflow = optuna.create_study(direction="maximize")
study_mlflow.optimize(objective_with_mlflow, n_trials=20)

# Display MLflow runs inline
runs = mlflow.search_runs(order_by=["start_time DESC"])
if runs.empty:
    print("No mlflow runs found in the local tracking store (check ./mlruns or the MLFLOW_TRACKING_URI).")
else:
    df = runs.copy()
    for col in ("metrics.num_topics", "metrics_num_topics"):
        if col in df.columns:
            df["num_topics"] = pd.to_numeric(df[col], errors="coerce")
            break
    for col in ("metrics.outlier_ratio", "metrics_outlier_ratio"):
        if col in df.columns:
            df["outlier_ratio"] = pd.to_numeric(df[col], errors="coerce")
            break

    def compute_score(r):
        nt = r.get("num_topics", np.nan)
        orr = r.get("outlier_ratio", np.nan)
        if np.isnan(nt):
            return np.nan
        if np.isnan(orr):
            return nt
        return nt - (orr * 100 if orr > 0.3 else 0)

    df["optuna_score"] = df.apply(compute_score, axis=1)

    param_cols = [c for c in df.columns if c.startswith("params.")]
    display_cols = ["run_id", "start_time", "optuna_score", "num_topics", "outlier_ratio"] + param_cols
    display_df = df.reindex(columns=[c for c in display_cols if c in df.columns])
    display(display_df.sort_values("optuna_score", ascending=False).head(10))

    best = df.sort_values("optuna_score", ascending=False).iloc[0]
    print("\nBest run id:", best.run_id)
    print("Score:", best.optuna_score)
    print("Params:")
    for c in param_cols:
        print(" ", c.replace("params.", ""), "=", best.get(c))


[I 2025-10-04 15:50:31,587] A new study created in memory with name: no-name-a133df47-c55b-469b-8749-ec01c7fb1bb7
[I 2025-10-04 15:50:46,961] Trial 0 finished with value: 29.0 and parameters: {'min_cluster_size': 135, 'ngram_range': '(1, 1)', 'min_df': 4}. Best is trial 0 with value: 29.0.
[I 2025-10-04 15:51:01,459] Trial 1 finished with value: 31.0 and parameters: {'min_cluster_size': 126, 'ngram_range': '(1, 1)', 'min_df': 4}. Best is trial 1 with value: 31.0.
[I 2025-10-04 15:51:16,440] Trial 2 finished with value: 69.0 and parameters: {'min_cluster_size': 18, 'ngram_range': '(1, 2)', 'min_df': 2}. Best is trial 2 with value: 69.0.
[I 2025-10-04 15:51:30,152] Trial 3 finished with value: 41.0 and parameters: {'min_cluster_size': 47, 'ngram_range': '(1, 2)', 'min_df': 4}. Best is trial 2 with value: 69.0.
[I 2025-10-04 15:51:45,534] Trial 4 finished with value: 28.0 and parameters: {'min_cluster_size': 137, 'ngram_range': '(1, 1)', 'min_df': 3}. Best is trial 2 with value: 69.0.
[I 

Unnamed: 0,run_id,start_time,optuna_score,num_topics,outlier_ratio,params.min_cluster_size,params.min_df,params.ngram_range
6,b8ea98bf2f814d5f9dedc2e9e4325904,2025-10-04 14:53:36.509000+00:00,81.0,81.0,0.150313,15,2,"(1, 2)"
8,c69737bba7484cb29ba60eb5570bf24f,2025-10-04 14:53:07.860000+00:00,80.0,80.0,0.166135,15,1,"(1, 2)"
27,50bede8e83b246e3acfddad236bcd41e,2025-10-04 14:45:31.823000+00:00,76.0,76.0,0.137088,15,2,"(1, 2)"
20,842eb4bd8d2343d38714f7df2a7684f0,2025-10-04 14:47:14.290000+00:00,73.0,73.0,0.136144,16,2,"(1, 2)"
23,89c9fdb2b2f444e8b01d1c5de7e9ce4a,2025-10-04 14:46:32.865000+00:00,71.0,71.0,0.134136,17,2,"(1, 2)"
17,14c7bda35da44069bc2c36a14a609ea4,2025-10-04 14:51:01.462000+00:00,69.0,69.0,0.141811,18,2,"(1, 2)"
28,63a49b1cba2b45e38f302fca3749122d,2025-10-04 14:45:17.797000+00:00,68.0,68.0,0.125044,19,4,"(1, 2)"
9,799b05131d9d48919fe0c745c0a55779,2025-10-04 14:52:54.485000+00:00,68.0,68.0,0.115598,18,1,"(1, 2)"
31,962c8c3d66af41d9aed2490b1ce6246e,2025-10-04 14:44:34.185000+00:00,67.0,67.0,0.103554,19,3,"(1, 1)"
2,a5f11a3d1c384c728c8e59625e78f360,2025-10-04 14:54:32.620000+00:00,63.0,63.0,0.109222,19,1,"(1, 2)"



Best run id: b8ea98bf2f814d5f9dedc2e9e4325904
Score: 81.0
Params:
  min_cluster_size = 15
  min_df = 2
  ngram_range = (1, 2)
