# Imports

In [10]:
import sys

import optuna
import plotly.express as px
from git_root import git_root
from optuna import visualization
from optuna.trial import TrialState
from optuna.visualization import plot_pareto_front

my_git_root = git_root()
sys.path.append(my_git_root)

# Loading the relevant study from storage

In [11]:
storage_url = f"sqlite:////{my_git_root}/final_topic_model_studies.db"

In [12]:
study_summaries =optuna.get_all_study_summaries(storage_url)

In [13]:
print("Most Recent Run for Each Study:")
for summary in study_summaries:
    study_name = summary.study_name

    # Load the study
    study = optuna.load_study(study_name=study_name, storage=storage_url)

    # Find the most recent trial based on `datetime_start`
    most_recent_trial = max(
        (trial for trial in study.trials if trial.datetime_start is not None),
        key=lambda trial: trial.datetime_start,
        default=None,
    )

    if most_recent_trial:
        print(f"Study: {study_name}")
        print(f"Most Recent Run: {most_recent_trial.datetime_start}")
        print(f'Number of Trials: {len(study.trials)}')
    else:
        print(f"Study: {study_name}")
        print("No trials have been run yet.\n")

Most Recent Run for Each Study:
Study: bertopic_double_optimization_v3
Most Recent Run: 2025-01-28 10:22:40.855067
Number of Trials: 208
Study: bertopic_double_optimization_v2
No trials have been run yet.

Study: LDA_double_optimization_v1
Most Recent Run: 2025-02-02 09:45:30.064091
Number of Trials: 517
Study: bertopic_double_optimization_v1
Most Recent Run: 2025-02-02 10:17:18.269488
Number of Trials: 515
Study: bertopic_double_optimization_pre_processed_dataset_v1
Most Recent Run: 2025-02-02 09:47:47.540056
Number of Trials: 604
Study: visualize_topic_numbers
Most Recent Run: 2025-02-02 10:16:59.892119
Number of Trials: 511


In [14]:
study = optuna.create_study(directions=['maximize', 'maximize'], study_name='bertopic_double_optimization_pre_processed_dataset_v1', storage=storage_url, load_if_exists=True)

[I 2025-02-03 12:54:50,650] Using an existing study with name 'bertopic_double_optimization_pre_processed_dataset_v1' instead of creating a new one.


In [15]:
study.trials

[FrozenTrial(number=0, state=TrialState.PRUNED, values=None, datetime_start=datetime.datetime(2025, 2, 1, 20, 16, 37, 582040), datetime_complete=datetime.datetime(2025, 2, 1, 20, 17, 0, 103630), params={'n_neighbors': 39, 'min_dist': 0.9556428757689246, 'min_cluster_size': 75, 'min_samples': 62}, user_attrs={'num_topics': 4}, system_attrs={}, intermediate_values={}, distributions={'n_neighbors': IntDistribution(high=100, log=False, low=2, step=1), 'min_dist': FloatDistribution(high=1.0, log=False, low=0.1, step=None), 'min_cluster_size': IntDistribution(high=100, log=False, low=5, step=1), 'min_samples': IntDistribution(high=100, log=False, low=5, step=1)}, trial_id=4214, value=None),
 FrozenTrial(number=1, state=TrialState.PRUNED, values=None, datetime_start=datetime.datetime(2025, 2, 1, 20, 17, 0, 235638), datetime_complete=datetime.datetime(2025, 2, 1, 20, 17, 15, 3620), params={'n_neighbors': 17, 'min_dist': 0.2403950683025824, 'min_cluster_size': 10, 'min_samples': 88}, user_attrs

In [31]:
fig = plot_pareto_front(study, target_names=['Coherence', 'Diversity'], include_dominated_trials=False)
fig.update_layout(width=500, height=500, yaxis=dict(scaleanchor="x", scaleratio=1),xaxis=dict(scaleanchor="y", scaleratio=1))

In [17]:
fig = visualization.plot_contour(study, target_name = 'coherence', target=lambda t: (t.values[0]))
fig.update_layout(width=2500, height=2500, template='plotly')
fig.write_html(f"{my_git_root}/final_notebooks/images/contour_plots/bertopic_pre_processed_coherence_contour.html")
fig.write_image(f"{my_git_root}/final_notebooks/images/contour_plots/bertopic_pre_processed_coherence_contour.pdf")

In [18]:
fig = visualization.plot_contour(study, target_name = 'diversity', target=lambda t: (t.values[1]))
fig.update_layout(width=2500, height=2500, template='plotly')
fig.write_html(f"{my_git_root}/final_notebooks/images/contour_plots/bertopic_pre_processed_diversity_contour.html")
fig.write_image(f"{my_git_root}/final_notebooks/images/contour_plots/bertopic_pre_processed_diversity_contour.pdf")

In [19]:
# Delete the study if it exists and create a new study
optuna.delete_study(storage=storage_url, study_name='visualize_topic_numbers')
visualize_topic_numbers_study = optuna.create_study(
    direction='maximize',
    study_name='visualize_topic_numbers',
    storage=storage_url,
    load_if_exists=True
)

# Assume `study` is defined; replace this with your actual study object
my_trials = []

for index, trial in enumerate(study.trials):
    if 'num_topics' in trial.user_attrs:
        frozen_trial = optuna.trial.FrozenTrial(
            number=index,
            state=TrialState.COMPLETE,
            value=trial.user_attrs['num_topics'],
            datetime_start=trial.datetime_start,
            datetime_complete=trial.datetime_complete,
            params=trial.params,
            distributions=trial.distributions,
            user_attrs= {},
            system_attrs=trial.system_attrs,
            intermediate_values=trial.intermediate_values,
            trial_id=None
        )

        print(frozen_trial)
        my_trials.append(frozen_trial)

# Add trials to the new study
visualize_topic_numbers_study.add_trials(my_trials)


[I 2025-02-03 12:54:55,757] A new study created in RDB with name: visualize_topic_numbers


FrozenTrial(number=0, state=TrialState.COMPLETE, values=[4], datetime_start=datetime.datetime(2025, 2, 1, 20, 16, 37, 582040), datetime_complete=datetime.datetime(2025, 2, 1, 20, 17, 0, 103630), params={'n_neighbors': 39, 'min_dist': 0.9556428757689246, 'min_cluster_size': 75, 'min_samples': 62}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'n_neighbors': IntDistribution(high=100, log=False, low=2, step=1), 'min_dist': FloatDistribution(high=1.0, log=False, low=0.1, step=None), 'min_cluster_size': IntDistribution(high=100, log=False, low=5, step=1), 'min_samples': IntDistribution(high=100, log=False, low=5, step=1)}, trial_id=None, value=None)
FrozenTrial(number=1, state=TrialState.COMPLETE, values=[9], datetime_start=datetime.datetime(2025, 2, 1, 20, 17, 0, 235638), datetime_complete=datetime.datetime(2025, 2, 1, 20, 17, 15, 3620), params={'n_neighbors': 17, 'min_dist': 0.2403950683025824, 'min_cluster_size': 10, 'min_samples': 88}, user_attrs={}, system_attr

In [20]:
fig = visualization.plot_contour(visualize_topic_numbers_study, target_name = 'Number of Topics')
fig.update_layout(width=2500, height=2500, template='plotly')
fig.write_html(f"{my_git_root}/final_notebooks/images/contour_plots/bertopic_pre_processed_num_topics_contour.html")
fig.write_image(f"{my_git_root}/final_notebooks/images/contour_plots/bertopic_pre_processed_num_topics_contour.pdf")

In [21]:
num_clusters = []
coherences = []
diversities = []

for trial in study.trials:
    if trial.values is not None:
        num_clusters.append(trial.user_attrs['num_topics'])
        coherences.append(trial.values[0])
        diversities.append(trial.values[1])

In [22]:
fig = px.scatter(x=num_clusters, y=coherences, template='plotly_white')
fig.update_layout(xaxis_title="Number of Clusters", yaxis_title="Coherence Score")
fig.write_image(f'{my_git_root}/notebooks/images/bertopic_pre_processed_coherences.pdf')

In [23]:
fig = px.scatter(x=num_clusters, y=diversities, template='plotly_white')
fig.update_layout(xaxis_title="Number of Clusters", yaxis_title="Diversity Score")
fig.write_image(f'{my_git_root}/notebooks/images/bertopic_pre_processed_diversities.pdf')

In [24]:
def calculate_harmonic_mean(coherence, diversity):
    coherence_normalized = (coherence+ 1) / 2
    harmonic_mean = 2 * (coherence_normalized*diversity) / (coherence_normalized+diversity)
    return harmonic_mean

In [25]:
def evaluate_trial(trial, measurement):
    if measurement == 'coherence':
        return trial.values[0]
    elif measurement == 'diversity':
        return trial.values[1]
    elif measurement == 'harmonic_mean':
        return calculate_harmonic_mean(trial.values[0], trial.values[1])
    else:
        raise ValueError(f"Measurement {measurement} not recognized. Must be 'coherence', 'diversity' or 'harmonic_mean'.")

In [26]:
def get_max_value(trials, measurement):
    max_value = -1
    max_trial = None
    for trial in trials:
        if trial.values is not None:
            value = evaluate_trial(trial, measurement)
            if value > max_value:
                max_value = value
                max_trial = trial

    return max_value, max_trial

In [27]:
max_coherence, max_coherence_trial = get_max_value(study.trials, 'coherence')
max_coherence_harmonic_mean = calculate_harmonic_mean(max_coherence_trial.values[0], max_coherence_trial.values[1])

print(f'max_coherence: {max_coherence}\n')
print(f'max_coherence_trial: {max_coherence_trial}\n')
print(f'harmonic_mean: {max_coherence_harmonic_mean}\n')

max_coherence: 0.22037868694636017

max_coherence_trial: FrozenTrial(number=384, state=TrialState.COMPLETE, values=[0.22037868694636017, 0.84], datetime_start=datetime.datetime(2025, 2, 1, 21, 58, 33, 220694), datetime_complete=datetime.datetime(2025, 2, 1, 21, 58, 51, 597310), params={'n_neighbors': 52, 'min_dist': 0.7654024396138743, 'min_cluster_size': 5, 'min_samples': 5}, user_attrs={'num_topics': 11}, system_attrs={}, intermediate_values={}, distributions={'n_neighbors': IntDistribution(high=100, log=False, low=2, step=1), 'min_dist': FloatDistribution(high=1.0, log=False, low=0.1, step=None), 'min_cluster_size': IntDistribution(high=100, log=False, low=5, step=1), 'min_samples': IntDistribution(high=100, log=False, low=5, step=1)}, trial_id=7377, value=None)

harmonic_mean: 0.706885691615828



In [28]:
max_diversity, max_diversity_trial = get_max_value(study.trials, 'diversity')
max_diversity_harmonic_mean = calculate_harmonic_mean(max_diversity_trial.values[0], max_diversity_trial.values[1])

print(f'max_diversity: {max_diversity}\n')
print(f'max_diversity_trial: {max_diversity_trial}\n')
print(f'harmonic_mean: {max_diversity_harmonic_mean}')

max_diversity: 0.928

max_diversity_trial: FrozenTrial(number=438, state=TrialState.COMPLETE, values=[0.08667008067860961, 0.928], datetime_start=datetime.datetime(2025, 2, 1, 22, 13, 18, 619415), datetime_complete=datetime.datetime(2025, 2, 1, 22, 13, 37, 870069), params={'n_neighbors': 100, 'min_dist': 0.5355763362518458, 'min_cluster_size': 8, 'min_samples': 5}, user_attrs={'num_topics': 10}, system_attrs={}, intermediate_values={}, distributions={'n_neighbors': IntDistribution(high=100, log=False, low=2, step=1), 'min_dist': FloatDistribution(high=1.0, log=False, low=0.1, step=None), 'min_cluster_size': IntDistribution(high=100, log=False, low=5, step=1), 'min_samples': IntDistribution(high=100, log=False, low=5, step=1)}, trial_id=7845, value=None)

harmonic_mean: 0.6853842307984425


In [29]:
max_harmonic_mean, max_harmonic_mean_trial = get_max_value(study.trials, 'harmonic_mean')
print(f'max_harmonic_mean: {max_harmonic_mean}\n')
print(f'max_harmonic_mean_trial: {max_harmonic_mean_trial}')

max_harmonic_mean: 0.706885691615828

max_harmonic_mean_trial: FrozenTrial(number=384, state=TrialState.COMPLETE, values=[0.22037868694636017, 0.84], datetime_start=datetime.datetime(2025, 2, 1, 21, 58, 33, 220694), datetime_complete=datetime.datetime(2025, 2, 1, 21, 58, 51, 597310), params={'n_neighbors': 52, 'min_dist': 0.7654024396138743, 'min_cluster_size': 5, 'min_samples': 5}, user_attrs={'num_topics': 11}, system_attrs={}, intermediate_values={}, distributions={'n_neighbors': IntDistribution(high=100, log=False, low=2, step=1), 'min_dist': FloatDistribution(high=1.0, log=False, low=0.1, step=None), 'min_cluster_size': IntDistribution(high=100, log=False, low=5, step=1), 'min_samples': IntDistribution(high=100, log=False, low=5, step=1)}, trial_id=7377, value=None)
