# Visualise results from model trained in clustering_analysis_training.ipynb

Author: Katharine Leney, April 2025

In [1]:
from bertopic import BERTopic # topic modelling
import joblib
import pandas as pd
import pickle
import os
import json
import sys
sys.path.append("../src")
from utils.parse_model_folder import parse_model_folder
from utils.generate_clean_labels import generate_clean_labels
from utils.generate_opportunity_table import generate_opportunity_table

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ====================================
# Load model and associated data
#
# !! Set path to model you want to visualise here !!
#
# ====================================

# Model trained on full dataset
#model_name = f"bertopic_cluster_model__20250428"

# Model trained on 2015--2019, 2022--2024 data
model_name = f"bertopic_cluster_model_from_2015_excl_2020_2021_20250428"

# Build correct path
model_path = os.path.join("../models/", model_name, "model")

# Load the model
topic_model = joblib.load(model_path)

# Load texts and timestamps
with open(os.path.join("../models/", model_name, "texts.pkl"), "rb") as f:
    texts = pickle.load(f)

with open(os.path.join("../models/", model_name, "timestamps.pkl"), "rb") as f:
    timestamps = pickle.load(f)

model_label = parse_model_folder(model_name)
print(model_label)

# Make sure the outputs folder exists
os.makedirs("../outputs", exist_ok=True)

From 2015 | Excluding 2020, 2021


In [3]:
# Explore top topics
topic_model.get_topic_info().head(10)
#topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,130,-1_fuel_data_airlines_s,"[fuel, data, airlines, s, solutions, timatic, ...","[Geneva Executive Office 33, Route de l’Aér..."
1,0,190,0_passenger_year_cargo_growth,"[passenger, year, cargo, growth, load, domesti...",[01 13 750 700 650 600 550 500 450 400 350 300...
2,1,151,1_saf_emissions_carbon_corsia,"[saf, emissions, carbon, corsia, environmental...",[Environment & Sustainability 28 IATA Annual ...
3,2,147,2_safety_iosa_accidents_accident,"[safety, iosa, accidents, accident, aircraft, ...",[Safety continues to improve The last decade s...
4,3,119,3_tax_regulation_passenger_taxation,"[tax, regulation, passenger, taxation, unruly,...",[The industry is asking governments to adopt s...
5,4,103,4_airport_charges_infrastructure_airports,"[airport, charges, infrastructure, airports, c...",[Cost-efficiency and lower charges Infrastruct...
6,5,99,5_security_information_smart security_smart,"[security, information, smart security, smart,...",[The UN resolution confirms ICAO’s leadership...
7,6,93,6_cargo_lithium_ceiv_e,"[cargo, lithium, ceiv, e, goods, batteries, da...",[Special cargo Standardized air cargo processe...
8,7,64,7_settlement_billion_processed_iss,"[settlement, billion, processed, iss, bsp, fin...","[At the end of 2017, CASS was processing 94 ..."
9,8,61,8_ndc_order_travel_airline,"[ndc, order, travel, airline, retailing, airli...","[In addition, NDC hackathons and other initia..."


In [4]:
# ------------------------------------------------------------
# Visualize intertopic distances
# ------------------------------------------------------------

# N.B. UMAP (Uniform Manifold Approximation and Projection)
# used for dimensionality reduction. 
fig_2d = topic_model.visualize_topics(custom_labels=True)

# Update background and grid
fig_2d.update_layout(
    plot_bgcolor="white",
    paper_bgcolor="white",
    font_family="Arial",
    title={
        "text": f"Intertopic Distance Map<br><sup>{model_label}</sup>",
        "x": 0.15,
        "font": {"size": 20}
    },
    width=1000,
    height=800,
    xaxis=dict(
        showgrid=False,
        zeroline=True,
        showline=True,
        ticks="outside",
        title=None
    ),
    yaxis=dict(
        showgrid=False,
        zeroline=True,
        showline=True,
        ticks="outside",
        title=None
    ),
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="center",
        x=0.5
    )
)
fig_2d.write_html("../outputs/intertopic_distance_map_2D.html")   

fig_2d.show()

In [5]:
# ------------------------------------------------------
# Clean up labels and remove junk topics
# ------------------------------------------------------

# Retrieve the current model labels before overwriting
old_labels = topic_model.topic_labels_

# Generate new labels
clean_labels, junk_topics = generate_clean_labels(topic_model, top_n_words=1)

# Print label changes for review
print("\nGenerated topic labels:")
for topic_id, label in sorted(clean_labels.items()):
    old_label = old_labels.get(topic_id)
    print(f"Topic {topic_id}: {label} (OLD: {old_label})")

# Apply to the model
topic_model.set_topic_labels(clean_labels)


Generated topic labels:
Topic -1: NULL (OLD: -1_fuel_data_airlines_s)
Topic 0: Passenger (OLD: 0_passenger_year_cargo_growth)
Topic 1: SAF (OLD: 1_saf_emissions_carbon_corsia)
Topic 2: Safety (OLD: 2_safety_iosa_accidents_accident)
Topic 3: Tax (OLD: 3_tax_regulation_passenger_taxation)
Topic 4: Airport (OLD: 4_airport_charges_infrastructure_airports)
Topic 5: Security (OLD: 5_security_information_smart security_smart)
Topic 6: Cargo (OLD: 6_cargo_lithium_ceiv_e)
Topic 7: Settlement (OLD: 7_settlement_billion_processed_iss)
Topic 8: New Distribution Capability (OLD: 8_ndc_order_travel_airline)
Topic 9: Commercial (OLD: 9_commercial_world_contents_s)
Topic 10: Travel (OLD: 10_travel_experience_passengers_passenger)
Topic 11: Chairman (OLD: 11_chairman_president_airlines_board)
Topic 12: Airlines (OLD: 12_airlines_airways_latam_china)
Topic 13: Diversity (OLD: 13_diversity_inclusion_diversity inclusion_female)
Topic 14: Slot (OLD: 14_slot_wsg_slots_regulation)
Topic 15: Training (OLD: 1

In [6]:
# ------------------------------------------------------------
# Generate topic time series and visualize trends
# ------------------------------------------------------------

timestamps = pd.to_datetime(timestamps, format="%Y")
topics_over_time = topic_model.topics_over_time(texts, timestamps, topic_model.topics_)

# Save time series data and topic labels for the RAG assistant to use later
topics_over_time.to_csv("../outputs/topics_over_time.csv", index=False)
with open("../outputs/topic_labels.json", "w") as f:
    json.dump(clean_labels, f, indent=2)

# Filter out junk and noise topics from list and count rate
topic_freq = topic_model.get_topic_freq()
valid_topic_freq = topic_freq[
    (~topic_freq["Topic"].isin(junk_topics)) &
    (topic_freq["Topic"] != -1)
]

# Select top 10 most frequent clean topics
top_topics = valid_topic_freq.head(10)["Topic"].tolist()

# Make plot of topic frequency by year
# y-axis = % of all text chunks assigned to this topic in a given year
fig = topic_model.visualize_topics_over_time(topics_over_time, topics=top_topics)

# Apply custom colors and labels
for i, trace in enumerate(fig.data):
    topic_id = top_topics[i]
    #trace.line.color = iata_palette[i % len(iata_palette)]
    trace.name = clean_labels[topic_id]
    hover_text = trace.hovertemplate or ""
    trace.hovertemplate = f"{clean_labels[topic_id]}<extra></extra>"

# Clean up the figure layout
fig.update_layout(
    title=f"Topic Trends Over Time<br><sup>{model_label}</sup>",
    width=1200,
    height=600,
    xaxis_title="Year",
    yaxis_title="Topic Frequency (% of Annual Report)",
    legend_title="Topics",
    font=dict(family="Arial", size=12),
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=-0.25,
        xanchor="center",
        x=0.5
    )
)

output_filename = f"../outputs/topic_trends_{model_name}"
fig.write_html(f"{output_filename}.html")
fig.write_image(f"{output_filename}.png")

fig.show()

8it [00:00,  8.13it/s]


In [7]:
# ===============================================
# Generate a table matching trends to business
# opportunities, and assign a priority ranking
# based on trend size and growth
# ===============================================

#topics_over_time = topic_model.topics_over_time(texts, timestamps, topic_model.topics_)
matched_opportunities_df, unmatched_topics = generate_opportunity_table(topic_model, topics_over_time)

print(f"Filtering (if any): {model_label}")
print(f"Model Name: {model_name}\n")
print("=== Matched Business Opportunities ===\n")
display(matched_opportunities_df)

# Save table as CSV
output_filename = f"../outputs/business_opportunities_{model_name}.csv"
matched_opportunities_df.to_csv(output_filename, index=False)

print(f"\nSaved opportunity table to: {output_filename}")

Filtering (if any): From 2015 | Excluding 2020, 2021
Model Name: bertopic_cluster_model_from_2015_excl_2020_2021_20250428

=== Matched Business Opportunities ===



Unnamed: 0,Trend,Detected Label,Implication,Opportunity,Priority (Recommended),Keyword Fraction (%),Topic Growth
0,Growth,0_passenger_year_cargo_growth,Strong recovery in air travel post-pandemic,Expand network capacity and optimize route pla...,High,14.76,9
1,Emissions,1_saf_emissions_carbon_corsia,Increasing pressure for environmental responsi...,Adopt Sustainable Aviation Fuels (SAF) and car...,High,11.73,16
2,Safety,2_safety_iosa_accidents_accident,Continued industry focus on safety standards,Invest in IOSA certification and safety audits,Medium,11.42,-3
3,Regulation,3_tax_regulation_passenger_taxation,Tighter regulatory environment for passenger r...,Implement compliance monitoring and customer c...,Medium,9.25,-8
4,Airport,4_airport_charges_infrastructure_airports,Growing airport-related fees and costs,Negotiate airport charges and improve operatio...,Medium,8.0,-1
5,Security,5_security_information_smart security_smart,Passenger data security and travel security co...,Enhance digital identity management and passen...,Medium,7.69,-1
6,Cargo,6_cargo_lithium_ceiv_e,Continued strength in air cargo and freight de...,Expand cargo services and invest in logistics ...,Medium,7.23,-4
7,Diversity,13_diversity_inclusion_diversity inclusion_female,"Industry-wide DEI (Diversity, Equity, Inclusio...",Strengthen diversity and inclusion initiatives,Medium,2.25,10
8,Regulation,14_slot_wsg_slots_regulation,Tighter regulatory environment for passenger r...,Implement compliance monitoring and customer c...,Medium,2.25,2
9,Settlement,7_settlement_billion_processed_iss,"Changes in financial settlement systems (BSP, ...",Optimize settlement systems and implement fast...,Low,4.97,-2



Saved opportunity table to: ../outputs/business_opportunities_bertopic_cluster_model_from_2015_excl_2020_2021_20250428.csv
