# Visualise results from model trained in clustering_analysis_training.ipynb

Author: Katharine Leney, April 2025

In [1]:
import joblib
import pandas as pd
import pickle
import os
import sys
import plotly.express as px  
sys.path.append("../src")
from utils.filter_dataframe import filter_dataframe_by_year, get_top_topics
from utils.parse_model_folder import parse_model_folder
from utils.generate_clean_labels import generate_clean_labels
from utils.generate_opportunity_table import generate_opportunity_table

In [2]:
# ====================================
# Load model and associated data
# ====================================

# Model name
model_name = f"bertopic_cluster_model"

# Build correct path
model_path = os.path.join("../models/", model_name, "model")

# Load the model
topic_model = joblib.load(model_path)

# Load texts and timestamps
with open(os.path.join("../models/", model_name, "texts.pkl"), "rb") as f:
    texts = pickle.load(f)

with open(os.path.join("../models/", model_name, "timestamps.pkl"), "rb") as f:
    timestamps = pickle.load(f)

model_label = parse_model_folder(model_name)
print(model_label)

# Make sure the outputs folder exists
os.makedirs("../outputs", exist_ok=True)

  from .autonotebook import tqdm as notebook_tqdm





In [3]:
# Explore top topics
#topic_model.get_topic_info().head(10)
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,595,-1_fuel_emissions_passenger_global,"[fuel, emissions, passenger, global, aircraft,...",[Demand for air transport services was not mat...
1,0,233,0_safety_iosa_accidents_accident,"[safety, iosa, accidents, accident, audit, rat...",[22 Auditing to global standards and practices...
2,1,194,1_security_information_smart security_passenger,"[security, information, smart security, passen...",[Smart Security Smart Security is joint initia...
3,2,170,2_settlement_billion_processed_bsp,"[settlement, billion, processed, bsp, cass, is...",[IATA Cargo Account Settlement System (CASS) s...
4,3,146,3_charges_airport_airports_infrastructure,"[charges, airport, airports, infrastructure, c...",[Airports Savings in airport charges of $1.5 b...
5,4,143,4_global_governments_important_board,"[global, governments, important, board, great,...",[Annual Review 2021\n\nWillie Walsh Director G...
6,5,92,5_ndc_order_travel_retailing,"[ndc, order, travel, retailing, distribution, ...","[NDC offer and order management coverage, mean..."
7,6,90,6_baggage_fast travel_fast_boarding,"[baggage, fast travel, fast, boarding, passeng...",[The program provides self-service options at ...
8,7,89,7_e-freight_freight_e-awb_chain,"[e-freight, freight, e-awb, chain, supply chai...",[Key areas of focus for 2009 included Protecti...
9,8,84,8_saf_corsia_emissions_production,"[saf, corsia, emissions, production, carbon, f...",[Sustainable aviation fuels One of the most pr...


In [4]:
# ------------------------------------------------------------
# Visualize intertopic distances
# (move this to a function)
# ------------------------------------------------------------

# N.B. UMAP (Uniform Manifold Approximation and Projection)
# used for dimensionality reduction. 
fig_2d = topic_model.visualize_topics(custom_labels=True)

# Update background and grid
fig_2d.update_layout(
    plot_bgcolor="white",
    paper_bgcolor="white",
    font_family="Arial",
    title={
        "text": f"Intertopic Distance Map<br><sup>{model_label}</sup>",
        "x": 0.15,
        "font": {"size": 20}
    },
    width=1000,
    height=800,
    xaxis=dict(
        showgrid=False,
        zeroline=True,
        showline=True,
        ticks="outside",
        title=None
    ),
    yaxis=dict(
        showgrid=False,
        zeroline=True,
        showline=True,
        ticks="outside",
        title=None
    ),
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="center",
        x=0.5
    )
)
fig_2d.write_html("../outputs/intertopic_distance_map_2D.html")   

fig_2d.show()

In [5]:
# ------------------------------------------------------
# Clean up labels and remove junk topics
# ------------------------------------------------------

# Retrieve the current model labels before overwriting
old_labels = topic_model.topic_labels_

# Generate new labels
clean_labels, junk_topics = generate_clean_labels(topic_model, top_n_words=1)

# Print label changes for review
print("\nGenerated topic labels:")
for topic_id, label in sorted(clean_labels.items()):
    old_label = old_labels.get(topic_id)
    print(f"Topic {topic_id}: {label} (OLD: {old_label})")

# Apply to the model
topic_model.set_topic_labels(clean_labels)

Topic 4 : global_governments_important_board flagged as junk
Topic 12 : tam_sia_mauritius_niugini flagged as junk
Topic 23 identified as junk:
[('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05)]
Topic 25 : board_chair_southern_managing flagged as junk

Generated topic labels:
Topic -1: NULL (OLD: -1_fuel_emissions_passenger_global)
Topic 0: Safety (OLD: 0_safety_iosa_accidents_accident)
Topic 1: Security (OLD: 1_security_information_smart security_passenger)
Topic 2: Settlement/BSP (OLD: 2_settlement_billion_processed_bsp)
Topic 3: Airport Charges/Infrastructure (OLD: 3_charges_airport_airports_infrastructure)
Topic 4:  (OLD: 4_global_governments_important_board)
Topic 5: NDC (OLD: 5_ndc_order_travel_retailing)
Topic 6: Baggage (OLD: 6_baggage_fast travel_fast_boarding)
Topic 7: E-freight (OLD: 7_e-freight_freight_e-awb_chain)
Topic 8: SAF (OLD: 8_saf_corsia_emissions_production)
Topic 9: Regulation (OLD: 9_

In [None]:
# ------------------------------------------------------------
# Generate topic time series and visualize trends
# ------------------------------------------------------------

timestamps = pd.to_datetime(timestamps, format="%Y")
topics_over_time = topic_model.topics_over_time(texts, timestamps, topic_model.topics_)

# OPTIONAL: Set filter criteria
# Set to None to ignore
min_year = 2018
max_year = None
exclude_years = None #[2020]  

# Apply year filtering
filtered_topics, filter_label = filter_dataframe_by_year(
    topics_over_time,
    min_year=min_year,
    max_year=max_year,
    exclude_years=exclude_years
)

# Apply topic filtering (remove junk and noise topics)
filtered_topics = filtered_topics[
    (~filtered_topics["Topic"].isin(junk_topics)) &
    (filtered_topics["Topic"] != -1)
]

# Select top k most frequent clean topics
top_k = 10
top_topics = get_top_topics(filtered_topics, top_k)

# Make plot of topic frequency by year
# y-axis = % of all text chunks assigned to this topic in a given year
fig = topic_model.visualize_topics_over_time(filtered_topics, topics=top_topics)

# Apply custom colors and labels
palette = px.colors.qualitative.T10

for i, trace in enumerate(fig.data):
    topic_id = top_topics[i]
    trace.line.color = palette[i % len(palette)]
    trace.name = clean_labels[topic_id]
    hover_text = trace.hovertemplate or ""
    trace.hovertemplate = f"{clean_labels[topic_id]}<extra></extra>"

# Clean up the figure layout
fig.update_layout(
    title=f"Topic Trends Over Time<br><sup>{filter_label}</sup>",
    width=1200,
    height=600,
    xaxis_title="Year",
    yaxis_title="Topic Frequency (% of Annual Report)",
    legend_title="Topics",
    font=dict(family="Arial", size=12),
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=-0.4,
        xanchor="center",
        x=0.5
    )
)

output_filename = f"../outputs/topic_trends"
fig.write_html(f"{output_filename}.html")
fig.write_image(f"{output_filename}.png")

fig.show()

19it [00:05,  3.60it/s]


In [7]:
# ===============================================
# Generate a table matching trends to business
# opportunities, and assign a priority ranking
# based on trend size and growth
# ===============================================

#matched_opportunities_df, unmatched_topics = generate_opportunity_table(topic_model, topics_over_time)
#matched_opportunities_df, unmatched_topics = generate_opportunity_table(topic_model, filtered_topics)
matched_opportunities_df, unmatched_topics = generate_opportunity_table(topic_model, filtered_topics, clean_labels=clean_labels)

print(f"Filtering (if any): {model_label}")
print(f"Model Name: {model_name}\n")
print("=== Matched Business Opportunities ===\n")
display(matched_opportunities_df)

# Save table as CSV
output_filename = f"../outputs/business_opportunities.csv"
matched_opportunities_df.to_csv(output_filename, index=False)

print(f"\nSaved opportunity table to: {output_filename}")

Filtering (if any): 
Model Name: bertopic_cluster_model

=== Matched Business Opportunities ===



Unnamed: 0,Trend,Detected Label,Implication,Ongoing Activity,Priority (Recommended),Keyword Fraction (%),Topic Growth
0,SAF,8_saf_corsia_emissions_production,Increased scrutiny on aviation’s environmental...,"Invest in low-carbon technologies, emissions t...",High,3.47,5.6676
1,Baggage,6_baggage_fast travel_fast_boarding,"Growing passenger expectations for faster, mor...",Implement RFID and real-time tracking to impro...,Medium,3.72,1.0087
2,Safety,0_safety_iosa_accidents_accident,Persistent industry focus on accident preventi...,Strengthen IOSA participation and implement pr...,Low,9.62,-0.6366
3,Security,1_security_information_smart security_passenger,Rising demand for seamless and secure passenge...,Adopt biometric screening and reinforce data p...,Low,8.01,-0.5408
4,NDC,5_ndc_order_travel_retailing,"Shift towards modern, personalised airline ret...",Deploy NDC to gain control over distribution a...,Low,3.8,-1.5662
5,Tax,13_tax_taxes_taxation_government,Increasing government taxation and levies on a...,Engage in policy dialogue and assess tax-effic...,Low,2.73,-0.5746
6,Training,19_training_itdi_strategic_courses,"Strategic need for skilled, future-ready aviat...",Modernise training delivery and align curricul...,Low,2.11,-0.0958
7,Emissions,20_emissions_carbon_environmental_fuel efficiency,Increased scrutiny on aviation’s environmental...,"Invest in low-carbon technologies, emissions t...",Low,2.11,-0.0958
8,Profit,21_profit_billion_net_profits,Airline profitability rebounding but under pre...,Streamline operations and explore ancillary re...,Low,1.94,0.6085
9,IENVA,29_ienva_waste_environmental_assessment,Airlines under pressure to align with environm...,Adopt IENVA tools and embed environmental mana...,Low,1.16,0.6085



Saved opportunity table to: ../outputs/business_opportunities.csv
