# Clustering analysis using unsupervised learning
This notebook loads IATA annual review text chunks and applies BERTopic to identify dominant topics and trends over time.
BERTopic: https://arxiv.org/abs/2203.05794

Author: Katharine Leney, April 2005

In [1]:
from bertopic import BERTopic # topic modelling
from sentence_transformers import SentenceTransformer # embeddings
import json
from sklearn.feature_extraction.text import CountVectorizer
import random
import numpy as np
import torch
import pickle
from datetime import datetime   
from umap import UMAP
from hdbscan import HDBSCAN
import os
import shutil
import sys
sys.path.append("../src")
from utils.stopwords import CUSTOM_STOPWORDS

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load data (chunks extracted from annual
# reports using src/extract.py)
with open("../data/chunks.json") as f:
    data = json.load(f)

texts = [entry["text"] for entry in data]
timestamps = [entry["year"] for entry in data]  # used for topic over time

# Make sure the models and outputs folders exist
os.makedirs("../outputs", exist_ok=True)
os.makedirs("../models", exist_ok=True)

In [3]:
# --------------------------------------------------
# Set hyperparameters for UMAP and HDBScan
# (chosen using grid_search_clustering_analysis.ipynb)
# Set random seeds everywhere to improve stability
# of model (needed with small dataset)
# --------------------------------------------------

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

# If using GPU (for completeness):
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

# Fix UMAP state
# (controls topical separation)
umap_model = UMAP(
    n_neighbors=10, # larger = clusters more blended, smaller = tighter clusters
    n_components=5, 
    min_dist=0.1, # controls spread of clusters (range: 0.0--0.1. larger = crisper clusters)
    metric='cosine', 
    random_state=42
    )

# Fix HDBScan state
# (controls topic stability)
hdbscan_model = HDBSCAN(
    min_cluster_size=20, 
    metric='euclidean', 
    cluster_selection_method='eom', 
    prediction_data=True
    )


In [4]:
# Lightweight sentence transformer from Hugging Face
# https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
# Turns sentences into dense vectors 
# https://medium.com/@yasindusanjeewa8/dense-vectors-in-natural-language-processing-06818dff5cd7

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

In [5]:
# Allow any word‐ or hyphen‐chars, length ≥2, but require at least one letter
token_pattern = r"(?u)\b(?=\w*[A-Za-z])[\w-]{2,}\b"

custom_vectorizer = CountVectorizer(
    stop_words=CUSTOM_STOPWORDS, # Remove stopwords
    ngram_range=(1, 2),          # Include unigrams and bigrams
    token_pattern=token_pattern
)

In [6]:
# Create and fit BERTopic model
topic_model = BERTopic(embedding_model=embedding_model, 
                       vectorizer_model=custom_vectorizer,
                       umap_model=umap_model,
                       hdbscan_model=hdbscan_model, 
                       calculate_probabilities=True,
                       verbose=True)
topics, probs = topic_model.fit_transform(texts)

2025-04-30 01:32:02,928 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 95/95 [00:10<00:00,  9.18it/s]
2025-04-30 01:32:13,360 - BERTopic - Embedding - Completed ✓
2025-04-30 01:32:13,361 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-30 01:32:23,551 - BERTopic - Dimensionality - Completed ✓
2025-04-30 01:32:23,551 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-30 01:32:23,707 - BERTopic - Cluster - Completed ✓
2025-04-30 01:32:23,709 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-30 01:32:24,134 - BERTopic - Representation - Completed ✓


In [7]:
# Save model (with datestamp if wanted)
save_model=True

if save_model:
    #today = datetime.now().strftime("%Y%m%d")
    #model_dir = f"../models/bertopic_cluster_model_{today}"
    model_dir = f"../models/bertopic_cluster_model"

    print("Saving model to ",model_dir)

    # If directory exists, delete it (clean overwrite)
    if os.path.exists(model_dir):
        shutil.rmtree(model_dir)

    # Create the directory
    os.makedirs(model_dir)

    # Save the BERTopic model
    topic_model.save(os.path.join(model_dir, "model"))

    # Save the texts and timestamps
    with open(os.path.join(model_dir, "texts.pkl"), "wb") as f:
        pickle.dump(texts, f)

    with open(os.path.join(model_dir, "timestamps.pkl"), "wb") as f:
        pickle.dump(timestamps, f)



Saving model to  ../models/bertopic_cluster_model


In [8]:
# Inspect model parameters
topic_model.vectorizer_model.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.int64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 2),
 'preprocessor': None,
 'stop_words': ['a',
  'about',
  'above',
  'across',
  'after',
  'afterwards',
  'again',
  'against',
  'air',
  'airline',
  'airlines',
  'airway',
  'airways',
  'all',
  'almost',
  'alone',
  'along',
  'already',
  'also',
  'although',
  'always',
  'am',
  'among',
  'amongst',
  'amoungst',
  'amount',
  'an',
  'and',
  'annual',
  'another',
  'any',
  'anyhow',
  'anyone',
  'anything',
  'anyway',
  'anywhere',
  'apr',
  'april',
  'are',
  'around',
  'as',
  'association',
  'at',
  'aug',
  'august',
  'aviation',
  'back',
  'be',
  'became',
  'because',
  'become',
  'becomes',
  'becoming',
  'been',
  'before',
  'beforehand',
  'behind',
  'being',
  'below',
  'beside',
  'besides',
  'between',
  'beyond',
  '

In [9]:
# Explore top topics
topic_model.get_topic_info().head(10)
#topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,595,-1_fuel_emissions_passenger_global,"[fuel, emissions, passenger, global, cargo, ai...",[Region 2023 2022 5-year average (2019-2023) A...
1,0,233,0_safety_iosa_accidents_accident,"[safety, iosa, accidents, accident, audit, rat...",[19 SAFETY Safety audits Airlines on the IOSA ...
2,1,194,1_security_information_smart security_passenger,"[security, information, smart security, passen...",[Smart Security Smart Security is joint initia...
3,2,170,2_settlement_billion_processed_bsp,"[settlement, billion, processed, bsp, cass, is...",[IATA Cargo Account Settlement System (CASS) s...
4,3,146,3_charges_airport_airports_infrastructure,"[charges, airport, airports, infrastructure, c...",[Airports Savings in airport charges of $1.5 b...
5,4,143,4_global_governments_important_great,"[global, governments, important, great, future...",[Annual Review 2021\n\nWillie Walsh Director G...
6,5,92,5_ndc_order_travel_retailing,"[ndc, order, travel, retailing, distribution, ...",[With passenger volume expected to double by 2...
7,6,90,6_baggage_fast travel_fast_boarding,"[baggage, fast travel, fast, boarding, passeng...",[The program provides self-service options at ...
8,7,89,7_cargo_e-freight_freight_e-awb,"[cargo, e-freight, freight, e-awb, chain, supp...","[In partnership with shippers, freight forward..."
9,8,84,8_saf_corsia_emissions_production,"[saf, corsia, emissions, production, carbon, i...",[Sustainable aviation fuels One of the most pr...


In [10]:

# Export full topic info to csv file for easier viewing/cross-checks
topic_model.get_topic_info().to_csv("../data/topic_info.csv", index=False)

In [11]:
# ------------------------------------------------------------
# Visualize intertopic distances
# ------------------------------------------------------------

# N.B. UMAP (Uniform Manifold Approximation and Projection)
# used for dimensionality reduction. 
fig_2d = topic_model.visualize_topics(custom_labels=True)

# Update background and grid
fig_2d.update_layout(
    plot_bgcolor="white",
    paper_bgcolor="white",
    font_family="Arial",
    title_font_size=20,
    title_x=0.15,  # Center title
    width=1000,
    height=800,
    xaxis=dict(
        showgrid=False,
        zeroline=True,
        showline=True,
        ticks="outside",
        title=None
    ),
    yaxis=dict(
        showgrid=False,
        zeroline=True,
        showline=True,
        ticks="outside",
        title=None
    ),
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="center",
        x=0.5
    )
)

fig_2d.write_image("../outputs/intertopic_distance_map_2D.png")
fig_2d.write_html("../outputs/intertopic_distance_map_2D.html")   

fig_2d.show()

