# Clustering analysis using unsupervised learning
This notebook loads IATA annual review text chunks and applies BERTopic to identify dominant topics and trends over time.
BERTopic: https://arxiv.org/abs/2203.05794

Author: Katharine Leney, April 2005

In [None]:
from bertopic import BERTopic # topic modelling
from sentence_transformers import SentenceTransformer # embeddings
import json
from sklearn.feature_extraction.text import CountVectorizer
import random
import numpy as np
import torch
import pickle
from datetime import datetime   
from umap import UMAP
from hdbscan import HDBSCAN
import os
import shutil
import sys
sys.path.append("../src")
from stopwords import CUSTOM_STOPWORDS

In [2]:
# ================================================================
# Function to filter documents by minimum year, maximum year, 
# and/or specific years to exclude.
# ================================================================

def filter_data(data, min_year=None, max_year=None, exclude_years=None):
    """
    Args:
        data (list): List of dicts with at least a 'year' field.
        min_year (int, optional): Keep only documents from this year onwards.
        max_year (int, optional): Keep only documents up to and including this year.
        exclude_years (list of int, optional): Years to exclude.

    Returns:
        tuple: (filtered_data, description_string)
    """
    filtered = []
    for entry in data:
        year = int(entry["year"])
        if min_year is not None and year < min_year:
            continue
        if max_year is not None and year > max_year:
            continue
        if exclude_years is not None and year in exclude_years:
            continue
        filtered.append(entry)

    # Build description
    desc_parts = []
    if min_year is not None:
        desc_parts.append(f"from_{min_year}")
    if max_year is not None:
        desc_parts.append(f"up_to_{max_year}")
    if exclude_years:
        desc_parts.append(f"excl_{'_'.join(map(str, exclude_years))}")

    description = "_".join(desc_parts) if desc_parts else "all data"

    return filtered, description

In [None]:
# Load data (chunks extracted from annual
# reports using src/extract.py)
with open("../data/chunks.json") as f:
    data = json.load(f)

# Possibility to filter data (e.g. focus on last 10 years but exclude pandemic)
apply_filter=True
filter_description=""
if apply_filter :
    filtered_data, filter_description = filter_data(data, min_year=2015, exclude_years=[2020, 2021])
    data = filtered_data
    print(filter_description)

texts = [entry["text"] for entry in data]
timestamps = [entry["year"] for entry in data]  # used for topic over time

from_2015_excl_2020_2021


In [4]:
# --------------------------------------------------
# Set hyperparameters for UMAP and HDBScan
# (chosen using grid_search_clustering_analysis.ipynb)
# Set random seeds everywhere to improve stability
# of model (needed with small dataset)
# --------------------------------------------------

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

# If using GPU (for completeness):
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

# Fix UMAP state
# (controls topical separation)
umap_model = UMAP(
    n_neighbors=10, # larger = clusters more blended, smaller = tighter clusters
    n_components=5, 
    min_dist=0.1, # controls spread of clusters (range: 0.0--0.1. larger = crisper clusters)
    metric='cosine', 
    random_state=42
    )

# Fix HDBScan state
# (controls topic stability)
hdbscan_model = HDBSCAN(
    min_cluster_size=20, 
    metric='euclidean', 
    cluster_selection_method='eom', 
    prediction_data=True
    )


In [5]:
# Lightweight sentence transformer from Hugging Face
# https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
# Turns sentences into dense vectors 
# https://medium.com/@yasindusanjeewa8/dense-vectors-in-natural-language-processing-06818dff5cd7

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
# Only allow tokens with at least one alphabet character (i.e. excludes numbers)
token_pattern = r"(?u)\b[a-zA-Z]*[a-zA-Z][a-zA-Z]*\b"

custom_vectorizer = CountVectorizer(
    stop_words=CUSTOM_STOPWORDS, # Remove stopwords
    ngram_range=(1, 2),          # Include unigrams and bigrams
    token_pattern=token_pattern
)

In [7]:
# Create and fit BERTopic model
topic_model = BERTopic(embedding_model=embedding_model, 
                       vectorizer_model=custom_vectorizer,
                       umap_model=umap_model,
                       hdbscan_model=hdbscan_model, 
                       calculate_probabilities=True,
                       verbose=True)
topics, probs = topic_model.fit_transform(texts)

2025-04-26 15:14:02,424 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/45 [00:00<?, ?it/s]

2025-04-26 15:14:07,762 - BERTopic - Embedding - Completed ✓
2025-04-26 15:14:07,763 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-26 15:14:12,059 - BERTopic - Dimensionality - Completed ✓
2025-04-26 15:14:12,060 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-26 15:14:12,106 - BERTopic - Cluster - Completed ✓
2025-04-26 15:14:12,107 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-26 15:14:12,265 - BERTopic - Representation - Completed ✓


In [8]:
# Save model with datestamp and filter name
save_model=True

if save_model:
    today = datetime.now().strftime("%Y%m%d")
    model_dir = f"../models/bertopic_cluster_model_{filter_description}_{today}"

    print("Saving model to ",model_dir)

    # If directory exists, delete it (clean overwrite)
    if os.path.exists(model_dir):
        shutil.rmtree(model_dir)

    # Create the directory
    os.makedirs(model_dir)

    # Save the BERTopic model
    topic_model.save(os.path.join(model_dir, "model"))

    # Save the texts and timestamps
    with open(os.path.join(model_dir, "texts.pkl"), "wb") as f:
        pickle.dump(texts, f)

    with open(os.path.join(model_dir, "timestamps.pkl"), "wb") as f:
        pickle.dump(timestamps, f)



Saving model to  ../models/bertopic_cluster_model_from_2015_excl_2020_2021_20250426


In [9]:
# Inspect model parameters
topic_model.vectorizer_model.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.int64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 2),
 'preprocessor': None,
 'stop_words': ['a',
  'about',
  'above',
  'across',
  'after',
  'afterwards',
  'again',
  'against',
  'air',
  'all',
  'almost',
  'alone',
  'along',
  'already',
  'also',
  'although',
  'always',
  'am',
  'among',
  'amongst',
  'amoungst',
  'amount',
  'an',
  'and',
  'annual',
  'another',
  'any',
  'anyhow',
  'anyone',
  'anything',
  'anyway',
  'anywhere',
  'apr',
  'april',
  'are',
  'around',
  'as',
  'association',
  'at',
  'aug',
  'august',
  'aviation',
  'back',
  'be',
  'became',
  'because',
  'become',
  'becomes',
  'becoming',
  'been',
  'before',
  'beforehand',
  'behind',
  'being',
  'below',
  'beside',
  'besides',
  'between',
  'beyond',
  'bill',
  'bisignani',
  'both',
  'bottom',
  'but',

In [10]:
# Explore top topics
topic_model.get_topic_info().head(10)
#topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,142,-1_fuel_airlines_s_timatic,"[fuel, airlines, s, timatic, airport, jet fuel...","[60 IATA OFFICES Geneva Executive Office 33,..."
1,0,201,0_passenger_year_cargo_growth,"[passenger, year, cargo, growth, load, demand,...",[Africa Russia Brazil Jan Feb Mar Apr May Jun ...
2,1,151,1_saf_emissions_carbon_corsia,"[saf, emissions, carbon, corsia, environmental...",[Environment & Sustainability 28 IATA Annual ...
3,2,147,2_safety_iosa_accidents_accident,"[safety, iosa, accidents, accident, aircraft, ...",[Safety continues to improve The last decade s...
4,3,110,3_tax_regulation_taxation_passenger,"[tax, regulation, taxation, passenger, unruly,...",[The industry is asking governments to adopt s...
5,4,108,4_cargo_data_lithium_ceiv,"[cargo, data, lithium, ceiv, e, freight, goods...",[Special cargo Standardized air cargo processe...
6,5,102,5_airport_charges_infrastructure_airports,"[airport, charges, infrastructure, airports, d...",[Cost-efficiency and lower charges Infrastruct...
7,6,98,6_security_information_smart security_smart,"[security, information, smart security, smart,...",[The UN resolution confirms ICAO’s leadership...
8,7,64,7_settlement_billion_processed_iss,"[settlement, billion, processed, iss, bsp, fin...","[At the end of 2017, CASS was processing 94 ..."
9,8,60,8_ndc_order_travel_airline,"[ndc, order, travel, airline, retailing, airli...",[IATA has drafted a recommended practice for ...


In [11]:

# Export full topic info to csv file for easier viewing/cross-checks
topic_model.get_topic_info().to_csv("../data/topic_info.csv", index=False)


In [12]:
# ------------------------------------------------------------
# Visualize intertopic distances
# ------------------------------------------------------------

# N.B. UMAP (Uniform Manifold Approximation and Projection)
# used for dimensionality reduction. 
fig_2d = topic_model.visualize_topics(custom_labels=True)

# Update background and grid
fig_2d.update_layout(
    plot_bgcolor="white",
    paper_bgcolor="white",
    font_family="Arial",
    title_font_size=20,
    title_x=0.15,  # Center title
    width=1000,
    height=800,
    xaxis=dict(
        showgrid=False,
        zeroline=True,
        showline=True,
        ticks="outside",
        title=None
    ),
    yaxis=dict(
        showgrid=False,
        zeroline=True,
        showline=True,
        ticks="outside",
        title=None
    ),
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="center",
        x=0.5
    )
)

fig_2d.write_image("../outputs/intertopic_distance_map_2D.png")
fig_2d.write_html("../outputs/intertopic_distance_map_2D.html")   

fig_2d.show()

