In [1]:
import os
import pdfplumber
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
import re
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kruu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\kruu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kruu\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


****
# Load Report
****

In [2]:
# Raw text is a string concatenating the different lines in a page"
# "Page 1 Line 1\nPage 1 Line 2\nPage 1 Line 3\n...\nPage 2 Line 1\n..."

# Idea: 
# - Get rid of front page and different titles


def extract_text_from_pdf(pdf_path):
    full_text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            full_text += page.extract_text() + "\n"
    return full_text

pdf_path = "../data/iata-annual-review-2024.pdf"
raw_text = extract_text_from_pdf(pdf_path)

# First 1000 char in the text
print(raw_text[:1000])

# Is it better to extrck the paragraphs, the lines, the dealiines, etc... ?

IATA
Annual
Review
2024
Contents
02
IATA
Annual
Review
2024
Contents
03 Members’ list 35 Infrastructure
05 Willie Walsh, Director General 40 Regulations & Taxes
08 Yvonne Manzi Makolo, Chair, Board of Governors 46 Cargo
11 Members of the Board of Governors 48 Security
12 Economics 52 Diversity & Inclusion
20 Environment & Sustainability 54 Modern Airline Retailing
26 Safety 57 Financial Services
32 Passenger Experience
International Air Transport Association
Annual Review 2024
80th Annual General Meeting and
World Air Transport Summit, Dubai, United Arab Emirates
Members’
list
03
IATA
Annual
Review
2024
F
Members’ list
FedEx Express
Fiji Airways
Finnair
Fly Baghdad
Fly Namibia
Albastar Cargojet Airways flydubai
A Allied Air Cargolux FlyEgypt
AlMasria Universal Caribbean Airlines Flynas
ABX Air Airlines Carpatair Flyone
Aegean Airlines Amelia (Regourd Aviation) Cathay Pacific Freebird Airlines
Aer Lingus American Airlines Cebu Pacific French Bee
Aero Republica ANA CemAir Fuzhou Airlines

****
# Text Processing
****

In [3]:
# Detect sentence boundaries in the previously built string and do chunks of 5 sentences
# sent_tokenize() uses the Punkt Sentence Tokenizer, which is a pretrained unsupervised algorithm trained to detect sentence boundaries using punctuation and capitalization patterns.
# chunks[i] is a single string made by joining 5 consecutive sentences together.

from nltk.tokenize import sent_tokenize

def chunk_text(text, chunk_size=5):
    sentences = sent_tokenize(text)
    chunks = [" ".join(sentences[i:i + chunk_size]) for i in range(0, len(sentences), chunk_size)]
    return chunks

chunks = chunk_text(raw_text)
print(f"Total chunks: {len(chunks)}")

Total chunks: 187


In [4]:
def clean_chunk(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation and non-word characters
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    # Remove stopwords
    words = text.split()
    words = [w for w in words if w not in stop_words]
    return " ".join(words)

cleaned_chunks = [clean_chunk(chunk) for chunk in chunks]

****
# Create Embeddings
****

In [7]:
# MiniLM is a smaller version of BERT for semantic similarity task or natural language inference, clustering, QA, etc... 
# Pretrained by HuggingFace
# This model outputs 384 dimensional vectors
# Default batch size = 32

model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(cleaned_chunks, show_progress_bar=True)

Batches: 100%|██████████| 6/6 [00:01<00:00,  3.15it/s]


****
# Topic Modeling
****

In [22]:
# Idea: 
# - find the topic for each report + see if there is an evolution in the most cited topic over the years. 

# BERTopic performs dimensionality reduction using UMAP
# Then applies HDBSCAN to find cluster of chunks with similar semantics (-1: no topic assigned)
# Uses class-based TF-IDF to find keywords that represent each topic

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text

my_stopwords = text.ENGLISH_STOP_WORDS.union(["iata", "aviation", "air", "industry"])
custom_vectorizer = CountVectorizer(stop_words=list(my_stopwords))
topic_model = BERTopic(vectorizer_model=custom_vectorizer, verbose=True)
topics, probs = topic_model.fit_transform(cleaned_chunks, embeddings)

# Explore topics
topic_model.get_topic_info().head(10)

2025-04-22 11:35:50,198 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-22 11:35:50,504 - BERTopic - Dimensionality - Completed ✓
2025-04-22 11:35:50,504 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-22 11:35:50,521 - BERTopic - Cluster - Completed ✓
2025-04-22 11:35:50,521 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-22 11:35:50,556 - BERTopic - Representation - Completed ✓


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,44,-1_airlines_executive_cargo_officer,"[airlines, executive, cargo, officer, chief, c...",[governors asked iata develop index launched h...
1,0,44,0_safety_security_accident_digital,"[safety, security, accident, digital, accident...",[affect accident rate well fatality risk calcu...
2,1,39,1_airport_charges_airports_development,"[airport, charges, airports, development, infr...",[factors contribute increase ansp charges affe...
3,2,23,2_saf_emissions_corsia_zero,"[saf, emissions, corsia, zero, carbon, net, fu...",[custody saf accounting adds momentum output g...
4,3,22,3_cargo_economics_growth_jul,"[cargo, economics, growth, jul, yoy, source, d...",[industry load factor decreased percentage poi...
5,4,15,4_airlines_airways_diversity_governors,"[airlines, airways, diversity, governors, boar...",[facilitate efficient secure costeffective avi...


In [23]:
topic_model.get_topic_info()["Representation"]

0    [airlines, executive, cargo, officer, chief, c...
1    [safety, security, accident, digital, accident...
2    [airport, charges, airports, development, infr...
3    [saf, emissions, corsia, zero, carbon, net, fu...
4    [cargo, economics, growth, jul, yoy, source, d...
5    [airlines, airways, diversity, governors, boar...
Name: Representation, dtype: object

****
# Visualize
****

In [24]:
topic_model.visualize_topics()

In [None]:
# Idea: Compare the importance of eachtopic over the years. 

In [25]:
topic_model.visualize_barchart(top_n_topics=10)