# Overview

# Importing Necessary Libraries

In [None]:
import pandas as pd
import os
import spacy
import re
import numpy as np
import random

# !python -m spacy download en_core_web_md

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.cluster import KMeans

# Importing Topic modelling libraries
!pip install bertopic[all] sentence-transformers
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

Collecting bertopic[all]
  Downloading bertopic-0.17.0-py3-none-any.whl.metadata (23 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5

# Loading Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
dataset_path = '/content/drive/MyDrive/bbc'
dataset = []

Firstly, I would combine the various text files into a list, and then a dataframe, to make processing easier

In [None]:
for category in os.listdir(dataset_path):       #loops through the items in root dataset folder
    category_path = os.path.join(dataset_path, category)       #constructs the path for each item
    if os.path.isdir(category_path):       #checks what items are directories
        for filename in os.listdir(category_path):       #loops through directories
            file_path = os.path.join(category_path, filename)       #constructs the path for each filee
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read().strip()
                dataset.append({'id':category[:3]+'_'+filename[:-4], 'text':text, 'category':category})  #Adding an identifier to each text consisting of the category and file name

In [None]:
text_df = pd.DataFrame(dataset)      # converting the resulting list to a dataframe

In [None]:
text_df

Unnamed: 0,id,text,category
0,ent_115,Rapper Snoop Dogg sued for 'rape'\n\nUS rapper...,entertainment
1,ent_352,Vera Drake's Bafta triumph hope\n\nAt the Baft...,entertainment
2,ent_094,God cut from Dark Materials film\n\nThe direct...,entertainment
3,ent_336,Films on war triumph at Sundance\n\nA study of...,entertainment
4,ent_073,Ray DVD beats box office takings\n\nOscar-nomi...,entertainment
...,...,...,...
2220,pol_065,Guantanamo pair's passport ban\n\nThe governme...,politics
2221,pol_344,Will Tory tax cuts lift spirits?\n\nMichael Ho...,politics
2222,pol_084,Job cuts 'false economy' - TUC\n\nPlans to sh...,politics
2223,pol_318,Labour in constituency race row\n\nLabour's ch...,politics


# Preprocessing Text Data

First step, check raw texts for duplicated texts

In [None]:
# Identify duplicate texts
duplicate_ind = text_df[text_df.duplicated(subset=['text'], keep=False)].index.tolist()
print(duplicate_ind)

[4, 6, 12, 13, 17, 19, 52, 67, 68, 84, 110, 122, 127, 144, 153, 165, 169, 170, 182, 185, 208, 219, 223, 228, 235, 240, 247, 265, 268, 270, 295, 306, 318, 369, 421, 446, 469, 494, 545, 572, 588, 679, 682, 705, 797, 850, 952, 962, 1008, 1039, 1042, 1056, 1088, 1106, 1196, 1284, 1312, 1338, 1340, 1393, 1409, 1410, 1411, 1412, 1422, 1426, 1428, 1436, 1439, 1455, 1461, 1464, 1468, 1469, 1473, 1476, 1478, 1479, 1484, 1487, 1491, 1492, 1493, 1496, 1504, 1505, 1511, 1514, 1525, 1526, 1528, 1533, 1546, 1553, 1559, 1560, 1569, 1570, 1573, 1577, 1578, 1579, 1585, 1588, 1590, 1592, 1593, 1594, 1597, 1599, 1601, 1606, 1607, 1612, 1613, 1618, 1622, 1623, 1630, 1633, 1635, 1636, 1637, 1638, 1641, 1642, 1645, 1651, 1659, 1666, 1667, 1669, 1674, 1675, 1679, 1687, 1688, 1689, 1690, 1698, 1702, 1703, 1704, 1706, 1707, 1709, 1711, 1717, 1718, 1719, 1720, 1722, 1726, 1729, 1741, 1746, 1752, 1754, 1756, 1757, 1759, 1771, 1772, 1782, 1791, 1794, 1803, 1805, 1816, 1834, 1835, 1849, 1909, 1928, 1940, 1953, 195

In [None]:
text_df.loc[duplicate_ind].sort_values(by='text')

Unnamed: 0,id,text,category
1464,tec_166,'Brainwave' cap controls computer\n\nA team of...,tech
1794,tec_165,'Brainwave' cap controls computer\n\nA team of...,tech
1973,pol_298,'Debate needed' on donations cap\n\nA cap on d...,politics
1849,pol_059,'Debate needed' on donations cap\n\nA cap on d...,politics
2009,pol_108,'Super union' merger plan touted\n\nTwo of Bri...,politics
...,...,...,...
1410,tec_315,Warning over tsunami aid website\n\nNet users ...,tech
1439,tec_312,Web radio takes Spanish rap global\n\nSpin the...,tech
1659,tec_090,Web radio takes Spanish rap global\n\nSpin the...,tech
1805,tec_313,What high-definition will do to DVDs\n\nFirst ...,tech


Now that the duplicated rows have been identified, they have to be removed

In [None]:
text_df.drop_duplicates(subset=['text'], inplace=True)

In [None]:
text_df

Unnamed: 0,id,text,category,preprocessed_text
0,ent_115,Rapper Snoop Dogg sued for 'rape'\n\nUS rapper...,entertainment,rapper snoop dogg sue rape rapper snoop dogg ...
1,ent_352,Vera Drake's Bafta triumph hope\n\nAt the Baft...,entertainment,vera drake bafta triumph hope bafta film awar...
2,ent_094,God cut from Dark Materials film\n\nThe direct...,entertainment,god cut dark materials film director screenwr...
3,ent_336,Films on war triumph at Sundance\n\nA study of...,entertainment,film war triumph sundance study united states...
4,ent_073,Ray DVD beats box office takings\n\nOscar-nomi...,entertainment,ray dvd beats box office oscarnominate film b...
...,...,...,...,...
2220,pol_065,Guantanamo pair's passport ban\n\nThe governme...,politics,guantanamo pair passport ban government write...
2221,pol_344,Will Tory tax cuts lift spirits?\n\nMichael Ho...,politics,tory tax cut lift spirit michael howard final...
2222,pol_084,Job cuts 'false economy' - TUC\n\nPlans to sh...,politics,job cut false economy tuc plan shed 71000 ...
2223,pol_318,Labour in constituency race row\n\nLabour's ch...,politics,labour constituency race row labour choice wh...


In [None]:
nlp = spacy.load('en_core_web_sm')

#Using the stopword list from http://mlg.ucd.ie/files/datasets/stopwords.txt to preprocess the dataset

custom_stopwords = []
with open('/content/drive/MyDrive/bbc/stopwords.txt', 'r', encoding='utf-8') as file:
    for line in file:
        custom_stopwords.append(line.strip())

print(custom_stopwords)

['a', 'about', 'above', 'according', 'across', 'actually', 'adj', 'after', 'afterwards', 'again', 'all', 'almost', 'along', 'already', 'also', 'although', 'always', 'among', 'amongst', 'an', 'am', 'and', 'another', 'any', 'anyhow', 'anyone', 'anything', 'anywhere', 'are', 'aren', "aren't", 'around', 'as', 'at', 'be', 'became', 'because', 'become', 'becomes', 'been', 'beforehand', 'begin', 'being', 'below', 'beside', 'besides', 'between', 'both', 'but', 'by', 'can', 'cannot', "can't", 'caption', 'co', 'come', 'could', 'couldn', "couldn't", 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'don', "don't", 'down', 'during', 'each', 'early', 'eg', 'either', 'else', 'elsewhere', 'end', 'ending', 'enough', 'etc', 'even', 'ever', 'every', 'everywhere', 'except', 'few', 'for', 'found', 'from', 'further', 'had', 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'he', 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers', 'him', 'his', 'how', 'however', 'ie', 'i

In [None]:
#Creating a function for preprocessing - lowercasing, lemmatization, stopword removal

def preprocess(text):

    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # Processing with Spacy
    doc = nlp(text)

    # Lemmatization and removing stopwords using thee custom list
    tokens = [token.lemma_.lower() for token in doc if token.lemma_.lower() not in custom_stopwords]
    return ' '.join(tokens)

text_df['preprocessed_text'] = text_df['text'].apply(preprocess)

In [None]:
# Remove newline characters
text_df['preprocessed_text'] = text_df['preprocessed_text'].str.replace('\n', '')
text_df.head()

Unnamed: 0,id,text,category,preprocessed_text
0,ent_115,Rapper Snoop Dogg sued for 'rape'\n\nUS rapper...,entertainment,rapper snoop dogg sue rape rapper snoop dogg ...
1,ent_352,Vera Drake's Bafta triumph hope\n\nAt the Baft...,entertainment,vera drake bafta triumph hope bafta film awar...
2,ent_094,God cut from Dark Materials film\n\nThe direct...,entertainment,god cut dark materials film director screenwr...
3,ent_336,Films on war triumph at Sundance\n\nA study of...,entertainment,film war triumph sundance study united states...
4,ent_073,Ray DVD beats box office takings\n\nOscar-nomi...,entertainment,ray dvd beats box office oscarnominate film b...


Checking the preprocessed text after preprocessing to remove any duplicates

In [None]:
ind = text_df[text_df.duplicated(subset=['preprocessed_text'], keep=False)].index.tolist()

In [None]:
text_df.loc[ind].sort_values(by='preprocessed_text')

Unnamed: 0,id,text,category,preprocessed_text
1755,tec_224,Disney backs Sony DVD technology\n\nA next gen...,tech,disney back sony dvd technology generation dv...
1796,tec_219,Disney backs Sony DVD technology\n\nA next gen...,tech,disney back sony dvd technology generation dv...
1538,tec_155,Games win for Blu-ray DVD format\n\nThe next-g...,tech,game win bluray dvd format nextgeneration dvd...
1547,tec_294,Games win for Blu-ray DVD format\n\nThe next-g...,tech,game win bluray dvd format nextgeneration dvd...
731,spo_332,Harinordoquy suffers France axe\n\nNumber eigh...,sport,harinordoquy suffer france axe number eight i...
750,spo_344,Harinordoquy suffers France axe\n\nNumber eigh...,sport,harinordoquy suffer france axe number eight i...
497,spo_470,Moya emotional at Davis Cup win\n\nCarlos Moya...,sport,moya emotional davis cup win carlos moya desc...
615,spo_469,Moya emotional after Davis Cup win\n\nCarlos M...,sport,moya emotional davis cup win carlos moya desc...
1661,tec_342,'No re-draft' for EU patent law\n\nA proposed ...,tech,redraft eu patent law propose european law so...
1701,tec_048,'No re-draft' for EU patent law\n\nA proposed ...,tech,redraft eu patent law propose european law so...


In [None]:
text_df.drop_duplicates(subset=['preprocessed_text'], inplace=True)

# Exploratory Data Analysis

# Sub-categorizing Main Categories

To break down the texts into sub-categories, I would make use of BERTopic

## Reusable Functions

In [None]:
# Creating a function to run the initial clustering of the top categories
def run_bertopic_pipeline(
    df,
    top_category,
    embedding_model = None,
    min_topic_size = 5
):
    print (f" Running pipeline for {top_category} category\n\n")

    #Filtering the specified top category
    top_category_df = df[df['category'] == top_category]

    # Converting to list for easy processing
    top_category_texts = top_category_df['preprocessed_text'].tolist()
    top_category_texts = [t for t in top_category_texts if isinstance(t, str) and t.strip()]

    # Specifying embedding model
    if embedding_model is None:
        embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

    # Creating an instance of BERTopic for my data
    bbc_topic_model = BERTopic(embedding_model=embedding_model,
                           language="english",
                           verbose=True,
                           min_topic_size = min_topic_size)

    # Fit and transform data
    topics, probabilities = bbc_topic_model.fit_transform(top_category_texts)

    # Topic info
    topic_info = bbc_topic_model.get_topic_info()

    # Visualizing document and topic maps
    topic_documents_map = bbc_topic_model.visualize_documents(top_category_texts)
    topic_distance_map = bbc_topic_model.visualize_topics()

    # Appending topics and labels to dataframe
    top_category_df['bertopic_topic'] = topics
    top_category_df['bertopic_topic_label'] = top_category_df['bertopic_topic'].apply(lambda x: bbc_topic_model.get_topic(x))

    return {'topic_info': topic_info,
            'topic_distance_map': topic_distance_map,
            'topic_documents_map': topic_documents_map,
            'topics': topics,
            'probabilities': probabilities,
            'bbc_topic_model': bbc_topic_model,
            'dataframe': top_category_df}

In [None]:
# Creating a function to reduce topics, if needed

def bert_reduce_topics(results, nr_topics=5):
    model = results['bbc_topic_model']
    df = results['dataframe']
    top_category_texts = df['preprocessed_text'].tolist()
    top_category_texts = [t for t in top_category_texts if isinstance(t, str) and t.strip()]

    # Reduce the topics
    model.reduce_topics(top_category_texts, nr_topics=nr_topics)
    reduced_topics = model.topics_

    # Computing updated topic info
    reduced_topic_info = model.get_topic_info()

    # Add new topics to dataframe
    df['bertopic_topic_reduced'] = reduced_topics
    df['bertopic_topic_reduced_label'] = df['bertopic_topic_reduced'].apply(lambda x: model.get_topic(x))

    #Update input dictionary
    results.update({'reduced_topics': reduced_topics,
                    'dataframe': df,
                    'reduced_topic_info': reduced_topic_info})

    return results

## Business Category

In [None]:
# Run bertopic on business top category
business_results = run_bertopic_pipeline(text_df, 'business')

 Running pipeline for business category




2025-07-05 09:42:09,261 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

2025-07-05 09:43:48,737 - BERTopic - Embedding - Completed ✓
2025-07-05 09:43:48,739 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-05 09:43:49,797 - BERTopic - Dimensionality - Completed ✓
2025-07-05 09:43:49,800 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-05 09:43:49,839 - BERTopic - Cluster - Completed ✓
2025-07-05 09:43:49,844 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-05 09:43:50,119 - BERTopic - Representation - Completed ✓


In [None]:
# Visualize intertopic distance map
business_results['topic_distance_map']

In [None]:
# Visualize documents and topics
business_results['topic_documents_map']

Now let us inspect the resulting topics and labels for the **business** category


---



In [None]:
business_df = business_results['dataframe']
business_df.head()

Unnamed: 0,id,text,category,preprocessed_text,bertopic_topic,bertopic_topic_label
897,bus_289,UK economy facing 'major risks'\n\nThe UK manu...,business,uk economy face major risk uk manufacturing s...,-1,"[(year, 0.015182345151810179), (growth, 0.0115..."
898,bus_011,Ask Jeeves tips online ad revival\n\nAsk Jeeve...,business,ask jeeves tip online ad revival ask jeeve th...,4,"[(profit, 0.036662731342130855), (share, 0.033..."
899,bus_402,US interest rate rise expected\n\nUS interest ...,business,interest rate rise expect interest rate expec...,16,"[(rate, 0.05867959020034519), (fed, 0.05660672..."
900,bus_207,EMI shares hit by profit warning\n\nShares in ...,business,emi share hit profit warn share music giant e...,4,"[(profit, 0.036662731342130855), (share, 0.033..."
901,bus_457,Germany calls for EU reform\n\nGerman Chancell...,business,germany call eu reform german chancellor gerh...,7,"[(eu, 0.03576855719164767), (german, 0.0346082..."


In [None]:
business_topic_info = business_results['topic_info']
business_topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,101,-1_year_growth_firm_uk,"[year, growth, firm, uk, rise, trade, economic...",[consumer drive french economy frances econom...
1,0,30,0_ebbers_fraud_worldcom_sullivan,"[ebbers, fraud, worldcom, sullivan, former, ma...",[worldcom director admit lie former chief fin...
2,1,28,1_car_gm_fiat_sale,"[car, gm, fiat, sale, vehicle, bmw, year, mode...",[saab build cadillacs sweden general motors w...
3,2,28,2_yukos_russian_gazprom_court,"[yukos, russian, gazprom, court, rosneft, auct...",[yukos bankruptcy matter russian authority ab...
4,3,27,3_airline_air_flight_passenger,"[airline, air, flight, passenger, plane, carri...",[probe airline travel chaos government invest...
5,4,26,4_profit_share_company_news,"[profit, share, company, news, game, sale, nas...",[news corp eye video game market news corp me...
6,5,22,5_oil_crude_price_barrel,"[oil, crude, price, barrel, gas, energy, cairn...",[oil price reach threemonth low oil price fal...
7,6,18,6_sri_disaster_damage_lanka,"[sri, disaster, damage, lanka, indonesia, peop...",[asia quake increase poverty risk nearly two ...
8,7,17,7_eu_german_european_economy,"[eu, german, european, economy, lisbon, growth...",[germany 1990 jobless level german unemployme...
9,8,16,8_drug_tobacco_patient_firm,"[drug, tobacco, patient, firm, company, smokin...",[seek 280bn smoker ruling justice department ...


In [None]:
for id in business_topic_info['Topic']:
    if id == -1:
        continue
    print(f"\n--- Topic {id} ---")
    print(f" Number of docs: {business_results['bbc_topic_model'].get_topic_freq(id)}")
    print(business_results['bbc_topic_model'].get_topic(id))


--- Topic 0 ---
 Number of docs: 30
[('ebbers', np.float64(0.04548791583515919)), ('fraud', np.float64(0.043181954136533085)), ('worldcom', np.float64(0.03169484350155235)), ('sullivan', np.float64(0.030875341823155496)), ('former', np.float64(0.02723696935860929)), ('marsh', np.float64(0.0259213757808122)), ('sec', np.float64(0.022935968211486937)), ('guilty', np.float64(0.02174061125904669)), ('charge', np.float64(0.02141620019860655)), ('firm', np.float64(0.02053342805626506))]

--- Topic 1 ---
 Number of docs: 28
[('car', np.float64(0.06659805412272009)), ('gm', np.float64(0.05818071699380475)), ('fiat', np.float64(0.05001313194751807)), ('sale', np.float64(0.033663523869079615)), ('vehicle', np.float64(0.03225406851489289)), ('bmw', np.float64(0.031582630854229916)), ('year', np.float64(0.024788521058758412)), ('model', np.float64(0.0243240899500705)), ('motors', np.float64(0.022627040061304143)), ('carmaker', np.float64(0.021473203734877436))]

--- Topic 2 ---
 Number of docs: 2

In [None]:
for id in business_topic_info['Topic']:
    if id == -1:
        continue
    print(f"\nSample docs for Topic {id}")
    docs = business_results['bbc_topic_model'].get_representative_docs(id)
    for doc in docs[:5]:  # View first 2
        print("-", doc[:300])  # Truncate for readability



Sample docs for Topic 0
- worldcom director admit lie  former chief financial officer telecom firm worldcom admit before york court lie fellow board member  speak trial former boss bernard ebbers scott sullivan lie board cover hole worldcoms finance ebbers trial fraud conspiracy relation worldcoms collapse 2002 plead guilty 
- worldcom director evidence  former chief financial officer telecom firm worldcom finish give evidence trial exboss bernie ebbers  scott sullivan admit juror willing commit fraud meet wall street earning projection ebbers trial fraud conspiracy relation worldcoms collapse 2002 plead guilty sullivan s
- ebber aware worldcom fraud  former worldcom boss bernie ebbers directly involve 11bn financial fraud firm close associate tell court  give evidence criminal trial ebbers exfinance chief scott sullivan implicate colleague accounting scandal firm sullivan worldcoms former number two government chief w

Sample docs for Topic 1
- saab build cadillacs sweden  general mo

**Inspection after reducing topics**

In [None]:
business_results = bert_reduce_topics(business_results, nr_topics=10)

2025-07-05 09:45:38,845 - BERTopic - Topic reduction - Reducing number of topics
2025-07-05 09:45:38,882 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-05 09:45:39,828 - BERTopic - Representation - Completed ✓
2025-07-05 09:45:39,872 - BERTopic - Topic reduction - Reduced number of topics from 30 to 10


In [None]:
business_results['reduced_topic_info']

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,101,-1_year_firm_growth_rise,"[year, firm, growth, rise, company, bank, uk, ...",[consumer drive french economy frances econom...
1,0,170,0_company_firm_year_share,"[company, firm, year, share, profit, oil, mark...",[fiat chief take steering wheel chief executi...
2,1,131,1_economy_rate_year_economic,"[economy, rate, year, economic, growth, bank, ...",[uk interest rate hold 475 bank england leave...
3,2,28,2_yukos_russian_gazprom_court,"[yukos, russian, gazprom, court, rosneft, auct...",[yukos bankruptcy matter russian authority ab...
4,3,22,3_deutsche_lse_boerse_euronext,"[deutsche, lse, boerse, euronext, exchange, bi...",[german bidder talk lse deutsche boerse boss ...
5,4,16,4_sale_store_retail_retailer,"[sale, store, retail, retailer, christmas, mcd...",[car pull retail figure retail sale fall 03 j...
6,5,11,5_argentina_venezuela_president_land,"[argentina, venezuela, president, land, govern...",[argentina venezuela oil deal argentina venez...
7,6,10,6_wine_fosters_beer_southcorp,"[wine, fosters, beer, southcorp, brewer, buy, ...",[french wine get 70 m euro topup french gover...
8,7,8,7_club_glazer_united_manchester,"[club, glazer, united, manchester, proposal, b...",[qa malcolm glazer man utd battle control man...
9,8,6,8_pension_age_scheme_retirement,"[pension, age, scheme, retirement, employer, e...",[pension hitch longlive man male life expecta...


In [None]:
for id in business_results['reduced_topic_info']['Topic']:
    if id == -1:
        continue
    print(f"\n--- Topic {id} ---")
    print(f" Number of docs: {business_results['bbc_topic_model'].get_topic_freq(id)}")
    print(business_results['bbc_topic_model'].get_topic(id))


--- Topic 0 ---
 Number of docs: 170
[('company', np.float64(0.028409818304422033)), ('firm', np.float64(0.025523224038315754)), ('year', np.float64(0.024982859593290403)), ('share', np.float64(0.022448525531365705)), ('profit', np.float64(0.018906793860979293)), ('oil', np.float64(0.017649974325354242)), ('market', np.float64(0.017107029367451475)), ('price', np.float64(0.016950927442051657)), ('sale', np.float64(0.016046407855205114)), ('car', np.float64(0.015701612485432098))]

--- Topic 1 ---
 Number of docs: 131
[('economy', np.float64(0.0303972349200157)), ('rate', np.float64(0.027318759042330314)), ('year', np.float64(0.026410973701049292)), ('economic', np.float64(0.025930519782388573)), ('growth', np.float64(0.025919195227796496)), ('bank', np.float64(0.02447278516708383)), ('rise', np.float64(0.024275224361537864)), ('market', np.float64(0.021076500806682257)), ('fall', np.float64(0.018885985718304788)), ('price', np.float64(0.018784000541090185))]

--- Topic 2 ---
 Number o

In [None]:
for id in business_results['reduced_topic_info']['Topic']:
    if id == -1:
        continue
    print(f"\nSample docs for Topic {id}")
    docs = business_results['bbc_topic_model'].get_representative_docs(id)
    for doc in docs[:5]:  # View first 2
        print("-", doc[:300])  # Truncate for readability



Sample docs for Topic 0
- fiat chief take steering wheel  chief executive fiat conglomerate take daytoday control struggle car business effort turn  sergio marchionne replace herbert demel chief executive fiat auto demel leave company marchionne fourth head business   expect 800 m euro 1bn loss 2004   year fiat underperform 
- gm issue 2005 profit warn  general motors warn expect earning year low 2004  world big car maker grapple loss european business weak sale gm high healthcare cost north america low profit financial service subsidiary hurt performance 2005 gm expect meet 2004 earning target despite tough competitive e
- gm ford cut output sale fall  car firm general motors gm ford force cut production face fall car sale  sale gm sink 127 february compare year ago ford sale drop 3 foreign rival take big share market asian carmaker fare   toyota sale jump 11 rival nissan notch 10 increase overall sale industry fall 1

Sample docs for Topic 1
- uk interest rate hold 475  bank england

## Entertainment Category

In [None]:
# Run bertopic on Entertainment top category
ent_results = run_bertopic_pipeline(text_df, 'entertainment')

 Running pipeline for entertainment category




2025-07-05 08:28:53,046 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/12 [00:00<?, ?it/s]

2025-07-05 08:29:55,407 - BERTopic - Embedding - Completed ✓
2025-07-05 08:29:55,411 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-05 08:29:56,159 - BERTopic - Dimensionality - Completed ✓
2025-07-05 08:29:56,160 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-05 08:29:56,181 - BERTopic - Cluster - Completed ✓
2025-07-05 08:29:56,185 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-05 08:29:56,322 - BERTopic - Representation - Completed ✓


In [None]:
ent_results['topic_distance_map']

In [None]:
ent_results['topic_documents_map']

Now let us inspect the resulting topics and labels for the **entertainment** category

In [None]:
ent_df = ent_results['dataframe']
ent_df.head()

Unnamed: 0,id,text,category,preprocessed_text,bertopic_topic,bertopic_topic_label
0,ent_115,Rapper Snoop Dogg sued for 'rape'\n\nUS rapper...,entertainment,rapper snoop dogg sue rape rapper snoop dogg ...,0,"[(music, 0.040330304226001604), (band, 0.02865..."
1,ent_352,Vera Drake's Bafta triumph hope\n\nAt the Baft...,entertainment,vera drake bafta triumph hope bafta film awar...,3,"[(film, 0.06210800858991049), (british, 0.0413..."
2,ent_094,God cut from Dark Materials film\n\nThe direct...,entertainment,god cut dark materials film director screenwr...,-1,"[(film, 0.056865312381085054), (year, 0.020445..."
3,ent_336,Films on war triumph at Sundance\n\nA study of...,entertainment,film war triumph sundance study united states...,9,"[(film, 0.09858691959397585), (festival, 0.087..."
4,ent_073,Ray DVD beats box office takings\n\nOscar-nomi...,entertainment,ray dvd beats box office oscarnominate film b...,2,"[(film, 0.05600442735282721), (box, 0.05190293..."


In [None]:
ent_topic_info = ent_results['topic_info']
ent_topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,63,-1_film_year_award_star,"[film, year, award, star, director, movie, inc...",[oscar nominee lack pull power year clutch os...
1,0,138,0_music_band_song_album,"[music, band, song, album, year, one, good, re...",[brit debate urban music joss stone 17yearold...
2,1,60,1_show_tv_bbc_series,"[show, tv, bbc, series, programme, people, cha...",[little britain vie tv trophy bbc hit little ...
3,2,29,2_film_box_office_take,"[film, box, office, take, comedy, star, weeken...",[horror film head box office lowbudget horror...
4,3,19,3_film_british_role_vera,"[film, british, role, vera, actress, good, dra...",[star gear bafta ceremony film star globe pre...
5,4,15,4_theatre_ballet_musical_show,"[theatre, ballet, musical, show, good, poppins...",[fear raise ballet future child uk follow dai...
6,5,11,5_good_swank_aviator_foxx,"[good, swank, aviator, foxx, win, actor, direc...",[foxx swank win award jamie foxx hilary swank...
7,6,10,6_good_award_film_win,"[good, award, film, win, academy, oscar, actre...",[academy awards flourish 77th annual academy ...
8,7,9,7_book_prize_winner_novel,"[book, prize, winner, novel, award, win, judge...",[paraguay novel win book prize novel set 19th...
9,8,8,8_film_festival_berlin_european,"[film, festival, berlin, european, daylewis, f...",[berlin hail european cinema organiser year b...


In [None]:
for id in ent_topic_info['Topic']:
    if id == -1:
        continue
    print(f"\n--- Topic {id} ---")
    print(f" Number of docs: {ent_results['bbc_topic_model'].get_topic_freq(id)}")
    print(ent_results['bbc_topic_model'].get_topic(id))


--- Topic 0 ---
 Number of docs: 138
[('music', np.float64(0.040330304226001604)), ('band', np.float64(0.02865941024494266)), ('song', np.float64(0.027291524191327805)), ('album', np.float64(0.02662289206607263)), ('year', np.float64(0.02310181700666336)), ('one', np.float64(0.020753094826834705)), ('good', np.float64(0.020011092136135195)), ('record', np.float64(0.019079634155829014)), ('single', np.float64(0.018435642898558674)), ('chart', np.float64(0.01699053609591897))]

--- Topic 1 ---
 Number of docs: 60
[('show', np.float64(0.05793216568944639)), ('tv', np.float64(0.03564469488491754)), ('bbc', np.float64(0.03026484469871411)), ('series', np.float64(0.025970923210028148)), ('programme', np.float64(0.021993475488618235)), ('people', np.float64(0.01961252337428942)), ('channel', np.float64(0.019559804549881318)), ('television', np.float64(0.017485170292037097)), ('comedy', np.float64(0.016882273167830378)), ('audience', np.float64(0.01679471081652168))]

--- Topic 2 ---
 Number 

In [None]:
for id in ent_topic_info['Topic']:
    if id == -1:
        continue
    print(f"\nSample docs for Topic {id}")
    docs = ent_results['bbc_topic_model'].get_representative_docs(id)
    for doc in docs[:5]:  # View first 2
        print("-", doc[:300])  # Truncate for readability



Sample docs for Topic 0
- brit debate urban music  joss stone 17yearold soul singer devon beat dizzee rascal jamelia lemar streets win good british urban act brit awards victory reignite debate urban music  i m really comfortable word urban word s manufacture country america describe black music word urban cover broad range 
- grammys honour soul star charles  memory soul legend ray charles dominate music world lead music ceremony sunday give eight posthumous grammy awards  charles die 2004 get honour include record album year alicia key actor jamie foxx perform musical tribute rb star keys win four award herself grammy c
- scissor sisters triumph brit  band scissor sisters lead winner uk music industry brit awards walk three prize flamboyant act score hattrick international category win good group good album good newcomer award glasgow group franz ferdinand win two prize keane joss stone vote good urban act digital tv

Sample docs for Topic 1
- little britain vie tv trophy  bbc hit l

**Inspection after reducing topics**

In [None]:
ent_results = bert_reduce_topics(ent_results, nr_topics=10)

2025-07-05 08:31:00,802 - BERTopic - Topic reduction - Reducing number of topics
2025-07-05 08:31:00,811 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-05 08:31:00,921 - BERTopic - Representation - Completed ✓
2025-07-05 08:31:00,924 - BERTopic - Topic reduction - Reduced number of topics from 11 to 10


In [None]:
ent_results['reduced_topic_info']

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,63,-1_film_year_award_star,"[film, year, award, star, director, movie, inc...",[oscar nominee lack pull power year clutch os...
1,0,138,0_music_band_song_album,"[music, band, song, album, year, one, good, re...",[brit debate urban music joss stone 17yearold...
2,1,60,1_show_tv_bbc_series,"[show, tv, bbc, series, programme, people, cha...",[little britain vie tv trophy bbc hit little ...
3,2,29,2_film_box_office_take,"[film, box, office, take, comedy, star, weeken...",[horror film head box office lowbudget horror...
4,3,21,3_good_win_film_actor,"[good, win, film, actor, award, aviator, direc...",[aviator win top globe accolade aviator name ...
5,4,19,4_film_british_role_good,"[film, british, role, good, vera, actress, awa...",[star gear bafta ceremony film star globe pre...
6,5,15,5_theatre_ballet_musical_show,"[theatre, ballet, musical, show, good, poppins...",[fear raise ballet future child uk follow dai...
7,6,9,6_book_prize_winner_novel,"[book, prize, winner, novel, award, win, judge...",[paraguay novel win book prize novel set 19th...
8,7,8,7_film_festival_berlin_european,"[film, festival, berlin, european, daylewis, f...",[berlin hail european cinema organiser year b...
9,8,7,8_film_festival_sundance_baghdad,"[film, festival, sundance, baghdad, cinema, di...",[hollywood hunt hit sundance sundance film fe...


In [None]:
for id in ent_results['reduced_topic_info']['Topic']:
    if id == -1:
        continue
    print(f"\n--- Topic {id} ---")
    print(f" Number of docs: {ent_results['bbc_topic_model'].get_topic_freq(id)}")
    print(ent_results['bbc_topic_model'].get_topic(id))


--- Topic 0 ---
 Number of docs: 138
[('music', np.float64(0.041650783116952333)), ('band', np.float64(0.029456870158707654)), ('song', np.float64(0.02804664664531717)), ('album', np.float64(0.027337793977043758)), ('year', np.float64(0.02392239472626125)), ('one', np.float64(0.02139917935501023)), ('good', np.float64(0.020725186217131906)), ('record', np.float64(0.01956074629350989)), ('single', np.float64(0.018882232911112608)), ('number', np.float64(0.017422735822045208))]

--- Topic 1 ---
 Number of docs: 60
[('show', np.float64(0.059855120764446164)), ('tv', np.float64(0.03660902502828401)), ('bbc', np.float64(0.031050951863581963)), ('series', np.float64(0.02658847882700148)), ('programme', np.float64(0.022457935553673387)), ('people', np.float64(0.02014049862870044)), ('channel', np.float64(0.019959864814165767)), ('television', np.float64(0.017856622355689772)), ('comedy', np.float64(0.017287518183429967)), ('audience', np.float64(0.017165352986091027))]

--- Topic 2 ---
 Numb

In [None]:
for id in ent_results['reduced_topic_info']['Topic']:
    if id == -1:
        continue
    print(f"\nSample docs for Topic {id}")
    docs = ent_results['bbc_topic_model'].get_representative_docs(id)
    for doc in docs[:5]:  # View first 2
        print("-", doc[:300])  # Truncate for readability



Sample docs for Topic 0
- brit debate urban music  joss stone 17yearold soul singer devon beat dizzee rascal jamelia lemar streets win good british urban act brit awards victory reignite debate urban music  i m really comfortable word urban word s manufacture country america describe black music word urban cover broad range 
- grammys honour soul star charles  memory soul legend ray charles dominate music world lead music ceremony sunday give eight posthumous grammy awards  charles die 2004 get honour include record album year alicia key actor jamie foxx perform musical tribute rb star keys win four award herself grammy c
- scissor sisters triumph brit  band scissor sisters lead winner uk music industry brit awards walk three prize flamboyant act score hattrick international category win good group good album good newcomer award glasgow group franz ferdinand win two prize keane joss stone vote good urban act digital tv

Sample docs for Topic 1
- little britain vie tv trophy  bbc hit l

## Politics Category

In [None]:
# Run bertopic on Politics top category
politics_results = run_bertopic_pipeline(text_df, 'politics')

 Running pipeline for politics category




2025-07-05 08:31:03,121 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

2025-07-05 08:32:23,249 - BERTopic - Embedding - Completed ✓
2025-07-05 08:32:23,254 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-05 08:32:26,338 - BERTopic - Dimensionality - Completed ✓
2025-07-05 08:32:26,340 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-05 08:32:26,486 - BERTopic - Cluster - Completed ✓
2025-07-05 08:32:26,496 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-05 08:32:26,650 - BERTopic - Representation - Completed ✓


In [None]:
# Visualize intertopic distance map
politics_results['topic_distance_map']

In [None]:
# Visualize documents and topics
politics_results['topic_documents_map']

Now let us inspect the resulting topics and labels for the **business** category

In [None]:
politics_df = politics_results['dataframe']
politics_df.head()

Unnamed: 0,id,text,category,preprocessed_text,bertopic_topic,bertopic_topic_label
1808,pol_099,Blair stresses prosperity goals\n\nTony Blair ...,politics,blair stress prosperity goal tony blair party...,1,"[(brown, 0.0484655184559364), (blair, 0.047174..."
1809,pol_134,Ban on hunting comes into force\n\nFox hunting...,politics,ban hunting force fox hunting dog illegal eng...,7,"[(hunt, 0.11473500731046785), (ban, 0.06261751..."
1810,pol_266,Peers debate Crown succession law\n\nPeers are...,politics,peer debate crown succession law peers debate...,13,"[(lords, 0.0787601276565084), (lord, 0.0524311..."
1811,pol_406,Profile: Gordon Brown\n\nThe ultimate prize of...,politics,profile gordon brown ultimate prize 10 downin...,1,"[(brown, 0.0484655184559364), (blair, 0.047174..."
1812,pol_064,Blair hails Turkey-EU talks deal\n\nTony Blair...,politics,blair hail turkeyeu talk deal tony blair hail...,3,"[(eu, 0.09855830399223424), (straw, 0.05233523..."


In [None]:
politics_topic_info = politics_results['topic_info']
politics_topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,98,-1_labour_government_people_minister,"[labour, government, people, minister, electio...",[tory leader unveil spending plan tory leader...
1,0,55,0_law_police_lord_suspect,"[law, police, lord, suspect, government, trial...",[antiterror plan face first test plan allow h...
2,1,38,1_brown_blair_chancellor_prime,"[brown, blair, chancellor, prime, election, mi...",[blair brown criticise mps labour mp angrily ...
3,2,24,2_lib_kennedy_party_dems,"[lib, kennedy, party, dems, tax, labour, dem, ...",[taxis trust kennedy public trust taxis bre...
4,3,17,3_eu_straw_constitution_referendum,"[eu, straw, constitution, referendum, embargo,...",[eu referendum question unveil question ask r...
5,4,16,4_howard_tory_election_tax,"[howard, tory, election, tax, labour, michael,...",[uk head wrong way howard tony blair chance...
6,5,16,5_aid_africa_world_g8,"[aid, africa, world, g8, brown, poverty, count...",[brown call 55bn aids fund gordon brown call ...
7,6,15,6_asylum_immigration_uk_system,"[asylum, immigration, uk, system, plan, refuge...",[howard attack cost asylum michael howard lau...
8,7,13,7_hunt_ban_hunting_animal,"[hunt, ban, hunting, animal, police, law, dog,...",[minister defend hunt ban law law ban hunt do...
9,8,13,8_child_student_parent_education,"[child, student, parent, education, access, un...",[student inequality expose teenager welloff b...


In [None]:
for id in politics_topic_info['Topic']:
    if id == -1:
        continue
    print(f"\n--- Topic {id} ---")
    print(f" Number of docs: {politics_results['bbc_topic_model'].get_topic_freq(id)}")
    print(politics_results['bbc_topic_model'].get_topic(id))


--- Topic 0 ---
 Number of docs: 55
[('law', np.float64(0.025920582040330594)), ('police', np.float64(0.024702510473433612)), ('lord', np.float64(0.018678066470238223)), ('suspect', np.float64(0.01849641126386632)), ('government', np.float64(0.01848519784702171)), ('trial', np.float64(0.018382643231681428)), ('home', np.float64(0.018368796232043037)), ('human', np.float64(0.017278737430409485)), ('right', np.float64(0.016881854575597783)), ('power', np.float64(0.016562960790232915))]

--- Topic 1 ---
 Number of docs: 38
[('brown', np.float64(0.0484655184559364)), ('blair', np.float64(0.0471749244256451)), ('chancellor', np.float64(0.0335676469065368)), ('prime', np.float64(0.0333395929557906)), ('election', np.float64(0.03005849849165126)), ('minister', np.float64(0.028587466597491347)), ('labour', np.float64(0.027971831832941167)), ('gordon', np.float64(0.020160911329724498)), ('book', np.float64(0.019265815600287003)), ('claim', np.float64(0.018554834502587367))]

--- Topic 2 ---
 N

In [None]:
for id in politics_topic_info['Topic']:
    if id == -1:
        continue
    print(f"\nSample docs for Topic {id}")
    docs = politics_results['bbc_topic_model'].get_representative_docs(id)
    for doc in docs[:5]:  # View first 2
        print("-", doc[:300])  # Truncate for readability



Sample docs for Topic 0
- antiterror plan face first test  plan allow home secretary charles clarke place terror suspect house arrest trial set first real test parliament  tories lib dems labour mp poise vote against plan clarke power need counter terror threat opponent judge politician able order detention uk citizen govern
- terror power expose tyranny  lord chancellor defend government plan introduce control order keep foreign british terrorist suspect house arrest evidence put trial  lord falconer insist proposal equate police state strike balance protect public against threat terrorism uphold civil liberty thriller w
- terror suspect face house arrest  uk citizen suspect involvement terrorism face house arrest part series measure outline home secretary  law lord rule detention 12 foreign terror suspect trial breach human right charles clarkes plan control order mean suspect involve terrorism subject house arrest c

Sample docs for Topic 1
- blair brown criticise mps  labour mp an

**Inspection after reducing topics**

In [None]:
politics_results = bert_reduce_topics(politics_results, nr_topics=10)

2025-07-05 08:33:47,998 - BERTopic - Topic reduction - Reducing number of topics
2025-07-05 08:33:48,012 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-05 08:33:48,156 - BERTopic - Representation - Completed ✓
2025-07-05 08:33:48,160 - BERTopic - Topic reduction - Reduced number of topics from 21 to 10


In [None]:
politics_results['reduced_topic_info']

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,98,-1_labour_government_people_minister,"[labour, government, people, minister, electio...",[tory leader unveil spending plan tory leader...
1,0,147,0_election_labour_blair_party,"[election, labour, blair, party, government, b...",[tory tax cut lift spirit michael howard fina...
2,1,75,1_law_police_home_government,"[law, police, home, government, people, right,...",[terror suspect face house arrest uk citizen ...
3,2,17,2_eu_straw_constitution_referendum,"[eu, straw, constitution, referendum, embargo,...",[eu referendum question unveil question ask r...
4,3,16,3_aid_world_africa_brown,"[aid, world, africa, brown, g8, country, pover...",[brown call 55bn aids fund gordon brown call ...
5,4,13,4_hunt_ban_hunting_police,"[hunt, ban, hunting, police, law, animal, dog,...",[minister defend hunt ban law law ban hunt do...
6,5,13,5_child_education_student_parent,"[child, education, student, parent, access, un...",[student inequality expose teenager welloff b...
7,6,12,6_ukip_kilroysilk_party_veritas,"[ukip, kilroysilk, party, veritas, robert, ele...",[kilroy name election seat target exchat show...
8,7,7,7_livingstone_jewish_mayor_apologise,"[livingstone, jewish, mayor, apologise, ken, r...",[boris oppose mayor apology ken livingstone s...
9,8,5,8_regiment_drinking_drink_drunk,"[regiment, drinking, drink, drunk, mcconnell, ...",[army chief regiment decision military chief ...


In [None]:
for id in politics_results['reduced_topic_info']['Topic']:
    if id == -1:
        continue
    print(f"\n--- Topic {id} ---")
    print(f" Number of docs: {politics_results['bbc_topic_model'].get_topic_freq(id)}")
    print(politics_results['bbc_topic_model'].get_topic(id))


--- Topic 0 ---
 Number of docs: 147
[('election', np.float64(0.032523593334034914)), ('labour', np.float64(0.030822554058157908)), ('blair', np.float64(0.026189084164420486)), ('party', np.float64(0.02611992472331493)), ('government', np.float64(0.02292396221122315)), ('brown', np.float64(0.022355927078816392)), ('tax', np.float64(0.020096179556233832)), ('people', np.float64(0.020025025003152272)), ('minister', np.float64(0.019756731843604894)), ('prime', np.float64(0.018324294135320777))]

--- Topic 1 ---
 Number of docs: 75
[('law', np.float64(0.025744053505871025)), ('police', np.float64(0.024415432253894205)), ('home', np.float64(0.022939924621201625)), ('government', np.float64(0.022028160729709666)), ('people', np.float64(0.01890491770369509)), ('right', np.float64(0.018869628830206504)), ('plan', np.float64(0.018492029371962763)), ('lord', np.float64(0.017520258635940002)), ('human', np.float64(0.0173539669941825)), ('case', np.float64(0.017047624342550565))]

--- Topic 2 ---

In [None]:
for id in politics_results['reduced_topic_info']['Topic']:
    if id == -1:
        continue
    print(f"\nSample docs for Topic {id}")
    docs = politics_results['bbc_topic_model'].get_representative_docs(id)
    for doc in docs[:5]:  # View first 2
        print("-", doc[:300])  # Truncate for readability



Sample docs for Topic 0
- tory tax cut lift spirit  michael howard finally reveal full scale plan tory tax cut  win general election earmark 4 billion reduce taxis   preelection message party press voter believe warm simple vote tory way government stick labour spending plan core public service include health education incre
- lib dems unveil election slogan  liberal democrats present real alternative forthcoming general election campaign charles kennedy  unveil slogan partys spring conference glass ceiling ambition tell delegate labour abuse public trust tories fail oppose response conservatives insist theirs party under
- blair pledge unity labour mp  tony blair seek reassure labour backbencher nothing stand way partys bid third term power  blair speak mps amid fresh rumour rift gordon brown book prime minister go back pledge brown stand before general election chancellor focus win poll due join election supremo alan

Sample docs for Topic 1
- terror suspect face house arrest  uk ci

## Sports Category

In [None]:
# Run bertopic on Sports top category
sport_results = run_bertopic_pipeline(text_df, 'sport')

 Running pipeline for sport category




2025-07-05 08:33:49,250 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

2025-07-05 08:35:19,987 - BERTopic - Embedding - Completed ✓
2025-07-05 08:35:20,005 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-05 08:35:23,313 - BERTopic - Dimensionality - Completed ✓
2025-07-05 08:35:23,317 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-05 08:35:23,366 - BERTopic - Cluster - Completed ✓
2025-07-05 08:35:23,375 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-05 08:35:23,657 - BERTopic - Representation - Completed ✓


In [None]:
# Visualize intertopoc distance map
sport_results['topic_distance_map']

In [None]:
# Visualize documents and topics
sport_results['topic_documents_map']

Now let us inspect the resulting topics and labels for the **business** category

In [None]:
sport_df = sport_results['dataframe']
sport_df.head()

Unnamed: 0,id,text,category,preprocessed_text,bertopic_topic,bertopic_topic_label
386,spo_464,Connors boost for British tennis\n\nFormer wor...,sport,connor boost british tennis former world numb...,2,"[(open, 0.05404425623817514), (win, 0.04855295..."
387,spo_336,Corry backs skipper Robinson\n\nEngland forwar...,sport,corry back skipper robinson england forward m...,1,"[(england, 0.04592286744118714), (wales, 0.036..."
388,spo_087,GB quartet get cross country call\n\nFour Brit...,sport,gb quartet get cross country call four britis...,3,"[(race, 0.049586451390209826), (world, 0.04356..."
389,spo_406,Fuming Robinson blasts officials\n\nEngland co...,sport,fume robinson blast official england coach an...,1,"[(england, 0.04592286744118714), (wales, 0.036..."
390,spo_341,Charvis set to lose fitness bid\n\nFlanker Col...,sport,charvis set lose fitness bid flanker colin ch...,1,"[(england, 0.04592286744118714), (wales, 0.036..."


In [None]:
sport_topic_info = sport_results['topic_info']
sport_topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,176,0_club_chelsea_game_play,"[club, chelsea, game, play, player, goal, go, ...",[parry put gerrard money listen full intervie...
1,1,148,1_england_wales_ireland_game,"[england, wales, ireland, game, rugby, six, ag...",[ogara revel ireland victory ireland flyhalf ...
2,2,86,2_open_win_play_roddick,"[open, win, play, roddick, set, seed, match, a...",[davenport dismantle young rival top seed lin...
3,3,67,3_race_world_olympic_win,"[race, world, olympic, win, indoor, run, europ...",[britain boost holmes double athletics fan en...
4,4,28,4_test_drug_iaaf_kenteris,"[test, drug, iaaf, kenteris, greek, thanou, ba...",[greek duo clear dope case sprinters kostas k...


In [None]:
for id in sport_topic_info['Topic']:
    if id == -1:
        continue
    print(f"\n--- Topic {id} ---")
    print(f" Number of docs: {sport_results['bbc_topic_model'].get_topic_freq(id)}")
    print(sport_results['bbc_topic_model'].get_topic(id))


--- Topic 0 ---
 Number of docs: 176
[('club', np.float64(0.03342681800031293)), ('chelsea', np.float64(0.028337452427108868)), ('game', np.float64(0.02712156040991748)), ('play', np.float64(0.02578682008223782)), ('player', np.float64(0.024984737555735687)), ('goal', np.float64(0.023502765541563355)), ('go', np.float64(0.02233653522799926)), ('league', np.float64(0.022124583924377314)), ('liverpool', np.float64(0.022001680241800377)), ('arsenal', np.float64(0.02166553901398831))]

--- Topic 1 ---
 Number of docs: 148
[('england', np.float64(0.04592286744118714)), ('wales', np.float64(0.03680074423522527)), ('ireland', np.float64(0.03311802156908291)), ('game', np.float64(0.03234029902647557)), ('rugby', np.float64(0.031749221757020885)), ('six', np.float64(0.02758517431480737)), ('against', np.float64(0.026873235853403902)), ('france', np.float64(0.026318401608198965)), ('win', np.float64(0.025137374939847964)), ('side', np.float64(0.024943616672721908))]

--- Topic 2 ---
 Number of 

In [None]:
for id in sport_topic_info['Topic']:
    if id == -1:
        continue
    print(f"\nSample docs for Topic {id}")
    docs = sport_results['bbc_topic_model'].get_representative_docs(id)
    for doc in docs[:5]:  # View first 2
        print("-", doc[:300])  # Truncate for readability



Sample docs for Topic 0
- parry put gerrard money  listen full interview sport five bbc sport website 1900 gmt  parry speak exclusively bbc sport admit gerrard constantly link chelsea final future tell bbc five live steven money future liverpool matter 30 m 40 m 50 m accept offer realistic know keep steven against subject li
- mourinho plot impressive course  chelsea win fulham   confirm position premiership summit   prove everything place mount serious challenge front season  get strength depth great player outstanding manager jose mourinho finance club world match need add big prize know difficult part one thing certain
- liverpool pledge keep gerrard  liverpool chief executive rick parry insist club sell steven gerrard amid report chelsea renew bid lure anfield  gerrard reiterate desire win trophy reds superb champions league winner wednesday parry move scotch claim chelsea launch 35 m bid chance stevie go january p

Sample docs for Topic 1
- ogara revel ireland victory  ireland fl

**Inspection after reducing topics**

In [None]:
sport_results = bert_reduce_topics(sport_results, nr_topics=10)

2025-07-05 08:36:55,685 - BERTopic - Topic reduction - Reducing number of topics
2025-07-05 08:36:55,687 - BERTopic - Topic reduction - Number of topics (10) is equal or higher than the clustered topics(5).
2025-07-05 08:36:55,688 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-05 08:36:55,973 - BERTopic - Representation - Completed ✓


In [None]:
sport_results['reduced_topic_info']

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,176,0_club_chelsea_game_play,"[club, chelsea, game, play, player, goal, go, ...",[parry put gerrard money listen full intervie...
1,1,148,1_england_wales_ireland_game,"[england, wales, ireland, game, rugby, six, ag...",[ogara revel ireland victory ireland flyhalf ...
2,2,86,2_open_win_play_roddick,"[open, win, play, roddick, set, seed, match, a...",[davenport dismantle young rival top seed lin...
3,3,67,3_race_world_olympic_win,"[race, world, olympic, win, indoor, run, europ...",[britain boost holmes double athletics fan en...
4,4,28,4_test_drug_iaaf_kenteris,"[test, drug, iaaf, kenteris, greek, thanou, ba...",[greek duo clear dope case sprinters kostas k...


In [None]:
for id in sport_results['reduced_topic_info']['Topic']:
    if id == -1:
        continue
    print(f"\n--- Topic {id} ---")
    print(f" Number of docs: {sport_results['bbc_topic_model'].get_topic_freq(id)}")
    print(sport_results['bbc_topic_model'].get_topic(id))


--- Topic 0 ---
 Number of docs: 176
[('club', np.float64(0.03342681800031293)), ('chelsea', np.float64(0.028337452427108868)), ('game', np.float64(0.02712156040991748)), ('play', np.float64(0.02578682008223782)), ('player', np.float64(0.024984737555735687)), ('goal', np.float64(0.023502765541563355)), ('go', np.float64(0.02233653522799926)), ('league', np.float64(0.022124583924377314)), ('liverpool', np.float64(0.022001680241800377)), ('arsenal', np.float64(0.02166553901398831))]

--- Topic 1 ---
 Number of docs: 148
[('england', np.float64(0.04592286744118714)), ('wales', np.float64(0.03680074423522527)), ('ireland', np.float64(0.03311802156908291)), ('game', np.float64(0.03234029902647557)), ('rugby', np.float64(0.031749221757020885)), ('six', np.float64(0.02758517431480737)), ('against', np.float64(0.026873235853403902)), ('france', np.float64(0.026318401608198965)), ('win', np.float64(0.025137374939847964)), ('side', np.float64(0.024943616672721908))]

--- Topic 2 ---
 Number of 

In [None]:
for id in sport_results['reduced_topic_info']['Topic']:
    if id == -1:
        continue
    print(f"\nSample docs for Topic {id}")
    docs = sport_results['bbc_topic_model'].get_representative_docs(id)
    for doc in docs[:5]:  # View first 2
        print("-", doc[:300])  # Truncate for readability



Sample docs for Topic 0
- parry put gerrard money  listen full interview sport five bbc sport website 1900 gmt  parry speak exclusively bbc sport admit gerrard constantly link chelsea final future tell bbc five live steven money future liverpool matter 30 m 40 m 50 m accept offer realistic know keep steven against subject li
- mourinho plot impressive course  chelsea win fulham   confirm position premiership summit   prove everything place mount serious challenge front season  get strength depth great player outstanding manager jose mourinho finance club world match need add big prize know difficult part one thing certain
- liverpool pledge keep gerrard  liverpool chief executive rick parry insist club sell steven gerrard amid report chelsea renew bid lure anfield  gerrard reiterate desire win trophy reds superb champions league winner wednesday parry move scotch claim chelsea launch 35 m bid chance stevie go january p

Sample docs for Topic 1
- ogara revel ireland victory  ireland fl

## Tech Category

In [None]:
# Run bertopic on business top category
tech_results = run_bertopic_pipeline(text_df, 'tech')

 Running pipeline for tech category




2025-07-05 08:36:57,006 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/11 [00:00<?, ?it/s]

2025-07-05 08:38:15,400 - BERTopic - Embedding - Completed ✓
2025-07-05 08:38:15,402 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-05 08:38:16,064 - BERTopic - Dimensionality - Completed ✓
2025-07-05 08:38:16,065 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-05 08:38:16,084 - BERTopic - Cluster - Completed ✓
2025-07-05 08:38:16,089 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-05 08:38:16,225 - BERTopic - Representation - Completed ✓


In [None]:
# Visualize intertopoc distance map
tech_results['topic_distance_map']

In [None]:
# Visualize documents and topics
tech_results['topic_documents_map']

Now let us inspect the resulting topics and labels for the **business** category

In [None]:
tech_df = tech_results['dataframe']
tech_df.head()

Unnamed: 0,id,text,category,preprocessed_text,bertopic_topic,bertopic_topic_label
1407,tec_174,Gadgets galore on show at fair\n\nThe 2005 Con...,tech,gadget galore show fair 2005 consumer electro...,7,"[(technology, 0.04804024725556728), (device, 0..."
1408,tec_369,Microsoft plans 'safer ID' system\n\nMicrosoft...,tech,microsoft plan safe id system microsoft plan ...,1,"[(security, 0.0425664464232506), (virus, 0.039..."
1409,tec_332,Apple iPod family expands market\n\nApple has ...,tech,apple ipod family expand market apple expand ...,2,"[(mobile, 0.06454288497275254), (phone, 0.0563..."
1410,tec_315,Warning over tsunami aid website\n\nNet users ...,tech,warn tsunami aid website net user tell avoid ...,6,"[(blog, 0.08058520863955349), (site, 0.0326815..."
1411,tec_381,Kenyan school turns to handhelds\n\nAt the Mbi...,tech,kenyan school turn handheld mbita point prima...,-1,"[(apple, 0.022314903446728106), (people, 0.020..."


In [None]:
tech_topic_info = tech_results['topic_info']
tech_topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,47,-1_apple_people_computer_technology,"[apple, people, computer, technology, get, dig...",[broadband set revolutionise tv bt start push...
1,0,64,0_game_play_console_title,"[game, play, console, title, gaming, gamer, ye...",[lose online gaming online role playing game ...
2,1,59,1_security_virus_attack_program,"[security, virus, attack, program, email, user...",[microsoft antipiracy move microsoft clamp pe...
3,2,49,2_mobile_phone_music_people,"[mobile, phone, music, people, service, techno...",[mobile music challenge ipod age nokia micros...
4,3,25,3_search_google_yahoo_people,"[search, google, yahoo, people, engine, web, d...",[search site get close user search site want ...
5,4,19,4_tv_programme_content_channel,"[tv, programme, content, channel, digital, vie...",[confusion highdefinition tv critical mass pe...
6,5,17,5_broadband_bt_service_net,"[broadband, bt, service, net, speed, connectio...",[broadband soar 2004 broadband jumbo jet 2003...
7,6,15,6_blog_site_news_ink,"[blog, site, news, ink, online, aid, blogger, ...",[blog pick word year term blog choose top wor...
8,7,12,7_technology_device_gadget_consumer,"[technology, device, gadget, consumer, digital...",[gates open big gadget fair bill gates open c...
9,8,12,8_p2p_file_movie_bittorrent,"[p2p, file, movie, bittorrent, network, system...",[fileswapper ready network legal attack websi...


In [None]:
for id in tech_topic_info['Topic']:
    if id == -1:
        continue
    print(f"\n--- Topic {id} ---")
    print(f" Number of docs: {tech_results['bbc_topic_model'].get_topic_freq(id)}")
    print(tech_results['bbc_topic_model'].get_topic(id))


--- Topic 0 ---
 Number of docs: 64
[('game', np.float64(0.07566864412241564)), ('play', np.float64(0.02721067518307488)), ('console', np.float64(0.024158385360424285)), ('title', np.float64(0.023112498639458773)), ('gaming', np.float64(0.020943842281091956)), ('gamer', np.float64(0.0191853287249207)), ('year', np.float64(0.018766961775956634)), ('nintendo', np.float64(0.018575211391352927)), ('time', np.float64(0.01768871589849712)), ('sony', np.float64(0.016418320147861164))]

--- Topic 1 ---
 Number of docs: 59
[('security', np.float64(0.0425664464232506)), ('virus', np.float64(0.03954002665414305)), ('attack', np.float64(0.029770176758398558)), ('program', np.float64(0.027744279723125664)), ('email', np.float64(0.026960680257521703)), ('user', np.float64(0.026577295947460552)), ('site', np.float64(0.024956690638820365)), ('spam', np.float64(0.023680962422837043)), ('net', np.float64(0.022333758197374383)), ('firm', np.float64(0.02143415263952536))]

--- Topic 2 ---
 Number of docs

In [None]:
for id in tech_topic_info['Topic']:
    if id == -1:
        continue
    print(f"\nSample docs for Topic {id}")
    docs = tech_results['bbc_topic_model'].get_representative_docs(id)
    for doc in docs[:5]:  # View first 2
        print("-", doc[:300])  # Truncate for readability



Sample docs for Topic 0
- mobile game age  bbc news website take look game mobile phone mature brief roundup follow skip straight review click link part two follow monday  reviews call duty splinter cell   pandora tomorrow lord rings pocket kingdom follow monday think snake mention mobile game bit surprise mobile game long w
- mobile game age  bbc news website take look game mobile phone mature brief roundup follow skip straight review click link  think snake mention mobile game bit surprise mobile game long way short time before nokias ngage game phone launch 2003 mobile operator realise audience look something play hand

Sample docs for Topic 1
- microsoft antipiracy move  microsoft clamp people run pirate version windows operating system restrict access security feature  windows genuine advantage scheme mean people prove software genuine mid2005 allow unauthorised copy get crucial security fix automatic update option limit microsoft release
- microsoft debut security tool  microsof

**Inspection after reducing topics**

In [None]:
tech_results = bert_reduce_topics(tech_results, nr_topics=10)

2025-07-05 08:39:34,098 - BERTopic - Topic reduction - Reducing number of topics
2025-07-05 08:39:34,109 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-05 08:39:34,247 - BERTopic - Representation - Completed ✓
2025-07-05 08:39:34,250 - BERTopic - Topic reduction - Reduced number of topics from 14 to 10


In [None]:
tech_results['reduced_topic_info']

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,47,-1_apple_people_computer_technology,"[apple, people, computer, technology, get, dig...",[broadband set revolutionise tv bt start push...
1,0,97,0_mobile_phone_people_service,"[mobile, phone, people, service, technology, t...",[look music drive mobile mobile phone enjoy b...
2,1,64,1_game_play_console_title,"[game, play, console, title, gaming, year, gam...",[lose online gaming online role playing game ...
3,2,59,2_security_virus_attack_program,"[security, virus, attack, program, user, email...",[microsoft antipiracy move microsoft clamp pe...
4,3,40,3_search_blog_people_google,"[search, blog, people, google, web, site, onli...",[year search personal odd fire browser go str...
5,4,12,4_p2p_file_network_system,"[p2p, file, network, system, movie, bittorrent...",[fileswapper ready network legal attack websi...
6,5,9,5_law_patent_directive_eu,"[law, patent, directive, eu, european, parliam...",[reboot order eu patent law european parliame...
7,6,7,6_robot_vehicle_lift_asimo,"[robot, vehicle, lift, asimo, car, human, pass...",[robot learn robotiquette rule robot learn le...
8,7,6,7_mac_gadget_apple_mini,"[mac, gadget, apple, mini, pc, computer, first...",[rival 400 apple mac mini cheap apple compute...
9,8,6,8_dvd_bluray_film_format,"[dvd, bluray, film, format, highdefinition, hd...",[disney back sony dvd technology generation d...


In [None]:
for id in tech_results['reduced_topic_info']['Topic']:
    if id == -1:
        continue
    print(f"\n--- Topic {id} ---")
    print(f" Number of docs: {tech_results['bbc_topic_model'].get_topic_freq(id)}")
    print(tech_results['bbc_topic_model'].get_topic(id))


--- Topic 0 ---
 Number of docs: 97
[('mobile', np.float64(0.04217488029474503)), ('phone', np.float64(0.03588212787145205)), ('people', np.float64(0.030004980371252987)), ('service', np.float64(0.02943485861340146)), ('technology', np.float64(0.02659407014733386)), ('tv', np.float64(0.024704925371321095)), ('music', np.float64(0.02459490264634999)), ('broadband', np.float64(0.021860522057695647)), ('digital', np.float64(0.019486166186666227)), ('video', np.float64(0.018317064877925882))]

--- Topic 1 ---
 Number of docs: 64
[('game', np.float64(0.08563467319954807)), ('play', np.float64(0.029816626297161614)), ('console', np.float64(0.026171261258369327)), ('title', np.float64(0.024954723104409406)), ('gaming', np.float64(0.022701758356980115)), ('year', np.float64(0.020927650894062108)), ('gamer', np.float64(0.020658239122659162)), ('nintendo', np.float64(0.01995224981890803)), ('time', np.float64(0.019556873401588366)), ('sony', np.float64(0.017786294059085953))]

--- Topic 2 ---
 

In [None]:
for id in tech_results['reduced_topic_info']['Topic']:
    if id == -1:
        continue
    print(f"\nSample docs for Topic {id}")
    docs = tech_results['bbc_topic_model'].get_representative_docs(id)
    for doc in docs[:5]:  # View first 2
        print("-", doc[:300])  # Truncate for readability



Sample docs for Topic 0
- look music drive mobile  mobile phone enjoy boom time sale accord research technology analyst gartner  674 million mobile sell year globally report high total sell date figure 30 2003 surpass optimistic prediction gartner good design look mobile service music download go way push sale 2005 analyst p
- look music drive mobile  mobile phone enjoy boom time sale accord research technology analyst gartner  674 million mobile sell year globally report high total sell date figure 30 2003 surpass optimistic prediction gartner good design look mobile service music download go way push sale 2005 analyst p
- mobile medium player  mobile ready allsinge alldance multimedia device replace portable medium player two report  despite move bring music download service mobile people want trade multimedia service size battery life jupiter separate study gartner realtime tv broadcast mobile europe 2007 technical 

Sample docs for Topic 1
- mobile game age  bbc news website take 