# Overview

# Importing Necessary Libraries

In [1]:
# Installing packages
!pip install 'bertopic[all]' sentence-transformers



In [2]:
import pandas as pd
import os
import spacy
import re
import numpy as np
import random
import umap
import torch

# !python -m spacy download en_core_web_md

# from sklearn.feature_extraction.text import TfidfVectorizer

# from sklearn.cluster import KMeans

# Importing Topic modelling libraries
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP

In [3]:
# Setting all seeds
SEED = 42
os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


# Loading Dataset

In [4]:
# from google.colab import drive
# drive.mount('/content/drive')

In [5]:
dataset_path = 'bbc'
dataset = []

Firstly, I would combine the various text files into a list, and then a dataframe, to make processing easier

In [6]:
for category in os.listdir(dataset_path):       #loops through the items in root dataset folder
    category_path = os.path.join(dataset_path, category)       #constructs the path for each item
    if os.path.isdir(category_path):       #checks what items are directories
        for filename in os.listdir(category_path):       #loops through directories
            file_path = os.path.join(category_path, filename)       #constructs the path for each filee
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read().strip()
                dataset.append({'id':category[:3]+'_'+filename[:-4], 'text':text, 'category':category})  #Adding an identifier to each text consisting of the category and file name

In [7]:
text_df = pd.DataFrame(dataset)      # converting the resulting list to a dataframe

In [8]:
text_df

Unnamed: 0,id,text,category
0,bus_001,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,bus_002,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,bus_003,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,bus_004,High fuel prices hit BA's profits\n\nBritish A...,business
4,bus_005,Pernod takeover talk lifts Domecq\n\nShares in...,business
...,...,...,...
2220,tec_397,BT program to beat dialler scams\n\nBT is intr...,tech
2221,tec_398,Spam e-mails tempt net shoppers\n\nComputer us...,tech
2222,tec_399,Be careful how you code\n\nA new European dire...,tech
2223,tec_400,US cyber security chief resigns\n\nThe man mak...,tech


In [9]:
text_df['category'].value_counts()

category
sport            511
business         510
politics         417
tech             401
entertainment    386
Name: count, dtype: int64

# Preprocessing Text Data

First step, check raw texts for duplicated texts

In [10]:
# Identify duplicate texts
duplicate_ind = text_df[text_df.duplicated(subset=['text'], keep=False)].index.tolist()
print(duplicate_ind)

[6, 213, 214, 239, 252, 255, 257, 264, 291, 332, 355, 370, 415, 493, 512, 532, 548, 552, 555, 558, 577, 582, 583, 591, 595, 597, 605, 612, 614, 636, 638, 642, 647, 654, 667, 671, 744, 759, 770, 781, 782, 817, 837, 843, 844, 847, 859, 865, 900, 902, 916, 954, 964, 978, 1002, 1003, 1012, 1018, 1036, 1112, 1113, 1118, 1148, 1180, 1182, 1186, 1193, 1195, 1205, 1206, 1216, 1236, 1237, 1246, 1251, 1254, 1324, 1332, 1364, 1396, 1564, 1567, 1569, 1570, 1647, 1720, 1742, 1748, 1826, 1827, 1831, 1835, 1837, 1844, 1848, 1850, 1859, 1862, 1863, 1864, 1865, 1867, 1870, 1874, 1878, 1882, 1885, 1886, 1889, 1890, 1898, 1902, 1904, 1906, 1912, 1913, 1917, 1919, 1921, 1926, 1941, 1949, 1951, 1952, 1954, 1955, 1956, 1970, 1972, 1979, 1982, 1985, 1986, 1988, 1989, 1990, 1992, 1994, 1998, 1999, 2000, 2002, 2003, 2016, 2034, 2038, 2049, 2050, 2051, 2056, 2075, 2083, 2086, 2093, 2102, 2108, 2112, 2114, 2115, 2116, 2118, 2119, 2120, 2121, 2122, 2125, 2126, 2127, 2131, 2133, 2135, 2136, 2137, 2138, 2140, 2141,

In [11]:
text_df.loc[duplicate_ind].sort_values(by='text')

Unnamed: 0,id,text,category
1989,tec_166,'Brainwave' cap controls computer\n\nA team of...,tech
1988,tec_165,'Brainwave' cap controls computer\n\nA team of...,tech
954,pol_059,'Debate needed' on donations cap\n\nA cap on d...,politics
1193,pol_298,'Debate needed' on donations cap\n\nA cap on d...,politics
1003,pol_108,'Super union' merger plan touted\n\nTwo of Bri...,politics
...,...,...,...
1850,tec_027,Warning over tsunami aid website\n\nNet users ...,tech
2135,tec_312,Web radio takes Spanish rap global\n\nSpin the...,tech
1913,tec_090,Web radio takes Spanish rap global\n\nSpin the...,tech
2136,tec_313,What high-definition will do to DVDs\n\nFirst ...,tech


Now that the duplicated rows have been identified, they have to be removed

In [12]:
text_df.drop_duplicates(subset=['text'], inplace=True)

In [13]:
text_df

Unnamed: 0,id,text,category
0,bus_001,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,bus_002,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,bus_003,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,bus_004,High fuel prices hit BA's profits\n\nBritish A...,business
4,bus_005,Pernod takeover talk lifts Domecq\n\nShares in...,business
...,...,...,...
2219,tec_396,New consoles promise big problems\n\nMaking ga...,tech
2220,tec_397,BT program to beat dialler scams\n\nBT is intr...,tech
2222,tec_399,Be careful how you code\n\nA new European dire...,tech
2223,tec_400,US cyber security chief resigns\n\nThe man mak...,tech


In [14]:
nlp = spacy.load('en_core_web_sm')

#Using the stopword list from http://mlg.ucd.ie/files/datasets/stopwords.txt to preprocess the dataset

custom_stopwords = []
with open('bbc/stopwords.txt', 'r', encoding='utf-8') as file:
    for line in file:
        custom_stopwords.append(line.strip())

print(custom_stopwords)

['a', 'about', 'above', 'according', 'across', 'actually', 'adj', 'after', 'afterwards', 'again', 'all', 'almost', 'along', 'already', 'also', 'although', 'always', 'among', 'amongst', 'an', 'am', 'and', 'another', 'any', 'anyhow', 'anyone', 'anything', 'anywhere', 'are', 'aren', "aren't", 'around', 'as', 'at', 'be', 'became', 'because', 'become', 'becomes', 'been', 'beforehand', 'begin', 'being', 'below', 'beside', 'besides', 'between', 'both', 'but', 'by', 'can', 'cannot', "can't", 'caption', 'co', 'come', 'could', 'couldn', "couldn't", 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'don', "don't", 'down', 'during', 'each', 'early', 'eg', 'either', 'else', 'elsewhere', 'end', 'ending', 'enough', 'etc', 'even', 'ever', 'every', 'everywhere', 'except', 'few', 'for', 'found', 'from', 'further', 'had', 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'he', 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers', 'him', 'his', 'how', 'however', 'ie', 'i

In [15]:
#Creating a function for preprocessing - lowercasing, lemmatization, stopword removal

def preprocess(text):

    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # Processing with Spacy
    doc = nlp(text)

    # Lemmatization and removing stopwords using thee custom list
    tokens = [token.lemma_.lower() for token in doc if token.lemma_.lower() not in custom_stopwords]
    return ' '.join(tokens)

text_df['preprocessed_text'] = text_df['text'].apply(preprocess)

In [16]:
# Remove newline characters
text_df['preprocessed_text'] = text_df['preprocessed_text'].str.replace('\n', '')
text_df.head()

Unnamed: 0,id,text,category,preprocessed_text
0,bus_001,Ad sales boost Time Warner profit\n\nQuarterly...,business,ad sale boost time warner profit quarterly pr...
1,bus_002,Dollar gains on Greenspan speech\n\nThe dollar...,business,dollar gain greenspan speech dollar hit high ...
2,bus_003,Yukos unit buyer faces loan claim\n\nThe owner...,business,yukos unit buyer face loan claim owner embatt...
3,bus_004,High fuel prices hit BA's profits\n\nBritish A...,business,high fuel price hit bas profit british airway...
4,bus_005,Pernod takeover talk lifts Domecq\n\nShares in...,business,pernod takeover talk lift domecq share uk dri...


Checking the preprocessed text after preprocessing to remove any duplicates

In [17]:
preprocessed_ind = text_df[text_df.duplicated(subset=['preprocessed_text'], keep=False)].index.tolist()

In [18]:
text_df.loc[preprocessed_ind].sort_values(by='preprocessed_text')

Unnamed: 0,id,text,category,preprocessed_text
2042,tec_219,Disney backs Sony DVD technology\n\nA next gen...,tech,disney back sony dvd technology generation dv...
2047,tec_224,Disney backs Sony DVD technology\n\nA next gen...,tech,disney back sony dvd technology generation dv...
1978,tec_155,Games win for Blu-ray DVD format\n\nThe next-g...,tech,game win bluray dvd format nextgeneration dvd...
2117,tec_294,Games win for Blu-ray DVD format\n\nThe next-g...,tech,game win bluray dvd format nextgeneration dvd...
1644,spo_332,Harinordoquy suffers France axe\n\nNumber eigh...,sport,harinordoquy suffer france axe number eight i...
1656,spo_344,Harinordoquy suffers France axe\n\nNumber eigh...,sport,harinordoquy suffer france axe number eight i...
1781,spo_469,Moya emotional after Davis Cup win\n\nCarlos M...,sport,moya emotional davis cup win carlos moya desc...
1782,spo_470,Moya emotional at Davis Cup win\n\nCarlos Moya...,sport,moya emotional davis cup win carlos moya desc...
1871,tec_048,'No re-draft' for EU patent law\n\nA proposed ...,tech,redraft eu patent law propose european law so...
2165,tec_342,'No re-draft' for EU patent law\n\nA proposed ...,tech,redraft eu patent law propose european law so...


In [19]:
text_df.drop_duplicates(subset=['preprocessed_text'], inplace=True)

# Exploratory Data Analysis

# Sub-categorizing Main Categories

To break down the texts into sub-categories, I would make use of BERTopic

## Reusable Functions

In [20]:
# Creating a function to run the initial clustering of the top categories
def run_bertopic_pipeline(
    df,
    top_category = None,
    embedding_model = None,
    min_topic_size = 5
):

    #Filtering the specified top category
    if top_category is not None:
        print (f" Running pipeline for {top_category} category\n\n")
        top_category_df = df[df['category'] == top_category]
    else:
        top_category_df = df

    # Converting to list for easy processing
    top_category_texts = top_category_df['preprocessed_text'].tolist()
    top_category_texts = [t for t in top_category_texts if isinstance(t, str) and t.strip()]

    # Specifying embedding model
    if embedding_model is None:
        embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

    # Creating an instance of BERTopic for my data
    bbc_topic_model = BERTopic(embedding_model=embedding_model,
                           language="english",
                           verbose=True,
                           min_topic_size = min_topic_size,
                           umap_model=UMAP(n_neighbors=15, n_components=5, min_dist=0.1, metric='cosine', random_state=SEED)) #Specified random state for reproducibility

    # Fit and transform data
    topics, probabilities = bbc_topic_model.fit_transform(top_category_texts)

    # Topic info
    topic_info = bbc_topic_model.get_topic_info()

    # Visualizing document and topic maps
    topic_documents_map = bbc_topic_model.visualize_documents(top_category_texts)
    topic_distance_map = bbc_topic_model.visualize_topics()


    # Appending topics and labels to dataframe
    top_category_df['bertopic_topic'] = topics
    top_category_df['bertopic_topic_label'] = top_category_df['bertopic_topic'].apply(lambda x: bbc_topic_model.get_topic(x))

    return {'topic_info': topic_info,
            'topic_distance_map': topic_distance_map,
            'topic_documents_map': topic_documents_map,
            'topics': topics,
            'probabilities': probabilities,
            'bbc_topic_model': bbc_topic_model,
            'dataframe': top_category_df}

In [21]:
# Creating a function to reduce topics, if needed

def bert_reduce_topics(results, nr_topics=5):
    model = results['bbc_topic_model']
    df = results['dataframe']
    top_category_texts = df['preprocessed_text'].tolist()
    top_category_texts = [t for t in top_category_texts if isinstance(t, str) and t.strip()]

    # Reduce the topics
    model.reduce_topics(top_category_texts, nr_topics=nr_topics)
    reduced_topics = model.topics_

    # Computing updated topic info
    reduced_topic_info = model.get_topic_info()

    # Add new topics to dataframe
    df['bertopic_topic_reduced'] = reduced_topics
    df['bertopic_topic_reduced_label'] = df['bertopic_topic_reduced'].apply(lambda x: model.get_topic(x))

    #Update input dictionary
    results.update({'reduced_topics': reduced_topics,
                    'dataframe': df,
                    'reduced_topic_info': reduced_topic_info})

    return results

## Business Category

In [22]:
# Run bertopic on business top category
business_results = run_bertopic_pipeline(text_df, top_category='business')

 Running pipeline for business category




2025-07-06 14:26:54,851 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

2025-07-06 14:27:14,914 - BERTopic - Embedding - Completed ✓
2025-07-06 14:27:14,914 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-06 14:27:24,776 - BERTopic - Dimensionality - Completed ✓
2025-07-06 14:27:24,778 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-06 14:27:24,792 - BERTopic - Cluster - Completed ✓
2025-07-06 14:27:24,795 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-06 14:27:24,875 - BERTopic - Representation - Completed ✓


In [23]:
# Visualize intertopic distance map
business_results['topic_distance_map']

In [24]:
# Visualize documents and topics
business_results['topic_documents_map']

Now let us inspect the resulting topics and labels for the **business** category


---



In [25]:
business_df = business_results['dataframe']
business_df.head()

Unnamed: 0,id,text,category,preprocessed_text,bertopic_topic,bertopic_topic_label
0,bus_001,Ad sales boost Time Warner profit\n\nQuarterly...,business,ad sale boost time warner profit quarterly pr...,1,"[(profit, 0.04203002085677433), (share, 0.0345..."
1,bus_002,Dollar gains on Greenspan speech\n\nThe dollar...,business,dollar gain greenspan speech dollar hit high ...,-1,"[(year, 0.014855408772173095), (us, 0.01291355..."
2,bus_003,Yukos unit buyer faces loan claim\n\nThe owner...,business,yukos unit buyer face loan claim owner embatt...,0,"[(yukos, 0.08016952884034413), (russian, 0.049..."
3,bus_004,High fuel prices hit BA's profits\n\nBritish A...,business,high fuel price hit bas profit british airway...,4,"[(airline, 0.06444251699397231), (air, 0.04691..."
4,bus_005,Pernod takeover talk lifts Domecq\n\nShares in...,business,pernod takeover talk lift domecq share uk dri...,5,"[(sale, 0.05170048308837637), (store, 0.037310..."


In [26]:
business_topic_info = business_results['topic_info']
business_topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,109,-1_year_us_world_company,"[year, us, world, company, eu, economic, bank,...",[s korea spending boost economy south korea b...
1,0,31,0_yukos_russian_gazprom_court,"[yukos, russian, gazprom, court, yugansk, oil,...",[yukos accuse lie court russian oil firm yuko...
2,1,29,1_profit_share_steel_news,"[profit, share, steel, news, company, firm, ga...",[news corp eye video game market news corp me...
3,2,29,2_ebbers_fraud_worldcom_sullivan,"[ebbers, fraud, worldcom, sullivan, former, ma...",[worldcom director admit lie former chief fin...
4,3,28,3_car_gm_fiat_sale,"[car, gm, fiat, sale, vehicle, bmw, year, mode...",[saab build cadillacs sweden general motors w...
5,4,27,4_airline_air_flight_passenger,"[airline, air, flight, passenger, plane, carri...",[us probe airline travel chaos us government ...
6,5,26,5_sale_store_retail_retailer,"[sale, store, retail, retailer, christmas, lab...",[us retail sale surge december us retail sale...
7,6,22,6_sri_disaster_country_damage,"[sri, disaster, country, damage, lanka, indone...",[asia quake increase poverty risk nearly two ...
8,7,16,7_drug_tobacco_patient_firm,"[drug, tobacco, patient, firm, company, smokin...",[us seek 280bn smoker ruling us justice depar...
9,8,16,8_deutsche_lse_boerse_euronext,"[deutsche, lse, boerse, euronext, bid, exchang...",[german bidder talk lse deutsche boerse boss ...


In [27]:
# Printing topic top terms and their score for each topic

for id in business_topic_info['Topic']:
    if id == -1:   # Skip the outliers
        continue
    print(f"\n--- Topic {id} ---")
    print(f" Number of docs: {business_results['bbc_topic_model'].get_topic_freq(id)}")
    print(business_results['bbc_topic_model'].get_topic(id))


--- Topic 0 ---
 Number of docs: 31
[('yukos', 0.08016952884034413), ('russian', 0.049438383516321564), ('gazprom', 0.04039355614965654), ('court', 0.036110865899023646), ('yugansk', 0.03388933318667126), ('oil', 0.03330142120178454), ('rosneft', 0.033294783481641936), ('auction', 0.031048983299874145), ('russia', 0.026752901095794514), ('khodorkovsky', 0.026324461178262017)]

--- Topic 1 ---
 Number of docs: 29
[('profit', 0.04203002085677433), ('share', 0.034547439904159996), ('steel', 0.03005622175037005), ('news', 0.02478429207734607), ('company', 0.02272916849817927), ('firm', 0.0227099864511706), ('game', 0.02215152630473687), ('sale', 0.02149504073287301), ('year', 0.018503612436470997), ('corp', 0.01752137114630759)]

--- Topic 2 ---
 Number of docs: 29
[('ebbers', 0.04642526731052048), ('fraud', 0.03907449195268859), ('worldcom', 0.0323185803964334), ('sullivan', 0.03144048122991052), ('former', 0.027194581209716243), ('marsh', 0.026375872457714358), ('firm', 0.02591666371181

In [28]:
# Displaying the representative documents for each topic to get a sense of the theme

for id in business_topic_info['Topic']:
    if id == -1:
        continue
    print(f"\nSample docs for Topic {id}")
    docs = business_results['bbc_topic_model'].get_representative_docs(id)
    for doc in docs:
        print("-", doc[:300])  # Truncate to first 300 characters



Sample docs for Topic 0
- yukos accuse lie court  russian oil firm yukos lie us court attempt russian government sell key production unit court hear  unit yugansk sell pay 275bn 145bn back tax bill yukos argue us subsidiary local bank account us court declare bankrupt auction yugansk deutsche bank   target yukos lawsuit   do
- yukos bankruptcy us matter  russian authority abide us court decision take regard troubled oil giant yukos houston court tell  legal expert william butler treaty us russia recognise legal ruling mean moscow adhere us ruling yukos case yukos us court entitle declare bankrupt before yugansk unit sell 
- yukos sue four firm 20bn  russian oil firm yukos sue four company role year force state auction key oil production unit yuganskneftegas  yukos claim 20bn 11bn damage yugansk sell december settle back taxis four company name law suit gas giant gazprom unit gazpromneft investment company baikal state 

Sample docs for Topic 1
- news corp eye video game market  news c

From the sample docs above, there is an overlap between several topics so i would reduce this further to 10 topics

### Reducing Topics

**Inspection after reducing topics**

In [29]:
business_results = bert_reduce_topics(business_results, nr_topics=11)

2025-07-06 14:27:46,900 - BERTopic - Topic reduction - Reducing number of topics
2025-07-06 14:27:46,906 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-06 14:27:46,970 - BERTopic - Representation - Completed ✓
2025-07-06 14:27:46,972 - BERTopic - Topic reduction - Reduced number of topics from 28 to 11


In [30]:
business_results['reduced_topic_info']

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,109,-1_year_us_world_company,"[year, us, world, company, firm, rise, bank, e...",[s korea spending boost economy south korea b...
1,0,178,0_sale_company_year_firm,"[sale, company, year, firm, share, us, profit,...",[us retail sale surge december us retail sale...
2,1,78,1_economy_rate_growth_year,"[economy, rate, growth, year, economic, us, do...",[steady job growth continue us us create job ...
3,2,50,2_country_government_economic_sri,"[country, government, economic, sri, disaster,...",[asia quake increase poverty risk nearly two ...
4,3,31,3_yukos_russian_gazprom_court,"[yukos, russian, gazprom, court, oil, yugansk,...",[yukos bankruptcy us matter russian authority...
5,4,15,4_oil_crude_price_barrel,"[oil, crude, price, barrel, cairn, us, supply,...",[oil price reach threemonth low oil price fal...
6,5,11,5_price_house_mortgage_housing,"[price, house, mortgage, housing, rise, fall, ...",[uk house price dip november uk house price d...
7,6,10,6_wine_fosters_beer_southcorp,"[wine, fosters, beer, southcorp, brewer, buy, ...",[french wine get 70 m euro topup french gover...
8,7,8,7_club_glazer_united_manchester,"[club, glazer, united, manchester, proposal, b...",[qa malcolm glazer man utd battle control man...
9,8,7,8_parmalat_bank_italian_company,"[parmalat, bank, italian, company, sue, victim...",[parmalat return stockmarket parmalat italian...


In [31]:
# Printing topic top terms and their score for each topic
for id in business_results['reduced_topic_info']['Topic']:
    if id == -1:
        continue
    print(f"\n--- Topic {id} ---")
    print(f" Number of docs: {business_results['bbc_topic_model'].get_topic_freq(id)}")
    print(business_results['bbc_topic_model'].get_topic(id))


--- Topic 0 ---
 Number of docs: 178
[('sale', 0.026390174418312415), ('company', 0.023755650788012693), ('year', 0.023346656100316204), ('firm', 0.023314427987421976), ('share', 0.020159281284108946), ('us', 0.01911181065537952), ('profit', 0.018736136657751104), ('market', 0.017861787955804443), ('executive', 0.016676450360668886), ('car', 0.015539391145192408)]

--- Topic 1 ---
 Number of docs: 78
[('economy', 0.0394465892301281), ('rate', 0.03742356273187164), ('growth', 0.03531422807192053), ('year', 0.02957850981385048), ('economic', 0.028511142878987823), ('us', 0.028340906165695057), ('dollar', 0.02741627648460837), ('rise', 0.026354819294155864), ('deficit', 0.021857108811606223), ('bank', 0.0214483553944524)]

--- Topic 2 ---
 Number of docs: 50
[('country', 0.031587563427033144), ('government', 0.029400995553881596), ('economic', 0.023423527185230393), ('sri', 0.02161630654166585), ('disaster', 0.021195430026908352), ('people', 0.021125662172743823), ('bank', 0.019011054768

In [32]:
# Displaying the representative documents for each topic to get a sense of the theme

for id in business_results['reduced_topic_info']['Topic']:
    if id == -1:
        continue
    print(f"\nSample docs for Topic {id}")
    docs = business_results['bbc_topic_model'].get_representative_docs(id)
    for doc in docs:
        print("-", doc[:300])



Sample docs for Topic 0
- us retail sale surge december  us retail sale year high note solid gain december boost strong car sale  seasonally adjust sale rise 12 month compare 01 month boost surge shop just before christmas sales climb 8 year good performance 85 rise 1999 commerce department add gain lead 43 jump auto sale de
- gm issue 2005 profit warn  general motors warn expect earning year low 2004  world big car maker grapple loss european business weak us sale gm high healthcare cost north america low profit financial service subsidiary hurt performance 2005 gm expect meet 2004 earning target despite tough competitiv
- gm ford cut output sale fall  us car firm general motors gm ford force cut production face fall car sale  us sale gm sink 127 february compare year ago ford sale drop 3 foreign rival take big share market asian carmaker fare   toyota sale jump 11 rival nissan notch 10 increase overall sale industry 

Sample docs for Topic 1
- steady job growth continue us  us creat

### Adding Custom Labels

Based on the reduced topics above, i will manually assign sub-categories to each topic_id

In [33]:
# topic_labels = {
#     0: "Company News & Performance",
#     1: "Macroeconomic Trends & Policy",
#     2: "Oil & Russian Corporate Conflict",
#     3: "Stock Exchange Mergers & Acquisitions",
#     4: "Retail Sales & Holiday Trends",
#     5: "Latin America's Economic & Political Crisis",
#     6: "Food & Beverage Industry M&A",
#     7: "Manchester United Takeover Bid",
#     8: "Pensions & Retirement Reform"
# }

topic_labels = {
    0: "Retail & Automotive Industry Performance",
    1: "Economic Growth & Policy",
    2: "South & Southeast Asia Disaster Impact",
    3: "Russian Oil Corporate Legal Issues",
    4: "UK Housing Market Trends",
    5: "Wine Industry & Corporate Takeovers",
    6: "Manchester United Takeover News",
    7: "Corporate Scandals & Stock Market",
    8: "Pensions & Retirement Policy Reform"
}

In [34]:
# Assigning the topic labels to the topic IDs
business_results['bbc_topic_model'].set_topic_labels(topic_labels)

In [35]:
#
business_df['Sub-category'] = business_df['bertopic_topic_reduced'].map(topic_labels)

In [36]:
# Visualizing top topics
business_results['bbc_topic_model'].visualize_barchart(top_n_topics=10)

### Dealing with Outliers

Majority of the business docs have been assigned a sub-categorybut there are still about 100 outliers that need to be handled

In [37]:
# Fitering out the texts with topic, -1
uncategorized_df = business_df[business_df['bertopic_topic_reduced'] == -1][['id', 'text', 'category', 'preprocessed_text']]
uncategorized_df.head()

Unnamed: 0,id,text,category,preprocessed_text
1,bus_002,Dollar gains on Greenspan speech\n\nThe dollar...,business,dollar gain greenspan speech dollar hit high ...
6,bus_007,Jobs growth still slow in the US\n\nThe US cre...,business,job growth slow us us create job expect janua...
7,bus_008,"India calls for fair trade rules\n\nIndia, whi...",business,india call fair trade rule india attend g7 me...
8,bus_009,Ethiopia's crop production up 24%\n\nEthiopia ...,business,ethiopias crop production 24 ethiopia produce...
13,bus_014,Telegraph newspapers axe 90 jobs\n\nThe Daily ...,business,telegraph newspaper axe 90 job daily sunday t...


In [38]:
# Count the number of tokens per text and assign this to
uncategorized_df['token_count'] = uncategorized_df['preprocessed_text'].apply(lambda x: len(x.split()))

In [39]:
uncategorized_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
token_count,109.0,184.834862,77.738702,87.0,137.0,166.0,210.0,464.0


In [40]:
uncategorized_df.sort_values(by='token_count', ascending=False)

Unnamed: 0,id,text,category,preprocessed_text,token_count
276,bus_277,Turkey turns on the economic charm\n\nThree ye...,business,turkey turn economic charm three year gruelli...,464
242,bus_243,Making your office work for you\n\nOur mission...,business,office work mission brighten work life contin...,422
240,bus_241,G7 backs Africa debt relief plan\n\nG7 finance...,business,g7 back africa debt relief plan g7 finance mi...,412
486,bus_487,World leaders gather to face uncertainty\n\nMo...,business,world leader gather face uncertainty 2000 bus...,406
267,bus_268,Giving financial gifts to children\n\nYour chi...,business,give financial gift child child grandchild wa...,400
...,...,...,...,...,...
17,bus_018,India's rupee hits five-year high\n\nIndia's r...,business,indias rupee hit fiveyear high indias rupee h...,97
472,bus_473,Two Nigerian banks set to merge\n\nNigerian ba...,business,two nigerian bank set merge nigerian bank uni...,96
354,bus_355,Israeli economy picking up pace\n\nIsrael's ec...,business,israeli economy pick pace israels economy for...,96
95,bus_096,Burren awarded Egyptian contracts\n\nBritish e...,business,burren award egyptian contract british energy...,92


After reviewing the uncategorized texts, below are a few reasons why they were uncategorized

- Rare and unusual words e.g bus_373
- Bad preprocessing with all stopwords removed leaving nothing meaningful e.g bus_510

Many of the docs can still be reclustered

In [41]:
# Reclustering -1 topics
uncategorized_results = run_bertopic_pipeline(uncategorized_df, min_topic_size=3)

2025-07-06 14:27:48,290 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2025-07-06 14:27:52,678 - BERTopic - Embedding - Completed ✓
2025-07-06 14:27:52,679 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-06 14:27:52,845 - BERTopic - Dimensionality - Completed ✓
2025-07-06 14:27:52,846 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-06 14:27:52,852 - BERTopic - Cluster - Completed ✓
2025-07-06 14:27:52,855 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-06 14:27:52,883 - BERTopic - Representation - Completed ✓


In [42]:
out_topic_info = uncategorized_results['topic_info']
out_topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,19,-1_oil_child_company_parent,"[oil, child, company, parent, pay, report, fir...",[soar oil hit world economy soar cost oil hit...
1,0,21,0_us_deficit_export_year,"[us, deficit, export, year, economic, trade, d...",[us trade deficit widen sharply gap us export...
2,1,15,1_bank_standard_barclays_year,"[bank, standard, barclays, year, rise, report,...",[barclay share merger talk share uk banking g...
3,2,10,2_eu_us_european_aid,"[eu, us, european, aid, company, airline, euro...",[euus seek deal air dispute eu us agree talk ...
4,3,7,3_growth_french_france_consumer,"[growth, french, france, consumer, economy, sp...",[french consumer spending rise french consume...
5,4,7,4_walmart_gun_company_california,"[walmart, gun, company, california, sina, sale...",[chinas shanda buy stake sina chinese online ...
6,5,7,5_card_lg_creditor_club,"[card, lg, creditor, club, borussia, credit, d...",[card fraudster target web safeguard credit d...
7,6,6,6_turkey_turkeys_turkish_lira,"[turkey, turkeys, turkish, lira, islamic, bank...",[turkey knock six zero lira turkey relaunch c...
8,7,5,7_wmc_xstrata_bid_deal,"[wmc, xstrata, bid, deal, cement, australian, ...",[swiss cement firm buy spree swiss cement fir...
9,8,4,8_call_centre_work_telegraph,"[call, centre, work, telegraph, customer, colo...",[telegraph newspaper axe 90 job daily sunday ...


In [43]:
# Displaying the representative documents for each topic to get a sense of the theme

for id in out_topic_info['Topic']:
    if id == -1:
        continue
    print(f"\nSample docs for Topic {id}")
    docs = uncategorized_results['bbc_topic_model'].get_representative_docs(id)
    for doc in docs:
        print("-", doc[:500])  # Truncate for readability



Sample docs for Topic 0
- us trade deficit widen sharply  gap us export import widen 60bn 317bn alltime record  figure commerce department november show export 23 956bn import grow 13 1558bn rise consumer demand part expand deficit high price oil import number suggest slide dollar   export expensive   little impact indicate slow economic growth  trade deficit   far big 54bn widely expect wall street   prompt rapid response currency market  1650 gmt dollar trade against euro 13280 cent half weak before announcement agains
- g7 back africa debt relief plan  g7 finance minister back plan write 100 debt world poor country  uk chancellor gordon brown london meeting world seven rich nation remember 100 debt relief summit 37 country benefit casebycase review body include world bank imf us support browns international finance facility boost aid develop country bbc correspondent meeting produce movement uks ambition work need brown major breakthrough international organisation offer 100 multil

After inspecting the representative documents of the new outlier topics, a lot of them can be merged with the existing set of topic labels.
*  Outlier topic 0 is related to 'Company News'
*  Outlier topic 1 is related to



and finally, adding a last topic 7 to represent 'Other'

In [44]:
# Mapping outlier topics to existing topics
outlier_to_existing_topic = {
    0:0,
    1:1,
    2:1,
    3:5,
    4:1,
    5:1,
    6:1,
    7:9
}

uncategorized_df['mapped_topic'] = uncategorized_df['bertopic_topic'].map(outlier_to_existing_topic)

There are still a few docs that are uncategorized and they will be dealt with manually. They will either be assigned to an existing topic or assigned to a new topic representing 'Other'

In [45]:
# Displaying all uncategorized topics
uncategorized_df[uncategorized_df['bertopic_topic']==-1]

Unnamed: 0,id,text,category,preprocessed_text,token_count,bertopic_topic,bertopic_topic_label,mapped_topic
29,bus_030,Soaring oil 'hits world economy'\n\nThe soarin...,business,soar oil hit world economy soar cost oil hit ...,233,-1,"[(oil, 0.024815433329690648), (child, 0.022411...",
33,bus_034,"Rover deal 'may cost 2,000 jobs'\n\nSome 2,000...",business,rover deal cost 2000 job 2000 job mg rovers m...,170,-1,"[(oil, 0.024815433329690648), (child, 0.022411...",
41,bus_042,UK Coal plunges into deeper loss\n\nShares in ...,business,uk coal plunge deep loss share uk coal fall m...,130,-1,"[(oil, 0.024815433329690648), (child, 0.022411...",
95,bus_096,Burren awarded Egyptian contracts\n\nBritish e...,business,burren award egyptian contract british energy...,92,-1,"[(oil, 0.024815433329690648), (child, 0.022411...",
101,bus_102,US company admits Benin bribery\n\nA US defenc...,business,us company admit benin bribery us defence tel...,139,-1,"[(oil, 0.024815433329690648), (child, 0.022411...",
107,bus_108,Slowdown hits US factory growth\n\nUS industri...,business,slowdown hit us factory growth us industrial ...,99,-1,"[(oil, 0.024815433329690648), (child, 0.022411...",
182,bus_183,Business fears over sluggish EU economy\n\nAs ...,business,business fear sluggish eu economy european le...,388,-1,"[(oil, 0.024815433329690648), (child, 0.022411...",
221,bus_222,Oil companies get Russian setback\n\nInternati...,business,oil company get russian setback international...,254,-1,"[(oil, 0.024815433329690648), (child, 0.022411...",
257,bus_258,S Korea spending boost to economy\n\nSouth Kor...,business,s korea spending boost economy south korea bo...,232,-1,"[(oil, 0.024815433329690648), (child, 0.022411...",
267,bus_268,Giving financial gifts to children\n\nYour chi...,business,give financial gift child child grandchild wa...,400,-1,"[(oil, 0.024815433329690648), (child, 0.022411...",


In [46]:
for i, row in uncategorized_df[uncategorized_df['bertopic_topic']==-1].iterrows():
    print(f"\nDoc {i}:\n{row['text'][:500]}")


Doc 29:
Soaring oil 'hits world economy'

The soaring cost of oil has hit global economic growth, although world's major economies should weather the storm of price rises, according to the OECD.

In its latest bi-annual report, the OECD cut its growth predictions for the world's main industrialised regions. US growth would reach 4.4% in 2004, but fall to 3.3% next year from a previous estimate of 3.7%, the OECD said. However, the Paris-based economics think tank said it believed the global economy could

Doc 33:
Rover deal 'may cost 2,000 jobs'

Some 2,000 jobs at MG Rover's Midlands plant may be cut if investment in the firm by a Chinese car maker goes ahead, the Financial Times has reported.

Shanghai Automotive Industry Corp plans to shift production of the Rover 25 to China and export it to the UK, sources close to the negotiations tell the FT. But Rover told BBC News that reports of job cuts were "speculation". A tie-up, seen as Rover's last chance to save its Longbridge plant, h

In [47]:
# Manually update the uncategorized texts
# Dictionary mapping docs to topic indices
doc_topic_map = {
    'bus_900': 0,
    'bus_958': 9,
    'bus_985': 1,
    'bus_1010': 1,
    'bus_1034': 0,
    'bus_1069': 1,
    'bus_1088': 1,
    'bus_1102': 0,
    'bus_1116': 0,
    'bus_1149': 0,
    'bus_1209': 9,
    'bus_1233': 9,
    'bus_1242': 0,
    'bus_1256': 5,
    'bus_1345': 1,
    'bus_1367': 5,
    'bus_1381': 3,
    'bus_1396': 1
}

uncategorized_df["mapped_topic"] = uncategorized_df.apply(lambda row: doc_topic_map.get(row["id"], row["mapped_topic"]), axis=1
)


In [48]:
uncategorized_df

Unnamed: 0,id,text,category,preprocessed_text,token_count,bertopic_topic,bertopic_topic_label,mapped_topic
1,bus_002,Dollar gains on Greenspan speech\n\nThe dollar...,business,dollar gain greenspan speech dollar hit high ...,212,0,"[(us, 0.035971775223655356), (deficit, 0.02838...",0.0
6,bus_007,Jobs growth still slow in the US\n\nThe US cre...,business,job growth slow us us create job expect janua...,151,3,"[(growth, 0.07462725475288558), (french, 0.073...",5.0
7,bus_008,"India calls for fair trade rules\n\nIndia, whi...",business,india call fair trade rule india attend g7 me...,179,0,"[(us, 0.035971775223655356), (deficit, 0.02838...",0.0
8,bus_009,Ethiopia's crop production up 24%\n\nEthiopia ...,business,ethiopias crop production 24 ethiopia produce...,143,0,"[(us, 0.035971775223655356), (deficit, 0.02838...",0.0
13,bus_014,Telegraph newspapers axe 90 jobs\n\nThe Daily ...,business,telegraph newspaper axe 90 job daily sunday t...,261,8,"[(call, 0.08142255474472622), (centre, 0.07027...",
...,...,...,...,...,...,...,...,...
489,bus_490,Brazil jobless rate hits new low\n\nBrazil's u...,business,brazil jobless rate hit low brazils unemploym...,187,10,"[(brazil, 0.05595540624658833), (government, 0...",
498,bus_499,China Aviation seeks rescue deal\n\nScandal-hi...,business,china aviation seek rescue deal scandalhit je...,143,-1,"[(oil, 0.024815433329690648), (child, 0.022411...",
501,bus_502,Algeria hit by further gas riots\n\nAlgeria su...,business,algeria hit gas riot algeria suffer weekend v...,132,-1,"[(oil, 0.024815433329690648), (child, 0.022411...",
503,bus_504,Aids and climate top Davos agenda\n\nClimate c...,business,aid climate top davos agenda climate change f...,240,-1,"[(oil, 0.024815433329690648), (child, 0.022411...",


In [49]:
# Update the main business_df
business_df.loc[uncategorized_df.index, 'bertopic_topic_reduced'] = uncategorized_df['mapped_topic']

In [50]:
business_df

Unnamed: 0,id,text,category,preprocessed_text,bertopic_topic,bertopic_topic_label,bertopic_topic_reduced,bertopic_topic_reduced_label,Sub-category
0,bus_001,Ad sales boost Time Warner profit\n\nQuarterly...,business,ad sale boost time warner profit quarterly pr...,1,"[(profit, 0.04203002085677433), (share, 0.0345...",0.0,"[(sale, 0.026390174418312415), (company, 0.023...",Retail & Automotive Industry Performance
1,bus_002,Dollar gains on Greenspan speech\n\nThe dollar...,business,dollar gain greenspan speech dollar hit high ...,-1,"[(year, 0.014855408772173095), (us, 0.01291355...",0.0,"[(year, 0.02245313977568396), (us, 0.019191454...",
2,bus_003,Yukos unit buyer faces loan claim\n\nThe owner...,business,yukos unit buyer face loan claim owner embatt...,0,"[(yukos, 0.08016952884034413), (russian, 0.049...",3.0,"[(yukos, 0.10519130083632863), (russian, 0.062...",Russian Oil Corporate Legal Issues
3,bus_004,High fuel prices hit BA's profits\n\nBritish A...,business,high fuel price hit bas profit british airway...,4,"[(airline, 0.06444251699397231), (air, 0.04691...",0.0,"[(sale, 0.026390174418312415), (company, 0.023...",Retail & Automotive Industry Performance
4,bus_005,Pernod takeover talk lifts Domecq\n\nShares in...,business,pernod takeover talk lift domecq share uk dri...,5,"[(sale, 0.05170048308837637), (store, 0.037310...",0.0,"[(sale, 0.026390174418312415), (company, 0.023...",Retail & Automotive Industry Performance
...,...,...,...,...,...,...,...,...,...
505,bus_506,Trial begins of Spain's top banker\n\nThe tria...,business,trial spains top banker trial emilio botin ch...,22,"[(parmalat, 0.1182596437461906), (italian, 0.0...",8.0,"[(parmalat, 0.1408055357648977), (bank, 0.0808...",Pensions & Retirement Policy Reform
506,bus_507,UK economy ends year with spurt\n\nThe UK econ...,business,uk economy year spurt uk economy grow estimat...,9,"[(rate, 0.05609878096496134), (mpc, 0.03409099...",1.0,"[(economy, 0.0394465892301281), (rate, 0.03742...",Economic Growth & Policy
507,bus_508,HealthSouth ex-boss goes on trial\n\nThe forme...,business,healthsouth exboss go trial former head us me...,2,"[(ebbers, 0.04642526731052048), (fraud, 0.0390...",0.0,"[(sale, 0.026390174418312415), (company, 0.023...",Retail & Automotive Industry Performance
508,bus_509,Euro firms miss out on optimism\n\nMore than 9...,business,euro firm optimism 90 large company world hig...,-1,"[(year, 0.014855408772173095), (us, 0.01291355...",1.0,"[(year, 0.02245313977568396), (us, 0.019191454...",


## Entertainment Category

In [51]:
# Run bertopic on Entertainment top category
ent_results = run_bertopic_pipeline(text_df, 'entertainment')

 Running pipeline for entertainment category




2025-07-06 14:27:58,613 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/12 [00:00<?, ?it/s]

2025-07-06 14:28:11,851 - BERTopic - Embedding - Completed ✓
2025-07-06 14:28:11,852 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-06 14:28:12,380 - BERTopic - Dimensionality - Completed ✓
2025-07-06 14:28:12,381 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-06 14:28:12,393 - BERTopic - Cluster - Completed ✓
2025-07-06 14:28:12,396 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-06 14:28:12,457 - BERTopic - Representation - Completed ✓


In [52]:
ent_results['topic_distance_map']

In [53]:
ent_results['topic_documents_map']

Now let us inspect the resulting topics and labels for the **entertainment** category

In [54]:
ent_df = ent_results['dataframe']
ent_df.head()

Unnamed: 0,id,text,category,preprocessed_text,bertopic_topic,bertopic_topic_label
510,ent_001,Gallery unveils interactive tree\n\nA Christma...,entertainment,gallery unveil interactive tree christmas tre...,-1,"[(film, 0.046832837915743435), (star, 0.019116..."
511,ent_002,Jarre joins fairytale celebration\n\nFrench mu...,entertainment,jarre join fairytale celebration french music...,0,"[(music, 0.04064247381019031), (band, 0.029388..."
512,ent_003,Musical treatment for Capra film\n\nThe classi...,entertainment,musical treatment capra film classic film won...,2,"[(theatre, 0.08046079345961737), (ballet, 0.06..."
513,ent_004,Richard and Judy choose top books\n\nThe 10 au...,entertainment,richard judy choose top book 10 author shortl...,6,"[(book, 0.099658642246669), (prize, 0.05445365..."
514,ent_005,Poppins musical gets flying start\n\nThe stage...,entertainment,poppin musical get fly start stage adaptation...,2,"[(theatre, 0.08046079345961737), (ballet, 0.06..."


In [55]:
ent_topic_info = ent_results['topic_info']
ent_topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,89,-1_film_star_year_movie,"[film, star, year, movie, actor, award, see, d...",[oscar nominee lack pull power year clutch os...
1,0,127,0_music_band_song_album,"[music, band, song, album, year, one, good, si...",[brit debate urban music joss stone 17yearold...
2,1,58,1_show_tv_bbc_series,"[show, tv, bbc, series, channel, programme, us...",[jungle tv show rating drop 4 m finale itv1s ...
3,2,15,2_theatre_ballet_musical_show,"[theatre, ballet, musical, show, poppins, good...",[fear raise ballet future child uk follow dai...
4,3,14,3_film_vera_drake_award,"[film, vera, drake, award, british, good, actr...",[vera drake bafta triumph hope bafta film awa...
5,4,14,4_box_office_take_top,"[box, office, take, top, weekend, comedy, film...",[de niro complete box office coup robert de n...
6,5,11,5_film_festival_sundance_redford,"[film, festival, sundance, redford, dutch, van...",[van gogh festival film withdraw murder direc...
7,6,10,6_book_prize_novel_winner,"[book, prize, novel, winner, award, win, judge...",[paraguay novel win us book prize novel set 1...
8,7,10,7_good_award_film_director,"[good, award, film, director, win, sideways, a...",[us critic laud comedy sideways road trip com...
9,8,8,8_good_aviator_win_swank,"[good, aviator, win, swank, foxx, director, ac...",[aviator win top globe accolade aviator name ...


In [56]:
for id in ent_topic_info['Topic']:
    if id == -1:
        continue
    print(f"\n--- Topic {id} ---")
    print(f" Number of docs: {ent_results['bbc_topic_model'].get_topic_freq(id)}")
    print(ent_results['bbc_topic_model'].get_topic(id))


--- Topic 0 ---
 Number of docs: 127
[('music', 0.04064247381019031), ('band', 0.02938806635763178), ('song', 0.027698597618137068), ('album', 0.027277985976249166), ('year', 0.02266477887268695), ('one', 0.02034001037798799), ('good', 0.02026487050575704), ('single', 0.019222450259997324), ('record', 0.018830562045856813), ('chart', 0.017704646240673014)]

--- Topic 1 ---
 Number of docs: 58
[('show', 0.058803809998232474), ('tv', 0.03474328992541589), ('bbc', 0.032833799691990816), ('series', 0.026630422488687525), ('channel', 0.022093415917680343), ('programme', 0.02035886250558621), ('us', 0.020327209946374474), ('television', 0.018256168258242965), ('celebrity', 0.018240883871480533), ('comedy', 0.017713818844107977)]

--- Topic 2 ---
 Number of docs: 15
[('theatre', 0.08046079345961737), ('ballet', 0.06368601093797782), ('musical', 0.05868642145985571), ('show', 0.029450784124818472), ('poppins', 0.028926048927692908), ('good', 0.02862141138370591), ('actor', 0.02789829849486477

In [57]:
for id in ent_topic_info['Topic']:
    if id == -1:
        continue
    print(f"\nSample docs for Topic {id}")
    docs = ent_results['bbc_topic_model'].get_representative_docs(id)
    for doc in docs:
        print("-", doc[:300])  # Truncate for readability



Sample docs for Topic 0
- brit debate urban music  joss stone 17yearold soul singer devon beat dizzee rascal jamelia lemar streets win good british urban act brit awards victory reignite debate urban music  i m really comfortable word urban word s manufacture country america describe black music word urban cover broad range 
- scissor sisters triumph brit  us band scissor sisters lead winner uk music industry brit awards walk three prize flamboyant act score hattrick international category win good group good album good newcomer award glasgow group franz ferdinand win two prize keane joss stone vote good urban act digital
- grammys honour soul star charles  memory soul legend ray charles dominate music world lead music ceremony sunday give eight posthumous grammy awards  charles die 2004 get honour include record album year alicia key actor jamie foxx perform musical tribute rb star keys win four award herself grammy c

Sample docs for Topic 1
- jungle tv show rating drop 4 m  finale 

**Inspection after reducing topics**

In [58]:
ent_results = bert_reduce_topics(ent_results, nr_topics=10)

2025-07-06 14:28:26,468 - BERTopic - Topic reduction - Reducing number of topics
2025-07-06 14:28:26,473 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-06 14:28:26,528 - BERTopic - Representation - Completed ✓
2025-07-06 14:28:26,530 - BERTopic - Topic reduction - Reduced number of topics from 12 to 10


In [59]:
ent_results['reduced_topic_info']

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,89,-1_film_star_year_movie,"[film, star, year, movie, award, actor, see, d...",[oscar nominee lack pull power year clutch os...
1,0,127,0_music_band_song_album,"[music, band, song, album, year, good, one, si...",[brit debate urban music joss stone 17yearold...
2,1,58,1_show_tv_bbc_series,"[show, tv, bbc, series, channel, us, programme...",[jungle tv show rating drop 4 m finale itv1s ...
3,2,32,2_good_film_win_award,"[good, film, win, award, actress, director, ac...",[aviator win top globe accolade aviator name ...
4,3,15,3_theatre_ballet_musical_show,"[theatre, ballet, musical, show, good, poppins...",[fear raise ballet future child uk follow dai...
5,4,14,4_box_office_take_top,"[box, office, take, top, weekend, comedy, film...",[de niro complete box office coup robert de n...
6,5,11,5_film_festival_sundance_dutch,"[film, festival, sundance, dutch, redford, van...",[van gogh festival film withdraw murder direc...
7,6,10,6_book_prize_novel_winner,"[book, prize, novel, winner, award, win, judge...",[paraguay novel win us book prize novel set 1...
8,7,7,7_film_berlin_festival_european,"[film, berlin, festival, european, daylewis, f...",[berlin hail european cinema organiser year b...
9,8,6,8_godzilla_film_japanese_yen,"[godzilla, film, japanese, yen, castle, moving...",[animation charm japan box office oscarwinnin...


In [60]:
for id in ent_results['reduced_topic_info']['Topic']:
    if id == -1:
        continue
    print(f"\n--- Topic {id} ---")
    print(f" Number of docs: {ent_results['bbc_topic_model'].get_topic_freq(id)}")
    print(ent_results['bbc_topic_model'].get_topic(id))


--- Topic 0 ---
 Number of docs: 127
[('music', 0.043252177344701284), ('band', 0.030986696397231898), ('song', 0.02919668242661591), ('album', 0.028709016257537173), ('year', 0.024246203493878845), ('good', 0.02168553896311466), ('one', 0.0215806188723308), ('single', 0.020130460852927908), ('record', 0.01975714255911084), ('number', 0.018576712577714222)]

--- Topic 1 ---
 Number of docs: 58
[('show', 0.06263280535634662), ('tv', 0.03657986011815657), ('bbc', 0.03449890826790027), ('series', 0.027864822603648313), ('channel', 0.022972059959913776), ('us', 0.0215251387412709), ('programme', 0.021195278707898692), ('television', 0.01901073777434786), ('celebrity', 0.018936871133014146), ('comedy', 0.018542829847104043)]

--- Topic 2 ---
 Number of docs: 32
[('good', 0.06858256798600412), ('film', 0.06383072624765833), ('win', 0.049618966484812045), ('award', 0.04766158247716313), ('actress', 0.04564421768086122), ('director', 0.04100920370193908), ('actor', 0.0404618918030892), ('avia

In [61]:
for id in ent_results['reduced_topic_info']['Topic']:
    if id == -1:
        continue
    print(f"\nSample docs for Topic {id}")
    docs = ent_results['bbc_topic_model'].get_representative_docs(id)
    for doc in docs[:5]:  # View first 2
        print("-", doc[:300])  # Truncate for readability



Sample docs for Topic 0
- brit debate urban music  joss stone 17yearold soul singer devon beat dizzee rascal jamelia lemar streets win good british urban act brit awards victory reignite debate urban music  i m really comfortable word urban word s manufacture country america describe black music word urban cover broad range 
- scissor sisters triumph brit  us band scissor sisters lead winner uk music industry brit awards walk three prize flamboyant act score hattrick international category win good group good album good newcomer award glasgow group franz ferdinand win two prize keane joss stone vote good urban act digital
- grammys honour soul star charles  memory soul legend ray charles dominate music world lead music ceremony sunday give eight posthumous grammy awards  charles die 2004 get honour include record album year alicia key actor jamie foxx perform musical tribute rb star keys win four award herself grammy c

Sample docs for Topic 1
- jungle tv show rating drop 4 m  finale 

## Politics Category

In [62]:
# Run bertopic on Politics top category
politics_results = run_bertopic_pipeline(text_df, 'politics')

 Running pipeline for politics category




2025-07-06 14:28:27,634 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

2025-07-06 14:28:43,782 - BERTopic - Embedding - Completed ✓
2025-07-06 14:28:43,786 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-06 14:28:44,377 - BERTopic - Dimensionality - Completed ✓
2025-07-06 14:28:44,378 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-06 14:28:44,391 - BERTopic - Cluster - Completed ✓
2025-07-06 14:28:44,393 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-06 14:28:44,468 - BERTopic - Representation - Completed ✓


In [63]:
# Visualize intertopic distance map
politics_results['topic_distance_map']

In [64]:
# Visualize documents and topics
politics_results['topic_documents_map']

Now let us inspect the resulting topics and labels for the **business** category

In [65]:
politics_df = politics_results['dataframe']
politics_df.head()

Unnamed: 0,id,text,category,preprocessed_text,bertopic_topic,bertopic_topic_label
896,pol_001,Labour plans maternity pay rise\n\nMaternity p...,politics,labour plan maternity pay rise maternity pay ...,-1,"[(government, 0.017387056446624017), (people, ..."
897,pol_002,Watchdog probes e-mail deletions\n\nThe inform...,politics,watchdog probe email deletion information com...,-1,"[(government, 0.017387056446624017), (people, ..."
898,pol_003,Hewitt decries 'career sexism'\n\nPlans to ext...,politics,hewitt decry career sexism plan extend pay ma...,-1,"[(government, 0.017387056446624017), (people, ..."
899,pol_004,Labour chooses Manchester\n\nThe Labour Party ...,politics,labour choose manchester labour party hold 20...,14,"[(tory, 0.057616449397442655), (conservative, ..."
900,pol_005,Brown ally rejects Budget spree\n\nChancellor ...,politics,brown ally reject budget spree chancellor gor...,0,"[(brown, 0.0403427677626557), (blair, 0.039533..."


In [66]:
politics_topic_info = politics_results['topic_info']
politics_topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,102,-1_government_people_year_labour,"[government, people, year, labour, minister, p...",[blair blast tory spending plan tony blair la...
1,0,63,0_brown_blair_election_labour,"[brown, blair, election, labour, prime, minist...",[labour seek quell feud talk labours leadersh...
2,1,27,1_school_child_education_government,"[school, child, education, government, sport, ...",[election really general election good chance...
3,2,26,2_trial_suspect_law_human,"[trial, suspect, law, human, right, terror, go...",[lord wrong detainee straw jack straw attac...
4,3,24,3_lib_kennedy_party_dems,"[lib, kennedy, party, dems, tax, labour, dem, ...",[taxis trust kennedy public trust taxis bre...
5,4,19,4_union_pension_worker_service,"[union, pension, worker, service, government, ...",[civil servant strike ballot uks big civil se...
6,5,17,5_police_sentence_murder_law,"[police, sentence, murder, law, lord, plan, dr...",[tory outline policing plan local community a...
7,6,16,6_howard_tory_tax_election,"[howard, tory, tax, election, labour, party, m...",[howard blair tax pledge clash tony blair vot...
8,7,16,7_eu_straw_constitution_referendum,"[eu, straw, constitution, referendum, embargo,...",[eu referendum question unveil question ask r...
9,8,15,8_lords_reform_woman_lord,"[lords, reform, woman, lord, minority, mp, hou...",[baron kinnock lords debut former labour lead...


In [67]:
for id in politics_topic_info['Topic']:
    if id == -1:
        continue
    print(f"\n--- Topic {id} ---")
    print(f" Number of docs: {politics_results['bbc_topic_model'].get_topic_freq(id)}")
    print(politics_results['bbc_topic_model'].get_topic(id))


--- Topic 0 ---
 Number of docs: 63
[('brown', 0.0403427677626557), ('blair', 0.039533286254590266), ('election', 0.03399467492899181), ('labour', 0.03170002768939986), ('prime', 0.030897485797785645), ('minister', 0.027958514489291136), ('chancellor', 0.027217317665694844), ('campaign', 0.020704063125027386), ('gordon', 0.01664989410030784), ('tell', 0.01628080967523174)]

--- Topic 1 ---
 Number of docs: 27
[('school', 0.030348141531855048), ('child', 0.027344490337456763), ('education', 0.02415403190976061), ('government', 0.02364838998885058), ('sport', 0.021977253198702633), ('people', 0.02066517398410499), ('age', 0.020518348626078625), ('student', 0.02037490806232474), ('parent', 0.019235549615378246), ('report', 0.0180572740609153)]

--- Topic 2 ---
 Number of docs: 26
[('trial', 0.0332104909456277), ('suspect', 0.032381046392657216), ('law', 0.032275841303273184), ('human', 0.028656505502752345), ('right', 0.028183859793457174), ('terror', 0.027529454037036022), ('government'

In [68]:
for id in politics_topic_info['Topic']:
    if id == -1:
        continue
    print(f"\nSample docs for Topic {id}")
    docs = politics_results['bbc_topic_model'].get_representative_docs(id)
    for doc in docs[:5]:  # View first 2
        print("-", doc[:300])  # Truncate for readability



Sample docs for Topic 0
- labour seek quell feud talk  labours leadership put show unity campaign poster launch mps criticise tony blair gordon brown report rift  brown join launch john prescott alan milburn man controversially put charge election planning blair private meeting monday see normally loyal mp warn feuding jeopa
- blair pledge unity labour mp  tony blair seek reassure labour backbencher nothing stand way partys bid third term power  blair speak mps amid fresh rumour rift gordon brown book prime minister go back pledge brown stand before general election chancellor focus win poll due join election supremo alan
- blair brown criticise mps  labour mp angrily criticise tony blair gordon brown amid renew report rift two man  meeting parliamentary labour party see succession normally loyal member warn feuding jeopardise labours election hope pm insist nothing derail labours campaign despite book upset chancellor

Sample docs for Topic 1
- election really  general election good 

**Inspection after reducing topics**

In [69]:
politics_results = bert_reduce_topics(politics_results, nr_topics=10)

2025-07-06 14:29:01,245 - BERTopic - Topic reduction - Reducing number of topics
2025-07-06 14:29:01,251 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-06 14:29:01,314 - BERTopic - Representation - Completed ✓
2025-07-06 14:29:01,316 - BERTopic - Topic reduction - Reduced number of topics from 18 to 10


In [70]:
politics_results['reduced_topic_info']

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,102,-1_government_people_year_labour,"[government, people, year, labour, minister, p...",[blair blast tory spending plan tony blair la...
1,0,98,0_party_government_plan_people,"[party, government, plan, people, lib, electio...",[taxis trust kennedy public trust taxis bre...
2,1,97,1_blair_election_labour_brown,"[blair, election, labour, brown, prime, minist...",[blair brown criticise mps labour mp angrily ...
3,2,43,2_law_police_home_trial,"[law, police, home, trial, government, suspect...",[lord wrong detainee straw jack straw attac...
4,3,16,3_eu_straw_constitution_referendum,"[eu, straw, constitution, referendum, embargo,...",[eu referendum question unveil question ask r...
5,4,12,4_world_aid_brown_g8,"[world, aid, brown, g8, africa, poverty, debt,...",[brown call 55bn aids fund gordon brown call ...
6,5,11,5_hunt_hunting_ban_police,"[hunt, hunting, ban, police, animal, law, dog,...",[minister defend hunt ban law law ban hunt do...
7,6,11,6_ukip_kilroysilk_party_veritas,"[ukip, kilroysilk, party, veritas, robert, ele...",[kilroy name election seat target exchat show...
8,7,7,7_livingstone_jewish_mayor_apologise,"[livingstone, jewish, mayor, apologise, ken, r...",[boris oppose mayor apology ken livingstone s...
9,8,6,8_mcconnell_regiment_drinking_drink,"[mcconnell, regiment, drinking, drink, scottis...",[scot smoke ban detail set smokefree area sav...


In [71]:
for id in politics_results['reduced_topic_info']['Topic']:
    if id == -1:
        continue
    print(f"\n--- Topic {id} ---")
    print(f" Number of docs: {politics_results['bbc_topic_model'].get_topic_freq(id)}")
    print(politics_results['bbc_topic_model'].get_topic(id))


--- Topic 0 ---
 Number of docs: 98
[('party', 0.025430483901400673), ('government', 0.025363188133670996), ('plan', 0.02350115459759589), ('people', 0.023041289805312882), ('lib', 0.020471004157792485), ('election', 0.02038894273768005), ('labour', 0.020274233882996196), ('council', 0.019614592247756964), ('tax', 0.01880461104332673), ('kennedy', 0.01741293013664313)]

--- Topic 1 ---
 Number of docs: 97
[('blair', 0.03933039393301757), ('election', 0.038686527821013436), ('labour', 0.03677290812444929), ('brown', 0.034827951610674625), ('prime', 0.028291057747549232), ('minister', 0.026606924127785887), ('howard', 0.026007121135847158), ('chancellor', 0.025007659772954532), ('campaign', 0.021884796231863547), ('party', 0.021066940270816275)]

--- Topic 2 ---
 Number of docs: 43
[('law', 0.035542816588283864), ('police', 0.03323774909818742), ('home', 0.02739945154754429), ('trial', 0.026749103125830977), ('government', 0.02667969854738585), ('suspect', 0.026500696260475083), ('right

In [72]:
for id in politics_results['reduced_topic_info']['Topic']:
    if id == -1:
        continue
    print(f"\nSample docs for Topic {id}")
    docs = politics_results['bbc_topic_model'].get_representative_docs(id)
    for doc in docs[:5]:  # View first 2
        print("-", doc[:300])  # Truncate for readability



Sample docs for Topic 0
- taxis trust   kennedy  public trust taxis break labour tories straight people issue lib dem leader charles kennedy  day ahead government prebudget report kennedy speech party face painful economic reality current level taxation right put 50 tax top earner party accuse lib dems uncosted promise kenne
- kennedys cautious optimism  charles kennedy far canny grand claim party fare general election  22 year commons see fair share claim dash rock bitter experience uks political electoral system caution hide fact party leader believe way something special month time look i i go put artificial limit ambi
- lib dems unveil election slogan  liberal democrats present real alternative forthcoming general election campaign charles kennedy  unveil slogan partys spring conference glass ceiling ambition tell delegate labour abuse public trust tories fail oppose response conservatives insist theirs party under

Sample docs for Topic 1
- blair brown criticise mps  labour mp an

## Sports Category

In [73]:
# Run bertopic on Sports top category
sport_results = run_bertopic_pipeline(text_df, 'sport')

 Running pipeline for sport category




2025-07-06 14:29:02,419 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

2025-07-06 14:29:20,328 - BERTopic - Embedding - Completed ✓
2025-07-06 14:29:20,330 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-06 14:29:21,125 - BERTopic - Dimensionality - Completed ✓
2025-07-06 14:29:21,126 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-06 14:29:21,141 - BERTopic - Cluster - Completed ✓
2025-07-06 14:29:21,144 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-06 14:29:21,211 - BERTopic - Representation - Completed ✓


In [74]:
# Visualize intertopoc distance map
sport_results['topic_distance_map']

In [75]:
# Visualize documents and topics
sport_results['topic_documents_map']

Now let us inspect the resulting topics and labels for the **business** category

In [76]:
sport_df = sport_results['dataframe']
sport_df.head()

Unnamed: 0,id,text,category,preprocessed_text,bertopic_topic,bertopic_topic_label
1313,spo_001,Claxton hunting first major medal\n\nBritish h...,sport,claxton hunt first major medal british hurdle...,3,"[(race, 0.047606982522375496), (world, 0.04150..."
1314,spo_002,O'Sullivan could run in Worlds\n\nSonia O'Sull...,sport,osullivan run worlds sonia osullivan indicate...,3,"[(race, 0.047606982522375496), (world, 0.04150..."
1315,spo_003,Greene sets sights on world title\n\nMaurice G...,sport,greene set sight world title maurice greene a...,3,"[(race, 0.047606982522375496), (world, 0.04150..."
1316,spo_004,IAAF launches fight against drugs\n\nThe IAAF ...,sport,iaaf launch fight against drug iaaf athleti...,4,"[(test, 0.09581743297828185), (drug, 0.0786182..."
1317,spo_005,"Dibaba breaks 5,000m world record\n\nEthiopia'...",sport,dibaba break 5000 m world record ethiopias ti...,3,"[(race, 0.047606982522375496), (world, 0.04150..."


In [77]:
sport_topic_info = sport_results['topic_info']
sport_topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,2,-1_israeli_sakhnin_israel_club,"[israeli, sakhnin, israel, club, bnei, muramba...",[iranian israel match iranian striker vahid h...
1,0,174,0_club_chelsea_game_play,"[club, chelsea, game, play, player, goal, live...",[parry firm gerrard listen full interview spo...
2,1,147,1_england_wales_ireland_rugby,"[england, wales, ireland, rugby, game, six, ag...",[ogara revel ireland victory ireland flyhalf ...
3,2,85,2_open_win_roddick_play,"[open, win, roddick, play, set, seed, match, a...",[davenport dismantle young rival top seed lin...
4,3,67,3_race_world_olympic_indoor,"[race, world, olympic, indoor, win, run, europ...",[britain boost holmes double athletics fan en...
5,4,28,4_test_drug_iaaf_kenteris,"[test, drug, iaaf, kenteris, greek, thanou, ba...",[greek duo clear dope case sprinters kostas k...


In [78]:
for id in sport_topic_info['Topic']:
    if id == -1:
        continue
    print(f"\n--- Topic {id} ---")
    print(f" Number of docs: {sport_results['bbc_topic_model'].get_topic_freq(id)}")
    print(sport_results['bbc_topic_model'].get_topic(id))


--- Topic 0 ---
 Number of docs: 174
[('club', 0.031188161627800918), ('chelsea', 0.027367578915541632), ('game', 0.025901230691117445), ('play', 0.024313957998508842), ('player', 0.023668385432110067), ('goal', 0.022721896055036165), ('liverpool', 0.021302086168390008), ('go', 0.02125227534584314), ('league', 0.021222876452861125), ('arsenal', 0.020982777270782276)]

--- Topic 1 ---
 Number of docs: 147
[('england', 0.043850387956979715), ('wales', 0.034926802488796684), ('ireland', 0.03138086687358901), ('rugby', 0.030665463450694572), ('game', 0.03058234725076813), ('six', 0.026365488572176018), ('against', 0.02534614642047196), ('france', 0.025059590196585185), ('side', 0.024003182858546922), ('win', 0.023637971927190506)]

--- Topic 2 ---
 Number of docs: 85
[('open', 0.052504695689000416), ('win', 0.04582990859565072), ('roddick', 0.040201902472635505), ('play', 0.040157691710466986), ('set', 0.039557755794523856), ('seed', 0.036254643529521485), ('match', 0.03586029489270669), 

In [79]:
for id in sport_topic_info['Topic']:
    if id == -1:
        continue
    print(f"\nSample docs for Topic {id}")
    docs = sport_results['bbc_topic_model'].get_representative_docs(id)
    for doc in docs[:5]:  # View first 2
        print("-", doc[:300])  # Truncate for readability



Sample docs for Topic 0
- parry firm gerrard  listen full interview sport five bbc sport website 1900 gmt  parry speak exclusively bbc sport admit gerrard constantly link chelsea final future tell bbc five live steven money future liverpool matter 30 m 40 m 50 m accept offer realistic know keep steven against subject liverpo
- liverpool pledge keep gerrard  liverpool chief executive rick parry insist club sell steven gerrard amid report chelsea renew bid lure anfield  gerrard reiterate desire win trophy reds superb champions league winner wednesday parry move scotch claim chelsea launch 35 m bid chance stevie go january p
- parry put gerrard money  listen full interview sport five bbc sport website 1900 gmt  parry speak exclusively bbc sport admit gerrard constantly link chelsea final future tell bbc five live steven money future liverpool matter 30 m 40 m 50 m accept offer realistic know keep steven against subject li

Sample docs for Topic 1
- ogara revel ireland victory  ireland fl

**Inspection after reducing topics**

In [80]:
sport_results = bert_reduce_topics(sport_results, nr_topics=10)

2025-07-06 14:29:39,368 - BERTopic - Topic reduction - Reducing number of topics
2025-07-06 14:29:39,369 - BERTopic - Topic reduction - Number of topics (10) is equal or higher than the clustered topics(6).
2025-07-06 14:29:39,370 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-06 14:29:39,522 - BERTopic - Representation - Completed ✓


In [81]:
sport_results['reduced_topic_info']

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,2,-1_israeli_sakhnin_israel_club,"[israeli, sakhnin, israel, club, bnei, muramba...",[iranian israel match iranian striker vahid h...
1,0,174,0_club_chelsea_game_play,"[club, chelsea, game, play, player, goal, live...",[parry firm gerrard listen full interview spo...
2,1,147,1_england_wales_ireland_rugby,"[england, wales, ireland, rugby, game, six, ag...",[ogara revel ireland victory ireland flyhalf ...
3,2,85,2_open_win_roddick_play,"[open, win, roddick, play, set, seed, match, a...",[davenport dismantle young rival top seed lin...
4,3,67,3_race_world_olympic_indoor,"[race, world, olympic, indoor, win, run, europ...",[britain boost holmes double athletics fan en...
5,4,28,4_test_drug_iaaf_kenteris,"[test, drug, iaaf, kenteris, greek, thanou, ba...",[greek duo clear dope case sprinters kostas k...


In [82]:
for id in sport_results['reduced_topic_info']['Topic']:
    if id == -1:
        continue
    print(f"\n--- Topic {id} ---")
    print(f" Number of docs: {sport_results['bbc_topic_model'].get_topic_freq(id)}")
    print(sport_results['bbc_topic_model'].get_topic(id))


--- Topic 0 ---
 Number of docs: 174
[('club', 0.031188161627800918), ('chelsea', 0.027367578915541632), ('game', 0.025901230691117445), ('play', 0.024313957998508842), ('player', 0.023668385432110067), ('goal', 0.022721896055036165), ('liverpool', 0.021302086168390008), ('go', 0.02125227534584314), ('league', 0.021222876452861125), ('arsenal', 0.020982777270782276)]

--- Topic 1 ---
 Number of docs: 147
[('england', 0.043850387956979715), ('wales', 0.034926802488796684), ('ireland', 0.03138086687358901), ('rugby', 0.030665463450694572), ('game', 0.03058234725076813), ('six', 0.026365488572176018), ('against', 0.02534614642047196), ('france', 0.025059590196585185), ('side', 0.024003182858546922), ('win', 0.023637971927190506)]

--- Topic 2 ---
 Number of docs: 85
[('open', 0.052504695689000416), ('win', 0.04582990859565072), ('roddick', 0.040201902472635505), ('play', 0.040157691710466986), ('set', 0.039557755794523856), ('seed', 0.036254643529521485), ('match', 0.03586029489270669), 

In [83]:
for id in sport_results['reduced_topic_info']['Topic']:
    if id == -1:
        continue
    print(f"\nSample docs for Topic {id}")
    docs = sport_results['bbc_topic_model'].get_representative_docs(id)
    for doc in docs[:5]:  # View first 2
        print("-", doc[:300])  # Truncate for readability



Sample docs for Topic 0
- parry firm gerrard  listen full interview sport five bbc sport website 1900 gmt  parry speak exclusively bbc sport admit gerrard constantly link chelsea final future tell bbc five live steven money future liverpool matter 30 m 40 m 50 m accept offer realistic know keep steven against subject liverpo
- liverpool pledge keep gerrard  liverpool chief executive rick parry insist club sell steven gerrard amid report chelsea renew bid lure anfield  gerrard reiterate desire win trophy reds superb champions league winner wednesday parry move scotch claim chelsea launch 35 m bid chance stevie go january p
- parry put gerrard money  listen full interview sport five bbc sport website 1900 gmt  parry speak exclusively bbc sport admit gerrard constantly link chelsea final future tell bbc five live steven money future liverpool matter 30 m 40 m 50 m accept offer realistic know keep steven against subject li

Sample docs for Topic 1
- ogara revel ireland victory  ireland fl

## Tech Category

In [84]:
# Run bertopic on business top category
tech_results = run_bertopic_pipeline(text_df, 'tech')

 Running pipeline for tech category




2025-07-06 14:29:40,663 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/11 [00:00<?, ?it/s]

2025-07-06 14:29:55,433 - BERTopic - Embedding - Completed ✓
2025-07-06 14:29:55,434 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-06 14:29:55,920 - BERTopic - Dimensionality - Completed ✓
2025-07-06 14:29:55,921 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-06 14:29:55,932 - BERTopic - Cluster - Completed ✓
2025-07-06 14:29:55,935 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-06 14:29:56,003 - BERTopic - Representation - Completed ✓


In [85]:
# Visualize intertopoc distance map
tech_results['topic_distance_map']

In [86]:
# Visualize documents and topics
tech_results['topic_documents_map']

Now let us inspect the resulting topics and labels for the **business** category

In [87]:
tech_df = tech_results['dataframe']
tech_df.head()

Unnamed: 0,id,text,category,preprocessed_text,bertopic_topic,bertopic_topic_label
1824,tec_001,Ink helps drive democracy in Asia\n\nThe Kyrgy...,tech,ink help drive democracy asia kyrgyz republic...,5,"[(blog, 0.09509242962294914), (online, 0.03219..."
1825,tec_002,China net cafe culture crackdown\n\nChinese au...,tech,china net cafe culture crackdown chinese auth...,-1,"[(china, 0.10667238231602963), (rfid, 0.083832..."
1826,tec_003,Microsoft seeking spyware trojan\n\nMicrosoft ...,tech,microsoft seek spyware trojan microsoft inves...,2,"[(security, 0.04852580757471009), (virus, 0.04..."
1827,tec_004,Digital guru floats sub-$100 PC\n\nNicholas Ne...,tech,digital guru float sub100 pc nicholas negropo...,0,"[(mobile, 0.03458537450546705), (phone, 0.0303..."
1828,tec_005,Technology gets the creative bug\n\nThe hi-tec...,tech,technology get creative bug hitech art world ...,3,"[(search, 0.0741557109380161), (patent, 0.0451..."


In [88]:
tech_topic_info = tech_results['topic_info']
tech_topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,7,-1_china_rfid_tag_chinese,"[china, rfid, tag, chinese, net, cafe, people,...",[china overtake us net chinese netusing popul...
1,0,136,0_mobile_phone_people_technology,"[mobile, phone, people, technology, service, t...",[look music drive mobile mobile phone enjoy b...
2,1,63,1_game_play_console_title,"[game, play, console, title, gaming, year, gam...",[lose online gaming online role playing game ...
3,2,60,2_security_virus_attack_program,"[security, virus, attack, program, user, email...",[microsoft antipiracy move microsoft clamp pe...
4,3,36,3_search_patent_software_google,"[search, patent, software, google, people, yah...",[search site get close user search site want ...
5,4,17,4_dvd_file_film_p2p,"[dvd, file, film, p2p, network, system, movie,...",[dvd copy protection strengthen dvd hard copy...
6,5,16,5_blog_online_site_ink,"[blog, online, site, ink, blogger, people, hel...",[web log aid disaster recovery vivid descript...
7,6,7,6_apple_product_journalist_us,"[apple, product, journalist, us, leak, softwar...",[apple sue tiger file sharer apple take legal...


In [89]:
for id in tech_topic_info['Topic']:
    if id == -1:
        continue
    print(f"\n--- Topic {id} ---")
    print(f" Number of docs: {tech_results['bbc_topic_model'].get_topic_freq(id)}")
    print(tech_results['bbc_topic_model'].get_topic(id))


--- Topic 0 ---
 Number of docs: 136
[('mobile', 0.03458537450546705), ('phone', 0.030328846834367006), ('people', 0.029582587570980323), ('technology', 0.025053543377293156), ('service', 0.02366854953571038), ('tv', 0.022333693716646705), ('music', 0.020600120138382704), ('broadband', 0.019623916041958612), ('digital', 0.019298049912657794), ('one', 0.017518227321759575)]

--- Topic 1 ---
 Number of docs: 63
[('game', 0.0926637811915251), ('play', 0.03182057662065184), ('console', 0.02776665961447161), ('title', 0.02648854709463149), ('gaming', 0.024134322827143763), ('year', 0.022033565024649676), ('gamer', 0.021854820467638064), ('nintendo', 0.021028178462894585), ('time', 0.021007601866205827), ('sony', 0.019076926762925925)]

--- Topic 2 ---
 Number of docs: 60
[('security', 0.04852580757471009), ('virus', 0.04422218598582246), ('attack', 0.03300678870042525), ('program', 0.03122893655866186), ('user', 0.0307669004224736), ('email', 0.030195656578791346), ('site', 0.0284878440628

In [90]:
for id in tech_topic_info['Topic']:
    if id == -1:
        continue
    print(f"\nSample docs for Topic {id}")
    docs = tech_results['bbc_topic_model'].get_representative_docs(id)
    for doc in docs[:5]:  # View first 2
        print("-", doc[:300])  # Truncate for readability



Sample docs for Topic 0
- look music drive mobile  mobile phone enjoy boom time sale accord research technology analyst gartner  674 million mobile sell year globally report high total sell date figure 30 2003 surpass optimistic prediction gartner good design look mobile service music download go way push sale 2005 analyst p
- mobile medium player  mobile ready allsinge alldance multimedia device replace portable medium player two report  despite move bring music download service mobile people want trade multimedia service size battery life jupiter separate study gartner realtime tv broadcast mobile europe 2007 technical 
- look music drive mobile  mobile phone enjoy boom time sale accord research technology analyst gartner  674 million mobile sell year globally report high total sell date figure 30 2003 surpass optimistic prediction gartner good design look mobile service music download go way push sale 2005 analyst p

Sample docs for Topic 1
- mobile game age  bbc news website take 

**Inspection after reducing topics**

In [91]:
tech_results = bert_reduce_topics(tech_results, nr_topics=10)

2025-07-06 14:30:11,169 - BERTopic - Topic reduction - Reducing number of topics
2025-07-06 14:30:11,171 - BERTopic - Topic reduction - Number of topics (10) is equal or higher than the clustered topics(8).
2025-07-06 14:30:11,171 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-06 14:30:11,339 - BERTopic - Representation - Completed ✓


In [92]:
tech_results['reduced_topic_info']

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,7,-1_china_rfid_tag_chinese,"[china, rfid, tag, chinese, net, cafe, people,...",[china overtake us net chinese netusing popul...
1,0,136,0_mobile_phone_people_technology,"[mobile, phone, people, technology, service, t...",[look music drive mobile mobile phone enjoy b...
2,1,63,1_game_play_console_title,"[game, play, console, title, gaming, year, gam...",[lose online gaming online role playing game ...
3,2,60,2_security_virus_attack_program,"[security, virus, attack, program, user, email...",[microsoft antipiracy move microsoft clamp pe...
4,3,36,3_search_patent_software_google,"[search, patent, software, google, people, yah...",[search site get close user search site want ...
5,4,17,4_dvd_file_film_p2p,"[dvd, file, film, p2p, network, system, movie,...",[dvd copy protection strengthen dvd hard copy...
6,5,16,5_blog_online_site_ink,"[blog, online, site, ink, blogger, people, hel...",[web log aid disaster recovery vivid descript...
7,6,7,6_apple_product_journalist_us,"[apple, product, journalist, us, leak, softwar...",[apple sue tiger file sharer apple take legal...


In [93]:
for id in tech_results['reduced_topic_info']['Topic']:
    if id == -1:
        continue
    print(f"\n--- Topic {id} ---")
    print(f" Number of docs: {tech_results['bbc_topic_model'].get_topic_freq(id)}")
    print(tech_results['bbc_topic_model'].get_topic(id))


--- Topic 0 ---
 Number of docs: 136
[('mobile', 0.03458537450546705), ('phone', 0.030328846834367006), ('people', 0.029582587570980323), ('technology', 0.025053543377293156), ('service', 0.02366854953571038), ('tv', 0.022333693716646705), ('music', 0.020600120138382704), ('broadband', 0.019623916041958612), ('digital', 0.019298049912657794), ('one', 0.017518227321759575)]

--- Topic 1 ---
 Number of docs: 63
[('game', 0.0926637811915251), ('play', 0.03182057662065184), ('console', 0.02776665961447161), ('title', 0.02648854709463149), ('gaming', 0.024134322827143763), ('year', 0.022033565024649676), ('gamer', 0.021854820467638064), ('nintendo', 0.021028178462894585), ('time', 0.021007601866205827), ('sony', 0.019076926762925925)]

--- Topic 2 ---
 Number of docs: 60
[('security', 0.04852580757471009), ('virus', 0.04422218598582246), ('attack', 0.03300678870042525), ('program', 0.03122893655866186), ('user', 0.0307669004224736), ('email', 0.030195656578791346), ('site', 0.0284878440628

In [94]:
for id in tech_results['reduced_topic_info']['Topic']:
    if id == -1:
        continue
    print(f"\nSample docs for Topic {id}")
    docs = tech_results['bbc_topic_model'].get_representative_docs(id)
    for doc in docs[:5]:  # View first 2
        print("-", doc[:300])  # Truncate for readability



Sample docs for Topic 0
- look music drive mobile  mobile phone enjoy boom time sale accord research technology analyst gartner  674 million mobile sell year globally report high total sell date figure 30 2003 surpass optimistic prediction gartner good design look mobile service music download go way push sale 2005 analyst p
- mobile medium player  mobile ready allsinge alldance multimedia device replace portable medium player two report  despite move bring music download service mobile people want trade multimedia service size battery life jupiter separate study gartner realtime tv broadcast mobile europe 2007 technical 
- look music drive mobile  mobile phone enjoy boom time sale accord research technology analyst gartner  674 million mobile sell year globally report high total sell date figure 30 2003 surpass optimistic prediction gartner good design look mobile service music download go way push sale 2005 analyst p

Sample docs for Topic 1
- mobile game age  bbc news website take 