# Overview

# Importing Necessary Libraries

In [None]:
import pandas as pd
import os
import spacy
import re
import numpy as np
import random

# !python -m spacy download en_core_web_md

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.cluster import KMeans

# Importing Topic modelling libraries
!pip install bertopic[all] sentence-transformers
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer



# Loading Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
dataset_path = '/content/drive/MyDrive/bbc'
dataset = []

Firstly, I would combine the various text files into a list, and then a dataframe, to make processing easier

In [None]:
for category in os.listdir(dataset_path):       #loops through the items in root dataset folder
    category_path = os.path.join(dataset_path, category)       #constructs the path for each item
    if os.path.isdir(category_path):       #checks what items are directories
        for filename in os.listdir(category_path):       #loops through directories
            file_path = os.path.join(category_path, filename)       #constructs the path for each filee
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read().strip()
                dataset.append({'text':text, 'category':category})

In [None]:
text_df = pd.DataFrame(dataset)      # converting the resulting list to a dataframe

In [None]:
text_df

Unnamed: 0,text,category
0,Rapper Snoop Dogg sued for 'rape'\n\nUS rapper...,entertainment
1,Vera Drake's Bafta triumph hope\n\nAt the Baft...,entertainment
2,God cut from Dark Materials film\n\nThe direct...,entertainment
3,Films on war triumph at Sundance\n\nA study of...,entertainment
4,Ray DVD beats box office takings\n\nOscar-nomi...,entertainment
...,...,...
2220,Guantanamo pair's passport ban\n\nThe governme...,politics
2221,Will Tory tax cuts lift spirits?\n\nMichael Ho...,politics
2222,Job cuts 'false economy' - TUC\n\nPlans to sh...,politics
2223,Labour in constituency race row\n\nLabour's ch...,politics


# Preprocessing Text Data

In [None]:
nlp = spacy.load('en_core_web_sm')

#Using the stopword list from http://mlg.ucd.ie/files/datasets/stopwords.txt to preprocess the dataset

custom_stopwords = []
with open('/content/drive/MyDrive/bbc/stopwords.txt', 'r', encoding='utf-8') as file:
    for line in file:
        custom_stopwords.append(line.strip())

print(custom_stopwords)

['a', 'about', 'above', 'according', 'across', 'actually', 'adj', 'after', 'afterwards', 'again', 'all', 'almost', 'along', 'already', 'also', 'although', 'always', 'among', 'amongst', 'an', 'am', 'and', 'another', 'any', 'anyhow', 'anyone', 'anything', 'anywhere', 'are', 'aren', "aren't", 'around', 'as', 'at', 'be', 'became', 'because', 'become', 'becomes', 'been', 'beforehand', 'begin', 'being', 'below', 'beside', 'besides', 'between', 'both', 'but', 'by', 'can', 'cannot', "can't", 'caption', 'co', 'come', 'could', 'couldn', "couldn't", 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'don', "don't", 'down', 'during', 'each', 'early', 'eg', 'either', 'else', 'elsewhere', 'end', 'ending', 'enough', 'etc', 'even', 'ever', 'every', 'everywhere', 'except', 'few', 'for', 'found', 'from', 'further', 'had', 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'he', 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers', 'him', 'his', 'how', 'however', 'ie', 'i

In [None]:
#Creating a function for preprocessing - lowercasing, lemmatization, stopword removal

def preprocess(text):

    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # Processing with Spacy
    doc = nlp(text)

    # Lemmatization and removing stopwords using thee custom list
    tokens = [token.lemma_.lower() for token in doc if token.lemma_.lower() not in custom_stopwords]
    return ' '.join(tokens)

text_df['preprocessed_text'] = text_df['text'].apply(preprocess)

In [None]:
# Remove newline characters
text_df['preprocessed_text'] = text_df['preprocessed_text'].str.replace('\n', '')
text_df.head()

Unnamed: 0,text,category,preprocessed_text
0,Rapper Snoop Dogg sued for 'rape'\n\nUS rapper...,entertainment,rapper snoop dogg sue rape rapper snoop dogg ...
1,Vera Drake's Bafta triumph hope\n\nAt the Baft...,entertainment,vera drake bafta triumph hope bafta film awar...
2,God cut from Dark Materials film\n\nThe direct...,entertainment,god cut dark materials film director screenwr...
3,Films on war triumph at Sundance\n\nA study of...,entertainment,film war triumph sundance study united states...
4,Ray DVD beats box office takings\n\nOscar-nomi...,entertainment,ray dvd beats box office oscarnominate film b...


# Exploratory Data Analysis

# Sub-categorizing Main Categories

To break down the texts into sub-categories, I would make use of BERTopic

## Reusable Functions

In [None]:
# Creating a function to run the initial clustering of the top categories
def run_bertopic_pipeline(
    df,
    top_category,
    embedding_model = None,
    min_topic_size = 3
):
    print (f" Running pipeline for {top_category} category")

    #Filtering the specified top category
    top_category_df = df[df['category'] == top_category]

    # Converting to list for easy processing
    top_category_texts = top_category_df['preprocessed_text'].tolist()
    top_category_texts = [t for t in top_category_texts if isinstance(t, str) and t.strip()]

    # Specifying embedding model
    if embedding_model is None:
        embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

    # Creating an instance of BERTopic for my data
    bbc_topic_model = BERTopic(embedding_model=embedding_model,
                           language="english",
                           verbose=True,
                           min_topic_size = min_topic_size)

    # Fit and transform data
    topics, probabilities = bbc_topic_model.fit_transform(top_category_texts)

    # Topic info
    topic_info = bbc_topic_model.get_topic_info()

    # Visualizing document and topic maps
    topic_documents_map = bbc_topic_model.visualize_documents(top_category_texts)
    topic_distance_map = bbc_topic_model.visualize_topics()

    # Appending topics and labels to dataframe
    top_category_df['bertopic_topic'] = topics
    top_category_df['bertopic_topic_label'] = top_category_df['bertopic_topic'].apply(lambda x: bbc_topic_model.get_topic(x))

    return {'topic_info': topic_info,
            'topic_distance_map': topic_distance_map,
            'topic_documents_map': topic_documents_map,
            'topics': topics,
            'probabilities': probabilities,
            'bbc_topic_model': bbc_topic_model,
            'dataframe': top_category_df}

In [None]:
# Creating a function to reduce topics, if needed

def bert_reduce_topics(results, nr_topics=5):
    model = results['bbc_topic_model']
    df = results['dataframe']
    top_category_texts = df['preprocessed_text'].tolist()
    top_category_texts = [t for t in top_category_texts if isinstance(t, str) and t.strip()]

    # Reduce the topics
    model.reduce_topics(top_category_texts, nr_topics=nr_topics)
    reduced_topics = model.topics_

    # Computing updated topic info
    reduced_topic_info = model.get_topic_info()

    # Add new topics to dataframe
    df['bertopic_topic_reduced'] = reduced_topics
    df['bertopic_topic_reduced_label'] = df['bertopic_topic_reduced'].apply(lambda x: model.get_topic(x))

    #Update input dictionary
    results.update({'reduced_topics': reduced_topics,
                    'dataframe': df,
                    'reduced_topic_info': reduced_topic_info})

    return results

## Business Category

In [None]:
business_results = run_bertopic_pipeline(text_df, 'business')

 Running pipeline for business category


2025-07-04 13:39:34,234 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

2025-07-04 13:41:09,915 - BERTopic - Embedding - Completed ✓
2025-07-04 13:41:09,934 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-04 13:41:12,507 - BERTopic - Dimensionality - Completed ✓
2025-07-04 13:41:12,512 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-04 13:41:12,549 - BERTopic - Cluster - Completed ✓
2025-07-04 13:41:12,554 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-04 13:41:12,696 - BERTopic - Representation - Completed ✓


In [None]:
# Visualize intertopoc distance map
business_results['topic_distance_map']

In [None]:
# Visualize documents and topics
business_results['topic_documents_map']

Now let us inspect the resulting topics and labels for the **business** category

In [None]:
business_df = business_results['dataframe']
business_df.head()

Unnamed: 0,text,category,preprocessed_text,bertopic_topic,bertopic_topic_label
897,UK economy facing 'major risks'\n\nThe UK manu...,business,uk economy face major risk uk manufacturing s...,14,"[(golden, 0.03267686805985442), (uk, 0.0309916..."
898,Ask Jeeves tips online ad revival\n\nAsk Jeeve...,business,ask jeeves tip online ad revival ask jeeve th...,8,"[(game, 0.040903569715579256), (share, 0.03685..."
899,US interest rate rise expected\n\nUS interest ...,business,interest rate rise expect interest rate expec...,6,"[(job, 0.0483850499085089), (rate, 0.041937792..."
900,EMI shares hit by profit warning\n\nShares in ...,business,emi share hit profit warn share music giant e...,-1,"[(year, 0.011571701138379064), (company, 0.011..."
901,Germany calls for EU reform\n\nGerman Chancell...,business,germany call eu reform german chancellor gerh...,15,"[(eu, 0.060328448565597916), (european, 0.0463..."


In [None]:
business_df[business_df['bertopic_topic'] == 0]['text'].values

array(["Yukos sues four firms for $20bn\n\nRussian oil firm Yukos has sued four companies for their role in last year's forced state auction of its key oil production unit Yuganskneftegas.\n\nYukos is claiming more than $20bn (£11bn) in damages after Yugansk was sold in December to settle back taxes. The four companies named in the law suit are gas giant Gazprom, its unit Gazpromneft, investment company Baikal, and state oil firm Rosneft. Yukos submitted the suit in Houston, where it filed for bankruptcy. As well as suing for damages, Yukos has asked the US court to send its tax dispute with the Russian government to an international arbitrator. It also has submitted a reorganisation plan as part of its Chapter 11 bankruptcy filing.\n\nThe clash between Yukos and the Kremlin came to a head last year when Yukos was hit with a bill of more than $27bn in back taxes and unpaid fines. To settle the bill, Russia forced Yukos to sell off Yuganskneftegas.\n\nYukos called the sale illegal and h

In [None]:
business_topic_info = business_results['topic_info']
business_topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,71,-1_year_company_world_country,"[year, company, world, country, business, eu, ...",[business fear sluggish eu economy european l...
1,0,28,0_yukos_russian_gazprom_court,"[yukos, russian, gazprom, court, rosneft, auct...",[yukos bankruptcy matter russian authority ab...
2,1,28,1_car_gm_fiat_vehicle,"[car, gm, fiat, vehicle, bmw, sale, model, mot...",[saab build cadillacs sweden general motors w...
3,2,27,2_airline_air_flight_passenger,"[airline, air, flight, passenger, plane, carri...",[probe airline travel chaos government invest...
4,3,17,3_oil_crude_barrel_price,"[oil, crude, barrel, price, cairn, energy, gas...",[oil price reach threemonth low oil price fal...
5,4,16,4_marsh_sec_insurance_firm,"[marsh, sec, insurance, firm, listing, fsa, br...",[sec rethink postenron rule stock market watc...
6,5,16,5_sri_disaster_damage_lanka,"[sri, disaster, damage, lanka, indonesia, tour...",[asia share defy postquake gloom indonesian i...
7,6,16,6_job_rate_fed_growth,"[job, rate, fed, growth, economy, economist, c...",[interest rate increase 2 interest rate rise ...
8,7,16,7_japans_japanese_japan_economy,"[japans, japanese, japan, economy, yen, recess...",[business confidence dip japan business confi...
9,8,15,8_game_share_corp_video,"[game, share, corp, video, profit, nasdaq, new...",[ad sale boost time warner profit quarterly p...


In [None]:
business_topic_info['Representative_Docs'][1]

['yukos bankruptcy matter  russian authority abide court decision take regard troubled oil giant yukos houston court tell  legal expert william butler treaty russia recognise legal ruling mean moscow adhere ruling yukos case yukos court entitle declare bankrupt before yugansk unit sell subsidiary local bank account  yukos surprise chapter 11 bankruptcy filing houston december unsuccessful attempt halt auction yugansk main oil produce unit russian authority yugansk sell help pay 275bn 145bn back tax bill buy 94bn previously unknown group turn buy statecontrolled oil company rosneft court jurisdiction challenge deutsche bank gazpromneft former unit russian gas monopoly gazprom due merge rosneft deutsche bank maintain case place court yukos asset apart two bank account house houston chief finance officer bruce misamore deutsche bank involve case sue yukos agree loan gazpromneft money bid yugansk bankruptcy judge letitia clark issue injunction december try prevent yugansk sale rule pretty 

In [None]:
business_results = bert_reduce_topics(business_results, nr_topics=10)

2025-07-04 13:42:49,038 - BERTopic - Topic reduction - Reducing number of topics
2025-07-04 13:42:49,059 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-04 13:42:49,416 - BERTopic - Representation - Completed ✓
2025-07-04 13:42:49,425 - BERTopic - Topic reduction - Reduced number of topics from 45 to 10


In [None]:
business_results['dataframe'].loc[1402].text

'US to rule on Yukos refuge call\n\nYukos has said a US bankruptcy court will decide whether to block Russia\'s impending auction of its main production arm on Thursday.\n\nThe Russian oil firm has filed for bankruptcy protection in the US in an attempt to halt the forced sale. However, Judge Letitia Clark said the hearing would continue on Thursday when arguments in the case would be heard. Russian authorities are due to auction off Yuganskneftegas on 19 December to pay a huge tax bill sent to Yukos.\n\nRussian prosecutors are forcing the sale of the firm\'s most lucrative asset Yuganskneftegas to help pay a $27bn (£14bn) back tax bill, which they claim is owed by Yukos.\n\nFiling for bankruptcy protection in the US was "a last resort to preserve the rights of our shareholders, employees and customers," said Yukos chief executive Steven Theede. The company added it had opted to take action through American courts as US bankruptcy law gives worldwide jurisdiction over a debtor company\

In [None]:
business_results['reduced_topic_info']

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,71,-1_year_company_world_firm,"[year, company, world, firm, country, market, ...",[business fear sluggish eu economy european l...
1,0,155,0_firm_company_share_year,"[firm, company, share, year, profit, market, e...",[gm issue 2005 profit warn general motors war...
2,1,97,1_rate_rise_growth_economy,"[rate, rise, growth, economy, year, dollar, fi...",[interest rate increase 2 interest rate rise ...
3,2,75,2_bank_economy_year_economic,"[bank, economy, year, economic, government, gr...",[asia share defy postquake gloom indonesian i...
4,3,49,3_yukos_oil_russian_gazprom,"[yukos, oil, russian, gazprom, court, company,...",[yukos unit fetch 9bn auction littleknown rus...
5,4,17,4_wine_fosters_beer_southcorp,"[wine, fosters, beer, southcorp, brewer, mcdon...",[french wine get 70 m euro topup french gover...
6,5,16,5_drug_tobacco_company_firm,"[drug, tobacco, company, firm, patient, govern...",[seek 280bn smoker ruling justice department ...
7,6,14,6_argentina_president_government_venezuela,"[argentina, president, government, venezuela, ...",[water firm suez argentina row conflict argen...
8,7,10,7_club_glazer_united_manchester,"[club, glazer, united, manchester, proposal, b...",[qa malcolm glazer man utd battle control man...
9,8,6,8_pension_age_scheme_retirement,"[pension, age, scheme, retirement, employer, e...",[pension hitch longlive man male life expecta...


In [None]:
business_df

Unnamed: 0,text,category,preprocessed_text,bertopic_topic,bertopic_topic_label,bertopic_topic_reduced,bertopic_topic_reduced_label
897,UK economy facing 'major risks'\n\nThe UK manu...,business,uk economy face major risk uk manufacturing s...,14,"[(golden, 0.03267686805985442), (uk, 0.0309916...",1,"[(rate, 0.039091706031254586), (rise, 0.036179..."
898,Ask Jeeves tips online ad revival\n\nAsk Jeeve...,business,ask jeeves tip online ad revival ask jeeve th...,8,"[(game, 0.040903569715579256), (share, 0.03685...",0,"[(firm, 0.025520280785799532), (company, 0.024..."
899,US interest rate rise expected\n\nUS interest ...,business,interest rate rise expect interest rate expec...,6,"[(job, 0.0483850499085089), (rate, 0.041937792...",1,"[(rate, 0.039091706031254586), (rise, 0.036179..."
900,EMI shares hit by profit warning\n\nShares in ...,business,emi share hit profit warn share music giant e...,-1,"[(year, 0.011571701138379064), (company, 0.011...",-1,"[(year, 0.02352824894933455), (company, 0.0214..."
901,Germany calls for EU reform\n\nGerman Chancell...,business,germany call eu reform german chancellor gerh...,15,"[(eu, 0.060328448565597916), (european, 0.0463...",1,"[(rate, 0.039091706031254586), (rise, 0.036179..."
...,...,...,...,...,...,...,...
1402,US to rule on Yukos refuge call\n\nYukos has s...,business,rule yukos refuge call yukos bankruptcy court...,0,"[(yukos, 0.07161888393620272), (russian, 0.043...",3,"[(yukos, 0.07819187791826837), (oil, 0.0699304..."
1403,US budget deficit to reach $368bn\n\nThe US bu...,business,budget deficit reach 368bn budget deficit set...,26,"[(budget, 0.07855532975566334), (deficit, 0.05...",1,"[(rate, 0.039091706031254586), (rise, 0.036179..."
1404,Building giant in asbestos payout\n\nAustralia...,business,build giant asbestos payout australian buildi...,-1,"[(year, 0.011571701138379064), (company, 0.011...",-1,"[(year, 0.02352824894933455), (company, 0.0214..."
1405,India power shares jump on debut\n\nShares in ...,business,india power share jump debut share indias lar...,27,"[(reliance, 0.08362528758609787), (ambani, 0.0...",0,"[(firm, 0.025520280785799532), (company, 0.024..."


In [None]:
business_df[business_df['bertopic_topic_reduced'] == 0]['text'].values

array(["Ask Jeeves tips online ad revival\n\nAsk Jeeves has become the third leading online search firm this week to thank a revival in internet advertising for improving fortunes.\n\nThe firm's revenue nearly tripled in the fourth quarter of 2004, exceeding $86m (£46m). Ask Jeeves, once among the best-known names on the web, is now a relatively modest player. Its $17m profit for the quarter was dwarfed by the $204m announced by rival Google earlier in the week. During the same quarter, Yahoo earned $187m, again tipping a resurgence in online advertising.\n\nThe trend has taken hold relatively quickly. Late last year, marketing company Doubleclick, one of the leading providers of online advertising, warned that some or all of its business would have to be put up for sale. But on Thursday, it announced that a sharp turnaround had brought about an unexpected increase in profits. Neither Ask Jeeves nor Doubleclick thrilled investors with their profit news, however. In both cases, their sh

## Entertainment Category

In [None]:
ent_results = run_bertopic_pipeline(text_df, 'entertainment')

 Running pipeline for entertainment category


2025-07-04 13:42:51,197 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

2025-07-04 13:43:56,747 - BERTopic - Embedding - Completed ✓
2025-07-04 13:43:56,785 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-04 13:44:00,084 - BERTopic - Dimensionality - Completed ✓
2025-07-04 13:44:00,085 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-04 13:44:00,106 - BERTopic - Cluster - Completed ✓
2025-07-04 13:44:00,111 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-04 13:44:00,233 - BERTopic - Representation - Completed ✓


In [None]:
ent_results['topic_distance_map']

In [None]:
ent_results['topic_documents_map']

Now let us inspect the resulting topics and labels for the **entertainment** category

In [None]:
ent_df = ent_results['dataframe']
ent_df.head()

Unnamed: 0,text,category,preprocessed_text,bertopic_topic,bertopic_topic_label
0,Rapper Snoop Dogg sued for 'rape'\n\nUS rapper...,entertainment,rapper snoop dogg sue rape rapper snoop dogg ...,8,"[(doherty, 0.034063577980696785), (assault, 0...."
1,Vera Drake's Bafta triumph hope\n\nAt the Baft...,entertainment,vera drake bafta triumph hope bafta film awar...,1,"[(film, 0.034495926093734725), (vera, 0.027973..."
2,God cut from Dark Materials film\n\nThe direct...,entertainment,god cut dark materials film director screenwr...,-1,"[(film, 0.017289247473067523), (show, 0.012136..."
3,Films on war triumph at Sundance\n\nA study of...,entertainment,film war triumph sundance study united states...,16,"[(festival, 0.0522282532518597), (sundance, 0...."
4,Ray DVD beats box office takings\n\nOscar-nomi...,entertainment,ray dvd beats box office oscarnominate film b...,0,"[(box, 0.04871906092440725), (office, 0.047048..."


In [None]:
ent_df[ent_df['bertopic_topic'] == 5]

Unnamed: 0,text,category,preprocessed_text,bertopic_topic,bertopic_topic_label
9,Musicians to tackle US red tape\n\nMusicians' ...,entertainment,musician tackle red tape musicians group tack...,5,"[(download, 0.06876196415840392), (music, 0.05..."
71,The Sound of Music is coming home\n\nThe origi...,entertainment,sound music home original stage production so...,5,"[(download, 0.06876196415840392), (music, 0.05..."
94,Music mogul Fuller sells company\n\nPop Idol s...,entertainment,music mogul fuller sell company pop idol supr...,5,"[(download, 0.06876196415840392), (music, 0.05..."
110,Downloads enter US singles chart\n\nDigital mu...,entertainment,download enter single chart digital music dow...,5,"[(download, 0.06876196415840392), (music, 0.05..."
130,Pupils to get anti-piracy lessons\n\nLessons o...,entertainment,pupil get antipiracy lesson lessons music pir...,5,"[(download, 0.06876196415840392), (music, 0.05..."
264,Abba queen enters music rich list\n\nThe woman...,entertainment,abba queen enter music rich list woman behind...,5,"[(download, 0.06876196415840392), (music, 0.05..."
265,Downloads enter US singles chart\n\nDigital mu...,entertainment,download enter single chart digital music dow...,5,"[(download, 0.06876196415840392), (music, 0.05..."
271,Help for indies in download sales\n\nA campaig...,entertainment,help indie download sale campaign launch help...,5,"[(download, 0.06876196415840392), (music, 0.05..."
282,US composer recreates Bach score\n\nA US music...,entertainment,composer recreate bach score musicologist rec...,5,"[(download, 0.06876196415840392), (music, 0.05..."
301,Download chart debut is delayed\n\nThe inclusi...,entertainment,download chart debut delay inclusion download...,5,"[(download, 0.06876196415840392), (music, 0.05..."


In [None]:
entertainment_topic_info = business_results['topic_info']
entertainment_topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,71,-1_year_company_world_country,"[year, company, world, country, business, eu, ...",[business fear sluggish eu economy european l...
1,0,28,0_yukos_russian_gazprom_court,"[yukos, russian, gazprom, court, rosneft, auct...",[yukos bankruptcy matter russian authority ab...
2,1,28,1_car_gm_fiat_vehicle,"[car, gm, fiat, vehicle, bmw, sale, model, mot...",[saab build cadillacs sweden general motors w...
3,2,27,2_airline_air_flight_passenger,"[airline, air, flight, passenger, plane, carri...",[probe airline travel chaos government invest...
4,3,17,3_oil_crude_barrel_price,"[oil, crude, barrel, price, cairn, energy, gas...",[oil price reach threemonth low oil price fal...
5,4,16,4_marsh_sec_insurance_firm,"[marsh, sec, insurance, firm, listing, fsa, br...",[sec rethink postenron rule stock market watc...
6,5,16,5_sri_disaster_damage_lanka,"[sri, disaster, damage, lanka, indonesia, tour...",[asia share defy postquake gloom indonesian i...
7,6,16,6_job_rate_fed_growth,"[job, rate, fed, growth, economy, economist, c...",[interest rate increase 2 interest rate rise ...
8,7,16,7_japans_japanese_japan_economy,"[japans, japanese, japan, economy, yen, recess...",[business confidence dip japan business confi...
9,8,15,8_game_share_corp_video,"[game, share, corp, video, profit, nasdaq, new...",[ad sale boost time warner profit quarterly p...


In [None]:
entertainment_topic_info['Representative_Docs'][1]

['yukos bankruptcy matter  russian authority abide court decision take regard troubled oil giant yukos houston court tell  legal expert william butler treaty russia recognise legal ruling mean moscow adhere ruling yukos case yukos court entitle declare bankrupt before yugansk unit sell subsidiary local bank account  yukos surprise chapter 11 bankruptcy filing houston december unsuccessful attempt halt auction yugansk main oil produce unit russian authority yugansk sell help pay 275bn 145bn back tax bill buy 94bn previously unknown group turn buy statecontrolled oil company rosneft court jurisdiction challenge deutsche bank gazpromneft former unit russian gas monopoly gazprom due merge rosneft deutsche bank maintain case place court yukos asset apart two bank account house houston chief finance officer bruce misamore deutsche bank involve case sue yukos agree loan gazpromneft money bid yugansk bankruptcy judge letitia clark issue injunction december try prevent yugansk sale rule pretty 

In [None]:
x = bert_reduce_topics(ent_results, nr_topics=7)

2025-07-04 13:45:10,478 - BERTopic - Topic reduction - Reducing number of topics
2025-07-04 13:45:10,485 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-04 13:45:10,621 - BERTopic - Representation - Completed ✓
2025-07-04 13:45:10,629 - BERTopic - Topic reduction - Reduced number of topics from 36 to 7


In [None]:
x['dataframe']

Unnamed: 0,text,category,preprocessed_text,bertopic_topic,bertopic_topic_label,bertopic_topic_reduced,bertopic_topic_reduced_label
0,Rapper Snoop Dogg sued for 'rape'\n\nUS rapper...,entertainment,rapper snoop dogg sue rape rapper snoop dogg ...,8,"[(doherty, 0.034063577980696785), (assault, 0....",0,"[(music, 0.050835205913303705), (song, 0.03355..."
1,Vera Drake's Bafta triumph hope\n\nAt the Baft...,entertainment,vera drake bafta triumph hope bafta film awar...,1,"[(film, 0.034495926093734725), (vera, 0.027973...",1,"[(film, 0.07582598232154815), (good, 0.0388149..."
2,God cut from Dark Materials film\n\nThe direct...,entertainment,god cut dark materials film director screenwr...,-1,"[(film, 0.017289247473067523), (show, 0.012136...",-1,"[(film, 0.03802824988933821), (year, 0.0238889..."
3,Films on war triumph at Sundance\n\nA study of...,entertainment,film war triumph sundance study united states...,16,"[(festival, 0.0522282532518597), (sundance, 0....",1,"[(film, 0.07582598232154815), (good, 0.0388149..."
4,Ray DVD beats box office takings\n\nOscar-nomi...,entertainment,ray dvd beats box office oscarnominate film b...,0,"[(box, 0.04871906092440725), (office, 0.047048...",1,"[(film, 0.07582598232154815), (good, 0.0388149..."
...,...,...,...,...,...,...,...
381,Spears seeks aborted tour payment\n\nSinger Br...,entertainment,spear seeks abort tour payment singer britney...,27,"[(walmart, 0.07192126471442438), (spector, 0.0...",0,"[(music, 0.050835205913303705), (song, 0.03355..."
382,Arnold congratulated on Oscar win\n\nOscar-win...,entertainment,arnold congratulate oscar win oscarwinner and...,4,"[(aviator, 0.042282102788831835), (swank, 0.04...",1,"[(film, 0.07582598232154815), (good, 0.0388149..."
383,Rock star sued by ex-girlfriend\n\nMotley Crue...,entertainment,rock star sue exgirlfriend motley crue guitar...,-1,"[(film, 0.017289247473067523), (show, 0.012136...",-1,"[(film, 0.03802824988933821), (year, 0.0238889..."
384,Vera Drake scoops film award\n\nOscar hopefuls...,entertainment,vera drake scoop film award oscar hopeful mik...,1,"[(film, 0.034495926093734725), (vera, 0.027973...",1,"[(film, 0.07582598232154815), (good, 0.0388149..."


## Politics Category

## Sports Category

## Tech Category