# Overview

# Importing Necessary Libraries

In [None]:
import pandas as pd
import os
import spacy
import re
import numpy as np
import random

# !python -m spacy download en_core_web_md

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.cluster import KMeans

# Importing Topic modelling libraries
# !pip install bertopic[all] sentence-transformers
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

# Loading Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
dataset_path = '/content/drive/MyDrive/bbc'
dataset = []

Firstly, I would combine the various text files into a list, and then a dataframe, to make processing easier

In [None]:
for category in os.listdir(dataset_path):       #loops through the items in root dataset folder
    category_path = os.path.join(dataset_path, category)       #constructs the path for each item
    if os.path.isdir(category_path):       #checks what items are directories
        for filename in os.listdir(category_path):       #loops through directories
            file_path = os.path.join(category_path, filename)       #constructs the path for each filee
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read().strip()
                dataset.append({'text':text, 'category':category})

In [None]:
text_df = pd.DataFrame(dataset)      # converting the resulting list to a dataframe

In [None]:
text_df

Unnamed: 0,text,category
0,Rapper Snoop Dogg sued for 'rape'\n\nUS rapper...,entertainment
1,Vera Drake's Bafta triumph hope\n\nAt the Baft...,entertainment
2,God cut from Dark Materials film\n\nThe direct...,entertainment
3,Films on war triumph at Sundance\n\nA study of...,entertainment
4,Ray DVD beats box office takings\n\nOscar-nomi...,entertainment
...,...,...
2220,Guantanamo pair's passport ban\n\nThe governme...,politics
2221,Will Tory tax cuts lift spirits?\n\nMichael Ho...,politics
2222,Job cuts 'false economy' - TUC\n\nPlans to sh...,politics
2223,Labour in constituency race row\n\nLabour's ch...,politics


# Preprocessing Text Data

In [None]:
nlp = spacy.load('en_core_web_sm')

#Using the stopword list from http://mlg.ucd.ie/files/datasets/stopwords.txt to preprocess the dataset

custom_stopwords = []
with open('/content/drive/MyDrive/bbc/stopwords.txt', 'r', encoding='utf-8') as file:
    for line in file:
        custom_stopwords.append(line.strip())

print(custom_stopwords)

['a', 'about', 'above', 'according', 'across', 'actually', 'adj', 'after', 'afterwards', 'again', 'all', 'almost', 'along', 'already', 'also', 'although', 'always', 'among', 'amongst', 'an', 'am', 'and', 'another', 'any', 'anyhow', 'anyone', 'anything', 'anywhere', 'are', 'aren', "aren't", 'around', 'as', 'at', 'be', 'became', 'because', 'become', 'becomes', 'been', 'beforehand', 'begin', 'being', 'below', 'beside', 'besides', 'between', 'both', 'but', 'by', 'can', 'cannot', "can't", 'caption', 'co', 'come', 'could', 'couldn', "couldn't", 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'don', "don't", 'down', 'during', 'each', 'early', 'eg', 'either', 'else', 'elsewhere', 'end', 'ending', 'enough', 'etc', 'even', 'ever', 'every', 'everywhere', 'except', 'few', 'for', 'found', 'from', 'further', 'had', 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'he', 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers', 'him', 'his', 'how', 'however', 'ie', 'i

In [None]:
#Creating a function for preprocessing - lowercasing, lemmatization, stopword removal

def preprocess(text):

    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # Processing with Spacy
    doc = nlp(text)

    # Lemmatization and removing stopwords using thee custom list
    tokens = [token.lemma_.lower() for token in doc if token.lemma_.lower() not in custom_stopwords]
    return ' '.join(tokens)

text_df['preprocessed_text'] = text_df['text'].apply(preprocess)

In [None]:
# Remove newline characters
text_df['preprocessed_text'] = text_df['preprocessed_text'].str.replace('\n', '')
text_df.head()

Unnamed: 0,text,category,preprocessed_text
0,Rapper Snoop Dogg sued for 'rape'\n\nUS rapper...,entertainment,rapper snoop dogg sue rape rapper snoop dogg ...
1,Vera Drake's Bafta triumph hope\n\nAt the Baft...,entertainment,vera drake bafta triumph hope bafta film awar...
2,God cut from Dark Materials film\n\nThe direct...,entertainment,god cut dark materials film director screenwr...
3,Films on war triumph at Sundance\n\nA study of...,entertainment,film war triumph sundance study united states...
4,Ray DVD beats box office takings\n\nOscar-nomi...,entertainment,ray dvd beats box office oscarnominate film b...


# Exploratory Data Analysis

# Sub-categorizing Main Categories

To break down the texts into sub-categories, I would make use of BERTopic

## Reusable Functions

In [None]:
# Creating a function to run the initial clustering of the top categories
def run_bertopic_pipeline(
    df,
    top_category,
    embedding_model = None,
    min_topic_size = 5
):
    print (f" Running pipeline for {top_category} category")

    #Filtering the specified top category
    top_category_df = df[df['category'] == top_category]

    # Converting to list for easy processing
    top_category_texts = top_category_df['preprocessed_text'].tolist()
    top_category_texts = [t for t in top_category_texts if isinstance(t, str) and t.strip()]

    # Specifying embedding model
    if embedding_model is None:
        embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

    # Creating an instance of BERTopic for my data
    bbc_topic_model = BERTopic(embedding_model=embedding_model,
                           language="english",
                           verbose=True,
                           min_topic_size = min_topic_size)

    # Fit and transform data
    topics, probabilities = bbc_topic_model.fit_transform(top_category_texts)

    # Topic info
    topic_info = bbc_topic_model.get_topic_info()

    # Visualizing document and topic maps
    topic_documents_map = bbc_topic_model.visualize_documents(top_category_texts)
    topic_distance_map = bbc_topic_model.visualize_topics()

    # Appending topics and labels to dataframe
    top_category_df['bertopic_topic'] = topics
    top_category_df['bertopic_topic_label'] = top_category_df['bertopic_topic'].apply(lambda x: bbc_topic_model.get_topic(x))

    return {'topic_info': topic_info,
            'topic_distance_map': topic_distance_map,
            'topic_documents_map': topic_documents_map,
            'topics': topics,
            'probabilities': probabilities,
            'bbc_topic_model': bbc_topic_model,
            'dataframe': top_category_df}

In [None]:
# Creating a function to reduce topics, if needed

def bert_reduce_topics(results, nr_topics=5):
    model = results['bbc_topic_model']
    df = results['dataframe']
    top_category_texts = df['preprocessed_text'].tolist()
    top_category_texts = [t for t in top_category_texts if isinstance(t, str) and t.strip()]

    # Reduce the topics
    model.reduce_topics(top_category_texts, nr_topics=nr_topics)
    reduced_topics = model.topics_

    # Add new topics to dataframe
    df['bertopic_topic_reduced'] = reduced_topics
    df['bertopic_topic_reduced_label'] = df['bertopic_topic_reduced'].apply(lambda x: model.get_topic(x))

    #Update input dictionary
    results.update({'reduced_topics': reduced_topics,
                    'dataframe': df})

    return results

## Business Category

In [None]:
business_results = run_bertopic_pipeline(text_df, 'business')

 Running pipeline for business category


2025-07-04 00:37:09,957 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

2025-07-04 00:37:11,854 - BERTopic - Embedding - Completed ✓
2025-07-04 00:37:11,855 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-04 00:37:21,170 - BERTopic - Dimensionality - Completed ✓
2025-07-04 00:37:21,171 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-04 00:37:21,191 - BERTopic - Cluster - Completed ✓
2025-07-04 00:37:21,195 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-04 00:37:21,298 - BERTopic - Representation - Completed ✓


In [None]:
business_results['topic_distance_map']

In [None]:
business_results['topic_documents_map']

Now let us inspect the resulting topics and labels for the **business** category

In [None]:
business_df = business_results['dataframe']
business_df.head()

Unnamed: 0,text,category,preprocessed_text,bertopic_topic,bertopic_topic_label
897,UK economy facing 'major risks'\n\nThe UK manu...,business,uk economy face major risk uk manufacturing s...,10,"[(rate, 0.059081433546730115), (interest, 0.03..."
898,Ask Jeeves tips online ad revival\n\nAsk Jeeve...,business,ask jeeves tip online ad revival ask jeeve th...,2,"[(profit, 0.03630716508192444), (share, 0.0319..."
899,US interest rate rise expected\n\nUS interest ...,business,interest rate rise expect interest rate expec...,22,"[(fed, 0.06523099578976589), (rate, 0.06477807..."
900,EMI shares hit by profit warning\n\nShares in ...,business,emi share hit profit warn share music giant e...,2,"[(profit, 0.03630716508192444), (share, 0.0319..."
901,Germany calls for EU reform\n\nGerman Chancell...,business,germany call eu reform german chancellor gerh...,5,"[(eu, 0.049268889792449685), (european, 0.0288..."


In [None]:
business_df[business_df['bertopic_topic'] == 5]

Unnamed: 0,text,category,preprocessed_text,bertopic_topic,bertopic_topic_label
901,Germany calls for EU reform\n\nGerman Chancell...,business,germany call eu reform german chancellor gerh...,5,"[(eu, 0.049268889792449685), (european, 0.0288..."
948,EU 'too slow' on economic reforms\n\nMost EU c...,business,eu slow economic reform eu country fail put p...,5,"[(eu, 0.049268889792449685), (european, 0.0288..."
970,German jobless rate at new record\n\nMore than...,business,german jobless rate record 52 million germans...,5,"[(eu, 0.049268889792449685), (european, 0.0288..."
974,Sluggish economy hits German jobs\n\nThe numbe...,business,sluggish economy hit german job number people...,5,"[(eu, 0.049268889792449685), (european, 0.0288..."
998,Further rise in UK jobless total\n\nThe UK's j...,business,far rise uk jobless total uks jobless total r...,5,"[(eu, 0.049268889792449685), (european, 0.0288..."
1007,German business confidence slides\n\nGerman bu...,business,german business confidence slide german busin...,5,"[(eu, 0.049268889792449685), (european, 0.0288..."
1075,German growth goes into reverse\n\nGermany's e...,business,german growth go reverse germanys economy shr...,5,"[(eu, 0.049268889792449685), (european, 0.0288..."
1079,Italy to get economic action plan\n\nItalian P...,business,italy get economic action plan italian prime ...,5,"[(eu, 0.049268889792449685), (european, 0.0288..."
1099,Germany nears 1990 jobless level\n\nGerman une...,business,germany 1990 jobless level german unemploymen...,5,"[(eu, 0.049268889792449685), (european, 0.0288..."
1118,EU-US seeking deal on air dispute\n\nThe EU an...,business,euus seek deal air dispute eu agree talk subs...,5,"[(eu, 0.049268889792449685), (european, 0.0288..."


In [None]:
business_topic_info = business_results['topic_info']
business_topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,88,-1_firm_year_company_bank,"[firm, year, company, bank, uk, market, growth...",[turkey turn economic charm three year gruell...
1,0,29,0_yukos_russian_gazprom_court,"[yukos, russian, gazprom, court, rosneft, yuga...",[yukos accuse lie court russian oil firm yuko...
2,1,28,1_car_gm_fiat_sale,"[car, gm, fiat, sale, vehicle, bmw, model, yea...",[saab build cadillacs sweden general motors w...
3,2,27,2_profit_share_game_news,"[profit, share, game, news, company, sale, cor...",[news corp eye video game market news corp me...
4,3,27,3_airline_air_flight_passenger,"[airline, air, flight, passenger, plane, carri...",[probe airline travel chaos government invest...
5,4,22,4_dollar_deficit_budget_bush,"[dollar, deficit, budget, bush, euro, currency...",[dollar gain greenspan speech dollar hit high...
6,5,21,5_eu_european_german_economy,"[eu, european, german, economy, lisbon, econom...",[newest eu member underpin growth european un...
7,6,21,6_oil_crude_price_barrel,"[oil, crude, price, barrel, cairn, field, prod...",[oil price reach threemonth low oil price fal...
8,7,17,7_country_south_government_card,"[country, south, government, card, trade, tax,...",[india unveil antipoverty budget india boost ...
9,8,17,8_drug_tobacco_patient_company,"[drug, tobacco, patient, company, firm, produc...",[seek 280bn smoker ruling justice department ...


In [None]:
business_topic_info['Representative_Docs'][1]

['yukos accuse lie court  russian oil firm yukos lie court attempt russian government sell key production unit court hear  unit yugansk sell pay 275bn 145bn back tax bill yukos argue subsidiary local bank account court declare bankrupt auction yugansk deutsche bank   target yukos lawsuit   document backdate strengthen case  deutsche banks evidence first day twoday hearing houston lawyer hugh ray tell court yukos claim transfer 27 m two texas bank account open subsidiary firm intend reinforce presence   chance get case hear court paper document transaction draw till week yukos bankruptcy application 14 december backdate  yukos chief financial officer bruce misamore move december set yukos usa acknowledge point discrepancy paperwork money transfer 14 december tell court 480000 account day rest arrive day  deutsche bank involve case sue yukos agree loan arm russian state gas firm gazprom money bid yuganskneftegaz yukos unit formally know sale go ahead despite order bankruptcy court order 

In [None]:
business_results2 = bert_reduce_topics(business_results, nr_topics=7)

2025-07-04 00:55:10,249 - BERTopic - Topic reduction - Reducing number of topics
2025-07-04 00:55:10,250 - BERTopic - Topic reduction - Number of topics (7) is equal or higher than the clustered topics(7).
2025-07-04 00:55:10,251 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-04 00:55:10,350 - BERTopic - Representation - Completed ✓


In [None]:
business_results2['dataframe']

Unnamed: 0,text,category,preprocessed_text,bertopic_topic,bertopic_topic_label,bertopic_topic_reduced,bertopic_topic_reduced_label
897,UK economy facing 'major risks'\n\nThe UK manu...,business,uk economy face major risk uk manufacturing s...,10,"[(rate, 0.059081433546730115), (interest, 0.03...",1,"[(economy, 0.03013473707140259), (year, 0.0270..."
898,Ask Jeeves tips online ad revival\n\nAsk Jeeve...,business,ask jeeves tip online ad revival ask jeeve th...,2,"[(profit, 0.03630716508192444), (share, 0.0319...",0,"[(sale, 0.03159972734591114), (year, 0.0302101..."
899,US interest rate rise expected\n\nUS interest ...,business,interest rate rise expect interest rate expec...,22,"[(fed, 0.06523099578976589), (rate, 0.06477807...",1,"[(economy, 0.03013473707140259), (year, 0.0270..."
900,EMI shares hit by profit warning\n\nShares in ...,business,emi share hit profit warn share music giant e...,2,"[(profit, 0.03630716508192444), (share, 0.0319...",0,"[(sale, 0.03159972734591114), (year, 0.0302101..."
901,Germany calls for EU reform\n\nGerman Chancell...,business,germany call eu reform german chancellor gerh...,5,"[(eu, 0.049268889792449685), (european, 0.0288...",1,"[(economy, 0.03013473707140259), (year, 0.0270..."
...,...,...,...,...,...,...,...
1402,US to rule on Yukos refuge call\n\nYukos has s...,business,rule yukos refuge call yukos bankruptcy court...,0,"[(yukos, 0.07934583058611158), (russian, 0.048...",2,"[(yukos, 0.0713127342835255), (russian, 0.0415..."
1403,US budget deficit to reach $368bn\n\nThe US bu...,business,budget deficit reach 368bn budget deficit set...,4,"[(dollar, 0.06710847634711041), (deficit, 0.05...",1,"[(economy, 0.03013473707140259), (year, 0.0270..."
1404,Building giant in asbestos payout\n\nAustralia...,business,build giant asbestos payout australian buildi...,-1,"[(firm, 0.01549142873006365), (year, 0.0145254...",-1,"[(year, 0.02775148522795105), (firm, 0.0273070..."
1405,India power shares jump on debut\n\nShares in ...,business,india power share jump debut share indias lar...,27,"[(reliance, 0.10257189045707317), (ambani, 0.0...",0,"[(sale, 0.03159972734591114), (year, 0.0302101..."


## Entertainment Category

In [None]:
ent_results = run_bertopic_pipeline(text_df, 'entertainment')

 Running pipeline for entertainment category


2025-07-04 00:37:27,297 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

2025-07-04 00:37:28,265 - BERTopic - Embedding - Completed ✓
2025-07-04 00:37:28,267 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-04 00:37:28,877 - BERTopic - Dimensionality - Completed ✓
2025-07-04 00:37:28,879 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-04 00:37:28,894 - BERTopic - Cluster - Completed ✓
2025-07-04 00:37:28,898 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-04 00:37:28,975 - BERTopic - Representation - Completed ✓


In [None]:
ent_results['topic_distance_map']

In [None]:
ent_results['topic_documents_map']

Now let us inspect the resulting topics and labels for the **entertainment** category

In [None]:
ent_df = ent_results['dataframe']
ent_df.head()

Unnamed: 0,text,category,preprocessed_text,bertopic_topic,bertopic_topic_label
0,Rapper Snoop Dogg sued for 'rape'\n\nUS rapper...,entertainment,rapper snoop dogg sue rape rapper snoop dogg ...,0,"[(music, 0.03747222697557896), (band, 0.027625..."
1,Vera Drake's Bafta triumph hope\n\nAt the Baft...,entertainment,vera drake bafta triumph hope bafta film awar...,3,"[(film, 0.05752488574685298), (british, 0.0442..."
2,God cut from Dark Materials film\n\nThe direct...,entertainment,god cut dark materials film director screenwr...,-1,"[(film, 0.053536841421826006), (director, 0.01..."
3,Films on war triumph at Sundance\n\nA study of...,entertainment,film war triumph sundance study united states...,9,"[(festival, 0.06421650835496213), (film, 0.059..."
4,Ray DVD beats box office takings\n\nOscar-nomi...,entertainment,ray dvd beats box office oscarnominate film b...,-1,"[(film, 0.053536841421826006), (director, 0.01..."


In [None]:
business_df[business_df['bertopic_topic'] == 5]

Unnamed: 0,text,category,preprocessed_text,bertopic_topic,bertopic_topic_label,bertopic_topic_reduced,bertopic_topic_reduced_label
901,Germany calls for EU reform\n\nGerman Chancell...,business,germany call eu reform german chancellor gerh...,5,"[(eu, 0.049268889792449685), (european, 0.0288...","BERTopic(calculate_probabilities=False, ctfidf...",False
948,EU 'too slow' on economic reforms\n\nMost EU c...,business,eu slow economic reform eu country fail put p...,5,"[(eu, 0.049268889792449685), (european, 0.0288...","BERTopic(calculate_probabilities=False, ctfidf...",False
970,German jobless rate at new record\n\nMore than...,business,german jobless rate record 52 million germans...,5,"[(eu, 0.049268889792449685), (european, 0.0288...","BERTopic(calculate_probabilities=False, ctfidf...",False
974,Sluggish economy hits German jobs\n\nThe numbe...,business,sluggish economy hit german job number people...,5,"[(eu, 0.049268889792449685), (european, 0.0288...","BERTopic(calculate_probabilities=False, ctfidf...",False
998,Further rise in UK jobless total\n\nThe UK's j...,business,far rise uk jobless total uks jobless total r...,5,"[(eu, 0.049268889792449685), (european, 0.0288...","BERTopic(calculate_probabilities=False, ctfidf...",False
1007,German business confidence slides\n\nGerman bu...,business,german business confidence slide german busin...,5,"[(eu, 0.049268889792449685), (european, 0.0288...","BERTopic(calculate_probabilities=False, ctfidf...",False
1075,German growth goes into reverse\n\nGermany's e...,business,german growth go reverse germanys economy shr...,5,"[(eu, 0.049268889792449685), (european, 0.0288...","BERTopic(calculate_probabilities=False, ctfidf...",False
1079,Italy to get economic action plan\n\nItalian P...,business,italy get economic action plan italian prime ...,5,"[(eu, 0.049268889792449685), (european, 0.0288...","BERTopic(calculate_probabilities=False, ctfidf...",False
1099,Germany nears 1990 jobless level\n\nGerman une...,business,germany 1990 jobless level german unemploymen...,5,"[(eu, 0.049268889792449685), (european, 0.0288...","BERTopic(calculate_probabilities=False, ctfidf...",False
1118,EU-US seeking deal on air dispute\n\nThe EU an...,business,euus seek deal air dispute eu agree talk subs...,5,"[(eu, 0.049268889792449685), (european, 0.0288...","BERTopic(calculate_probabilities=False, ctfidf...",False


In [None]:
entertainment_topic_info = business_results['topic_info']
entertainment_topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,88,-1_firm_year_company_bank,"[firm, year, company, bank, uk, market, growth...",[turkey turn economic charm three year gruell...
1,0,29,0_yukos_russian_gazprom_court,"[yukos, russian, gazprom, court, rosneft, yuga...",[yukos accuse lie court russian oil firm yuko...
2,1,28,1_car_gm_fiat_sale,"[car, gm, fiat, sale, vehicle, bmw, model, yea...",[saab build cadillacs sweden general motors w...
3,2,27,2_profit_share_game_news,"[profit, share, game, news, company, sale, cor...",[news corp eye video game market news corp me...
4,3,27,3_airline_air_flight_passenger,"[airline, air, flight, passenger, plane, carri...",[probe airline travel chaos government invest...
5,4,22,4_dollar_deficit_budget_bush,"[dollar, deficit, budget, bush, euro, currency...",[dollar gain greenspan speech dollar hit high...
6,5,21,5_eu_european_german_economy,"[eu, european, german, economy, lisbon, econom...",[newest eu member underpin growth european un...
7,6,21,6_oil_crude_price_barrel,"[oil, crude, price, barrel, cairn, field, prod...",[oil price reach threemonth low oil price fal...
8,7,17,7_country_south_government_card,"[country, south, government, card, trade, tax,...",[india unveil antipoverty budget india boost ...
9,8,17,8_drug_tobacco_patient_company,"[drug, tobacco, patient, company, firm, produc...",[seek 280bn smoker ruling justice department ...


In [None]:
entertainment_topic_info['Representative_Docs'][1]

['yukos accuse lie court  russian oil firm yukos lie court attempt russian government sell key production unit court hear  unit yugansk sell pay 275bn 145bn back tax bill yukos argue subsidiary local bank account court declare bankrupt auction yugansk deutsche bank   target yukos lawsuit   document backdate strengthen case  deutsche banks evidence first day twoday hearing houston lawyer hugh ray tell court yukos claim transfer 27 m two texas bank account open subsidiary firm intend reinforce presence   chance get case hear court paper document transaction draw till week yukos bankruptcy application 14 december backdate  yukos chief financial officer bruce misamore move december set yukos usa acknowledge point discrepancy paperwork money transfer 14 december tell court 480000 account day rest arrive day  deutsche bank involve case sue yukos agree loan arm russian state gas firm gazprom money bid yuganskneftegaz yukos unit formally know sale go ahead despite order bankruptcy court order 

In [None]:
x = bert_reduce_topics(ent_results, nr_topics=7)

2025-07-04 00:37:30,689 - BERTopic - Topic reduction - Reducing number of topics
2025-07-04 00:37:30,699 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-04 00:37:30,770 - BERTopic - Representation - Completed ✓
2025-07-04 00:37:30,771 - BERTopic - Topic reduction - Reduced number of topics from 14 to 7


In [None]:
x['dataframe']

Unnamed: 0,text,category,preprocessed_text,bertopic_topic,bertopic_topic_label,bertopic_topic_reduced,bertopic_topic_reduced_label
0,Rapper Snoop Dogg sued for 'rape'\n\nUS rapper...,entertainment,rapper snoop dogg sue rape rapper snoop dogg ...,0,"[(music, 0.03747222697557896), (band, 0.027625...","BERTopic(calculate_probabilities=False, ctfidf...",False
1,Vera Drake's Bafta triumph hope\n\nAt the Baft...,entertainment,vera drake bafta triumph hope bafta film awar...,3,"[(film, 0.05752488574685298), (british, 0.0442...","BERTopic(calculate_probabilities=False, ctfidf...",False
2,God cut from Dark Materials film\n\nThe direct...,entertainment,god cut dark materials film director screenwr...,-1,"[(film, 0.053536841421826006), (director, 0.01...","BERTopic(calculate_probabilities=False, ctfidf...",False
3,Films on war triumph at Sundance\n\nA study of...,entertainment,film war triumph sundance study united states...,9,"[(festival, 0.06421650835496213), (film, 0.059...","BERTopic(calculate_probabilities=False, ctfidf...",False
4,Ray DVD beats box office takings\n\nOscar-nomi...,entertainment,ray dvd beats box office oscarnominate film b...,-1,"[(film, 0.053536841421826006), (director, 0.01...","BERTopic(calculate_probabilities=False, ctfidf...",False
...,...,...,...,...,...,...,...
381,Spears seeks aborted tour payment\n\nSinger Br...,entertainment,spear seeks abort tour payment singer britney...,0,"[(music, 0.03747222697557896), (band, 0.027625...","BERTopic(calculate_probabilities=False, ctfidf...",False
382,Arnold congratulated on Oscar win\n\nOscar-win...,entertainment,arnold congratulate oscar win oscarwinner and...,-1,"[(film, 0.053536841421826006), (director, 0.01...","BERTopic(calculate_probabilities=False, ctfidf...",False
383,Rock star sued by ex-girlfriend\n\nMotley Crue...,entertainment,rock star sue exgirlfriend motley crue guitar...,0,"[(music, 0.03747222697557896), (band, 0.027625...","BERTopic(calculate_probabilities=False, ctfidf...",False
384,Vera Drake scoops film award\n\nOscar hopefuls...,entertainment,vera drake scoop film award oscar hopeful mik...,3,"[(film, 0.05752488574685298), (british, 0.0442...","BERTopic(calculate_probabilities=False, ctfidf...",False
