# Overview

# Importing Necessary Libraries & Requirements

In [2]:
!pip install spacy
!pip install wikipedia
!pip install wikidata
!pip install SPARQLWrapper



In [3]:
# Installing packages
!pip install 'bertopic[all]' sentence-transformers

Collecting numpy>=1.20.0 (from bertopic[all])
  Downloading numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Downloading numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m136.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.2.6
    Uninstalling numpy-2.2.6:
      Successfully uninstalled numpy-2.2.6
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.4 which is incompatible.[0m[31m
[0mSuccessfully installed numpy-1.26.4


In [4]:
!pip install dateparser
!pip install transformers[torch]
!pip install sentencepiece

zsh:1: no matches found: transformers[torch]


In [5]:
import pandas as pd
import os
import spacy
import re
import numpy as np
import random
import umap
import torch
import math
import wikipedia

from typing import List, Tuple, Optional

# Importing Topic modelling libraries
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import pairwise_distances

#Importing NER libraries

#Importing Date parsing libraries
import dateparser
from transformers import pipeline

In [6]:
# Setting all seeds
SEED = 42
os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


# Loading Dataset

In [7]:
# from google.colab import drive
# drive.mount('/content/drive')

In [8]:
dataset_path = 'bbc'
dataset = []

Firstly, I would combine the various text files into a list, and then a dataframe, to make processing easier

In [9]:
for category in os.listdir(dataset_path):       #loops through the items in root dataset folder
    category_path = os.path.join(dataset_path, category)       #constructs the path for each item
    if os.path.isdir(category_path):       #checks what items are directories
        for filename in os.listdir(category_path):       #loops through directories
            file_path = os.path.join(category_path, filename)       #constructs the path for each filee
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read().strip()
                dataset.append({'id':category[:3]+'_'+filename[:-4], 'text':text, 'category':category})  #Adding an identifier to each text consisting of the category and file name

In [10]:
text_df = pd.DataFrame(dataset)      # converting the resulting list to a dataframe

In [11]:
text_df

Unnamed: 0,id,text,category
0,bus_001,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,bus_002,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,bus_003,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,bus_004,High fuel prices hit BA's profits\n\nBritish A...,business
4,bus_005,Pernod takeover talk lifts Domecq\n\nShares in...,business
...,...,...,...
2220,tec_397,BT program to beat dialler scams\n\nBT is intr...,tech
2221,tec_398,Spam e-mails tempt net shoppers\n\nComputer us...,tech
2222,tec_399,Be careful how you code\n\nA new European dire...,tech
2223,tec_400,US cyber security chief resigns\n\nThe man mak...,tech


In [12]:
text_df['category'].value_counts()

category
sport            511
business         510
politics         417
tech             401
entertainment    386
Name: count, dtype: int64

# Preprocessing Text Data

First step, check raw texts for duplicated texts

In [13]:
# Identify duplicate texts
duplicate_ind = text_df[text_df.duplicated(subset=['text'], keep=False)].index.tolist()
print(duplicate_ind)

[6, 213, 214, 239, 252, 255, 257, 264, 291, 332, 355, 370, 415, 493, 512, 532, 548, 552, 555, 558, 577, 582, 583, 591, 595, 597, 605, 612, 614, 636, 638, 642, 647, 654, 667, 671, 744, 759, 770, 781, 782, 817, 837, 843, 844, 847, 859, 865, 900, 902, 916, 954, 964, 978, 1002, 1003, 1012, 1018, 1036, 1112, 1113, 1118, 1148, 1180, 1182, 1186, 1193, 1195, 1205, 1206, 1216, 1236, 1237, 1246, 1251, 1254, 1324, 1332, 1364, 1396, 1564, 1567, 1569, 1570, 1647, 1720, 1742, 1748, 1826, 1827, 1831, 1835, 1837, 1844, 1848, 1850, 1859, 1862, 1863, 1864, 1865, 1867, 1870, 1874, 1878, 1882, 1885, 1886, 1889, 1890, 1898, 1902, 1904, 1906, 1912, 1913, 1917, 1919, 1921, 1926, 1941, 1949, 1951, 1952, 1954, 1955, 1956, 1970, 1972, 1979, 1982, 1985, 1986, 1988, 1989, 1990, 1992, 1994, 1998, 1999, 2000, 2002, 2003, 2016, 2034, 2038, 2049, 2050, 2051, 2056, 2075, 2083, 2086, 2093, 2102, 2108, 2112, 2114, 2115, 2116, 2118, 2119, 2120, 2121, 2122, 2125, 2126, 2127, 2131, 2133, 2135, 2136, 2137, 2138, 2140, 2141,

In [14]:
text_df.loc[duplicate_ind].sort_values(by='text')

Unnamed: 0,id,text,category
1989,tec_166,'Brainwave' cap controls computer\n\nA team of...,tech
1988,tec_165,'Brainwave' cap controls computer\n\nA team of...,tech
954,pol_059,'Debate needed' on donations cap\n\nA cap on d...,politics
1193,pol_298,'Debate needed' on donations cap\n\nA cap on d...,politics
1003,pol_108,'Super union' merger plan touted\n\nTwo of Bri...,politics
...,...,...,...
1850,tec_027,Warning over tsunami aid website\n\nNet users ...,tech
2135,tec_312,Web radio takes Spanish rap global\n\nSpin the...,tech
1913,tec_090,Web radio takes Spanish rap global\n\nSpin the...,tech
2136,tec_313,What high-definition will do to DVDs\n\nFirst ...,tech


Now that the duplicated rows have been identified, they have to be removed

In [15]:
text_df.drop_duplicates(subset=['text'], inplace=True)

In [16]:
text_df.shape

(2127, 3)

In [17]:
nlp = spacy.load('en_core_web_sm')

#Using the stopword list from http://mlg.ucd.ie/files/datasets/stopwords.txt to preprocess the dataset

custom_stopwords = []
with open('bbc/stopwords.txt', 'r', encoding='utf-8') as file:
    for line in file:
        custom_stopwords.append(line.strip())

print(custom_stopwords)

['a', 'about', 'above', 'according', 'across', 'actually', 'adj', 'after', 'afterwards', 'again', 'all', 'almost', 'along', 'already', 'also', 'although', 'always', 'among', 'amongst', 'an', 'am', 'and', 'another', 'any', 'anyhow', 'anyone', 'anything', 'anywhere', 'are', 'aren', "aren't", 'around', 'as', 'at', 'be', 'became', 'because', 'become', 'becomes', 'been', 'beforehand', 'begin', 'being', 'below', 'beside', 'besides', 'between', 'both', 'but', 'by', 'can', 'cannot', "can't", 'caption', 'co', 'come', 'could', 'couldn', "couldn't", 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'don', "don't", 'down', 'during', 'each', 'early', 'eg', 'either', 'else', 'elsewhere', 'end', 'ending', 'enough', 'etc', 'even', 'ever', 'every', 'everywhere', 'except', 'few', 'for', 'found', 'from', 'further', 'had', 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'he', 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers', 'him', 'his', 'how', 'however', 'ie', 'i

In [18]:
#Creating a function for preprocessing - lowercasing, lemmatization, stopword removal

def preprocess(text):

    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # Processing with Spacy
    doc = nlp(text)

    # Lemmatization and removing stopwords using thee custom list
    tokens = [token.lemma_.lower() for token in doc if token.lemma_.lower() not in custom_stopwords]
    return ' '.join(tokens)

text_df['preprocessed_text'] = text_df['text'].apply(preprocess)

In [19]:
# Remove newline characters
text_df['preprocessed_text'] = text_df['preprocessed_text'].str.replace('\n', '')
text_df.head()

Unnamed: 0,id,text,category,preprocessed_text
0,bus_001,Ad sales boost Time Warner profit\n\nQuarterly...,business,ad sale boost time warner profit quarterly pr...
1,bus_002,Dollar gains on Greenspan speech\n\nThe dollar...,business,dollar gain greenspan speech dollar hit high ...
2,bus_003,Yukos unit buyer faces loan claim\n\nThe owner...,business,yukos unit buyer face loan claim owner embatt...
3,bus_004,High fuel prices hit BA's profits\n\nBritish A...,business,high fuel price hit bas profit british airway...
4,bus_005,Pernod takeover talk lifts Domecq\n\nShares in...,business,pernod takeover talk lift domecq share uk dri...


Checking the preprocessed text after preprocessing to remove any duplicates

In [20]:
preprocessed_ind = text_df[text_df.duplicated(subset=['preprocessed_text'], keep=False)].index.tolist()

In [21]:
text_df.loc[preprocessed_ind].sort_values(by='preprocessed_text')

Unnamed: 0,id,text,category,preprocessed_text
2042,tec_219,Disney backs Sony DVD technology\n\nA next gen...,tech,disney back sony dvd technology generation dv...
2047,tec_224,Disney backs Sony DVD technology\n\nA next gen...,tech,disney back sony dvd technology generation dv...
1978,tec_155,Games win for Blu-ray DVD format\n\nThe next-g...,tech,game win bluray dvd format nextgeneration dvd...
2117,tec_294,Games win for Blu-ray DVD format\n\nThe next-g...,tech,game win bluray dvd format nextgeneration dvd...
1644,spo_332,Harinordoquy suffers France axe\n\nNumber eigh...,sport,harinordoquy suffer france axe number eight i...
1656,spo_344,Harinordoquy suffers France axe\n\nNumber eigh...,sport,harinordoquy suffer france axe number eight i...
1781,spo_469,Moya emotional after Davis Cup win\n\nCarlos M...,sport,moya emotional davis cup win carlos moya desc...
1782,spo_470,Moya emotional at Davis Cup win\n\nCarlos Moya...,sport,moya emotional davis cup win carlos moya desc...
1871,tec_048,'No re-draft' for EU patent law\n\nA proposed ...,tech,redraft eu patent law propose european law so...
2165,tec_342,'No re-draft' for EU patent law\n\nA proposed ...,tech,redraft eu patent law propose european law so...


In [22]:
text_df.drop_duplicates(subset=['preprocessed_text'], inplace=True)

# Exploratory Data Analysis

In [23]:
# Count the number of tokens per text and assign this to
text_df['token_count'] = text_df['preprocessed_text'].apply(lambda x: len(x.split()))

In [24]:
text_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
token_count,2120.0,204.92783,117.906142,48.0,135.0,181.0,247.0,2007.0


# Sub-categorizing Main Categories

To break down the texts into sub-categories, I would make use of BERTopic

## Reusable Functions

In [25]:
def estimate_optimal_k(n_samples, min_k=10, max_k=50):
    estimated_k = int(math.sqrt(n_samples))
    return max(min_k, min(max_k, estimated_k))


In [26]:
# Creating a function to run the initial clustering of the top categories
def run_bertopic_pipeline(
    df,
    top_category = None,
    embedding_model = None,
    n_clusters = None
):
    '''
    Processes dataframe and returns 
    '''
    #Filtering the specified top category
    if top_category is not None:
        print (f" Running pipeline for {top_category} category\n\n")
        top_category_df = df[df['category'] == top_category]
    else:
        top_category_df = df

    # Converting to list for easy processing
    top_category_texts = top_category_df['preprocessed_text'].tolist()
    top_category_texts = [t for t in top_category_texts if isinstance(t, str) and t.strip()]

    # Specifying embedding model
    if embedding_model is None:
        embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

    embeddings = embedding_model.encode(top_category_texts, batch_size=128, show_progress_bar=True)

    # Clustering (fast + scalable)
    n_samples = len(top_category_texts)
    best_k = estimate_optimal_k(n_samples)
    cluster_model = MiniBatchKMeans(n_clusters=best_k, batch_size=1024, random_state=SEED)
    initial_clusters = cluster_model.fit_predict(embeddings)
    
    # Setting up outlier detection (based on distance to assigned cluster centroid)
    centroids = cluster_model.cluster_centers_
    distances = pairwise_distances(embeddings, centroids, metric='cosine')
    assigned_distances = distances[np.arange(len(embeddings)), initial_clusters]
    
    # Mark top 5% as outliers
    threshold = np.percentile(assigned_distances, 95)
    is_outlier = assigned_distances > threshold

    # Creating an instance of BERTopic for my data
    bbc_topic_model = BERTopic(embedding_model=embedding_model,
                               language="english",
                               umap_model=None,
                               hdbscan_model=cluster_model,
                               calculate_probabilities=True,
                               verbose=True)

    # Fit data
    bbc_topic_model.fit(top_category_texts, embeddings=embeddings, y=initial_clusters)
    topics, probabilities = bbc_topic_model.transform(top_category_texts, embeddings=embeddings)

    #Reinserting outliers
    final_topics = np.where(is_outlier, -1, topics)

    # Save the model for reuse
    bbc_topic_model.save(f"{top_category}_bertopic_model", save_embedding_model=True)

    # Topic info
    topic_info = bbc_topic_model.get_topic_info()

    # Visualizing document and topic maps
    topic_documents_map = bbc_topic_model.visualize_documents(top_category_texts)
    topic_distance_map = bbc_topic_model.visualize_topics()


    # Appending topics and labels to dataframe
    top_category_df['bertopic_topic'] = final_topics
    top_category_df['bertopic_topic_label'] = top_category_df['bertopic_topic'].apply(lambda x: bbc_topic_model.get_topic(x)if x != -1 else ["Outlier"])

    return {'topic_info': topic_info,
            'topic_distance_map': topic_distance_map,
            'topic_documents_map': topic_documents_map,
            'topics': topics,
            'probabilities': probabilities,
            'bbc_topic_model': bbc_topic_model,
            'dataframe': top_category_df}

In [27]:
# Creating a function that prints the top terms and their scores for each topic

def print_top_terms(topic_info, model):
    for id in topic_info['Topic']:
        if id == -1:   # Skip the outliers
            continue
        print(f"\n--- Topic {id} ---")
        print(f" Number of docs: {model.get_topic_freq(id)}")
        print(model.get_topic(id))

In [28]:
# Creating a function that prints the representative documents for each topic to get a sense of the theme

def print_docs(topic_info, model):
    for id in topic_info['Topic']:
        if id == -1:
            continue
        print(f"\nSample docs for Topic {id}")
        docs = model.get_representative_docs(id)
        for doc in docs:
            print("-", doc[:500])  # Limit to first 500 characters


In [29]:
# Assign defined labels to outliers
def best_label(doc, keyword_map):
    text = doc.lower()
    scores = {}
    for label, kws in keyword_map.items():
        count = 0
        for kw in kws:
            # word-boundary match to avoid substrings
            if re.search(rf'\b{re.escape(kw)}\b', text):
                count += 1
        scores[label] = count
    # pick the label with the highest score
    best, best_score = max(scores.items(), key=lambda kv: kv[1])
    # Assign "Other" if no keywords matched at all
    return best if best_score > 0 else "Other"

## Business Category

In [30]:
# Run bertopic on business top category
business_results = run_bertopic_pipeline(text_df, top_category='business')

 Running pipeline for business category




Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2025-07-08 18:39:37,738 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-08 18:39:48,420 - BERTopic - Dimensionality - Completed ✓
2025-07-08 18:39:48,421 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-08 18:39:48,433 - BERTopic - Cluster - Completed ✓
2025-07-08 18:39:48,437 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-08 18:39:48,522 - BERTopic - Representation - Completed ✓
2025-07-08 18:39:48,627 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-07-08 18:39:48,629 - BERTopic - Dimensionality - Completed ✓
2025-07-08 18:39:48,630 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-07-08 18:39:48,631 - BERTopic - Cluster - Completed ✓


In [31]:
# Visualize intertopic distance map
business_results['topic_distance_map']

In [32]:
# Visualize documents and topics
business_results['topic_documents_map']

Now let us inspect the resulting topics and labels for the **business** category


---



In [33]:
business_df = business_results['dataframe']
business_df.head()

Unnamed: 0,id,text,category,preprocessed_text,token_count,bertopic_topic,bertopic_topic_label
0,bus_001,Ad sales boost Time Warner profit\n\nQuarterly...,business,ad sale boost time warner profit quarterly pr...,250,2,"[(firm, 0.01952933512924711), (company, 0.0181..."
1,bus_002,Dollar gains on Greenspan speech\n\nThe dollar...,business,dollar gain greenspan speech dollar hit high ...,212,13,"[(dollar, 0.0625803835958628), (eu, 0.05398407..."
2,bus_003,Yukos unit buyer faces loan claim\n\nThe owner...,business,yukos unit buyer face loan claim owner embatt...,158,6,"[(yukos, 0.09460043305628853), (russian, 0.056..."
3,bus_004,High fuel prices hit BA's profits\n\nBritish A...,business,high fuel price hit bas profit british airway...,249,7,"[(airline, 0.07247632342689005), (air, 0.05528..."
4,bus_005,Pernod takeover talk lifts Domecq\n\nShares in...,business,pernod takeover talk lift domecq share uk dri...,162,3,"[(drug, 0.0374912132123681), (tobacco, 0.02791..."


In [34]:
business_topic_info = business_results['topic_info']
business_topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,65,0_economic_budget_economy_year,"[economic, budget, economy, year, us, governme...",[tick budget face us budget proposal lay admi...
1,1,39,1_profit_share_year_sale,"[profit, share, year, sale, barclays, 2004, eu...",[barclay share merger talk share uk banking g...
2,2,37,2_firm_company_sec_share,"[firm, company, sec, share, marsh, business, m...",[troubled marsh sec scrutiny us stock market ...
3,3,37,3_drug_tobacco_firm_company,"[drug, tobacco, firm, company, product, share,...",[us seek 280bn smoker ruling us justice depar...
4,4,31,4_share_company_firm_reliance,"[share, company, firm, reliance, bank, market,...",[nasdaq plan 100mshare sale owner technologyd...
5,5,30,5_unemployment_economy_growth_job,"[unemployment, economy, growth, job, million, ...",[business fear sluggish eu economy european l...
6,6,26,6_yukos_russian_gazprom_court,"[yukos, russian, gazprom, court, rosneft, auct...",[yukos bankruptcy us matter russian authority...
7,7,24,7_airline_air_flight_plane,"[airline, air, flight, plane, passenger, airbu...",[us probe airline travel chaos us government ...
8,8,22,8_car_sale_bmw_vehicle,"[car, sale, bmw, vehicle, year, model, gm, for...",[bmw cash fuel mini production four year mini...
9,9,22,9_rate_price_house_housing,"[rate, price, house, housing, rise, mortgage, ...",[uk interest rate hold 475 bank england leave...


In [35]:
# Printing topic top terms and their score for each topic
print_top_terms(business_topic_info, business_results['bbc_topic_model'])


--- Topic 0 ---
 Number of docs: 65
[('economic', 0.022030816663943093), ('budget', 0.021956121427491742), ('economy', 0.01825640195413451), ('year', 0.01806756981001962), ('us', 0.01749563309701419), ('government', 0.017190450537257502), ('spending', 0.014857246314365255), ('growth', 0.01443815653510424), ('debt', 0.014282905850275246), ('country', 0.013724033961836902)]

--- Topic 1 ---
 Number of docs: 39
[('profit', 0.047707926194823634), ('share', 0.031791304409214), ('year', 0.025027488464011403), ('sale', 0.023593368422629157), ('barclays', 0.023139636096857617), ('2004', 0.022307181113210513), ('euro', 0.021797307220994495), ('business', 0.01718497123182079), ('club', 0.016741887859795333), ('firm', 0.016283208070480677)]

--- Topic 2 ---
 Number of docs: 37
[('firm', 0.01952933512924711), ('company', 0.018165078263775496), ('sec', 0.01785655939587956), ('share', 0.01617028869571581), ('marsh', 0.015462174757653022), ('business', 0.01543054762861081), ('market', 0.014791429789

In [36]:
# Displaying the representative documents for each topic to get a sense of the theme
print_docs(business_topic_info, business_results['bbc_topic_model'])


Sample docs for Topic 0
- tick budget face us  budget proposal lay administration us president george w bush highly controversial washingtonbased economic policy institute tend critical president look possible fault line us politician citizen political persuasion dose shock therapy major change current policy political prejudice federal budget simply hold news coverage bush budget dominate debate spending cut fact large cut small program standpoint big fiscal trend cut gratuitous big budget train wreck direct threat fede
- giant wave damage s asia economy  government aid agency insurer travel firm count cost massive earthquake wave hammer southern asia  worsthit area sri lanka india indonesia thailand least 23000 people kill estimate world bank put amount aid need 5bn 26bn similar cash offer central america hurricane mitch mitch kill 10000 people cause damage 10bn 1998 world bank spokesman damien milverton tell wall street journal expect aid package financing debt relief  tourism vita

From the sample docs above, there is an overlap between several topics. Custom labels would be assigned

### Adding Custom Labels

Based on the representative docs above, I will manually assign sub-categories to each topic_id. 
* Topic 0,5,12, are heavily centered around budgets and economic news, but in different continents
* Topic 1 is more about company news i.e. profits, losses, etc
* Topic 2,3 ar focused on financial regulations and litigations
* Topic 3 is heavy on the tobacco industry
* Topics 6,10 are centered around the oil sector
* Topic 7
* Topic 8
* Topic 14 focuses on retail
* Topic 15,19,21 are focused on mergers and acquisitions across telecoms and sports-clubs
* Topic 16 is more on Global policies


In [37]:
topic_labels = {
    0: "Economic News", 
    1: "Company News", 
    2: "Corporate Regulation & Litigation", 
    3: "Corporate Regulation & Litigation",
    4: "Forex & Stock Markets",            
    5: "Economic News",                   
    6: "Oil Sector",                     
    7: "Airline Industry",                    
    8: "Automotive Industry",                   
    9: "Housing Market",                  
    10: "Oil Sector",                 
    11: "Corporate Regulation & Litigation",                 
    12: "Economic News",    
    13: "Forex & Stock Markets",
    14: "Retail & Consumer Spending",
    15: "Mergers and Acquisitions",
    16: "Global Policy",
    17: "Manufacturing Industry",
    18: "Economic News",
    19: "Mergers and Acquisitions",
    20: "Automotive Industry",
    21: "Mergers and Acquisitions"
}

In [38]:
# Assigning the topic labels to the topic IDs
business_results['bbc_topic_model'].set_topic_labels(topic_labels)

In [39]:
# Adding sub categories
business_df['Sub-category'] = business_df['bertopic_topic'].map(topic_labels)

In [40]:
# Visualizing top topics
business_results['bbc_topic_model'].visualize_barchart(top_n_topics=10)

### Dealing with Outliers

The pipeline had an outlier component which would need to bedealt with in this section

In [41]:
business_df.head()

Unnamed: 0,id,text,category,preprocessed_text,token_count,bertopic_topic,bertopic_topic_label,Sub-category
0,bus_001,Ad sales boost Time Warner profit\n\nQuarterly...,business,ad sale boost time warner profit quarterly pr...,250,2,"[(firm, 0.01952933512924711), (company, 0.0181...",Corporate Regulation & Litigation
1,bus_002,Dollar gains on Greenspan speech\n\nThe dollar...,business,dollar gain greenspan speech dollar hit high ...,212,13,"[(dollar, 0.0625803835958628), (eu, 0.05398407...",Forex & Stock Markets
2,bus_003,Yukos unit buyer faces loan claim\n\nThe owner...,business,yukos unit buyer face loan claim owner embatt...,158,6,"[(yukos, 0.09460043305628853), (russian, 0.056...",Oil Sector
3,bus_004,High fuel prices hit BA's profits\n\nBritish A...,business,high fuel price hit bas profit british airway...,249,7,"[(airline, 0.07247632342689005), (air, 0.05528...",Airline Industry
4,bus_005,Pernod takeover talk lifts Domecq\n\nShares in...,business,pernod takeover talk lift domecq share uk dri...,162,3,"[(drug, 0.0374912132123681), (tobacco, 0.02791...",Corporate Regulation & Litigation


In [42]:
# Fitering out the texts with topic, -1
outlier_df = business_df[business_df['bertopic_topic'] == -1]
outlier_df.sort_values(by='token_count', ascending=False)

Unnamed: 0,id,text,category,preprocessed_text,token_count,bertopic_topic,bertopic_topic_label,Sub-category
242,bus_243,Making your office work for you\n\nOur mission...,business,office work mission brighten work life contin...,422,-1,[Outlier],
267,bus_268,Giving financial gifts to children\n\nYour chi...,business,give financial gift child child grandchild wa...,400,-1,[Outlier],
293,bus_294,Wall Street cheers Bush victory\n\nThe US stoc...,business,wall street cheer bush victory us stock marke...,289,-1,[Outlier],
270,bus_271,Arsenal 'may seek full share listing'\n\nArsen...,business,arsenal seek full share list arsenal vicechai...,229,-1,[Outlier],
89,bus_090,French wine gets 70m euro top-up\n\nThe French...,business,french wine get 70 m euro topup french govern...,219,-1,[Outlier],
134,bus_135,Feta cheese battle reaches court\n\nA row over...,business,feta cheese battle reach court row greece all...,205,-1,[Outlier],
399,bus_400,Monsanto fined $1.5m for bribery\n\nThe US agr...,business,monsanto fine 15 m bribery us agrochemical gi...,189,-1,[Outlier],
441,bus_442,Lesotho textile workers lose jobs\n\nSix forei...,business,lesotho textile worker lose job six foreignow...,186,-1,[Outlier],
198,bus_199,Georgia plans hidden asset pardon\n\nGeorgia i...,business,georgia plan hide asset pardon georgia offer ...,186,-1,[Outlier],
281,bus_282,Chinese dam firm 'defies Beijing'\n\nThe China...,business,chinese dam firm defy beijing china three gor...,184,-1,[Outlier],


In [43]:
for i, row in outlier_df.iterrows():
    print(f"\nDoc {i}:\n{row['text']}")


Doc 57:
Electrolux to export Europe jobs

Electrolux saw its shares rise 14% on Tuesday after it said it would be shifting more of its manufacturing to low-cost countries.

The Swedish firm, the world's largest maker of home appliances, said it is to relocate about 10 of its 27 plants in western Europe and North America. It did not say which facilities would be affected, but intends moving them to Asia, eastern Europe and Mexico. The company has two manufacturing sites in County Durham. It makes lawn and garden products in Newton Aycliffe, and cookers and ovens in Spennymoor. The Newton Aycliffe plant could also be affected by Electrolux's separate announcement that it is to spin-off its outdoor products unit into a new separate company.

Electrolux's subsidiary brands include AEG, Zanussi and Frigidaire. The company said it was speeding up its restructuring programme, which aims to save between £190m and £265m annually from 2009. "We see that about half the plants in high-cost countr

---

After reviewing the uncategorized texts, below are a few reasons why they were uncategorized

- Rare and unusual words e.g bus_373
- Bad preprocessing with all stopwords removed leaving nothing meaningful e.g bus_510
- Not thematically related e.g bus_243, bus_268

The outliers will be handled using keyword heuristics and merged into the existing set of topic labels

In [44]:
# Setting up keywords
final_keywords = {
    "Economic News": ['unemployment', 'GDP', 'economic crisis', 'fiscal', 'tax amnesty', 'interest rate', 'inflation'],
    # "Company News": ['profit', 'loss'],
    "Corporate Regulation & Litigation": ['lawsuit', 'SEC', 'fraud', 'bribe', 'bribery', 'payout', 'settlement'],
    "Forex & Stock Markets": ['currency', 'shares', 'stock exchange', 'share price'],          
    "Oil Sector": ['oil', 'crude', 'gas'],                  
    "Airline & Transport Industry": ['flight', 'rail', 'underground'],                    
    "Automotive Industry": ['car', 'automotive', 'Peugeot', 'Mitsubishi', 'minivan', 'electric vehicle'],                   
    "Housing Market": [],              
    "Retail & Consumer Spending": ['retail', 'store', 'shop', 'sale', 'discount', 'consumer spending'],
    "Mergers and Acquisitions": ['merger', 'acquisition', 'takeover', 'buyout', 'stake'],
    "Global Policy": [],
    "Manufacturing Industry": ['manufacturing', 'factory', 'plant', 'production', 'assembly']
}

In [45]:
# Assign labels to outlier texts
outlier_df['labels']  = outlier_df['preprocessed_text'].apply(lambda t: best_label(t, final_keywords))

In [46]:
outlier_df

Unnamed: 0,id,text,category,preprocessed_text,token_count,bertopic_topic,bertopic_topic_label,Sub-category,labels
57,bus_058,Electrolux to export Europe jobs\n\nElectrolux...,business,electrolux export europe job electrolux see s...,107,-1,[Outlier],,Manufacturing Industry
89,bus_090,French wine gets 70m euro top-up\n\nThe French...,business,french wine get 70 m euro topup french govern...,219,-1,[Outlier],,Retail & Consumer Spending
119,bus_120,Nigeria to boost cocoa production\n\nThe gover...,business,nigeria boost cocoa production government nig...,169,-1,[Outlier],,Manufacturing Industry
122,bus_123,Train strike grips Buenos Aires\n\nA strike on...,business,train strike grip buenos aires strike buenos ...,100,-1,[Outlier],,Airline & Transport Industry
134,bus_135,Feta cheese battle reaches court\n\nA row over...,business,feta cheese battle reach court row greece all...,205,-1,[Outlier],,Manufacturing Industry
166,bus_167,Hariri killing hits Beirut shares\n\nShares in...,business,hariri killing hit beirut share share solider...,122,-1,[Outlier],,Forex & Stock Markets
198,bus_199,Georgia plans hidden asset pardon\n\nGeorgia i...,business,georgia plan hide asset pardon georgia offer ...,186,-1,[Outlier],,Economic News
206,bus_207,EMI shares hit by profit warning\n\nShares in ...,business,emi share hit profit warn share music giant e...,183,-1,[Outlier],,Forex & Stock Markets
234,bus_235,Pension hitch for long-living men\n\nMale life...,business,pension hitch longlive man male life expectan...,124,-1,[Outlier],,Other
242,bus_243,Making your office work for you\n\nOur mission...,business,office work mission brighten work life contin...,422,-1,[Outlier],,Other


In [47]:
# Update the main business_df
business_df.loc[outlier_df.index, 'Sub-category'] = outlier_df['labels']

## Entertainment Category

In [48]:
# Run bertopic on Entertainment top category
ent_results = run_bertopic_pipeline(text_df, top_category='entertainment')

 Running pipeline for entertainment category




Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2025-07-08 18:40:27,483 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-08 18:40:27,846 - BERTopic - Dimensionality - Completed ✓
2025-07-08 18:40:27,847 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-08 18:40:27,859 - BERTopic - Cluster - Completed ✓
2025-07-08 18:40:27,862 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-08 18:40:27,929 - BERTopic - Representation - Completed ✓
2025-07-08 18:40:28,007 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-07-08 18:40:28,009 - BERTopic - Dimensionality - Completed ✓
2025-07-08 18:40:28,009 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-07-08 18:40:28,011 - BERTopic - Cluster - Completed ✓


In [49]:
# Visualize intertopic distance map
ent_results['topic_distance_map']

In [50]:
# Visualize documents and topics
ent_results['topic_documents_map']

Now let us inspect the resulting topics and labels for the **entertainment** category

---

In [51]:
ent_df = ent_results['dataframe']
ent_df.head()

Unnamed: 0,id,text,category,preprocessed_text,token_count,bertopic_topic,bertopic_topic_label
510,ent_001,Gallery unveils interactive tree\n\nA Christma...,entertainment,gallery unveil interactive tree christmas tre...,104,15,"[(van, 0.05534471669753969), (gallery, 0.05176..."
511,ent_002,Jarre joins fairytale celebration\n\nFrench mu...,entertainment,jarre join fairytale celebration french music...,151,-1,[Outlier]
512,ent_003,Musical treatment for Capra film\n\nThe classi...,entertainment,musical treatment capra film classic film won...,104,18,"[(broadway, 0.10063500771711625), (musical, 0...."
513,ent_004,Richard and Judy choose top books\n\nThe 10 au...,entertainment,richard judy choose top book 10 author shortl...,126,12,"[(book, 0.07654611930571634), (novel, 0.039680..."
514,ent_005,Poppins musical gets flying start\n\nThe stage...,entertainment,poppin musical get fly start stage adaptation...,113,17,"[(ballet, 0.12936490629717246), (poppins, 0.06..."


In [52]:
ent_topic_info = ent_results['topic_info']
ent_topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,40,0_film_good_award_win,"[film, good, award, win, director, actor, avia...",[aviator win top globe accolade aviator name ...
1,1,38,1_show_tv_bbc_series,"[show, tv, bbc, series, comedy, channel, telev...",[jungle tv show rating drop 4 m finale itv1s ...
2,2,29,2_box_film_office_take,"[box, film, office, take, movie, weekend, year...",[horror film head us box office lowbudget hor...
3,3,27,3_album_music_urban_number,"[album, music, urban, number, good, artist, bl...",[outkast win mtv europe awards us hiphop duo ...
4,4,26,4_song_music_band_good,"[song, music, band, good, radio, year, 25, rec...",[york rocker top talent poll york electrorock...
5,5,25,5_show_tv_us_jackson,"[show, tv, us, jackson, forensic, programme, f...",[us tv cut nudity bbc film us tv network edit...
6,6,24,6_film_star_french_direct,"[film, star, french, direct, role, play, tauto...",[johnny depp act outlaw johnny depp role pete...
7,7,22,7_music_download_chart_sale,"[music, download, chart, sale, industry, us, u...",[help indie download sale campaign launch hel...
8,8,20,8_ticket_band_concert_festival,"[ticket, band, concert, festival, hendrix, roc...",[rock band u2 break ticket record u2 smash ir...
9,9,18,9_court_police_singer_claim,"[court, police, singer, claim, assault, jail, ...",[rapper snoop dogg sue rape us rapper snoop d...


In [53]:
# Printing topic top terms and their score for each topic
print_top_terms(ent_topic_info, ent_results['bbc_topic_model'])


--- Topic 0 ---
 Number of docs: 40
[('film', 0.03443928942447291), ('good', 0.033391653031985095), ('award', 0.031132840393119372), ('win', 0.028871798733363933), ('director', 0.02742335014825059), ('actor', 0.02690578092377688), ('aviator', 0.022879160195087518), ('oscars', 0.022514032415833815), ('oscar', 0.020545902821335647), ('swank', 0.01875171802715664)]

--- Topic 1 ---
 Number of docs: 38
[('show', 0.05432537324656398), ('tv', 0.0369572736869857), ('bbc', 0.033861958441640604), ('series', 0.03151540848432888), ('comedy', 0.026789774495162438), ('channel', 0.024170592547017693), ('television', 0.020945420351011212), ('celebrity', 0.020596929320804924), ('star', 0.01862033846532932), ('gervais', 0.01834851408162288)]

--- Topic 2 ---
 Number of docs: 29
[('box', 0.046756163217816775), ('film', 0.04584879034974272), ('office', 0.04526624902086461), ('take', 0.03145722079149226), ('movie', 0.025175828501582127), ('weekend', 0.025165619985450416), ('year', 0.024461361521762866), 

In [54]:
# Displaying the representative documents for each topic to get a sense of the theme
print_docs(ent_topic_info, ent_results['bbc_topic_model'])


Sample docs for Topic 0
- aviator win top globe accolade  aviator name good film golden globe awards star leonardo dicaprio name good actor  hollywood veteran clint eastwood take good director prize million dollar baby star hilary swank good actress quirky comedy sideways name good screenplay good comedy ray star jamie foxx good actor musicalcomedy briton clive owen natalie portman win prize good supporting role close  aviator dicaprio play millionaire howard hughes edge ahead rival beverly hills ceremony win good origin
- critic back aviator oscars  martin scorseses aviator win good film oscars accord uks lead movie critic  survey bbc news website think veteran filmmaker lose good director prize clint eastwood critic tip jamie foxx hilary swank scoop good actor actress ray million dollar baby respectively jury comprise expert critic top uk film publication panel reveal nominee personally prefer win  expect aviator win good film think close race scorseses howard hughes biopic eastwood

### Adding Custom Labels

In [55]:
ent_labels = {
    0: "Cinema", 
    1: "Television", 
    2: "Cinema", 
    3: "Music",
    4: "Music",            
    5: "Television",                   
    6: "Cinema",                     
    7: "Music",                    
    8: "Music",                   
    9: "Celebrity News",                  
    10: "Music",                 
    11: "Cinema",                 
    12: "Literature",    
    13: "Cinema",
    14: "Cinema",
    15: "Cinema",
    16: "Theatre & Dance",
    17: "Theatre & Dance",
    18: "Music"
}

In [56]:
# Assigning the topic labels to the topic IDs
ent_results['bbc_topic_model'].set_topic_labels(ent_labels)

In [57]:
# Adding sub categories
ent_df['Sub-category'] = ent_df['bertopic_topic'].map(ent_labels)

In [58]:
# Visualizing top topics
ent_results['bbc_topic_model'].visualize_barchart(top_n_topics=10)

### Dealing with Outliers

In [59]:
#Check to see if there are any outliers
ent_outliers = ent_df[ent_df['bertopic_topic'] == -1]
ent_outliers.sort_values(by='token_count', ascending=False)

Unnamed: 0,id,text,category,preprocessed_text,token_count,bertopic_topic,bertopic_topic_label,Sub-category
862,ent_353,Roundabout continues nostalgia trip\n\nThe new...,entertainment,roundabout continue nostalgia trip bigscreen ...,1155,-1,[Outlier],
722,ent_213,TV show unites Angolan families\n\nAngolan fam...,entertainment,tv show unite angolan family angolan family a...,289,-1,[Outlier],
723,ent_214,Volcano drama erupts on BBC One\n\nSupervolcan...,entertainment,volcano drama erupt bbc one supervolcano docu...,288,-1,[Outlier],
885,ent_376,France set for new Da Vinci novel\n\nFrench bo...,entertainment,france set da vinci novel french bookseller b...,174,-1,[Outlier],
519,ent_010,Uganda bans Vagina Monologues\n\nUganda's auth...,entertainment,uganda ban vagina monologues ugandas authorit...,169,-1,[Outlier],
737,ent_228,Joy Division story to become film\n\nThe life ...,entertainment,joy division story film life joy division sin...,168,-1,[Outlier],
511,ent_002,Jarre joins fairytale celebration\n\nFrench mu...,entertainment,jarre join fairytale celebration french music...,151,-1,[Outlier],
563,ent_054,Mumbai bombs movie postponed\n\nThe release of...,entertainment,mumbai bomb movie postpone release film mumba...,146,-1,[Outlier],
681,ent_172,Eurovision 'greats' to do battle\n\nStars of t...,entertainment,eurovision great battle star eurovision song ...,126,-1,[Outlier],
678,ent_169,US actor 'found with gun residue'\n\nActor Rob...,entertainment,us actor find gun residue actor robert blake ...,126,-1,[Outlier],


In [60]:
for i, row in ent_outliers.iterrows():
    print(f"\nDoc {i}:\n{row['text']}")


Doc 511:
Jarre joins fairytale celebration

French musician Jean-Michel Jarre is to perform at a concert in Copenhagen to mark the bicentennial of the birth of writer Hans Christian Andersen.

Denmark is holding a three-day celebration of the life of the fairy-tale author, with a concert at Parken stadium on 2 April. Other stars are expected to join the line-up in the coming months, and the Danish royal family will attend. "Christian Andersen's fairy tales are timeless and universal," said Jarre. "For all of us, at any age there is always - beyond the pure enjoyment of the tale - a message to learn." There are year-long celebrations planned across the world to celebrate Andersen and his work, which includes The Emperor's New Clothes and The Little Mermaid. Denmark's Crown Prince Frederik and Crown Princess Mary visited New York on Monday to help promote the festivities. The pair were at a Manhattan library to honour US literary critic Harold Bloom "the international icon we thought we

In [61]:
ent_keywords = {
    'Cinema': ['screened', 'movie', 'film'], 
    'Theatre & Dance': ['play', 'book', 'stage'], 
    'Literature':['author', 'novel'], 
    'Television': ['tv show', 'television', 'series', 'tv'],
    'Celebrity News': ['trial', 'crime', 'divorce', 'news', 'controversy'], 
    'Music': ['jazz', 'musician', 'tape recordings', 'saxophone', 'song', 'sing', 'music', 'composition']
}

In [62]:
# Assign labels to outlier texts
ent_outliers['labels']  = ent_outliers['preprocessed_text'].apply(lambda t: best_label(t, ent_keywords))

In [63]:
ent_outliers

Unnamed: 0,id,text,category,preprocessed_text,token_count,bertopic_topic,bertopic_topic_label,Sub-category,labels
511,ent_002,Jarre joins fairytale celebration\n\nFrench mu...,entertainment,jarre join fairytale celebration french music...,151,-1,[Outlier],,Literature
519,ent_010,Uganda bans Vagina Monologues\n\nUganda's auth...,entertainment,uganda ban vagina monologues ugandas authorit...,169,-1,[Outlier],,Theatre & Dance
563,ent_054,Mumbai bombs movie postponed\n\nThe release of...,entertainment,mumbai bomb movie postpone release film mumba...,146,-1,[Outlier],,Cinema
673,ent_164,Parker's saxophone heads auction\n\nA saxophon...,entertainment,parker saxophone head auction saxophone belon...,111,-1,[Outlier],,Music
678,ent_169,US actor 'found with gun residue'\n\nActor Rob...,entertainment,us actor find gun residue actor robert blake ...,126,-1,[Outlier],,Celebrity News
681,ent_172,Eurovision 'greats' to do battle\n\nStars of t...,entertainment,eurovision great battle star eurovision song ...,126,-1,[Outlier],,Music
686,ent_177,Hillbillies singer Scoggins dies\n\nCountry an...,entertainment,hillbilly singer scoggin die country western ...,91,-1,[Outlier],,Music
722,ent_213,TV show unites Angolan families\n\nAngolan fam...,entertainment,tv show unite angolan family angolan family a...,289,-1,[Outlier],,Television
723,ent_214,Volcano drama erupts on BBC One\n\nSupervolcan...,entertainment,volcano drama erupt bbc one supervolcano docu...,288,-1,[Outlier],,Cinema
737,ent_228,Joy Division story to become film\n\nThe life ...,entertainment,joy division story film life joy division sin...,168,-1,[Outlier],,Cinema


In [64]:
# Update the main business_df
ent_df.loc[ent_outliers.index, 'Sub-category'] = ent_outliers['labels']
ent_df

Unnamed: 0,id,text,category,preprocessed_text,token_count,bertopic_topic,bertopic_topic_label,Sub-category
510,ent_001,Gallery unveils interactive tree\n\nA Christma...,entertainment,gallery unveil interactive tree christmas tre...,104,15,"[(van, 0.05534471669753969), (gallery, 0.05176...",Cinema
511,ent_002,Jarre joins fairytale celebration\n\nFrench mu...,entertainment,jarre join fairytale celebration french music...,151,-1,[Outlier],Literature
512,ent_003,Musical treatment for Capra film\n\nThe classi...,entertainment,musical treatment capra film classic film won...,104,18,"[(broadway, 0.10063500771711625), (musical, 0....",Music
513,ent_004,Richard and Judy choose top books\n\nThe 10 au...,entertainment,richard judy choose top book 10 author shortl...,126,12,"[(book, 0.07654611930571634), (novel, 0.039680...",Literature
514,ent_005,Poppins musical gets flying start\n\nThe stage...,entertainment,poppin musical get fly start stage adaptation...,113,17,"[(ballet, 0.12936490629717246), (poppins, 0.06...",Theatre & Dance
...,...,...,...,...,...,...,...,...
891,ent_382,Last Star Wars 'not for children'\n\nThe sixth...,entertainment,star wars child sixth final star wars movie s...,117,6,"[(film, 0.05982710128499842), (star, 0.0322897...",Cinema
892,ent_383,French honour for director Parker\n\nBritish f...,entertainment,french honour director parker british film di...,123,14,"[(film, 0.07724205032802749), (festival, 0.071...",Cinema
893,ent_384,Robots march to US cinema summit\n\nAnimated m...,entertainment,robot march us cinema summit animate movie ro...,173,2,"[(box, 0.046756163217816775), (film, 0.0458487...",Cinema
894,ent_385,Hobbit picture 'four years away'\n\nLord of th...,entertainment,hobbit picture four year away lord rings dire...,142,2,"[(box, 0.046756163217816775), (film, 0.0458487...",Cinema


## Politics Category

In [65]:
# Run bertopic on Politics top category
politics_results = run_bertopic_pipeline(text_df, 'politics')

 Running pipeline for politics category




Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2025-07-08 18:41:03,007 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-08 18:41:03,337 - BERTopic - Dimensionality - Completed ✓
2025-07-08 18:41:03,339 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-08 18:41:03,349 - BERTopic - Cluster - Completed ✓
2025-07-08 18:41:03,351 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-08 18:41:03,428 - BERTopic - Representation - Completed ✓
2025-07-08 18:41:03,531 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-07-08 18:41:03,533 - BERTopic - Dimensionality - Completed ✓
2025-07-08 18:41:03,534 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-07-08 18:41:03,535 - BERTopic - Cluster - Completed ✓


In [66]:
# Visualize intertopic distance map
politics_results['topic_distance_map']

In [67]:
# Visualize documents and topics
politics_results['topic_documents_map']

Now let us inspect the resulting topics and labels for the **politics** category

---

In [68]:
politics_df = politics_results['dataframe']
politics_df.head()

Unnamed: 0,id,text,category,preprocessed_text,token_count,bertopic_topic,bertopic_topic_label
896,pol_001,Labour plans maternity pay rise\n\nMaternity p...,politics,labour plan maternity pay rise maternity pay ...,231,3,"[(wage, 0.029054999250886494), (pay, 0.0277003..."
897,pol_002,Watchdog probes e-mail deletions\n\nThe inform...,politics,watchdog probe email deletion information com...,188,1,"[(howard, 0.038541672129872036), (tory, 0.0261..."
898,pol_003,Hewitt decries 'career sexism'\n\nPlans to ext...,politics,hewitt decry career sexism plan extend pay ma...,280,3,"[(wage, 0.029054999250886494), (pay, 0.0277003..."
899,pol_004,Labour chooses Manchester\n\nThe Labour Party ...,politics,labour choose manchester labour party hold 20...,130,2,"[(election, 0.03181659734051623), (party, 0.03..."
900,pol_005,Brown ally rejects Budget spree\n\nChancellor ...,politics,brown ally reject budget spree chancellor gor...,255,0,"[(blair, 0.03954442833347343), (brown, 0.03573..."


In [69]:
politics_topic_info = politics_results['topic_info']
politics_topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,56,0_blair_brown_labour_prime,"[blair, brown, labour, prime, election, minist...",[labour seek quell feud talk labours leadersh...
1,1,49,1_howard_tory_labour_government,"[howard, tory, labour, government, tories, tax...",[tory leader unveil spending plan tory leader...
2,2,38,2_election_party_labour_blair,"[election, party, labour, blair, blackpool, mi...",[tory expert deny defeat warn conservatives c...
3,3,36,3_wage_pay_increase_work,"[wage, pay, increase, work, government, job, m...",[talk aim avert pension strike talks aim aver...
4,4,30,4_blair_terrorist_forsyth_threat,"[blair, terrorist, forsyth, threat, arrest, go...",[election terror target terrorist try target ...
5,5,27,5_party_lib_kennedy_tax,"[party, lib, kennedy, tax, dems, election, lab...",[taxis trust kennedy public trust taxis bre...
6,6,20,6_school_sport_education_university,"[school, sport, education, university, ballot,...",[school sport back pm tony blair promise spor...
7,7,19,7_aid_world_poverty_africa,"[aid, world, poverty, africa, country, brown, ...",[brown call 55bn aids fund gordon brown call ...
8,8,18,8_lord_lords_answer_advice,"[lord, lords, answer, advice, reform, attorney...",[iraq advice claim spark row tories minister ...
9,9,15,9_asylum_health_immigration_uk,"[asylum, health, immigration, uk, people, refu...",[tory plan migrant health check noneuropean u...


In [70]:
# Printing topic top terms and their score for each topic
print_top_terms(politics_topic_info, politics_results['bbc_topic_model'])


--- Topic 0 ---
 Number of docs: 56
[('blair', 0.03954442833347343), ('brown', 0.03573684657264444), ('labour', 0.032526886173224634), ('prime', 0.031568208836281954), ('election', 0.029535750283013706), ('minister', 0.027025120609136918), ('chancellor', 0.025825568794953138), ('tell', 0.018547294650560823), ('campaign', 0.018017686663369534), ('claim', 0.016105226617050675)]

--- Topic 1 ---
 Number of docs: 49
[('howard', 0.038541672129872036), ('tory', 0.026159120815791977), ('labour', 0.024977230940056536), ('government', 0.021460449251533134), ('tories', 0.020337875237645422), ('tax', 0.0203150811819474), ('election', 0.018528998331585587), ('party', 0.018002236852680605), ('plan', 0.017915354620144826), ('people', 0.01754329159715265)]

--- Topic 2 ---
 Number of docs: 38
[('election', 0.03181659734051623), ('party', 0.03157685825284511), ('labour', 0.02387959845950664), ('blair', 0.021425869384096425), ('blackpool', 0.0208216896011875), ('minister', 0.018551821196726513), ('tor

In [71]:
# Displaying the representative documents for each topic to get a sense of the theme
print_docs(politics_topic_info, politics_results['bbc_topic_model'])


Sample docs for Topic 0
- labour seek quell feud talk  labours leadership put show unity campaign poster launch mps criticise tony blair gordon brown report rift  brown join launch john prescott alan milburn man controversially put charge election planning blair private meeting monday see normally loyal mp warn feuding jeopardise election hope follow book chart dispute prime minister chancellor  event first time milburn share platform chancellor take browns traditional poll planning role pair chat amicably brown insist h
- blair face mp amid feud talk  tony blair face first prime minister question 2005 week renew speculation relationship gordon brown  chancellor leave britain highprofile tour africa highlight poverty issue before insist trust blair despite claim contrary book labour mp warn against disunity tory leader michael howard take theme commons tories accuse prime minister chancellor behave schoolboy squabble playground  michael howard want capitalise far spat go headtohead pr

### Adding Custom Labels

In [72]:
pol_labels = {
    0: "Domestic Politics",
    1: "Economic Issues",
    2: "Domestic Politics",
    3: "Security Measures",
    4: "Domestic Politics",
    5: "Economic Issues",
    6: "Domestic Politics",
    7: "Social Policy",
    8: "Foreign Affairs",
    9: "Foreign Affairs",
    10: "Immigration",
    11: "Legal Reform",
    12: "Security Measures",
    13: "Social Policy",
    14: "Domestic Politics",
    15: "Legal Reform",
    16: "Legal Reform",
    17: "Security Measures",
    18: "Social Policy",
    19: "Foreign Affairs"
}

In [73]:
# Assigning the topic labels to the topic IDs
politics_results['bbc_topic_model'].set_topic_labels(pol_labels)

In [74]:
# Adding sub categories
politics_df['Sub-category'] = politics_df['bertopic_topic'].map(pol_labels)

In [75]:
# Visualizing top topics
politics_results['bbc_topic_model'].visualize_barchart(top_n_topics=10)

### Dealing with Outliers

In [76]:
#Check to see if there are any outliers
pol_outliers = politics_df[politics_df['bertopic_topic'] == -1]
pol_outliers.sort_values(by='token_count', ascending=False)

Unnamed: 0,id,text,category,preprocessed_text,token_count,bertopic_topic,bertopic_topic_label,Sub-category
1139,pol_244,Research fears over Kelly's views\n\nScientist...,politics,research fear kellys view scientist express c...,245,-1,[Outlier],
928,pol_033,Protesters plan airport challenge\n\nCampaigne...,politics,protester plan airport challenge campaigner a...,234,-1,[Outlier],
1033,pol_138,EU fraud clampdown urged\n\nEU member states a...,politics,eu fraud clampdown urge eu member state fail ...,228,-1,[Outlier],
1095,pol_200,Mallon wades into NE vote battle\n\nMiddlesbro...,politics,mallon wade ne vote battle middlesbrough mayo...,223,-1,[Outlier],
955,pol_060,Cardinal criticises Iraq war cost\n\nBillions ...,politics,cardinal criticise iraq war cost billion poun...,200,-1,[Outlier],
981,pol_086,Howard backs stem cell research\n\nMichael How...,politics,howard back stem cell research michael howard...,200,-1,[Outlier],
1264,pol_369,BAA support ahead of court battle\n\nUK airpor...,politics,baa support ahead court battle uk airport ope...,196,-1,[Outlier],
1041,pol_146,'Nuclear dumpsite' plan attacked\n\nPlans to a...,politics,nuclear dumpsite plan attack plan allow forei...,194,-1,[Outlier],
918,pol_023,E-University 'disgraceful waste'\n\nA failed g...,politics,euniversity disgraceful waste fail government...,178,-1,[Outlier],
1057,pol_162,"Protect whistleblowers, TUC says\n\nThe govern...",politics,protect whistleblower tuc government change l...,167,-1,[Outlier],


In [77]:
politics_df['Sub-category'].unique()

array(['Security Measures', 'Economic Issues', 'Domestic Politics',
       'Foreign Affairs', nan, 'Social Policy', 'Legal Reform',
       'Immigration'], dtype=object)

In [78]:
for i, row in pol_outliers.iterrows():
    print(f"\nDoc {i}:\n{row['text']}")


Doc 910:
Talks held on Gibraltar's future

Two days of talks on the future of Gibraltar begin at Jack Straw's country residence later on Wednesday.

Officials at the two-day summit at the foreign secretary's official Kent house, Chevening, will plan a new forum on the Rock's future. In October, Mr Straw and his Spanish counterpart Miguel Moratinos agreed to establish a body that would give Gibraltarians a voice in their future. Most Gibraltarians said in a referendum they wanted to remain British.

Gibraltar's Chief Minister Peter Caruana will represent the British citizens living on the Rock, while Britain's Europe Director Dominick Chilcott will represent the UK. Madrid is being represented by Spain's director general for Europe, Jose Maria Pons. The initiative follows Spain's socialist government's decision to put its long-standing sovereignty ambitions on hold. Gibraltarians rejected plans for the Rock's sovereignty to be shared between Britain and Spain in a referendum organised 

In [79]:
pol_keywords = {
    'Economic Issues': ['budget', 'fiscal'], 
    'Social Policy': ['religious', 'inequality', 'protest', 'children', 'union', 'education'], 
    'Domestic Politics': ['campaign', 'manifesto', 'commons', 'democrats'],
    'Foreign Affairs': ['foreign secretary', 'eu', 'treaty'], 
    'Security Measures': ['terrorist', 'anti-terror'], 
    'Legal Reform': ['legislation'],
    'Immigration': ['migrant']
}

In [80]:
# Assign labels to outlier texts
pol_outliers['labels']  = pol_outliers['preprocessed_text'].apply(lambda t: best_label(t, pol_keywords))

In [81]:
pol_outliers

Unnamed: 0,id,text,category,preprocessed_text,token_count,bertopic_topic,bertopic_topic_label,Sub-category,labels
910,pol_015,Talks held on Gibraltar's future\n\nTwo days o...,politics,talk hold gibraltars future two day talk futu...,94,-1,[Outlier],,Other
913,pol_018,Straw to attend Auschwitz service\n\nForeign S...,politics,straw attend auschwitz service foreign secret...,149,-1,[Outlier],,Foreign Affairs
918,pol_023,E-University 'disgraceful waste'\n\nA failed g...,politics,euniversity disgraceful waste fail government...,178,-1,[Outlier],,Social Policy
928,pol_033,Protesters plan airport challenge\n\nCampaigne...,politics,protester plan airport challenge campaigner a...,234,-1,[Outlier],,Legal Reform
952,pol_057,England children's tsar appointed\n\nThe first...,politics,england children tsar appoint first children ...,116,-1,[Outlier],,Social Policy
955,pol_060,Cardinal criticises Iraq war cost\n\nBillions ...,politics,cardinal criticise iraq war cost billion poun...,200,-1,[Outlier],,Domestic Politics
981,pol_086,Howard backs stem cell research\n\nMichael How...,politics,howard back stem cell research michael howard...,200,-1,[Outlier],,Other
1003,pol_108,'Super union' merger plan touted\n\nTwo of Bri...,politics,super union merger plan tout two britains big...,95,-1,[Outlier],,Social Policy
1018,pol_123,MPs issued with Blackberry threat\n\nMPs will ...,politics,mp issue blackberry threat mp throw commons b...,77,-1,[Outlier],,Domestic Politics
1033,pol_138,EU fraud clampdown urged\n\nEU member states a...,politics,eu fraud clampdown urge eu member state fail ...,228,-1,[Outlier],,Domestic Politics


In [82]:
# Update the main business_df
pol_df.loc[pol_outliers.index, 'Sub-category'] = pol_outliers['labels']
pol_df

NameError: name 'pol_df' is not defined

## Sports Category

In [None]:
# Run bertopic on Sports top category
sport_results = run_bertopic_pipeline(text_df, 'sport')

In [None]:
# Visualize intertopoc distance map
sport_results['topic_distance_map']

In [None]:
# Visualize documents and topics
sport_results['topic_documents_map']

Now let us inspect the resulting topics and labels for the **sport** category

---

In [None]:
sport_df = sport_results['dataframe']
sport_df.head()

In [None]:
sport_topic_info = sport_results['topic_info']
sport_topic_info

In [None]:
# Printing topic top terms and their score for each topic
print_top_terms(sport_topic_info, sport_results['bbc_topic_model'])

In [None]:
# Displaying the representative documents for each topic to get a sense of the theme
print_docs(sport_topic_info, sport_results['bbc_topic_model'])

### Adding Custom Labels

In [None]:
sport_labels = {
    0: 'Football',
    1: 'Athletics (Track & Field)',
    2: 'Rugby',
    3: 'Football',
    4: 'Tennis',
    5: 'Athletics (Track & Field)',
    6: 'Athletics (Track & Field)',
    7: 'Tennis',
    8: 'Rugby',
    9: 'Football',
    10: 'Rugby',
    11: 'Football',
    12: 'Tennis',
    13: 'Rugby',
    14: 'Football',
    15: 'Football',
    16: 'Rugby',
    17: 'Football',
    18: 'Rugby',
    19: 'Rugby',
    20: 'Football',
    21: 'Tennis'
}

In [None]:
# Assigning the topic labels to the topic IDs
sport_results['bbc_topic_model'].set_topic_labels(sport_labels)

In [None]:
# Adding sub categories
sport_df['Sub-category'] = sport_df['bertopic_topic'].map(sport_labels)

In [None]:
# Visualizing top topics
sport_results['bbc_topic_model'].visualize_barchart(top_n_topics=10)

### Dealing with Outliers

In [None]:
#Check to see if there are any outliers
sport_outliers = sport_df[sport_df['bertopic_topic'] == -1]
sport_outliers.sort_values(by='token_count', ascending=False)

In [None]:
for i, row in sport_outliers.iterrows():
    print(f"\nDoc {i}:\n{row['text']}")

In [None]:
# Setting up keywords
sport_keywords = {
    'Rugby': ['six nation', 'lions'],
    'Football': ['uefa', 'midfielder', 'arsenal', 'manchester united', 'football'],
    'Athletics (Track & Field)': ['marathon', 'athletics', ],
    'Tennis': ['grand slam', 'tennis']
}

In [None]:
# Assign labels to outlier texts
sport_outliers['labels']  = sport_outliers['preprocessed_text'].apply(lambda t: best_label(t, sport_keywords))

In [None]:
sport_outliers

In [None]:
# Update the main business_df
sport_df.loc[sport_outliers.index, 'Sub-category'] = sport_outliers['labels']

## Tech Category

In [None]:
# Run bertopic on business top category
tech_results = run_bertopic_pipeline(text_df, 'tech')

In [None]:
# Visualize intertopoc distance map
tech_results['topic_distance_map']

In [None]:
# Visualize documents and topics
tech_results['topic_documents_map']

Now let us inspect the resulting topics and labels for the **tech** category

---

In [None]:
tech_df = tech_results['dataframe']
tech_df.head()

In [None]:
tech_topic_info = tech_results['topic_info']
tech_topic_info

In [None]:
# Printing topic top terms and their score for each topic
print_top_terms(tech_topic_info, tech_results['bbc_topic_model'])

In [None]:
# Displaying the representative documents for each topic to get a sense of the theme
print_docs(tech_topic_info, tech_results['bbc_topic_model'])

### Adding Custom Labels

In [None]:
tech_labels = {
    0: "Mobile Devices",
    1: "Gaming",
    2: "Legal",
    3: "Gadgets",
    4: "Web",
    5: "Media",
    6: "Cybersecurity",
    7: "Web",
    8: "Gaming",
    9: "Networking",
    10: "Cybersecurity",
    11: "Cybersecurity",
    12: "Software",
    13: "Software",
    14: "Networking",
    15: "Hardware",
    16: "Legal",
    17: "Gadgets",  
}

In [None]:
# Assigning the topic labels to the topic IDs
tech_results['bbc_topic_model'].set_topic_labels(tech_labels)

In [None]:
# Adding sub categories
tech_df['Sub-category'] = tech_df['bertopic_topic'].map(tech_labels)

In [None]:
# Visualizing top topics
tech_results['bbc_topic_model'].visualize_barchart(top_n_topics=10)

### Dealing with Outliers

In [None]:
#Check to see if there are any outliers
tech_outliers = tech_df[tech_df['bertopic_topic'] == -1]
tech_outliers.sort_values(by='token_count', ascending=False)

In [None]:
for i, row in tech_outliers.iterrows():
    print(f"\nDoc {i}:\n{row['text']}")

In [None]:
tech_keywords = {
    'Cybersecurity': ['phish', 'cyber attack', 'malware', 'virus'], 
    'Gadgets': ['power cable'],
    'Web': ['search', 'engine', 'blog'], 
    'Media': ['television', 'radio'], 
    'Software': ['software'], 
    'Legal': ['sue', 'court', 'legal action'],
    'Mobile Devices': ['mobile', 'phone'], 
    'Hardware': ['chip', 'intel', 'micro processor'], 
    'Networking': ['broaband', 'wifi'], 
    'Gaming': ['console', 'psp', 'playstation']
}

In [None]:
# Assign labels to outlier texts
tech_outliers['labels']  = tech_outliers['preprocessed_text'].apply(lambda t: best_label(t, tech_keywords))

In [None]:
tech_outliers

In [None]:
# Update the main business_df
tech_df.loc[tech_outliers.index, 'Sub-category'] = tech_outliers['labels']

# Named Entity Recognition

The purpose of this section is to identify named entities in the documents and identify their jobs

In [None]:
# Define jb categories and common indicative keywords
job_keywords = {
    "Politician": ["politician", "senator", "president", "minister", "governor", "mp", "congress"],
    "TV/Film Personality": ["actor", "actress", "director", "producer", "television", "host", "filmmaker"],
    "Musician": ["singer", "songwriter", "musician", "composer", "band", "rapper", "dj"]
}

In [None]:
# Setup function to extract the entities
def extract_persons(text: str, nlp) -> List[str]:
    """Extract PERSON entities from text using spaCy model."""
    doc = nlp(text)
    return list({ent.text for ent in doc.ents if ent.label_ == "PERSON"})

In [None]:
# Set up function to classify the jobs of the entities
def classify_person(name: str) -> Optional[str]:
    """Classify entities using Wikipedia."""
    search_results = wikipedia.search(name)
    try:
        if search_results:
            summary = wikipedia.summary(search_results[0], sentences=1, auto_suggest=False)
        else:
            return None
    except Exception:
        return None
        
    summary_lower = summary.lower()
    for category, keywords in job_keywords.items():
        if any(kw in summary_lower for kw in keywords):
            return category
        return summary_lower

In [None]:
xx = text_df['text'].apply(lambda x: extract_persons(x, nlp))

In [None]:
from wikidataclient import Client

client = Client()

def occupations_from_name(name: str) -> list[str]:
    entity = client.get(name, load=True)  # fuzzy matches the title
    # entity['claims']['P106'] is a list of Claim objects
    return [claim.target.text for claim in entity.claims.get('P106', [])]

print(occupations_from_name("Lady Gaga"))

In [None]:
[classify_person(x) for x in xx[1]]

In [None]:
from SPARQLWrapper import SPARQLWrapper, JSON

def fetch_occupations_sparql(names: list[str]) -> dict[str, list[str]]:
    """
    Given a list of person names, find their Wikidata Q-IDs via SPARQL
    and return a mapping name → [occupation labels].
    """
    # Build a VALUES block matching on the rdfs:label (case-insensitive)
    values = "\n".join(
        f'  ( "{name}"@en )' for name in names
    )
    query = f"""
    SELECT ?name ?occupationLabel WHERE {{
      VALUES (?name) {{ 
    {values}
      }}
      ?person rdfs:label ?name ;
              wdt:P106 ?occupation .
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    """
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()["results"]["bindings"]

    occ_map = {name: [] for name in names}
    for row in results:
        name = row["name"]["value"]
        occ  = row["occupationLabel"]["value"]
        occ_map[name].append(occ)
    return occ_map

# Example usage:
names = ["Barack Obama", "Lady Gaga", "Michael Jackson"]
occ_map = fetch_occupations_sparql(names)
for person, occs in occ_map.items():
    print(f"{person}: {occs or ['Unknown']}")


In [None]:
name = wikipedia.search('Michael Jackson')

In [None]:
def analyze_documents(docs: List[str]) -> List[Tuple[str, Optional[str]]]:
    """
    Process list of documents, extract and classify media personalities.
    Returns list of (name, category) tuples.
    """
    nlp = spacy.load('en_core_web_sm')
    results = []
    for text in docs:
        persons = extract_persons(text, nlp)
        for person in persons:
            category = classify_person(person)
            results.append((person, category))
    return results


def main() -> None:
    """
    Demonstrate NER and classification on example documents.
    """
    documents = [
        "Barack Obama spoke at the United Nations.",
        "Lady Gaga will release her new album next month.",
        "Christopher Nolan is directing a new film."
    ]

    entries = analyze_documents(documents)
    for name, category in entries:
        label = category if category else "Unknown"
        print(f"{name}: {label}")


if __name__ == '__main__':
    main()


In [None]:
def extract_entities(text: str) -> list[tuple[str, str]]:
    """
    Process `text` and return a list of (entity_text, label) tuples.

    :param text: Input string to analyze
    :return: List of named entities and their labels
    """
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]


def main() -> None:
    """
    Demonstrate NER on example sentences.
    """
    samples = [
        "Apple is looking at buying U.K. startup for $1 billion.",
        "Barack Obama was born in Hawaii.",
        "Tesla's stock price surged after the earnings report."
    ]

    for sentence in samples:
        entities = extract_entities(sentence)
        print(f"Input: {sentence}")
        print("Entities:")
        for text, label in entities:
            print(f"  - {text} ({label})")
        print()


if __name__ == '__main__':
    # Load the English model with NER capability
    nlp = spacy.load('en_core_web_sm')
    main()

# Date Parsing

This section is for extracting events that took place in April

In [None]:
# Define summarizer
summarizer = pipeline('summarization')

In [None]:
# Creating function

def extract_text_with_dates(text: str):
    doc = nlp(text)
    for sentence in doc.sents:
        # Find DATE entities in the sentence
        dates = [ent.text for ent in sentence.ents if ent.label_ == 'DATE']
        for raw in dates:
            if not re.search(r"\b\d{4}\b", raw):
                continue
            dt = dateparser.parse(raw, settings={'REQUIRE_PARTS': ['month', 'year']}) #Only accepting parses with a date and a year
            if dt:
                yield sentence.text, dt

In [None]:
# Function to filter out april dtaes and summarize events

def filter_summarize_april(text: str, max_events=10):
    april_events = []
    for sentence, dt in extract_text_with_dates(text):
        if dt.month ==4:
            april_events.append(sentence)

    summaries = []
    for sentence in april_events:
        summary = summarizer(sentence, max_length=50, min_length=15, do_sample=False)[0]["summary_text"]
        summaries.append({"date_sentence": sentence, "summary": summary})
    return summaries

In [None]:
date_matches = text_df['text'].apply(lambda t: list(extract_text_with_dates(t)))

In [None]:
date_table = text_df.assign(matches=m).explode('matches').dropna(subset=['matches'])

In [None]:
date_table[['sentence', 'date']] = pd.DataFrame(date_table['matches'].tolist(), index=date_table.index)

In [None]:
date_table = date_table.drop(columns=['matches'])

In [None]:
# date_table['sentence'].apply(filter_summarize_april)

In [None]:
date_table

In [None]:
mm