In [1]:
from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups
import pandas as pd

pd.set_option("display.max_rows", None)
from bertopic.representation import KeyBERTInspired
import string

from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
df = pd.read_csv("merged_uk.csv").dropna(subset=["text"])
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22207 entries, 0 to 22375
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   url      22207 non-null  object
 1   website  22207 non-null  object
 2   title    22196 non-null  object
 3   text     22207 non-null  object
dtypes: object(4)
memory usage: 867.5+ KB


In [3]:
custom_stopwords = {
    "Arden University",
    "Arts University Bournemouth",
    "Courtauld Institute of Art",
    "University of Leicester",
    "Norwich University of the Arts",
    "University of East London",
    "University of Warwick",
    "Glyndŵr University",
    "Writtle University College",
    "University of Aberdeen",
    "Aberystwyth University",
    "Abertay University",
    "Anglo-European College of Chiropractic",
    "University of the Arts London",
    "Anglia Ruskin University",
    "Aston University",
    "Bangor University",
    "University of Bath",
    "Bath Spa University",
    "Birkbeck, Uiversity of London",
    "Birmingham City University",
    "University of Bedfordshire",
    "University of Birmingham",
    "University of Bolton",
    "Bournemouth University",
    "BPP University",
    "University of Bradford",
    "University of Brighton",
    "University of Bristol",
    "Oxford Brookes University",
    "Rose Bruford College of Theatre and Performance",
    "Brunel University London",
    "University of Buckingham",
    "Buckinghamshire New University",
    "University of Cambridge",
    "Canterbury Christ Church University",
    "Cardiff university",
    "Cardiff Metropolitan University",
    "University of Chichester",
    "City University London",
    "Coventry University",
    "Cranfield University",
    "Royal Central School of Speech and Drama",
    "University of Cumbria",
    "University of Derby",
    "De Montfort University",
    "University of Dundee",
    "Durham University",
    "University of Edinburgh",
    "Edge Hill University",
    "University of Essex",
    "University of Exeter",
    "Falmouth University",
    "Glasgow Caledonian University",
    "University of Gloucestershire",
    "Goldsmiths, University of London",
    "University of Greenwich",
    "Guildhall School of Music and Drama",
    "Harper Adams University",
    "Hartpury College",
    "University of Hertfordshire",
    "Liverpool Hope University",
    "University of Huddersfield",
    "University of Hull",
    "Ashridge Business School",
    "Heriot-Watt University",
    "Institute of Cancer Research",
    "Imperial College of Science, Technology and Medicine",
    "King’s College London",
    "Keele University",
    "University of Kent",
    "Kingston University",
    "Lancaster University",
    "University of Law",
    "Loughborough University",
    "Leeds College of Art",
    "University of Leeds",
    "Leeds Becket University",
    "Leeds Trinity University",
    "London Institute of Banking and Finance",
    "University of Lincoln",
    "University of Liverpool",
    "Liverpool John Moores University",
    "University of London",
    "London Business School",
    "London Metropolitan University",
    "London South Bank University",
    "London School of Economics and Political Science",
    "London School of Hygiene and Tropical Medicine",
    "Liverpool School of Tropical Medicine",
    "University of Manchester",
    "University of St Mark and St John, Plymouth",
    "Middlesex University",
    "Manchester Metropolitan University",
    "Edinburgh Napier University",
    "NCG",
    "Newcastle University",
    "Newman University, Birmingham",
    "University of Northampton",
    "Northumbria University Newcastle",
    "University of Nottingham",
    "Nottingham Trent University",
    "Open University",
    "University of Oxford",
    "Plymouth University",
    "Portsmouth University",
    "Queen Margaret University, Edinburgh",
    "Queen Mary, University of London",
    "Queen’s University Belfast",
    "Royal Academy of Music",
    "Royal Agricultural University",
    "Ravensbourne",
    "Royal College of Art",
    "Royal College of Music",
    "Royal College of Nursing",
    "Royal Conservatoire of Scotland",
    "University of Reading",
    "Regent’s University London",
    "Robert Gordon University, Aberdeen",
    "Royal Northern College of Music",
    "University of Roehampton",
    "Royal Holloway, University of London",
    "Royal Veterinary College",
    "University of Salford",
    "St George’s, University of London",
    "University of Sheffield",
    "Sheffield Hallam University",
    "School of Oriental and African Studies (SOAS), University of London",
    "Solent University",
    "University of Southampton",
    "University of South Wales",
    "University of St Andrews",
    "Staffordshire University",
    "University of Stirling",
    "St Mary’s University, Twickenham",
    "University of Strathclyde",
    "University of Sunderland",
    "University of Surrey",
    "University of Sussex",
    "Swansea University",
    "Teesside University",
    "Trinity Laban Conservatoire of Music and Dance",
    "University for the Creative Arts",
    "University College of Estate Management",
    "University College London",
    "Institute of Education, University of London",
    "University of Central Lancashire",
    "University College of Osteopathy",
    "University of East Anglia",
    "University of the Highlands and Islands",
    "University of Ulster",
    "University of Suffolk",
    "University of the West of England, Bristol",
    "University of West London",
    "University of the West of Scotland",
    "University of Wales Trinity Saint David",
    "University of Wales",
    "University of Westminster",
    "University of Winchester",
    "University of Wolverhampton",
    "University of Worcester",
    "University of York",
    "York St John University",
    "University of Chester",
    "next",
    "previous",
    "university",
    "professor",
    "undergraduate",
    "postgraduate",
    "cookies",
    "decline",
    "accept",
    "course",
    "studies",
    "year",
    "study",
    "education",
    "years",
    "engineering",
    "physics",
    "mathematics",
    "please",
    "alumni",
    "school",
    "contact",
    "philosophy",
    "law",
    "economics",
    "literature",
    "medicine",
    "computer",
    "political",
    "science",
    "chemistry",
    "biology",
    "geography",
    "history",
    "classics",
    "engineering",
    "academics",
    "academia",
    "centre",
    "center",
    "program",
    "campus",
    "degree",
    "pages",
    "page",
    "information",
    "find",
    "msc",
    "hons",
    "bsc",
    "apply",
    "learning",
    "curriculum",
    "instruction",
    "project",
    "projects",
    "publication",
    "publications",
    "laboratory",
    "laboratories",
    "lab",
    "labs",
    "grant",
    "grants",
    "activities",
    "scholars",
    "scholar",
    "researcher",
    "researchers",
    "programs",
    "program",
    "major",
    "majors",
    "minor",
    "minors",
    "courses",
    "specialization",
    "specializations",
    "field",
    "fields",
    "facilities",
    "buildings",
    "building",
    "bachelor",
    "master",
    "doctorate",
    "graduate",
    "phd",
    "diploma",
    "library",
    "libraries",
    "technology",
    "reject",
    "save",
    "business",
    "information",
    "level",
    "events",
    "management",
    "music",
    "sciences",
    "english",
    "online",
    "art",
    "arts",
    "professional",
    "new",
    "media",
    "days",
    "day",
    "design",
    "team",
    "people",
    "menu",
    "programme",
    "programmes",
    "finance",
    "city",
    "foundation",
    "data",
    "email",
    "department",
    "departments",
    "mail",
    "address",
    "view",
    "back",
    "click",
    "please",
    "privacy",
    "visit",
    "college",
    "terms",
    "conditions",
    "institute",
    "institutes",
    "institution",
    "institutions",
    "visa",
    "marketing",
    "content",
    "mba",
    "login",
    "password",
    "passwords",
    "aru",
    "marys",
    "lstm",
    "bcu",
    "bnu",
    "beckett",
    "bangor",
    "mba",
    "msc",
    "bsc",
    "mphil",
    "phd",
    "dphil",
    "sign in",
    "liverpool",
    "semester",
    "lsbu",
    "nhs",
    "uea",
    "dmu",
    "qmu",
    "rcm",
    "marjon",
    "wroclaw",
    "wrocław",
    "menu",
    "submenu",
    "semester",
    "bachelors",
    "masters",
    "javascript",
    "malaysia",
    "china",
    "nottingham",
    "nottinghams",
    "uea",
    "mail",
    "mailing",
    "logging",
    "log",
    "reset",
    "permission",
    "unauthorized",
    "skip",
    "cefs",
    "aber",
    "uco",
    "uwtsd",
    "az",
    "wlv",
    "liverpoolit",
    "virtual",
    "tour",
    "username",
    "logon",
    "term",
    "dates",
    "terminalfour",
    "redirect",
    "bus",
    "entrance",
    "corridor",
    "lse",
    "welcome",
    "payment",
    "portal",
    "scarborough",
    "uganda",
    "malawi",
    "ucl",
    "error",
    "403",
    "404",
    "nginx",
    "cookies",
    "browser",
    "denied",
    "terminalfour",
    "redirecting"
}

In [4]:
# Convert set of phrases to a set of distinct words
distinct_stopwords = set()
for phrase in custom_stopwords:
    phrase_no_punct = phrase.translate(
        str.maketrans(string.punctuation, " " * len(string.punctuation))
    )
    words = phrase_no_punct.split()
    for word in words:
        distinct_stopwords.add(word.lower())  # Convert each word to lowercase

len(distinct_stopwords)

389

In [5]:
# distinct_stopwords

In [6]:
def remove_distinct_stopwords(text):
    return " ".join([word for word in text.split() if word not in distinct_stopwords])

In [7]:
df["text"] = df["text"].astype(str)
df["text"] = df["text"].str.lower()
translator = str.maketrans(string.punctuation, " " * len(string.punctuation))
df["text"] = df["text"].astype(str).apply(lambda x: x.translate(translator))
df = df[~df["text"].str.contains("no data")]
df["text"] = df["text"].apply(remove_distinct_stopwords)
df["text"] = df["text"].str.replace(r"\s+", " ", regex=True).str.strip()
df.head()

Unnamed: 0,url,website,title,text
0,http://bear.buckingham.ac.uk/,buckingham.ac.uk,Welcome to the Buckingham E-Archive of Researc...,to e archive bear e archive home about browse ...
1,http://blogs.cranfield.ac.uk/,cranfield.ac.uk,Cranfield University Blogs | The latest blogs ...,blogs latest blogs from staff students at to s...
2,http://bone.uco.ac.uk/,uco.ac.uk,Bone-UCO's Virtual Learning Environment: Log i...,bone s environment to site to main to bone s e...
3,http://classes.myplace.strath.ac.uk/,strath.ac.uk,myplace.strath.ac.uk,myplace strath ac uk to main toggle navigation...
4,http://connected.qmul.ac.uk/,qmul.ac.uk,Queen Mary University Authentication,authentication identity check this is restrict...


In [8]:
# Pre-calculate embeddings
embedding_model = SentenceTransformer("all-mpnet-base-v2")
embeddings = embedding_model.encode(df["text"].to_numpy(), show_progress_bar=True)

Batches:   0%|          | 0/690 [00:00<?, ?it/s]

In [9]:
# reduce embedding dimensions
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric="cosine")

In [10]:
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))

In [11]:
# cluster embeddings
hdbscan_model = HDBSCAN(metric="euclidean", prediction_data=True)

In [12]:
representation_model = KeyBERTInspired()

topic_model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    embedding_model=embedding_model,
    representation_model=representation_model,
    top_n_words=20,
    verbose=True,
    nr_topics="auto"
)

topics, probs = topic_model.fit_transform(df["text"], embeddings)

2024-02-21 21:35:16,018 - BERTopic - Reduced dimensionality


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

2024-02-21 21:35:18,068 - BERTopic - Clustered reduced embeddings
2024-02-21 22:00:00,875 - BERTopic - Reduced number of topics from 648 to 394


In [13]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,4168,-1_governance_global health_funding_policy,"[governance, global health, funding, policy, h...",[procurement strategy informizely customer fee...
1,0,947,0_interdisciplinary_interdisciplinary ma_speci...,"[interdisciplinary, interdisciplinary ma, spec...",[home staff student website search at at under...
2,1,537,1_environmental humanities_secondary teaching_...,"[environmental humanities, secondary teaching,...",[to main main index a z types interest areas a...
3,2,511,2_campuses_campuses severn_academic_honours st...,"[campuses, campuses severn, academic, honours ...",[search to we use to enhance your experience t...
4,3,489,3_application gateway_azure_microsoft azure_ga...,"[application gateway, azure, microsoft azure, ...",[forbidden forbidden microsoft azure applicati...
5,4,414,4_student enrolment_ual insights_ual staff_ual...,"[student enrolment, ual insights, ual staff, u...",[stories ual to main subjects colleges accessi...
6,5,358,5_conservatoires_ba acting_modern ballet_teaching,"[conservatoires, ba acting, modern ballet, tea...",[not found to main to home to on verge share s...
7,6,271,6_therapy auditions_auditions_productions_audi...,"[therapy auditions, auditions, productions, au...",[unavailable icon arrow downicon arrow rightic...
8,7,269,7_helpdesk academic_application guide_midwifer...,"[helpdesk academic, application guide, midwife...",[not found to main explore order prospectus at...
9,8,267,8_international recruitment_international cons...,"[international recruitment, international cons...",[access your prospectus cinematics games maths...


In [16]:
topic_model.find_topics("diversity", top_n=10)

([88, 137, 332, 339, 158, 321, 29, 268, 12, 210],
 [0.5337648148882712,
  0.5102447333044345,
  0.47725656703665986,
  0.4675097797380581,
  0.4467239984746658,
  0.4194907805622158,
  0.41591734832337424,
  0.4027074964484265,
  0.39686289369090255,
  0.39293978873481955])

In [81]:
topic_model.get_topic(8)

[('international recruitment', 0.49681804),
 ('international consultants', 0.47625363),
 ('life uni', 0.46547326),
 ('recruitment development', 0.44513243),
 ('social work', 0.42174786),
 ('academic partnerships', 0.41763884),
 ('wellbeing inclusive', 0.4162608),
 ('consultancy', 0.40402254),
 ('choose academic', 0.3673463),
 ('working students', 0.3663531)]

In [15]:
topic_model.get_topic_info().to_excel("topics-uk-02022024.xlsx")