In [8]:
import pandas as pd
import gensim

from gensim.corpora import Dictionary
from gensim.models import ldamodel
from gensim.models.ldamodel import LdaModel




In [2]:
import pickle

with open('filtered_tokenised_abstracts.pkl', 'rb') as file:
    filtered_tokenised_abstracts = pickle.load(file)

print(filtered_tokenised_abstracts[:5])

0    [india, event, mark, conclusion, funded, pilot...
1    [action, based, brain, provocation, philosophy...
2    [imprint, forensic, historical, investigation,...
3    [structural, typological, variation, dialect, ...
4    [judging, image, making, management, consumpti...
Name: AllText, dtype: object


In [4]:
dictionary = Dictionary(filtered_tokenised_abstracts)
len(dictionary)

30035

In [5]:
# Create a Document-Term Matrix (DTM) using the dictionary
dtm = [dictionary.doc2bow(doc) for doc in filtered_tokenised_abstracts]



In [None]:
def set_priors(eta, topic, words, p=.8):
    word_indexes = [word2id[w] for w in words]
    eta[topic, word_indexes] *=1000

In [None]:
n = 15
eta = np.full((n, len(dictionary)), 1/(len(dictionary)*n))

In [16]:
# Define your predefined keywords for each topic
digital_keywords = ["technology", "data", "digital", "internet", "information", "digitise"]
urban_keywords = ["urban", "city", "planning", "space", "access", "culture", "design", "civic", "transport"]
local_keywords = ["local", "region", "regional", "heritage", "history", "identity", "site"]
health_keywords = ["health", "wellbeing", "mental_health", "care", "social_care", "disability", "inclusion", "access", "community", "support"]
healthcare_keywords = ["hospital", "nhs", "disease", "infection", "death", "anti_microbial", "pandemic", "virus"]
cooperation_keywords = ["network", "cooperation", "partnership", "partner", "collaborate", "collaboration", "impact"]
international_keywords = ["international", "global", "transnational", "world_wide", "world"]
race_keywords = ["race", "colonial", "slavery", "slave_trade", "black", "ethnic", "african", "indian", "empire", "indigenous", "native"]
justice_keywords = ["justice", "equality", "diversity", "social", "reparative", "reparation", "repatriation"]
conflict_keywords = ["conflict", "violence", "genocide", "terrorism", "war", "displacement"]
learning_keywords = ["resource", "learning", "teaching", "school", "activity", "youth", "education", "student", "teacher", "classroom"]
public_keywords = ["engagement", "public", "engage", "audience", "participation", "participant", "community", "experience", "visitor"]
climate_keywords = ["climate", "climate_change", "environment", "nature", "natural", "animal", "plant", "resource", "waste", "future", "food", "science","global_warming", "argiculture", "risk","environmental", "emergency"]
religion_keywords = ["religion", "faith", "church", "belief", "value", "worldview"]
migration_keywords = ["migration", "migrant", "refugee"]

seed_topics = [digital_keywords, urban_keywords, local_keywords, health_keywords, healthcare_keywords, cooperation_keywords, 
               international_keywords, race_keywords, justice_keywords, conflict_keywords, learning_keywords, public_keywords,
               climate_keywords, religion_keywords, migration_keywords]


In [19]:
# Create a topic-term matrix where the seed words have high probabilities
num_topics = 20
alpha = 10
eta = 0.01
iterations = 200
passes = 50

lda_model = LdaModel(dtm, num_topics=num_topics, id2word=dictionary, alpha=alpha, eta=eta, iterations=iterations, passes = passes)

# Assign the seed words to the topics
for topic_id, topic_words in enumerate(seed_topics):
    topic_words_ids = [dictionary.token2id[word] for word in topic_words if word in dictionary.token2id]
    lda_model.get_topic_terms(topic_id, len(topic_words_ids))
    for word_id, weight in lda_model.get_topic_terms(topic_id, len(topic_words_ids)):
        lda_model.state.get_lambda()[topic_id, word_id] = 25.0

# Train the LDA model with the seeded topics
lda_model.update(dtm)

# Print the topics with seeded words
for i in range(num_topics):
    print(f"Topic {i}: {lda_model.print_topic(i)}")


Topic 0: 0.108*"community" + 0.028*"local" + 0.015*"partner" + 0.014*"group" + 0.013*"city" + 0.013*"organisation" + 0.010*"engagement" + 0.010*"working" + 0.009*"urban" + 0.008*"practice"
Topic 1: 0.023*"science" + 0.013*"nature" + 0.012*"question" + 0.010*"idea" + 0.010*"humanity" + 0.009*"public" + 0.009*"field" + 0.009*"communication" + 0.008*"form" + 0.008*"researcher"
Topic 2: 0.030*"music" + 0.028*"digital" + 0.018*"film" + 0.016*"performance" + 0.012*"medium" + 0.012*"technology" + 0.011*"industry" + 0.010*"form" + 0.010*"audience" + 0.008*"sound"
Topic 3: 0.020*"public" + 0.014*"future" + 0.012*"child" + 0.011*"social" + 0.011*"human" + 0.009*"practice" + 0.009*"family" + 0.008*"space" + 0.008*"issue" + 0.008*"environment"
Topic 4: 0.020*"international" + 0.015*"political" + 0.012*"case" + 0.012*"religious" + 0.011*"context" + 0.011*"social" + 0.011*"state" + 0.010*"finding" + 0.010*"academic" + 0.009*"concept"
Topic 5: 0.019*"health" + 0.019*"value" + 0.018*"impact" + 0.015*"

In [20]:
LdaModel?

[1;31mInit signature:[0m
[0mLdaModel[0m[1;33m([0m[1;33m
[0m    [0mcorpus[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mnum_topics[0m[1;33m=[0m[1;36m100[0m[1;33m,[0m[1;33m
[0m    [0mid2word[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mdistributed[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mchunksize[0m[1;33m=[0m[1;36m2000[0m[1;33m,[0m[1;33m
[0m    [0mpasses[0m[1;33m=[0m[1;36m1[0m[1;33m,[0m[1;33m
[0m    [0mupdate_every[0m[1;33m=[0m[1;36m1[0m[1;33m,[0m[1;33m
[0m    [0malpha[0m[1;33m=[0m[1;34m'symmetric'[0m[1;33m,[0m[1;33m
[0m    [0meta[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mdecay[0m[1;33m=[0m[1;36m0.5[0m[1;33m,[0m[1;33m
[0m    [0moffset[0m[1;33m=[0m[1;36m1.0[0m[1;33m,[0m[1;33m
[0m    [0meval_every[0m[1;33m=[0m[1;36m10[0m[1;33m,[0m[1;33m
[0m    [0miterations[0m[1;33m=[0m[1;36m50[0m[1;33m,[0m[1;33m
[0m    [0mgamma_thres

In [21]:
import guidedlda

ModuleNotFoundError: No module named 'guidedlda'