In [1]:
import pandas as pd

from gensim.corpora import Dictionary
from gensim.models import ldamodel
from gensim.models.coherencemodel import CoherenceModel
from wordcloud import WordCloud

from PIL  import Image
import numpy as np
import random
import re
import matplotlib.pyplot as plt

In [9]:
import pickle

with open('filtered_tokenized_abstracts.pkl', 'rb') as file:
    filtered_tokenized_abstracts = pickle.load(file)

print(filtered_tokenized_abstracts[:5])

[['india', 'event', 'mark', 'conclusion', 'ahrcfunded', 'pilot', 'century', 'indian', 'print', 'cip', 'anniversary', 'indian', 'independence', 'important', 'national', 'event', 'south', 'asia', 'propose', 'showcase', 'cips', 'cataloguing', 'digitisation', 'output', 'celebrate', 'india', 'print', 'film', 'literary', 'history', 'dedicated', 'event', 'programme', 'aim', 'maximise', 'programme', 'demonstrate', 'role', 'documenting', 'collection', 'making', 'available', 'engaging', 'user', 'diversifying', 'british', 'library', 'audience', 'cip', 'largely', 'focused', 'east', 'india', 'east', 'indian', 'language', 'bengali', 'assamese', 'sylheti', 'result', 'cip', 'event', 'october', 'bangladesh', 'british', 'library', 'often', 'oriented', 'towards', 'british', 'bangladeshi', 'visitor', 'arranging', 'event', 'aimed', 'multiple', 'british', 'south', 'asian', 'proposed', 'india', 'event', 'able', 'sustain', 'existing', 'relationship', 'british', 'bangladeshi', 'form', 'connection', 'british', 

In [11]:
dictionary = Dictionary(filtered_tokenized_abstracts)
len(dictionary)

38939

In [18]:
# Create a Document-Term Matrix (DTM) using the dictionary
dtm = [dictionary.doc2bow(doc) for doc in filtered_tokenized_abstracts]



In [19]:
from sklearn.model_selection import train_test_split

train_df, valid_df = train_test_split(dtm, test_size=0.5, random_state=53)


np.random.seed(1)
k_range = range(10,25)
scores = []
for k in k_range:
    lda_model = ldamodel.LdaModel(train_df, num_topics=k, id2word=dictionary, passes=20)
    cm = CoherenceModel(model = lda_model, corpus = train_df, dictionary = dictionary, coherence = "u_mass")
    print(cm.get_coherence())
    scores.append(cm.get_coherence())
plt.figure()
plt.plot(k_range, scores)

-1.4336631321665174
-1.6314613119918482
-1.8343368939140203
-1.798354485474716
-2.8235847359791593
-2.4506788102466617
-2.2956888166459803


KeyboardInterrupt: 

In [14]:
# Specify the number of topics you want to extract
num_topics = 10  # Adjust the number of topics as needed

# Initialize LDA model
lda_model = ldamodel.LdaModel(corpus=train_df, id2word=dictionary, num_topics=num_topics, iterations=100)


# Iterate over a range of iterations, and check coherence every 5th iteration
for iteration in range(10, 100):  # Adjust the range based on your total number of iterations
    lda_model.update(train_df)  # Perform one iteration of LDA
    if iteration % 5 == 0:
        cm = CoherenceModel(model=lda_model, corpus=valid_df, dictionary=dictionary, coherence = "u_mass")
        coherence_score = cm.get_coherence()
        print(f"Iteration {iteration}: Coherence Score = {coherence_score:.4f}")



Iteration 10: Coherence Score = -0.9103
Iteration 15: Coherence Score = -1.3645
Iteration 20: Coherence Score = -1.2033
Iteration 25: Coherence Score = -1.1778
Iteration 30: Coherence Score = -1.1845
Iteration 35: Coherence Score = -1.1850
Iteration 40: Coherence Score = -1.1875
Iteration 45: Coherence Score = -1.1849
Iteration 50: Coherence Score = -1.2292
Iteration 55: Coherence Score = -1.2272
Iteration 60: Coherence Score = -1.2426
Iteration 65: Coherence Score = -1.2205
Iteration 70: Coherence Score = -1.2266
Iteration 75: Coherence Score = -1.2404
Iteration 80: Coherence Score = -1.2381
Iteration 85: Coherence Score = -1.2347
Iteration 90: Coherence Score = -1.2384
Iteration 95: Coherence Score = -1.2393


In [20]:
# Specify the number of topics you want to extract
num_topics = 10  # Adjust the number of topics as needed

# Initialize LDA model
lda_model10 = ldamodel.LdaModel(corpus=dtm, id2word=dictionary, num_topics=num_topics, iterations=10)


num_topics = lda_model10.num_topics  # Get the total number of topics in the model

# Print all identified topics with their top terms
for topic_id in range(num_topics):
    print(f"Topic {topic_id + 1}:")
    print(lda_model10.show_topic(topic_id, topn = 20))
    print()

Topic 1:
[('practice', 0.0046650437), ('local', 0.004457202), ('digital', 0.004138977), ('museum', 0.00409837), ('workshop', 0.0040068016), ('history', 0.0038713403), ('academic', 0.0037856821), ('social', 0.0037805932), ('people', 0.0036734154), ('benefit', 0.0036277284), ('group', 0.0035610246), ('art', 0.0035296606), ('policy', 0.0033837424), ('knowledge', 0.0032751202), ('way', 0.0032631152), ('heritage', 0.003247942), ('music', 0.003126008), ('development', 0.003101195), ('national', 0.0030381132), ('different', 0.0029144494)]

Topic 2:
[('art', 0.005549456), ('social', 0.0050637596), ('heritage', 0.0047210352), ('international', 0.0046223304), ('history', 0.0043842774), ('knowledge', 0.004173488), ('practice', 0.0041119438), ('understanding', 0.004091939), ('people', 0.004035173), ('benefit', 0.0039005687), ('local', 0.0037247608), ('policy', 0.0036475318), ('aim', 0.0033892812), ('group', 0.0032955136), ('creative', 0.0032798909), ('academic', 0.0032629995), ('partner', 0.003093

In [21]:
# Specify the number of topics you want to extract
num_topics = 10  # Adjust the number of topics as needed

# Initialize LDA model
lda_model25 = ldamodel.LdaModel(corpus=dtm, id2word=dictionary, num_topics=num_topics, iterations=25)


num_topics = lda_model25.num_topics  # Get the total number of topics in the model

# Print all identified topics with their top terms
for topic_id in range(num_topics):
    print(f"Topic {topic_id + 1}:")
    print(lda_model25.show_topic(topic_id, topn = 20))
    print()

Topic 1:
[('museum', 0.0059046755), ('benefit', 0.0048836195), ('collection', 0.004646591), ('social', 0.0045608184), ('practice', 0.004538641), ('history', 0.0044864165), ('art', 0.004307728), ('music', 0.0042779613), ('group', 0.0041255015), ('heritage', 0.0041112253), ('local', 0.004038574), ('academic', 0.0039820443), ('knowledge', 0.00397716), ('engagement', 0.0039771274), ('digital', 0.003405263), ('policy', 0.0032749076), ('experience', 0.0032406708), ('development', 0.0032093432), ('workshop', 0.0032002227), ('activity', 0.0031279773)]

Topic 2:
[('group', 0.0050332593), ('creative', 0.0047435462), ('social', 0.004666353), ('local', 0.004552101), ('people', 0.0045367274), ('organisation', 0.004513502), ('practice', 0.0044953777), ('data', 0.004339397), ('knowledge', 0.004141969), ('health', 0.0041097756), ('development', 0.004048634), ('national', 0.0039483802), ('benefit', 0.0037310135), ('heritage', 0.003675266), ('support', 0.0036523978), ('art', 0.0035445972), ('policy', 0.

In [23]:
# Specify the number of topics you want to extract
num_topics = 8  # Adjust the number of topics as needed

# Initialize LDA model
lda_model300 = ldamodel.LdaModel(corpus=dtm, id2word=dictionary, num_topics=num_topics, iterations=300)

# Print all identified topics with their top terms
for topic_id in range(num_topics):
    print(f"Topic {topic_id + 1}:")
    print(lda_model300.show_topic(topic_id, topn = 20))
    print()

Topic 1:
[('heritage', 0.009813383), ('local', 0.0076543405), ('art', 0.0076155933), ('partner', 0.0066566924), ('practice', 0.0060110474), ('creative', 0.0058965357), ('knowledge', 0.0055530816), ('activity', 0.005314126), ('organisation', 0.0052511906), ('engagement', 0.0050156903), ('benefit', 0.004643099), ('group', 0.0044111544), ('people', 0.0040410943), ('value', 0.0039029457), ('academic', 0.003895448), ('resource', 0.003638145), ('social', 0.0034383135), ('develop', 0.003308507), ('aim', 0.0032855817), ('development', 0.0032588337)]

Topic 2:
[('creative', 0.008438696), ('psm', 0.0060220854), ('design', 0.0047755884), ('support', 0.0044842395), ('university', 0.0044120206), ('industry', 0.004177546), ('value', 0.0040866258), ('knowledge', 0.0039216005), ('economy', 0.0038603297), ('benefit', 0.0037707342), ('academic', 0.0035065578), ('practice', 0.0034694031), ('digital', 0.0033978503), ('sector', 0.0033080694), ('culture', 0.0032884309), ('innovation', 0.0031722884), ('polic