In [1]:
import pandas as pd
import spacy
import gensim
import gensim.corpora as corpora
import matplotlib.pyplot as plt
import os
import pickle
import pyLDAvis.gensim
import pyLDAvis
from datetime import datetime
from gensim.models.coherencemodel import CoherenceModel
from utils import sent_to_words

In [2]:
df = pd.read_csv('cleaned_text.csv')
df.head()

Unnamed: 0,Sitting Date,Section Name,MPs Speaking,Content
0,2017-02-07,oral answers questions,"[The Minister for Manpower (Mr Lim Swee Say), ...",dr tan wu mengaskedthe minister manpower wheth...
1,2020-03-03,budget,"[Mr Douglas Foo (Nominated Member), Mr Patric...",chairman head ministry manpower patrick tay fa...
2,2020-02-28,budget,[The Senior Minister of State for Law (Mr Edwi...,chairman head r ministry law christopher de so...
3,2019-03-06,oral answers questions,[The Senior Parliamentary Secretary to the Min...,following question stood name ofms anthea ong ...
4,2018-03-06,budget,"[Mr Pritam Singh (Aljunied), Mr Melvin Yong Yi...",growing pains transportmr sitoh yih pin potong...


In [3]:
data = df['Content'].values.tolist()

In [4]:
data_words = list(sent_to_words(data))

In [5]:
# Print first 30 words of the first processed paper/data
print(data_words[:1][0][:30])

['dr', 'tan', 'wu', 'mengaskedthe', 'minister', 'manpower', 'whether', 'ministry', 'provide', 'update', 'investigations', 'recent', 'termination', 'multiple', 'employees', 'surbana', 'jurong', 'assoc', 'prof', 'daniel', 'goh', 'pei', 'siongaskedthe', 'minister', 'manpower', 'constitutes', 'due', 'fair', 'process', 'terminating']


In [6]:
# Create Dictionary
id2word = corpora.Dictionary(data_words)

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_words]

# View first document's word frequencies (first 30 words)
print(corpus[:1][0][:30])

[(0, 2), (1, 1), (2, 2), (3, 1), (4, 1), (5, 1), (6, 2), (7, 1), (8, 2), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 3), (18, 2), (19, 1), (20, 1), (21, 1), (22, 1), (23, 2), (24, 2), (25, 1), (26, 2), (27, 1), (28, 1), (29, 1)]


In [7]:
# Number of topics
num_topics = 6

# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics,
                                       passes=10,   
                                       workers=2)   

# Print the keywords in the 6 topics
print(lda_model.print_topics())

# Transform the corpus using the LDA model
doc_lda = lda_model[corpus]

[(0, '0.011*"workers" + 0.007*"support" + 0.006*"work" + 0.005*"would" + 0.005*"help" + 0.005*"employment" + 0.005*"one" + 0.005*"companies" + 0.005*"jobs" + 0.004*"many"'), (1, '0.005*"would" + 0.005*"new" + 0.005*"public" + 0.004*"transport" + 0.004*"minister" + 0.004*"year" + 0.004*"like" + 0.004*"one" + 0.003*"singaporeans" + 0.003*"years"'), (2, '0.007*"students" + 0.005*"support" + 0.005*"school" + 0.005*"children" + 0.005*"one" + 0.005*"public" + 0.005*"would" + 0.004*"education" + 0.004*"many" + 0.004*"work"'), (3, '0.007*"would" + 0.005*"town" + 0.005*"minister" + 0.004*"public" + 0.004*"hdb" + 0.004*"one" + 0.004*"new" + 0.003*"companies" + 0.003*"many" + 0.003*"like"'), (4, '0.007*"community" + 0.007*"care" + 0.006*"support" + 0.005*"singaporeans" + 0.005*"year" + 0.004*"healthcare" + 0.004*"health" + 0.004*"one" + 0.004*"would" + 0.004*"like"'), (5, '0.009*"defence" + 0.007*"saf" + 0.005*"training" + 0.005*"us" + 0.005*"ns" + 0.004*"security" + 0.004*"minister" + 0.004*"yea

In [8]:
res = CoherenceModel(model=lda_model, texts=data_words, dictionary=id2word, coherence='c_v').get_coherence()
print(res)

0.3114987660942129


In [9]:

# Get current date and time
now = datetime.now()

# Convert to a string with format YYYYMMDD_HHMMSS
timestamp = now.strftime("%Y%m%d_%H%M%S")



# Create directory if it does not exist and save 
if not os.path.exists('./results'):
    os.makedirs('./results')
LDAvis_data_filepath = os.path.join(f'./results/ldavis_prepared_{num_topics}_{timestamp}')

# Prepare the pyLDAvis visualization data
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)

# Save locally
with open(LDAvis_data_filepath, 'wb') as f:
    pickle.dump(LDAvis_prepared, f)

# Load locally
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)

# Save visualization 
pyLDAvis.save_html(LDAvis_prepared, f'./results/ldavis_prepared_{num_topics}_{timestamp}.html')


pyLDAvis.enable_notebook()
LDAvis_prepared


In [10]:
lda_model.save(f'./results/lda_{num_topics}_{timestamp}')
id2word.save(f'./results/dictionary_{num_topics}_{timestamp}')
corpora.MmCorpus.serialize(f'./results/corpus_{num_topics}_{timestamp}.mm', corpus)

In [12]:
print(f'./results/lda_{num_topics}_{timestamp}')

./results/lda_6_20230917_001039


In [13]:
print(f'./results/dictionary_{num_topics}_{timestamp}')

./results/dictionary_6_20230917_001039


In [14]:
print(f'./results/corpus_{num_topics}_{timestamp}.mm')

./results/corpus_6_20230917_001039.mm
