<a href="https://colab.research.google.com/github/jeremychia/singapore-parliament-speeches/blob/main/Parliamentary_Data_Topic_Modelling_(NMF).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# install bigquery client library
!pip install google-cloud-bigquery --quiet

In [2]:
# authenticate to GCP
from google.colab import auth
auth.authenticate_user()

In [3]:
# set up project id and dataset
project_id = "singapore-parliament-speeches"  # Replace with your GCP project ID
dataset_id = "singapore-parliament-speeches.prod_mart.mart_speeches"

In [4]:
# create a bigquery client
from google.cloud import bigquery
client = bigquery.Client(project=project_id)

In [47]:
from datetime import datetime

today = datetime.today()

date_yyyymmdd = today.strftime('%Y%m%d')
date_yyyymmdd = str(date_yyyymmdd)

In [5]:
# define query
query = f"""
  select
    topic_id,
    speech_text
  from {dataset_id}
  order by date desc, speech_id
"""

In [6]:
# run query
import pandas as pd
query_job = client.query(query)
df = query_job.to_dataframe()

In [7]:
# compbine text by topic

def combine_text_by_topic(df):

  combined_text = {}
  for index, row in df.iterrows():
    topic_id = row['topic_id']
    text = row['speech_text']
    if topic_id not in combined_text:
      combined_text[topic_id] = ""
    combined_text[topic_id] += ' ' + text

  result_df = pd.DataFrame.from_dict(combined_text, orient='index', columns=['speech_text'])
  result_df.reset_index(inplace=True)
  result_df.rename(columns={'index': 'topic_id'}, inplace=True)

  return result_df

# execute
ct_df = combine_text_by_topic(df)

In [8]:
ct_df.head()

Unnamed: 0,topic_id,speech_text
0,2024-03-07-T-001,"Order. Sir, can I ask you to exercise your po..."
1,2024-03-07-T-002,asked the Minister for Defence (a) what effor...
2,2024-03-07-T-003,asked the Minister for Health (a) what is the...
3,2024-03-07-T-004,asked the Minister for Health (a) how many ne...
4,2024-03-07-T-005,asked the Minister for Manpower in view that ...


In [9]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

# text preprocessing functions
def lowercase_text(text):
  return text.lower()

def remove_punctuation(text):
  punctuation = string.punctuation
  return "".join([char for char in text if char not in punctuation])

def remove_stopwords(text, custom_stopwords=[]):
  stopword_list = stopwords.words('english')
  stopword_list.extend(custom_stopwords)

  words = [word for word in text.split() if word not in stopword_list]
  return " ".join(words)

def lemmatize_text(text):
  lemmatizer = WordNetLemmatizer()
  words = [lemmatizer.lemmatize(word) for word in text.split()]
  return " ".join(words)

def clean_text(text):
  cleaned_text = re.sub(r"[^\x00-\x7F]+", "", text)  # Remove non-ASCII characters
  cleaned_text = re.sub(r"\d+", "", cleaned_text)  # Remove numbers
  cleaned_text = re.sub(r"\s+", " ", cleaned_text)  # Replace multiple spaces with a single space
  cleaned_text = re.sub(r"\n|\t|\r|\f", "", cleaned_text)  # Remove newlines, tabs, carriage returns, and form feeds
  return cleaned_text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


The following stop words are removed.

Stopwords, comprising common words like "and," "the," and "is," are typically removed in text analysis tasks for their high frequency and low semantic value. This removal reduces data noise, lowers dimensionality, and improves model performance by focusing on more meaningful terms. Moreover, excluding stopwords enhances interpretability and normalization of text data, aiding in more efficient and effective text analysis and modeling processes.

| Category                     | Stopwords                                         |
|------------------------------|---------------------------------------------------|
| Government and Political     | parliament, ministry, minister, parliamentary,   |
|                              | mr, speaker, member, deputy, indranee, rajah,    |
|                              | zaqy, mohamad, yien, hai, fu, grace, leader,    |
|                              | house                                             |
| Terms                        |                                                   |
| General Stopwords            | also, year, time, need, new, would, one, may,   |
|                              | many, like, whether, u, make, public, take,      |
|                              | well, even, example, text, sitting, act,         |
|                              | edition, read, printed, adjourned, adjourn,      |
|                              | resolved, order, assent, proceeding, chapter,     |
|                              | revised, amend, presented, second, available,    |
|                              | amendment, consequential, state, debate,        |
|                              | tomorrow, resumption, day, beg, fixed, stand,    |
|                              | date, today, accordingly, sit, exempted,         |
|                              | provision, present, general, paper, item,        |
|                              | today, allotted, supply, committee,              |
|                              | consideration, th, nd, rd, pursuant, minute,     |
|                              | pm, rising, speech, respect, discussion,         |
|                              | agreed, january, february, march, april,        |
|                              | may, june, july, august, september, october,    |
|                              | november, december, fy, leave, progress,        |
|                              | chair, head, said                                |
| Specific Terms and Names     | bill, first, question, continue, must, ensure,   |
|                              | proc, amendment, consequential, debate,         |
|                              | according, thursday, friday, saturday, sunday,  |
|                              | kim, gan, desmond, lee, yong, hon, provision,   |
|                              | mdm                                               |

In [10]:
custom_stopwords = ['parliament', 'ministry', 'minister',
                    'parliamentary', 'mr', 'speaker', 'asked', 'sir', 'thank',
                    'also', 'year', 'time', 'need', 'new', 'government',
                    'would', 'one', 'may', 'many', 'year', 'member',
                    'like', 'whether', 'u', 'make', 'public', 'take',
                    'bill', 'first', 'question', 'well', 'continue', 'must',
                    'ensure', 'even', 'example', 'proc', 'text', 'sitting',
                    'act', 'edition', 'read', 'printed', 'adjourned', 'adjourn',
                    'resolved', 'order', 'assent', 'standing', 'consent', 'proceeding',
                    'chapter', 'revised', 'amend', 'presented', 'second', 'available',
                    'amendment', 'consequential', 'state', 'debate', 'tomorrow',
                    'resumption', 'day', 'deputy',  'beg', 'indranee', 'rajah',
                    'zaqy', 'mohamad', 'yien', 'hai', 'fu', 'grace', 'fixed',
                    'stand', 'date', 'today', 'accordingly', 'sit', 'exempted',
                    'member', 'provision', 'present', 'general', 'paper', 'item',
                    'today', 'allotted', 'supply', 'committee', 'consideration',
                    'th', 'nd', 'rd', 'pursuant', 'minute', 'pm',
                    'hen', 'eng', 'ng', 'dr', 'monday', 'tuesday', 'wednesday',
                    'thursday', 'friday', 'saturday', 'sunday', 'rising',
                    'kim', 'gan', 'desmond', 'lee', 'yong', 'member', 'hon',
                    'speech', 'proceeding', 'respect', 'discussion', 'leader',
                    'provision', 'agreed', 'house', 'january', 'february', 'march',
                    'april', 'may', 'june', 'july', 'august', 'september',
                    'october', 'november', 'december', 'fy', 'leave', 'progress',
                    'chair', 'head', 'mdm', 'said', 'interruption', 'minute',
                    'propose', 'extend', 'moment', 'day', 'period', 'completion',
                    'business', 'days', 'today', 'facilitate', 'singapore',
                    'singaporean', 'ma', 'rgb', 'stylecolor', 'span', 'u']

ct_df['speech_text'] = ct_df['speech_text'].apply(lowercase_text)
ct_df['speech_text'] = ct_df['speech_text'].apply(remove_punctuation)
ct_df['speech_text'] = ct_df['speech_text'].apply(remove_stopwords, custom_stopwords=custom_stopwords)
ct_df['speech_text'] = ct_df['speech_text'].apply(lemmatize_text)
ct_df['speech_text'] = ct_df['speech_text'].apply(clean_text)

In [11]:
ct_df.iloc[4,1]

'manpower view senior live mature estate whose hdb flat shorter remaining lease lower resale value consider reassessing use hdb flat type eligibility criterion silver support scheme especially regard ownership room larger hdb flat disqualifies senior receiving silver support silver support scheme targeted senior lower income working year little family support resource retirement property ownership indicative seniors resource remains relevant ensuring silver support scheme targeted senior particular senior room larger housing development board hdb flat likely resource access additional retirement fund hence eligible silver support nonetheless senior face unique circumstance including room larger hdb flat short remaining lease low annual value write central provident fund cpf board review eligibility silver support consider merit appeal casebycase basis strengthen support retirement adequacy announced enhancement silver support scheme budget includes increasing qualifying per caput house

In [12]:
import time
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
from gensim.corpora import Dictionary
from gensim.models import Nmf
import matplotlib.pyplot as plt

texts = ct_df['speech_text']

# Preprocess data
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(texts)

# Create a Gensim Dictionary
dictionary = Dictionary(texts.apply(str.split))

# Convert sparse TF-IDF matrix to Gensim corpus
corpus = [dictionary.doc2bow(doc.split()) for doc in texts]

# Define a number of topics
num_topics = 25

In [14]:
nmf_model = Nmf(corpus=corpus, num_topics=num_topics, id2word=dictionary, random_state=42)

In [15]:
for i in range(num_topics):
  print(f"Topic {i}")
  print(nmf_model.print_topic(i, topn=20))

Topic 0
0.017*"council" + 0.017*"town" + 0.012*"think" + 0.009*"member" + 0.009*"point" + 0.007*"data" + 0.007*"party" + 0.007*"say" + 0.006*"fact" + 0.006*"m" + 0.006*"case" + 0.006*"system" + 0.005*"made" + 0.005*"statement" + 0.005*"want" + 0.005*"let" + 0.005*"information" + 0.004*"know" + 0.004*"issue" + 0.004*"put"
Topic 1
0.068*"health" + 0.047*"mental" + 0.015*"support" + 0.013*"healthcare" + 0.013*"wellbeing" + 0.012*"care" + 0.009*"condition" + 0.008*"workplace" + 0.007*"parent" + 0.006*"service" + 0.006*"person" + 0.005*"strategy" + 0.005*"family" + 0.005*"intervention" + 0.005*"treatment" + 0.005*"professional" + 0.004*"individual" + 0.004*"child" + 0.004*"physical" + 0.004*"employee"
Topic 2
0.016*"law" + 0.013*"officer" + 0.012*"security" + 0.011*"drug" + 0.009*"legal" + 0.008*"international" + 0.008*"conflict" + 0.007*"israel" + 0.006*"foreign" + 0.006*"country" + 0.006*"palestinian" + 0.006*"lawyer" + 0.006*"civilian" + 0.005*"police" + 0.005*"humanitarian" + 0.005*"pea

In [16]:
from gensim.utils import simple_preprocess

def get_topic_distribution(text):

    tokens = text.split()
    bow_vector = dictionary.doc2bow(tokens)
    topic_distribution = nmf_model.get_document_topics(bow_vector)

    return topic_distribution

# Apply the function to each row of the DataFrame
ct_df['topic_distribution'] = ct_df['speech_text'].apply(get_topic_distribution)

In [17]:
ct_df

Unnamed: 0,topic_id,speech_text,topic_distribution
0,2024-03-07-T-001,ask exercise power reinstate half hour questio...,"[(0, 0.3352175513963566), (3, 0.03944268953929..."
1,2024-03-07-T-002,defence effort taken address mental health iss...,"[(1, 0.5466514050034106), (2, 0.03945898922081..."
2,2024-03-07-T-003,health planned capacity increase patient seeki...,"[(1, 0.36261444491240097), (3, 0.0785749825004..."
3,2024-03-07-T-004,health active ageing centre set next five year...,"[(0, 0.14379564324129757), (3, 0.0244223978221..."
4,2024-03-07-T-005,manpower view senior live mature estate whose ...,"[(1, 0.041883489166909975), (4, 0.200667339745..."
...,...,...,...
17457,2012-09-10-T-067,education total amount partnership related fee...,"[(2, 0.023994995020700453), (3, 0.019303055505..."
17458,2012-09-10-T-068,acting community development youth sport past ...,"[(3, 0.020818916210223058), (4, 0.115768057109..."
17459,2012-09-10-T-069,acting manpower year number employer warned fi...,"[(4, 0.01297885457231009), (15, 0.174787769095..."
17460,2012-09-10-T-070,acting manpower gathering feedback smes impact...,"[(8, 0.16504749798832583), (10, 0.382534101444..."


In [18]:
ct_df.iloc[2,1]

'health planned capacity increase patient seeking mental health specialist care imh ii alexandra hospital b step taken support necessary resource required alexandra hospital provide outpatient inpatient mental health specialist service including acute bed psychiatric care rehabilitation institute mental health imh recently refurbished acute psychiatric ward enhance inpatient care expanding capacity meet future demand support increase demand mental health service planning increase number psychiatrist increase number healthcare worker training psychology expand mental health service primary community setting support national mental health wellbeing strategy m nadia quick followup supplementary senior sharing training place learning opportunity think student interested pursue route could senior give little bit detail training opportunity trickle learning institution understand m nadia samdins training opportunity made within learning institution number institution already provide training

In [19]:
ct_df.iloc[2,2]

[(1, 0.36261444491240097),
 (3, 0.07857498250046385),
 (7, 0.012059774477916556),
 (8, 0.10251446598542281),
 (9, 0.1398903405667462),
 (14, 0.06278710762071317),
 (17, 0.02482372113816609),
 (20, 0.059880371079569235),
 (21, 0.053058259662708586),
 (22, 0.09606004713599951)]

In [20]:
def extract_topic_info(row, num_topics):
    topic_info = dict(row['topic_distribution'])
    for i in range(num_topics):
        if i in topic_info:
            row[f'topic_{i}_distribution'] = topic_info[i]
        else:
            row[f'topic_{i}_distribution'] = 0
    return row

In [21]:
def extract_highest_topic_info(topic_distribution):
    if not topic_distribution:
        return None, None
    topic_info = dict(topic_distribution)
    max_topic = max(topic_info, key=topic_info.get)
    return int(max_topic), topic_info.get(max_topic, 0)

In [22]:
ct_df = ct_df.apply(lambda row: extract_topic_info(row, num_topics), axis=1)

In [23]:
ct_df['highest_topic'], ct_df['highest_topic_distribution'] = zip(*ct_df['topic_distribution'].apply(extract_highest_topic_info))

In [24]:
ct_df.head()

Unnamed: 0,topic_id,speech_text,topic_distribution,topic_0_distribution,topic_1_distribution,topic_2_distribution,topic_3_distribution,topic_4_distribution,topic_5_distribution,topic_6_distribution,...,topic_17_distribution,topic_18_distribution,topic_19_distribution,topic_20_distribution,topic_21_distribution,topic_22_distribution,topic_23_distribution,topic_24_distribution,highest_topic,highest_topic_distribution
0,2024-03-07-T-001,ask exercise power reinstate half hour questio...,"[(0, 0.3352175513963566), (3, 0.03944268953929...",0.335218,0.0,0.0,0.039443,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.083569,0.022766,0.11792,16.0,0.349401
1,2024-03-07-T-002,defence effort taken address mental health iss...,"[(1, 0.5466514050034106), (2, 0.03945898922081...",0.0,0.546651,0.039459,0.0,0.0,0.0,0.0,...,0.037361,0.0,0.0,0.010403,0.0,0.0,0.0,0.0,1.0,0.546651
2,2024-03-07-T-003,health planned capacity increase patient seeki...,"[(1, 0.36261444491240097), (3, 0.0785749825004...",0.0,0.362614,0.0,0.078575,0.0,0.0,0.0,...,0.024824,0.0,0.0,0.05988,0.053058,0.09606,0.0,0.0,1.0,0.362614
3,2024-03-07-T-004,health active ageing centre set next five year...,"[(0, 0.14379564324129757), (3, 0.0244223978221...",0.143796,0.0,0.0,0.024422,0.0,0.0,0.0,...,0.179686,0.011138,0.0,0.064252,0.0,0.012329,0.014103,0.097103,14.0,0.194433
4,2024-03-07-T-005,manpower view senior live mature estate whose ...,"[(1, 0.041883489166909975), (4, 0.200667339745...",0.0,0.041883,0.0,0.0,0.200667,0.015246,0.09129,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,0.281981


In [25]:
pivoted_distributions = ct_df[["topic_id"]+[f"topic_{i}_distribution" for i in range(num_topics)]]
unpivoted = pivoted_distributions.melt(id_vars = ['topic_id'],
                                       var_name = 'topic',
                                       value_name = 'distribution')
unpivoted['topic'] = unpivoted['topic'].str.extract('(\d+)').astype(int)
unpivoted = unpivoted[unpivoted['distribution'] != 0]
unpivoted = unpivoted.sort_values(by=['topic_id', 'topic'])
unpivoted = unpivoted.reset_index(drop=True)
unpivoted['date'] = unpivoted['topic_id'].apply(lambda x: pd.to_datetime(x[:10]))

In [26]:
unpivoted

Unnamed: 0,topic_id,topic,distribution,date
0,2012-09-10-T-001,0,0.343569,2012-09-10
1,2012-09-10-T-001,2,0.053899,2012-09-10
2,2012-09-10-T-001,6,0.017128,2012-09-10
3,2012-09-10-T-001,9,0.096178,2012-09-10
4,2012-09-10-T-001,14,0.124965,2012-09-10
...,...,...,...,...
129259,2024-03-07-T-041,22,0.013568,2024-03-07
129260,2024-03-07-T-042,0,0.075074,2024-03-07
129261,2024-03-07-T-042,7,0.420817,2024-03-07
129262,2024-03-07-T-042,9,0.134863,2024-03-07


In [48]:
dataset_id = "topic_modelling"
table_id = f"topic_distribution_{num_topics}_nmf_{date_yyyymmdd}"

unpivoted.to_gbq(destination_table=f"{dataset_id}.{table_id}",
          project_id=project_id,
          if_exists="replace")

100%|██████████| 1/1 [00:00<00:00, 7013.89it/s]


In [28]:
highest_topic = ct_df[["topic_id", "highest_topic", "highest_topic_distribution"]]
highest_topic['date'] = highest_topic['topic_id'].apply(lambda x: pd.to_datetime(x[:10]))
highest_topic = highest_topic[~highest_topic['highest_topic'].isna()]
highest_topic['highest_topic'] = highest_topic['highest_topic'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  highest_topic['date'] = highest_topic['topic_id'].apply(lambda x: pd.to_datetime(x[:10]))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  highest_topic['highest_topic'] = highest_topic['highest_topic'].astype(int)


In [29]:
highest_topic

Unnamed: 0,topic_id,highest_topic,highest_topic_distribution,date
0,2024-03-07-T-001,16,0.349401,2024-03-07
1,2024-03-07-T-002,1,0.546651,2024-03-07
2,2024-03-07-T-003,1,0.362614,2024-03-07
3,2024-03-07-T-004,14,0.194433,2024-03-07
4,2024-03-07-T-005,14,0.281981,2024-03-07
...,...,...,...,...
17457,2012-09-10-T-067,22,0.301020,2012-09-10
17458,2012-09-10-T-068,7,0.380711,2012-09-10
17459,2012-09-10-T-069,21,0.598795,2012-09-10
17460,2012-09-10-T-070,21,0.389710,2012-09-10


In [49]:
dataset_id = "topic_modelling"
table_id = f"highest_topic_{num_topics}_nmf_{date_yyyymmdd}"

highest_topic.to_gbq(destination_table=f"{dataset_id}.{table_id}",
          project_id=project_id,
          if_exists="replace")

100%|██████████| 1/1 [00:00<00:00, 1414.13it/s]


In [31]:
!pip install openai --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m262.9/262.9 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [53]:
from openai import OpenAI
from google.colab import userdata

client = OpenAI(
    api_key = userdata.get('OPENAI_API_KEY')
)

In [33]:
# Function to get summarization from ChatGPT
def get_summarization(topic_top_n_words):
    response = client.chat.completions.create(
      model="gpt-3.5-turbo-0125",
      messages=[
        {"role": "system", "content": "You're a political text analyst focusing on Singapore Parliament Speeches. Given top words, provide a 1-2 word summary of the topic."},
        {"role": "user", "content": f"The words are: {topic_top_n_words}. What's a summary in 1-2 words?"}
      ]
    )
    return response.choices[0].message.content

def extract_topic_words(topic_repr):
    # Split the representation by "+" to get individual terms
    terms = topic_repr.split('+')
    topic_words = []
    for term in terms:
        # Extract the word (text between quotes) using regular expression
        word = re.findall(r'"([^"]*)"', term)
        # Append the extracted word to the list of topic words
        if word:
            topic_words.append(word[0])
    return topic_words

In [34]:
topic_words_list = []

for i in range(num_topics):
    topic_repr = nmf_model.print_topic(i, topn=15)
    topic_words = extract_topic_words(topic_repr)
    topic_words_list.append(topic_words)

In [35]:
for i in range(num_topics):
  print(' '.join(topic_words_list[i]))

council town think member point data party say fact m case system made statement want
health mental support healthcare wellbeing care condition workplace parent service person strategy family intervention treatment
law officer security drug legal international conflict israel foreign country palestinian lawyer civilian police humanitarian
healthcare patient care doctor moh hospital medical healthier health system sg community disease nurse healthy
flat hdb housing price resident home cost bto estate rental owner household resale market room
digital data technology company job support opportunity business work economy skill programme access talent child
family woman support social child care society work u community issue men marriage caregiver mother
sport programme support community art youth child student school work development centre family chairman opportunity
sector service help programme last industry year work plan area technology agency development chairman centre
national spo

In [36]:
topic_summaries = []
for topic_words in topic_words_list:
    topic_top_n_words = ' '.join(topic_words)
    summary = get_summarization(topic_top_n_words)
    topic_summaries.append(summary)

In [37]:
topic_summaries = [topic.strip('"').title() for topic in topic_summaries]

In [38]:
for i, summary in enumerate(topic_summaries):
    print(f"Topic {i}: {summary}")

Topic 0: Debates In Parliament
Topic 1: Healthcare Support
Topic 2: Foreign Relations
Topic 3: Summary: Healthcare System
Topic 4: Housing Affordability
Topic 5: Skills Development
Topic 6: Family Support
Topic 7: Community Engagement
Topic 8: Government Programs
Topic 9: Sports Development
Topic 10: Green Economy
Topic 11: Family Issues
Topic 12: Economic Policies
Topic 13: Transportation Infrastructure
Topic 14: Elderly Care
Topic 15: National Security
Topic 16: National Development
Topic 17: Interfaith Relations
Topic 18: Environmental Policy
Topic 19: Economic Policies
Topic 20: Cybersecurity Measures
Topic 21: Labor Issues
Topic 22: Education Reform
Topic 23: Legal Reforms
Topic 24: Policy Analysis


In [39]:
topic_names = pd.DataFrame({
    'topic': [i for i in range(num_topics)],
    'topic_summary': topic_summaries,
    'top_n_words': [' '.join(topic_words) for topic_words in topic_words_list]
})

In [50]:
dataset_id = "topic_modelling"
table_id = f"topic_names_{num_topics}_nmf_{date_yyyymmdd}"

topic_names.to_gbq(destination_table=f"{dataset_id}.{table_id}",
          project_id=project_id,
          if_exists="replace")

100%|██████████| 1/1 [00:00<00:00, 3809.54it/s]


In [52]:
# Save NMF Model

from google.colab import drive
import os
drive.mount('/content/drive')

# Define the directory path
directory = "/content/drive/My Drive/singapore-parliament-speeches/"

# Create the directory if it doesn't exist
if not os.path.exists(directory):
    os.makedirs(directory)

nmf_model.save(f"{directory}model_{num_topics}_nmf_{date_yyyymmdd}")


# Load the NMF model from the 'singapore-parliament-speeches' folder in Google Drive
# nmf_model_path = "f/content/drive/My Drive/singapore-parliament-speeches/nmf_model_{num_topics}_topics"
# loaded_nmf_model = Nmf.load(nmf_model_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
