In [1]:
import os
import glob
import nltk
import sklearn
import pandas as pd
import plotly.express as px
from bs4 import BeautifulSoup
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jeanettepoh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jeanettepoh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jeanettepoh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/jeanettepoh/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
file_pattern = 'scraped_data/*.txt'
data = []
files_with_no_elements = []

for file in glob.glob(file_pattern):
    try:
        with open(file, 'r', encoding='utf-8') as f:
            # Read the entire content of the file
            content = f.read()

            # Parse the HTML content
            soup = BeautifulSoup(content, 'html.parser')

            # Extract data from specified CSS selectors
            sitting_date = soup.select_one('#right1-1 > div > table > tr:nth-child(5) > td:nth-child(2) > span')
            section_name = soup.select_one('#right1-1 > div > table > tr:nth-child(6) > td:nth-child(2) > span')
            title = soup.select_one('#right1-1 > div > table > tr:nth-child(7) > td:nth-child(2) > span')
            text = soup.select_one('#showTopic > div.hansardContent > div')

            # Check if any element is missing
            if not (sitting_date and section_name and title and text):
                files_with_no_elements.append(file)
                continue

            # Initialize a dictionary to store data for this file
            file_data = {
                'Sitting Date': sitting_date.text.strip() if sitting_date else None,
                'Section Name': section_name.text.strip() if section_name else None,
                'Title': title.text.strip() if title else None,
                'Text': text.text.strip() if text else None
            }

            # Append the dictionary to the list of data
            data.append(file_data)

    except Exception as e:
        print(f"Error processing file '{file}': {str(e)}")

# Create a DataFrame from the collected data
df = pd.DataFrame(data)

# Drop rows with any missing values
df.dropna(inplace=True)

# Print files with no matching elements
if files_with_no_elements:
    print("\nFiles with no matching elements:")
    for file in files_with_no_elements:
        print(file)
else:
    print("\nAll files had matching elements.")


Files with no matching elements:
scraped_data/sprs3topic_reportid=budget-1314.txt
scraped_data/sprs3topic_reportid=president-address-27.txt
scraped_data/sprs3topic_reportid=budget-1277.txt
scraped_data/sprs3topic_reportid=budget-820.txt
scraped_data/sprs3topic_reportid=budget-1099.txt
scraped_data/sprs3topic_reportid=budget-1338.txt
scraped_data/sprs3topic_reportid=budget-840.txt
scraped_data/sprs3topic_reportid=budget-1160.txt
scraped_data/sprs3topic_reportid=budget-688.txt
scraped_data/sprs3topic_reportid=budget-1142.txt
scraped_data/sprs3topic_reportid=budget-867.txt
scraped_data/sprs3topic_reportid=budget-696.txt
scraped_data/sprs3topic_reportid=budget-938.txt
scraped_data/sprs3topic_reportid=motion-902.txt
scraped_data/sprs3topic_reportid=budget-870.txt
scraped_data/sprs3topic_reportid=budget-1081.txt
scraped_data/sprs3topic_reportid=budget-1279.txt


In [4]:
df

Unnamed: 0,Sitting Date,Section Name,Title,Text
0,7-2-2017,Oral Answers to Questions,Fair Process for Termination Due to Poor Perfo...,1 Dr Tan Wu Meng asked\tthe Minister for Manpo...
1,9-7-2018,Written Answers to Questions for Oral Answer N...,Details and Framework of Workings of The Speci...,22 Assoc Prof Fatimah Lateef asked the Ministe...
2,28-2-2020,Budget,Committee of Supply – Head R (Ministry of Law),"The Chairman: Head R, Ministry of Law. Mr Chri..."
3,6-3-2019,Oral Answers to Questions,Female Representation in Statutory Boards,The following question stood in the name of Ms...
4,6-3-2018,Budget,Committee of Supply – Head W (Ministry of Tran...,Growing Pains in TransportMr Sitoh Yih Pin (Po...
...,...,...,...,...
159,11-9-2017,Oral Answers to Questions,Complaints on Lapses in Real-time Updates for ...,1 Er Dr Lee Bee Wah asked\tthe Minister for Tr...
160,4-11-2019,Second Reading Bills,Central Provident Fund (Amendment) Bill,Order for Second Reading read.4.51 pmThe Minis...
161,5-3-2020,Budget,Committee of Supply – Head I (Ministry of Soci...,"The Chairman: Head I, Ministry of Social and F..."
162,4-6-2020,Ministerial Statements,Government's Plans in our Continued Fight agai...,Debate resumed.Mr Deputy Speaker: Minister Isw...


In [5]:
# Group by Section Name and count occurrences
section_name_counts = df['Section Name'].value_counts().reset_index()
section_name_counts.columns = ['Section Name', 'Count']

# Plotting the distribution with Plotly
fig = px.bar(section_name_counts, x='Section Name', y='Count', text='Count', 
             title='Distribution of Section Name', labels={'Count': 'Frequency'})
fig.update_traces(texttemplate='%{text}', textposition='outside')  # Display count above each bar
fig.update_layout(xaxis_title='Section Name', yaxis_title='Frequency', xaxis_tickangle=-45,
                  uniformtext_minsize=8, uniformtext_mode='hide',  # Adjust text size and visibility
                  height=600, width=1000,  # Adjust plot size
                  margin=dict(l=50, r=50, b=100, t=100),  # Adjust margins for better layout
                  plot_bgcolor='rgba(0,0,0,0)')  # Set plot background color to transparent

fig.update_xaxes(tickfont=dict(size=12))  # Adjust x-axis tick font size
fig.update_yaxes(tickfont=dict(size=12), range=[0, 120])  # Adjust y-axis tick font size and range

fig.show()

In [6]:
# Assuming df['Title'] contains the titles
titles_text = ' '.join(df['Title'].dropna().astype(str))

# Set NLTK English stopwords
stop_words = set(stopwords.words('english'))

# Add custom stopwords
custom_stopwords = {'head', 'committee', 'ministry', 'debate', 'supply', 'annual', 'statement', 'singapore', 'measures', 'chair', 'prime', 'minister'}
stop_words.update(custom_stopwords)

# Generate the word cloud excluding stopwords
wordcloud = WordCloud(width=800, height=400, background_color='white', stopwords=stop_words).generate(titles_text)

# Plotting the word cloud using Plotly
fig = px.imshow(wordcloud, title='Word Cloud for Titles')
fig.update_layout(coloraxis_showscale=False)  # Hide the color axis for cleaner display
fig.show()

In [7]:
# Add custom stopwords
custom_stopwords = list(custom_stopwords) + ["parliament", "session", "government", "mr", "year", "singaporeans", "member"]

# Combine custom stopwords with the default English stopwords
combined_stopwords = list(sklearn.feature_extraction.text.ENGLISH_STOP_WORDS.union(custom_stopwords))

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to lemmatize the text
def lemmatize_text(text):
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.lower() not in combined_stopwords]
    return ' '.join(lemmatized_tokens)

# Apply lemmatization to each text in the dataframe
df['Text'] = df['Text'].apply(lemmatize_text)

# Extract all texts for topic modeling
texts = df['Text']

# Convert texts to a matrix of token counts
vectorizer = CountVectorizer(stop_words=combined_stopwords)
X = vectorizer.fit_transform(texts)

# Fit LDA model
lda = LatentDirichletAllocation(n_components=10, random_state=25)  
lda.fit(X)

# Display the most important words for each topic identified by the LDA model
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic #{topic_idx}:")
        # Extract and print the top words for a given topic in an LDA model
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

n_top_words = 10
print_top_words(lda, vectorizer.get_feature_names_out(), n_top_words)

# Assign topics to the debates
df['Topic'] = lda.transform(X).argmax(axis=1)

Topic #0:
school student education need child learning support programme moe skill
Topic #1:
business company industry sector new digital help technology need smes
Topic #2:
drug lift police team people family offender building home work
Topic #3:
public defence service time saf people family need country threat
Topic #4:
energy smoking smoker product tobacco water carbon cigarettes cigarette emission
Topic #5:
covid 19 support need community people help time art budget
Topic #6:
transport public new bus waste food like vehicle lta need
Topic #7:
worker job need budget support work company time help employer
Topic #8:
care need community service health flat family healthcare support senior
Topic #9:
council town speaker claim cpf law act case amendment time



In [8]:
topics = {
    0: "Education and Support Programmes",
    1: "Digital Transformation in Industry",
    2: "Law Enforcement and Rehabilitation",
    3: "National Defence and Security",
    4: "Smoking and Environmental Impact",
    5: "Community Support during COVID-19",
    6: "Punlic Transport and Waste Management",
    7: "Workforce Support and Employment",
    8: "Community Care and Services",
    9: "Legislative Matters and Amendments"
}

df["Topic"] = df["Topic"].map(topics)

In [9]:
df

Unnamed: 0,Sitting Date,Section Name,Title,Text,Topic
0,7-2-2017,Oral Answers to Questions,Fair Process for Termination Due to Poor Perfo...,1 dr tan wu meng asked manpower provide update...,Workforce Support and Employment
1,9-7-2018,Written Answers to Questions for Oral Answer N...,Details and Framework of Workings of The Speci...,22 assoc prof fatimah lateef asked social fami...,Community Care and Services
2,28-2-2020,Budget,Committee of Supply – Head R (Ministry of Law),"chairman : r , law . christopher souza , cut ....",Legislative Matters and Amendments
3,6-3-2019,Oral Answers to Questions,Female Representation in Statutory Boards,following question stood m anthea ong –1 ask s...,Community Care and Services
4,6-3-2018,Budget,Committee of Supply – Head W (Ministry of Tran...,growing pain transportmr sitoh yih pin ( poton...,Punlic Transport and Waste Management
...,...,...,...,...,...
159,11-9-2017,Oral Answers to Questions,Complaints on Lapses in Real-time Updates for ...,1 er dr lee bee wah asked transport ( ) lta aw...,Punlic Transport and Waste Management
160,4-11-2019,Second Reading Bills,Central Provident Fund (Amendment) Bill,order second reading read.4.51 pmthe manpower ...,Legislative Matters and Amendments
161,5-3-2020,Budget,Committee of Supply – Head I (Ministry of Soci...,"chairman : , social family development . seah ...",Community Care and Services
162,4-6-2020,Ministerial Statements,Government's Plans in our Continued Fight agai...,resumed.mr deputy speaker : iswaran.6.49 pmthe...,Digital Transformation in Industry


In [10]:
df["Topic"].value_counts()

Topic
Workforce Support and Employment         27
Digital Transformation in Industry       26
Community Care and Services              22
Punlic Transport and Waste Management    20
Legislative Matters and Amendments       18
Education and Support Programmes         16
National Defence and Security            14
Community Support during COVID-19        13
Law Enforcement and Rehabilitation        6
Smoking and Environmental Impact          2
Name: count, dtype: int64

In [16]:
# Plotting the distribution of dominant topics using Plotly
topic_counts = df["Topic"].value_counts().reset_index()
topic_counts.columns = ['Topic', 'Count']

fig = px.bar(topic_counts, x='Topic', y='Count', title='Distribution of Dominant Topics',
             labels={'Topic': 'Topic', 'Count': 'Number of Debates'},
             text='Count',
             color='Topic',
             color_continuous_scale=px.colors.qualitative.Plotly)  # Use Plotly's qualitative color scale

fig.update_traces(texttemplate='%{text}', textposition='outside')

fig.update_layout(
    title={'text': 'Distribution of Common Topics in Parliamentary Debates', 'x': 0.5, 'xanchor': 'center'},
    xaxis_title='Topic',
    yaxis_title='Count',
    uniformtext_minsize=8, uniformtext_mode='hide',
    plot_bgcolor='white',
    paper_bgcolor='white',
    font=dict(size=12),
    yaxis=dict(showgrid=True, zeroline=False, range=[0, 45]),  # Set the y-axis range to 0-45
    xaxis_tickangle=-45,  # Rotate x-axis labels for better readability
    bargap=0.4,  # Gap between bars
    margin=dict(l=50, r=50, b=100, t=100),  # Adjust margins for better layout
)

fig.show()