In [8]:
import os
import re
import glob
import nltk
import sklearn
import pandas as pd
import plotly.express as px
from bs4 import BeautifulSoup
from collections import Counter
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [9]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jeanettepoh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jeanettepoh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jeanettepoh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/jeanettepoh/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [10]:
file_pattern = 'scraped_data/*.txt'
data = []
files_with_no_elements = []

for file in glob.glob(file_pattern):
    try:
        with open(file, 'r', encoding='utf-8') as f:
            # Read the entire content of the file
            content = f.read()

            # Parse the HTML content
            soup = BeautifulSoup(content, 'html.parser')

            # Extract data from specified CSS selectors
            # sitting_date = soup.select_one('#right1-1 > div > table > tr:nth-child(5) > td:nth-child(2) > span')
            section_name = soup.select_one('#right1-1 > div > table > tr:nth-child(6) > td:nth-child(2) > span')
            title = soup.select_one('#right1-1 > div > table > tr:nth-child(7) > td:nth-child(2) > span')
            text = soup.select_one('#showTopic > div.hansardContent > div')
            speakers = soup.select_one('#right1-1 > div > table > tr:nth-child(8) > td:nth-child(2) > span')

            # Check if any element is missing
            if not (section_name and title and text and speakers):
                files_with_no_elements.append(file)
                continue

            # Initialize a dictionary to store data for this file
            file_data = {
                'Section Name': section_name.text.strip() if section_name else None,
                'Title': title.text.strip() if title else None,
                'Text': text.text.strip() if text else None,
                'Speakers': speakers.text.strip() if speakers else None
            }

            # Append the dictionary to the list of data
            data.append(file_data)

    except Exception as e:
        print(f"Error processing file '{file}': {str(e)}")

# Create a DataFrame from the collected data
df = pd.DataFrame(data)

# Drop rows with any missing values
df.dropna(inplace=True)

# Print files with no matching elements
if files_with_no_elements:
    print("\nFiles with no matching elements:")
    for file in files_with_no_elements:
        print(file)
else:
    print("\nAll files had matching elements.")


Files with no matching elements:
scraped_data/sprs3topic_reportid=budget-1314.txt
scraped_data/sprs3topic_reportid=president-address-27.txt
scraped_data/sprs3topic_reportid=budget-1277.txt
scraped_data/sprs3topic_reportid=budget-820.txt
scraped_data/sprs3topic_reportid=budget-1099.txt
scraped_data/sprs3topic_reportid=budget-1338.txt
scraped_data/sprs3topic_reportid=budget-840.txt
scraped_data/sprs3topic_reportid=budget-1160.txt
scraped_data/sprs3topic_reportid=budget-688.txt
scraped_data/sprs3topic_reportid=budget-1142.txt
scraped_data/sprs3topic_reportid=budget-867.txt
scraped_data/sprs3topic_reportid=budget-696.txt
scraped_data/sprs3topic_reportid=budget-938.txt
scraped_data/sprs3topic_reportid=motion-902.txt
scraped_data/sprs3topic_reportid=budget-870.txt
scraped_data/sprs3topic_reportid=budget-1081.txt
scraped_data/sprs3topic_reportid=budget-1279.txt


In [11]:
df

Unnamed: 0,Section Name,Title,Text,Speakers
0,Oral Answers to Questions,Fair Process for Termination Due to Poor Perfo...,1 Dr Tan Wu Meng asked\tthe Minister for Manpo...,"Er Dr Lee Bee Wah (Nee Soon),Mr Zainal Sapari ..."
1,Written Answers to Questions for Oral Answer N...,Details and Framework of Workings of The Speci...,22 Assoc Prof Fatimah Lateef asked the Ministe...,"[Prof Fatimah Lateef, Mr Desmond Lee]"
2,Budget,Committee of Supply – Head R (Ministry of Law),"The Chairman: Head R, Ministry of Law. Mr Chri...",[The Senior Minister of State for Law (Mr Edwi...
3,Oral Answers to Questions,Female Representation in Statutory Boards,The following question stood in the name of Ms...,[The Senior Parliamentary Secretary to the Min...
4,Budget,Committee of Supply – Head W (Ministry of Tran...,Growing Pains in TransportMr Sitoh Yih Pin (Po...,"[Mr Pritam Singh (Aljunied), Mr Melvin Yong Yi..."
...,...,...,...,...
159,Oral Answers to Questions,Complaints on Lapses in Real-time Updates for ...,1 Er Dr Lee Bee Wah asked\tthe Minister for Tr...,"Er Dr Lee Bee Wah (Nee Soon),Er Dr Lee Bee Wah..."
160,Second Reading Bills,Central Provident Fund (Amendment) Bill,Order for Second Reading read.4.51 pmThe Minis...,"[Mrs Josephine Teo, Assoc Prof Daniel Goh Pei ..."
161,Budget,Committee of Supply – Head I (Ministry of Soci...,"The Chairman: Head I, Ministry of Social and F...","[The Chairman, Mr Sam Tan Chin Siong, Ms Denis..."
162,Ministerial Statements,Government's Plans in our Continued Fight agai...,Debate resumed.Mr Deputy Speaker: Minister Isw...,"[Mr Deputy Speaker, Mr Deputy Speaker, Miss Ch..."


In [12]:
# Directory to save figures
save_dir = "figures"
os.makedirs(save_dir, exist_ok=True)

In [13]:
# Group by Section Name and count occurrences
section_name_counts = df['Section Name'].value_counts().reset_index()
section_name_counts.columns = ['Section Name', 'Count']

# Plotting the distribution with Plotly
fig = px.bar(section_name_counts, x='Section Name', y='Count', text='Count', 
             title='Distribution of Section Name', labels={'Count': 'Frequency'})
fig.update_traces(texttemplate='%{text}', textposition='outside')  # Display count above each bar
fig.update_layout(xaxis_title='Section Name', yaxis_title='Frequency', xaxis_tickangle=-45,
                  uniformtext_minsize=8, uniformtext_mode='hide',  # Adjust text size and visibility
                  height=600, width=1000,  # Adjust plot size
                  margin=dict(l=50, r=50, b=100, t=100),  # Adjust margins for better layout
                  plot_bgcolor='rgba(0,0,0,0)')  # Set plot background color to transparent

fig.update_xaxes(tickfont=dict(size=12))  # Adjust x-axis tick font size
fig.update_yaxes(tickfont=dict(size=12), range=[0, 120])  # Adjust y-axis tick font size and range

# Save the plot as an HTML file
fig.write_html(os.path.join(save_dir, "distribution_of_section_name.html"))

fig.show()

In [14]:
# Assuming df['Title'] contains the titles
titles_text = ' '.join(df['Title'].dropna().astype(str))

# Set NLTK English stopwords
stop_words = set(stopwords.words('english'))

# Add custom stopwords
custom_stopwords = {'head', 'committee', 'ministry', 'debate', 'supply', 'annual', 'statement', 'singapore', 'measures', 'chair', 'prime', 'minister'}
stop_words.update(custom_stopwords)

# Generate the word cloud excluding stopwords
wordcloud = WordCloud(width=800, height=400, background_color='white', stopwords=stop_words).generate(titles_text)

# Plotting the word cloud using Plotly
fig = px.imshow(wordcloud, title='Word Cloud for Titles')
fig.update_layout(coloraxis_showscale=False)  # Hide the color axis for cleaner display

# Save the plot as an HTML file
fig.write_html(os.path.join(save_dir, "distribution_of_section_nameword_cloud_for_titles.html"))

fig.show()

In [15]:
# Add custom stopwords
custom_stopwords = list(custom_stopwords) + ["parliament", "session", "government", "mr", "year", "singaporeans", "member"]

# Combine custom stopwords with the default English stopwords
combined_stopwords = list(sklearn.feature_extraction.text.ENGLISH_STOP_WORDS.union(custom_stopwords))

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to lemmatize the text
def lemmatize_text(text):
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.lower() not in combined_stopwords]
    return ' '.join(lemmatized_tokens)

# Apply lemmatization to each text in the dataframe
df['Lemmatized Text'] = df['Text'].apply(lemmatize_text)

# Extract all texts for topic modeling
texts = df['Lemmatized Text']

# Convert texts to a matrix of token counts
vectorizer = CountVectorizer(stop_words=combined_stopwords)
X = vectorizer.fit_transform(texts)

# Fit LDA model
lda = LatentDirichletAllocation(n_components=10, random_state=25)  
lda.fit(X)

# Display the most important words for each topic identified by the LDA model
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic #{topic_idx}:")
        # Extract and print the top words for a given topic in an LDA model
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

n_top_words = 10
print_top_words(lda, vectorizer.get_feature_names_out(), n_top_words)

# Assign topics to the debates
df['Topic'] = lda.transform(X).argmax(axis=1)

Topic #0:
school student education need child learning support programme moe skill
Topic #1:
business company industry sector new digital help technology need smes
Topic #2:
drug lift police team people family offender building home work
Topic #3:
public defence service time saf people family need country threat
Topic #4:
energy smoking smoker product tobacco water carbon cigarettes cigarette emission
Topic #5:
covid 19 support need community people help time art budget
Topic #6:
transport public new bus waste food like vehicle lta need
Topic #7:
worker job need budget support work company time help employer
Topic #8:
care need community service health flat family healthcare support senior
Topic #9:
council town speaker claim cpf law act case amendment time



In [16]:
topics = {
    0: "Education and Support Programmes",
    1: "Digital Transformation in Industry",
    2: "Law Enforcement and Rehabilitation",
    3: "National Defence and Security",
    4: "Smoking and Environmental Impact",
    5: "Community Support during COVID-19",
    6: "Punlic Transport and Waste Management",
    7: "Workforce Support and Employment",
    8: "Community Care and Services",
    9: "Legislative Matters and Amendments"
}

df["Topic"] = df["Topic"].map(topics)

In [17]:
df

Unnamed: 0,Section Name,Title,Text,Speakers,Lemmatized Text,Topic
0,Oral Answers to Questions,Fair Process for Termination Due to Poor Perfo...,1 Dr Tan Wu Meng asked\tthe Minister for Manpo...,"Er Dr Lee Bee Wah (Nee Soon),Mr Zainal Sapari ...",1 dr tan wu meng asked manpower provide update...,Workforce Support and Employment
1,Written Answers to Questions for Oral Answer N...,Details and Framework of Workings of The Speci...,22 Assoc Prof Fatimah Lateef asked the Ministe...,"[Prof Fatimah Lateef, Mr Desmond Lee]",22 assoc prof fatimah lateef asked social fami...,Community Care and Services
2,Budget,Committee of Supply – Head R (Ministry of Law),"The Chairman: Head R, Ministry of Law. Mr Chri...",[The Senior Minister of State for Law (Mr Edwi...,"chairman : r , law . christopher souza , cut ....",Legislative Matters and Amendments
3,Oral Answers to Questions,Female Representation in Statutory Boards,The following question stood in the name of Ms...,[The Senior Parliamentary Secretary to the Min...,following question stood m anthea ong –1 ask s...,Community Care and Services
4,Budget,Committee of Supply – Head W (Ministry of Tran...,Growing Pains in TransportMr Sitoh Yih Pin (Po...,"[Mr Pritam Singh (Aljunied), Mr Melvin Yong Yi...",growing pain transportmr sitoh yih pin ( poton...,Punlic Transport and Waste Management
...,...,...,...,...,...,...
159,Oral Answers to Questions,Complaints on Lapses in Real-time Updates for ...,1 Er Dr Lee Bee Wah asked\tthe Minister for Tr...,"Er Dr Lee Bee Wah (Nee Soon),Er Dr Lee Bee Wah...",1 er dr lee bee wah asked transport ( ) lta aw...,Punlic Transport and Waste Management
160,Second Reading Bills,Central Provident Fund (Amendment) Bill,Order for Second Reading read.4.51 pmThe Minis...,"[Mrs Josephine Teo, Assoc Prof Daniel Goh Pei ...",order second reading read.4.51 pmthe manpower ...,Legislative Matters and Amendments
161,Budget,Committee of Supply – Head I (Ministry of Soci...,"The Chairman: Head I, Ministry of Social and F...","[The Chairman, Mr Sam Tan Chin Siong, Ms Denis...","chairman : , social family development . seah ...",Community Care and Services
162,Ministerial Statements,Government's Plans in our Continued Fight agai...,Debate resumed.Mr Deputy Speaker: Minister Isw...,"[Mr Deputy Speaker, Mr Deputy Speaker, Miss Ch...",resumed.mr deputy speaker : iswaran.6.49 pmthe...,Digital Transformation in Industry


In [18]:
df["Topic"].value_counts()

Topic
Workforce Support and Employment         27
Digital Transformation in Industry       26
Community Care and Services              22
Punlic Transport and Waste Management    20
Legislative Matters and Amendments       18
Education and Support Programmes         16
National Defence and Security            14
Community Support during COVID-19        13
Law Enforcement and Rehabilitation        6
Smoking and Environmental Impact          2
Name: count, dtype: int64

In [19]:
# Plotting the distribution of dominant topics using Plotly
topic_counts = df["Topic"].value_counts().reset_index()
topic_counts.columns = ['Topic', 'Count']

fig = px.bar(topic_counts, x='Topic', y='Count', title='Distribution of Dominant Topics',
             labels={'Topic': 'Topic', 'Count': 'Number of Debates'},
             text='Count',
             color='Topic',
             color_continuous_scale=px.colors.qualitative.Plotly)  # Use Plotly's qualitative color scale

fig.update_traces(texttemplate='%{text}', textposition='outside')

fig.update_layout(
    title={'text': 'Distribution of Common Topics in Parliamentary Debates', 'x': 0.5, 'xanchor': 'center'},
    xaxis_title='Topic',
    yaxis_title='Count',
    uniformtext_minsize=8, uniformtext_mode='hide',
    plot_bgcolor='white',
    paper_bgcolor='white',
    font=dict(size=12),
    yaxis=dict(showgrid=True, zeroline=False, range=[0, 45]),  # Set the y-axis range to 0-45
    xaxis_tickangle=-45,  # Rotate x-axis labels for better readability
    bargap=0.4,  # Gap between bars
    margin=dict(l=50, r=50, b=100, t=100),  # Adjust margins for better layout
)

# Save the plot as an HTML file
fig.write_html(os.path.join(save_dir, "distribution_of_common_topics.html"))

fig.show()

In [20]:
df_questions = df[df["Section Name"].str.contains("Questions")]
df_questions

Unnamed: 0,Section Name,Title,Text,Speakers,Lemmatized Text,Topic
0,Oral Answers to Questions,Fair Process for Termination Due to Poor Perfo...,1 Dr Tan Wu Meng asked\tthe Minister for Manpo...,"Er Dr Lee Bee Wah (Nee Soon),Mr Zainal Sapari ...",1 dr tan wu meng asked manpower provide update...,Workforce Support and Employment
1,Written Answers to Questions for Oral Answer N...,Details and Framework of Workings of The Speci...,22 Assoc Prof Fatimah Lateef asked the Ministe...,"[Prof Fatimah Lateef, Mr Desmond Lee]",22 assoc prof fatimah lateef asked social fami...,Community Care and Services
3,Oral Answers to Questions,Female Representation in Statutory Boards,The following question stood in the name of Ms...,[The Senior Parliamentary Secretary to the Min...,following question stood m anthea ong –1 ask s...,Community Care and Services
13,Oral Answers to Questions,Wage Growth Moving in Tandem with Productivity...,1 Mr Saktiandi Supaat asked the Minister for T...,"Mr Saktiandi Supaat,The Senior Minister of Sta...",1 saktiandi supaat asked trade industry ( ) wa...,Digital Transformation in Industry
14,Oral Answers to Questions,Impact of Recent Fires in Australia on Singapo...,29 Mr Seah Kian Peng asked the Minister for th...,[The Minister for the Environment and Water Re...,29 seah kian peng asked environment water reso...,Punlic Transport and Waste Management
15,Oral Answers to Questions,Capital Reserve in PUB's Accounts,8 Mr Liang Eng Hwa asked the Minister for Fina...,"[Mr Pritam Singh, Mr Liang Eng Hwa, Mr Low Thi...",8 liang eng hwa asked finance clarify $ 5.3 bi...,Workforce Support and Employment
24,Written Answers to Questions for Oral Answer N...,Challenges in Applying for Assistance under Le...,41 Mr Thomas Chua Kee Seng asked the Minister ...,"Mr Thomas Chua Kee Seng,Mr Lim Swee Say",41 thomas chua kee seng asked manpower ( ) key...,Digital Transformation in Industry
30,Oral Answers to Questions,Waiting Time for Childcare Centre Places,5 Er Dr Lee Bee Wah asked\tthe Minister for So...,[The Minister for Social and Family Developmen...,5 er dr lee bee wah asked social family develo...,Community Care and Services
32,Written Answers to Questions for Oral Answer N...,Update on Local Enterprise and Association Dev...,40 Mr Thomas Chua Kee Seng asked the Minister ...,"[Mr Thomas Chua Kee Seng, Mr Chan Chun Sing]",40 thomas chua kee seng asked trade industry p...,Digital Transformation in Industry
34,Oral Answers to Questions,Measures to Mitigate Increasing Cost of Living,2 Mr Liang Eng Hwa asked the Minister for Trad...,[The Minister for Trade and Industry (Mr Chan ...,2 liang eng hwa asked trade industry ( ) gener...,Workforce Support and Employment


In [21]:
len(df_questions)

34

In [22]:
# Remove the square brackets from the Speakers column
df_questions['Speakers'] = df_questions['Speakers'].str.replace('[\[\]]', '', regex=True)

# Function to check if the speaker asked a question
def asked_question(text, speaker):
    parts = speaker.split()
    for part in parts:
        # Regex pattern to capture name after "The following question stood in the name of"
        pattern1 = re.compile(r'The\s+following\s+question\s+stood\s+in\s+the\s+name\s+of\s+([\w\s\-]+)[^\w\s]*')

        # Regex pattern to capture name before "asked"
        pattern2 = re.compile(r'([\w\s\-]+)\s+asked', re.IGNORECASE)

        # Find all matches for both patterns
        matches1 = pattern1.findall(text)
        matches2 = pattern2.findall(text)

        # Check if any part of the speaker's name matches
        if any(re.search(r'\b{}\b'.format(re.escape(part)), match, re.IGNORECASE) for match in matches1 + matches2):
            return True

    return False

# Apply the function to determine if each speaker asked a question
def get_mps_who_asked_questions(row):
    speakers = [s.strip() for s in row['Speakers'].split(',')]
    mps_asked = [s for s in speakers if asked_question(row['Text'], s)]
    return mps_asked

df_questions['MPs_Asked'] = df_questions.apply(get_mps_who_asked_questions, axis=1)


invalid escape sequence '\['


invalid escape sequence '\['


invalid escape sequence '\['



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [23]:
df_questions.loc[3]["Text"]

"The following question stood in the name of Ms Anthea Ong\xa0\t–1 To ask\xa0the Minister for Social and Family Development (a) how we are doing with our target for 20% female representation in Statutory Boards and listed companies by 2020; (b) what steps are being taken to close the gender salary gap of about 20%; and (c) in view of the upcoming International Women's Day theme of #BalanceforBetter, what are existing imbalances in our gender equality pursuit that we must balance for a better Singapore.\xa0 \tAssoc Prof Walter Theseira (Nominated Member): Question No 1, Sir.The Senior Parliamentary Secretary to the Minister for Social and Family Development (Assoc Prof Dr Muhammad Faishal Ibrahim) (for the Minister for Social and Family Development): Mr Speaker, a few of the issues being asked in this Parliamentary Question (PQ) are similar to the issues being raised by Members Prof Fatimah Lateef and Ms Rahayu Mahzam in the COS cuts for MSF. May I have your permission to address the is

In [24]:
df_questions

Unnamed: 0,Section Name,Title,Text,Speakers,Lemmatized Text,Topic,MPs_Asked
0,Oral Answers to Questions,Fair Process for Termination Due to Poor Perfo...,1 Dr Tan Wu Meng asked\tthe Minister for Manpo...,"Er Dr Lee Bee Wah (Nee Soon),Mr Zainal Sapari ...",1 dr tan wu meng asked manpower provide update...,Workforce Support and Employment,"[Er Dr Lee Bee Wah (Nee Soon), Dr Tan Wu Meng ..."
1,Written Answers to Questions for Oral Answer N...,Details and Framework of Workings of The Speci...,22 Assoc Prof Fatimah Lateef asked the Ministe...,"Prof Fatimah Lateef, Mr Desmond Lee",22 assoc prof fatimah lateef asked social fami...,Community Care and Services,[Prof Fatimah Lateef]
3,Oral Answers to Questions,Female Representation in Statutory Boards,The following question stood in the name of Ms...,The Senior Parliamentary Secretary to the Mini...,following question stood m anthea ong –1 ask s...,Community Care and Services,[The Senior Parliamentary Secretary to the Min...
13,Oral Answers to Questions,Wage Growth Moving in Tandem with Productivity...,1 Mr Saktiandi Supaat asked the Minister for T...,"Mr Saktiandi Supaat,The Senior Minister of Sta...",1 saktiandi supaat asked trade industry ( ) wa...,Digital Transformation in Industry,"[Mr Saktiandi Supaat, Saktiandi Supaat, Mr Sak..."
14,Oral Answers to Questions,Impact of Recent Fires in Australia on Singapo...,29 Mr Seah Kian Peng asked the Minister for th...,The Minister for the Environment and Water Res...,29 seah kian peng asked environment water reso...,Punlic Transport and Waste Management,"[Mr Seah Kian Peng, Mr Seah Kian Peng (Marine ..."
15,Oral Answers to Questions,Capital Reserve in PUB's Accounts,8 Mr Liang Eng Hwa asked the Minister for Fina...,"Mr Pritam Singh, Mr Liang Eng Hwa, Mr Low Thia...",8 liang eng hwa asked finance clarify $ 5.3 bi...,Workforce Support and Employment,"[Mr Pritam Singh, Mr Liang Eng Hwa, Mr Low Thi..."
24,Written Answers to Questions for Oral Answer N...,Challenges in Applying for Assistance under Le...,41 Mr Thomas Chua Kee Seng asked the Minister ...,"Mr Thomas Chua Kee Seng,Mr Lim Swee Say",41 thomas chua kee seng asked manpower ( ) key...,Digital Transformation in Industry,"[Mr Thomas Chua Kee Seng, Mr Lim Swee Say]"
30,Oral Answers to Questions,Waiting Time for Childcare Centre Places,5 Er Dr Lee Bee Wah asked\tthe Minister for So...,The Minister for Social and Family Development...,5 er dr lee bee wah asked social family develo...,Community Care and Services,"[Er Dr Lee Bee Wah, Er Dr Lee Bee Wah (Nee Soo..."
32,Written Answers to Questions for Oral Answer N...,Update on Local Enterprise and Association Dev...,40 Mr Thomas Chua Kee Seng asked the Minister ...,"Mr Thomas Chua Kee Seng, Mr Chan Chun Sing",40 thomas chua kee seng asked trade industry p...,Digital Transformation in Industry,"[Mr Thomas Chua Kee Seng, Mr Chan Chun Sing]"
34,Oral Answers to Questions,Measures to Mitigate Increasing Cost of Living,2 Mr Liang Eng Hwa asked the Minister for Trad...,The Minister for Trade and Industry (Mr Chan C...,2 liang eng hwa asked trade industry ( ) gener...,Workforce Support and Employment,[The Minister for Trade and Industry (Mr Chan ...


In [26]:
df_questions["MPs_Asked"].value_counts()

MPs_Asked
[Dr Lim Wee Kiak (Sembawang), Dr Lim Wee Kiak]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              2
[Assoc Prof Walter Theseira, Assoc Prof Walter Theseira (Nominated Member)]                                                                                                                                                                                                                                                                                                                                                                                     

In [27]:
# Function to clean and standardize names
def clean_names(names):
    # Convert list of names to a single string for processing
    names_str = ", ".join(names)
    
    # Define terms to replace and their replacements
    replacements = {
        'The Minister for Manpower (Mrs Josephine Teo)': 'Mrs Josephine Teo',
        'The Minister for Manpower and Second Minister for Home Affairs (Mrs Josephine Teo)': 'Mrs Josephine Teo',
        'The Deputy Prime Minister and Coordinating Minister for National Security (Mr Teo Chee Hean)': 'Mr Teo Chee Hean',
        'The Minister for Trade and Industry (Mr Chan Chun Sing)': 'Mr Chan Chun Sing',
        'The Senior Parliamentary Secretary to the Minister for Social and Family Development (Assoc Prof Dr Muhammad Faishal Ibrahim)': 'Assoc Prof Dr Muhammad Faishal Ibrahim',
        'Liang Eng Hwa': 'Mr Liang Eng Hwa',
        'Saktiandi Supaat': 'Mr Saktiandi Supaat'
    }
    
    # Replace specified terms in each name
    cleaned_names = []
    for name in names:
        for term, replacement in replacements.items():
            if term in name:
                name = replacement
        cleaned_names.append(name.strip())
    
    # Remove content in parentheses for each cleaned name
    names_cleaned = []
    for name in cleaned_names:
        cleaned_name = re.sub(r'\s*\([^()]*\)', '', name).strip()
        names_cleaned.append(cleaned_name)

    # Remove duplicate names while preserving order
    names_cleaned = list(dict.fromkeys(names_cleaned))
    
    return names_cleaned
    
# Apply the cleaning function to the 'MPs_Asked' column
df_questions['MPs_Asked'] = df_questions['MPs_Asked'].apply(clean_names)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [28]:
df_questions["MPs_Asked"]

0      [Er Dr Lee Bee Wah, Dr Tan Wu Meng, Assoc Prof...
1                                  [Prof Fatimah Lateef]
3      [Assoc Prof Dr Muhammad Faishal Ibrahim, Ms An...
13                                 [Mr Saktiandi Supaat]
14        [Mr Seah Kian Peng, Mr Masagos Zulkifli B M M]
15     [Mr Pritam Singh, Mr Liang Eng Hwa, Mr Low Thi...
24            [Mr Thomas Chua Kee Seng, Mr Lim Swee Say]
30                                   [Er Dr Lee Bee Wah]
32          [Mr Thomas Chua Kee Seng, Mr Chan Chun Sing]
34     [Mr Chan Chun Sing, Mr Liang Eng Hwa, Mr Sakti...
40                          [Assoc Prof Walter Theseira]
41                            [Ms Irene Quay Siew Ching]
52                                     [Dr Lim Wee Kiak]
53     [Mr Teo Chee Hean, Mr Leon Perera, Mr Alex Yam...
54     [Mrs Josephine Teo, Mr Chong Kee Hiong, Mr Pat...
56     [Mr Chong Kee Hiong, Mr Liang Eng Hwa, Mr Patr...
59                                     [Dr Lim Wee Kiak]
64                         [Mr 

In [29]:
# Initialize a dictionary to store MP counts for each topic
topic_mp_counts = {}

# Iterate through each topic
for topic, group in df_questions.groupby('Topic'):
    # Initialize Counter to count MPs
    mp_counts = Counter()
    
    # Count MPs for the current topic
    for _, row in group.iterrows():
        mp_counts += Counter(row['MPs_Asked'])
    
    # Sort MPs by number of questions in descending order
    sorted_mp_counts = dict(sorted(mp_counts.items(), key=lambda item: item[1], reverse=True))
    
    # Include all MPs who asked one question or more
    ranked_mps = []
    for mp, count in sorted_mp_counts.items():
        ranked_mps.append(f"{mp}: {count} question{'s' if count > 1 else ''}")
    
    # Store results in topic_mp_counts
    topic_mp_counts[topic] = ranked_mps

# Print results for each topic
for topic, mp_counts in topic_mp_counts.items():
    print(f"Topic: {topic}")
    for mp_count in mp_counts:
        print(f"- {mp_count}")
    print()

Topic: Community Care and Services
- Prof Fatimah Lateef: 1 question
- Assoc Prof Dr Muhammad Faishal Ibrahim: 1 question
- Ms Anthea Ong: 1 question
- Er Dr Lee Bee Wah: 1 question
- Ms Joan Pereira: 1 question

Topic: Digital Transformation in Industry
- Mr Saktiandi Supaat: 2 questions
- Mr Thomas Chua Kee Seng: 2 questions
- Mr Lim Swee Say: 1 question
- Mr Chan Chun Sing: 1 question
- Dr Lim Wee Kiak: 1 question
- Mr Kwek Hian Chuan Henry: 1 question
- Mr Masagos Zulkifli B M M: 1 question
- Mr S Iswaran: 1 question

Topic: Education and Support Programmes
- Dr Lim Wee Kiak: 2 questions
- Mr Darryl David: 2 questions
- Ms Irene Quay Siew Ching: 1 question
- Mr Speaker: 1 question
- Mr Leon Perera: 1 question
- Mr Ong Ye Kung: 1 question

Topic: Law Enforcement and Rehabilitation
- Mr Teo Chee Hean: 1 question
- Mr Leon Perera: 1 question
- Mr Alex Yam: 1 question
- Mr Pritam Singh: 1 question

Topic: Legislative Matters and Amendments
- Mr Chong Kee Hiong: 1 question
- Mr Liang En