In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os, json
import datetime
import re
from unidecode import unidecode
from pylab import rcParams
import string
import nltk
from nltk.corpus import stopwords

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Data Cleaning

In [526]:
def format_date(date):
    day, month, year = date.split("-")
    return datetime.datetime(int(year), int(month), int(day)).strftime("%Y-%m-%d")
                                  
def speeches_from_json(json_file):
    with open(json_file, "r") as f:
        data = json.load(f)
    return [{
        "report_id": json_file.split("\\")[-1], 
        "section": data["section"], 
        "date": format_date(data["date"]),
        "title": data["title"],
        "name": unidecode(speech["name"] or ''),
        "speech": unidecode(speech["speech"] or ''),
    } for speech in data["speeches"]]

First, load all json files

In [527]:
DIR = ".\\parliament"

In [697]:
speeches = [speech for parliament_number in os.listdir(DIR) if not parliament_number.endswith(".txt")
                   for f in os.listdir(f"{DIR}\\{parliament_number}") if f.lower().endswith(".json") 
                   for speech in speeches_from_json(f"{DIR}\\{parliament_number}\\{f}")]

Create a data frame

In [698]:
df = pd.DataFrame.from_records(speeches)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 203062 entries, 0 to 203061
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   report_id  203062 non-null  object
 1   section    203062 non-null  object
 2   date       203062 non-null  object
 3   title      203062 non-null  object
 4   name       203062 non-null  object
 5   speech     203062 non-null  object
dtypes: object(6)
memory usage: 9.3+ MB


In [699]:
df.head()

Unnamed: 0,report_id,section,date,title,name,speech
0,002_19651213_S0002_T0003.json,BILLS,1965-12-13,CONSTITUTION (AMENDMENT) BILL,CONSTITUTION (AMENDMENT) BILL,
1,002_19651213_S0002_T0003.json,BILLS,1965-12-13,CONSTITUTION (AMENDMENT) BILL,First Reading,
2,002_19651213_S0002_T0003.json,BILLS,1965-12-13,CONSTITUTION (AMENDMENT) BILL,The Prime Minister:,"Mr Speaker, Sir, I have His Excellency the Yan..."
3,002_19651213_S0002_T0003.json,BILLS,1965-12-13,CONSTITUTION (AMENDMENT) BILL,Second Reading,
4,002_19651213_S0002_T0003.json,BILLS,1965-12-13,CONSTITUTION (AMENDMENT) BILL,Mr Speaker:,"Second Reading, what day?"


Drop rows with empty names and speeches

In [700]:
df = df.dropna()

### Convert 'date' from string to datetime

In [701]:
df['date'] = pd.to_datetime(df['date'])

### Normalize case

In [702]:
def normalize_case(text):
    return text.lower()

In [703]:
df['section'] = df['section'].map(normalize_case)
df['title'] = df['title'].map(normalize_case)
df['name'] = df['name'].map(normalize_case)
df['speech'] = df['speech'].map(normalize_case)

### Remove Punctuations

In [704]:
def remove_punctuations(text):
    return re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text)

In [705]:
#df['title'] = df['title'].map(lambda text: re.sub(r"[^a-z. ]", "", text))
df['name'] = df['name'].map(lambda text: re.sub(r"[^a-z. ]", "", text))
#df['speech'] = df['speech'].map(remove_punctuations)

In [706]:
df.head()

Unnamed: 0,report_id,section,date,title,name,speech
0,002_19651213_S0002_T0003.json,bills,1965-12-13,constitution (amendment) bill,constitution amendment bill,
1,002_19651213_S0002_T0003.json,bills,1965-12-13,constitution (amendment) bill,first reading,
2,002_19651213_S0002_T0003.json,bills,1965-12-13,constitution (amendment) bill,the prime minister,"mr speaker, sir, i have his excellency the yan..."
3,002_19651213_S0002_T0003.json,bills,1965-12-13,constitution (amendment) bill,second reading,
4,002_19651213_S0002_T0003.json,bills,1965-12-13,constitution (amendment) bill,mr speaker,"second reading, what day?"


### Remove stopwords

In [707]:
def remove_stopwords(text):
    stop = stopwords.words('english')
    return " ".join([word for word in text.split() if word not in (stop)])

In [708]:
#df['speech'] = df['speech'].map(remove_stopwords)

In [709]:
df.head()

Unnamed: 0,report_id,section,date,title,name,speech
0,002_19651213_S0002_T0003.json,bills,1965-12-13,constitution (amendment) bill,constitution amendment bill,
1,002_19651213_S0002_T0003.json,bills,1965-12-13,constitution (amendment) bill,first reading,
2,002_19651213_S0002_T0003.json,bills,1965-12-13,constitution (amendment) bill,the prime minister,"mr speaker, sir, i have his excellency the yan..."
3,002_19651213_S0002_T0003.json,bills,1965-12-13,constitution (amendment) bill,second reading,
4,002_19651213_S0002_T0003.json,bills,1965-12-13,constitution (amendment) bill,mr speaker,"second reading, what day?"


# 13th Parliament

In [710]:
df = df[(df['date'] >= pd.Timestamp(2016, 1, 15)) & (df['date'] <= pd.Timestamp(2020, 6, 23))]
df.shape

(21616, 6)

In [711]:
df = df[~(df['name'] == '')]
df = df[~(df['speech'] == '')]
df.shape

(21205, 6)

### Map MPs' names

How many MPs in the data frame?

In [712]:
df['name'].unique().shape

(624,)

How many actual MPs ?

In [713]:
with open(f"{DIR}\\mp.txt") as f:
    MP = np.array([re.sub(r"[^a-z. ]", "", line.replace("\n","").lower().strip()) for line in f.readlines() if line.strip()])
    print("Number of actual MPs:", len(MP))

Number of actual MPs: 524


The number of MPs in the dataframe is greater than the actual number. Let's correct this.

Prime Ministers was sometimes referred to as "the prime minister" instead of their names. We have to replace these by their names:
* lee kuan yew: 1959-06-05 to 1990-11-28
* goh chok tong: 1990-11-28 to 2004-08-12
* lee hsien loong: 2004-08-12 to present

In [714]:
df.loc[(df['name'] == "the prime minister") & (df['date'] < pd.Timestamp(1990, 11, 28)), "name"] = 'lee kuan yew'
df.loc[(df['name'] == "the prime minister") & (df['date'] >= pd.Timestamp(1990, 11, 28)) & (df['date'] < pd.Timestamp(2004, 8, 12)), "name"] = 'goh chok tong'
df.loc[(df['name'] == "the prime minister") & (df['date'] >= pd.Timestamp(2004, 8, 12)), "name"] = 'lee hsien loong'

Map names to known MPs

In [715]:
def map_text_to_names(text, names):
    for name in names:
        if name in text:
            return name
    return ""

In [716]:
df['name'] = df['name'].map(lambda text: map_text_to_names(text, MP))

Remove rows with empty names

In [717]:
df = df[~(df['name'] == '')]

Reset index

In [721]:
df = df.reset_index(drop=True)
df.shape

(17188, 6)

In [31]:
temp[temp['name'] == '']

Unnamed: 0,report_id,section,date,title,name,speech
69342,bill-11.json,second reading bills,2018-08-06,transport safety investigations bill,,makes clear that tsib may investigate incident...
69343,bill-11.json,second reading bills,2018-08-06,transport safety investigations bill,,clause 16 of the bill sets out the scope of ts...
69365,bill-12.json,second reading bills,2018-09-10,land transport (enforcement measures) bill,,asked about circumstances in which pmds can be...
69366,bill-12.json,second reading bills,2018-09-10,land transport (enforcement measures) bill,,asked whether the proposed penalties for causi...
69377,bill-12.json,second reading bills,2018-09-10,land transport (enforcement measures) bill,,
...,...,...,...,...,...,...
111161,written-answer-na-9353.json,written answers to questions for oral answer n...,2021-11-01,utilisation rate for category d coes and propo...,,
111240,written-answer-na-9470.json,written answers to questions for oral answer n...,2022-01-10,number of students admitted to secondary schoo...,,
111401,written-answer-na-9609.json,written answers to questions for oral answer n...,2022-01-10,factors for higher-than-expected inflation in ...,,
111649,written-statement-1551.json,correction by written statement,2021-01-05,clarification by minister for foreign affairs ...,,of the cpc.


In [32]:
unique_new_names = temp['name'].unique()
unique_new_names[~np.isin(unique_new_names, MP)]

array(['', 'nominated member', 'clarification by member', 'third reading',
       'west coast', 'sgtogether or', 'debate on annual budget statement',
       'senior parliamentary secretary', '.', 'staystrong and', 'assoc',
       'chairma', 'debate on budget statement', 'annual budget statement',
       'cedric foo do you wish to withdraw your amendment', 'thank you.',
       'adjournment motion',
       'preserving green spaces and heritage in jalan kayu constituency',
       'future of national service',
       'maximising every students potential in classrooms of future',
       'arts as integral part of our people and strategic national resource',
       'putting commuters first in public transport',
       'national service  balancing operational readiness safety and will to fight',
       'achieving arts excellence in singapore',
       'taking busking to next level',
       'liberal education and corruption of youth of singapore',
       'providing housing for single unwed paren

In [217]:
#temp[temp["name"] == "don wee chua chu kang"]
map_name_to_mp(np.array(["don wee", "goh choon kang"]), np.array(["don wee chua chu kang", "???"]))

[0 0]


IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed

In [None]:
df[df["name"].str.contains("speaker")]["name"].unique()

In [None]:
df = df[~df["name"].str.contains("speaker")]

In [None]:
from utils import lcs
def filter_name(name, namelist):
    for v in namelist:
        if v in name:
            return v
    return name
df['name'] = df['name'].map(lambda name: filter_name(name, mp_names))

In [42]:
#df['name'].unique().tolist()

In [None]:
def extract_name(s, samples):
    def append_dot_if_single(w):
        if len(w) == 1:
            return w + "."
        return w
    name = " ".join([append_dot_if_single(w) for w in s.split()])
    for sample in samples:
        if sample in name:
            return sample.strip()
    return np.nan
df['name'] = df['name'].map(lambda c: extract_name(c, mp))

In [None]:
df = df.dropna()
df['name'].unique().shape

then convert it to categorical data

In [None]:
df["name"] = df["name"].astype("category")
df.shape

### Merge similar sections

How many types of sections?

In [None]:
df['section'].unique()

Reduce the number of sections by merging similar ones

Merge all "..answer to question.." into "answers to questions"

In [None]:
df['section'] = df['section'].map(lambda c: 'answers to questions' if re.search(r"answer(s?) to question(s?)", c) else c)

Merge all "..motion.." into "motions"

In [None]:
df['section'] = df['section'].map(lambda c: 'motions' if re.search(r"motion(s?)", c) else c)

Merge "bills" and "second reading bills" into "bills"

In [None]:
df['section'] = df['section'].map(lambda c: 'bills' if re.search(r"bill(s?)", c) else c)

Convert it to categorical data

In [None]:
df["section"] = df["section"].astype("category")

Sort the data frame by descending date

In [None]:
df = df.sort_values('date', ascending=False).reset_index(drop=True)
df.head()

# Text statistics

Calculate length of speech in terms of characters and words

In [None]:
stat_df = df.copy()
stat_df['speech_length_characters'] = stat_df['speech'].str.len()
stat_df['speech_length_words'] = stat_df['speech'].str.split().map(lambda x: len(x))
stat_df.head()

Distribution of the number of characters

In [None]:
bins = np.arange(0,stat_df['speech_length_characters'].max(),10)
plt.hist(stat_df['speech_length_characters'], bins=bins, density=True)
#plt.xscale('log')
#plt.yscale('log')
plt.show()

Distribution of the number of words

In [None]:
bins = np.arange(0,stat_df['speech_length_words'].max(),10)
plt.hist(stat_df['speech_length_words'], bins=bins, density=True)
plt.xscale('log')
plt.yscale('log')
plt.show()

Number of speeches per MP

In [None]:
speech_dist = stat_df[['name','speech', 'speech_length_words', 'speech_length_characters']].groupby(['name']).agg({
    "speech": ["count"], 
    "speech_length_words": ["max", "min", "mean", "median", "std"], 
    "speech_length_characters": ["max", "min", "mean", "median", "std"]})
speech_dist = speech_dist.sort_values([('speech','count')], ascending=False).reset_index()
speech_dist

In [None]:
rcParams['figure.figsize'] = 10, 5 
bins = np.arange(0, max(speech_dist[('speech','count')]), 50)
plt.hist(speech_dist[('speech','count')], bins=bins)
plt.xscale('log')
plt.yscale('log')
plt.xlabel("Number of speeches per MP")
plt.ylabel("Number of MPs")
plt.show()

# 14th parliament

In [None]:
parliament14_df = df[df['date'] >= '2020-08-24']
parliament14_df