In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os, json
import datetime
from unidecode import unidecode

In [11]:
def format_date(date):
    day, month, year = date.split("-")
    return datetime.datetime(int(year), int(month), int(day)).strftime("%Y-%m-%d")
                                  
def speeches_from_json(json_file):
    with open(json_file, "r") as f:
        data = json.load(f)
    return [{
        "id": json_file.split("\\")[-1],
        "section": data["section"],
        "date": format_date(data["date"]),
        "title": data["title"],
        "name": speech["name"] or '',
        "speech": unidecode(speech["speech"]) or '',
    } for speech in data["speeches"]]

First, read all json files

In [12]:
DIR = ".\\parliament"
speeches = [speech for parliament_number in os.listdir(DIR) if not parliament_number.endswith(".txt")
                   for f in os.listdir(f"{DIR}\\{parliament_number}") if f.lower().endswith(".json") 
                   for speech in speeches_from_json(f"{DIR}\\{parliament_number}\\{f}")]

Create a data frame

In [17]:
df = pd.DataFrame.from_records(speeches)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169014 entries, 0 to 169013
Data columns (total 6 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   id       169014 non-null  object
 1   section  169014 non-null  object
 2   date     169014 non-null  object
 3   title    169014 non-null  object
 4   name     169014 non-null  object
 5   speech   169014 non-null  object
dtypes: object(6)
memory usage: 7.7+ MB


In [18]:
df.head()

Unnamed: 0,id,section,date,title,name,speech
0,002_19651213_S0002_T0003.json,BILLS,1965-12-13,CONSTITUTION (AMENDMENT) BILL,The Prime Minister:,"Mr Speaker, Sir, I have His Excellency the Yan..."
1,002_19651213_S0002_T0003.json,BILLS,1965-12-13,CONSTITUTION (AMENDMENT) BILL,Mr Speaker:,"Second Reading, what day?"
2,002_19651213_S0002_T0003.json,BILLS,1965-12-13,CONSTITUTION (AMENDMENT) BILL,The Prime Minister:,"22nd of December, 1965."
3,002_19651214_S0002_T0002.json,BILLS,1965-12-14,SINGAPORE TELEPHONE BOARD (AMENDMENT) BILL,,"""to amend the Singapore Telephone Board Ordina..."
4,002_19651215_S0002_T0002.json,BILLS,1965-12-15,PEOPLE'S DEFENCE FORCE BILL,The Minister of Defence (Dr Goh Keng Swee):,"Mr Speaker, Sir, I have the recommendation of ..."


Remove rows with empty speech

In [19]:
df[df['speech'] == '']

Unnamed: 0,id,section,date,title,name,speech
1025,007_19651222_S0002_T0003.json,BILLS,1965-12-22,INTERPRETATION BILL,Mr Lim Cheng Lock (River Valley) rose --- The ...,
3322,014_19661216_S0002_T0010.json,BUDGET,1966-12-16,"BUDGET, MINISTRY OF LABOUR",The Chairman (Mr Tang): Mr Lim. Mr Lim Cheng L...,
5317,026_19660622_S0003_T0004.json,BILLS,1966-06-22,BRETTON WOODS AGREEMENTS BILL,Committee of Supply.,
6231,001_20040105_S0005_T0002.json,ORAL ANSWERS TO QUESTIONS,2004-01-05,BRIDGE CONNECTING JOHORE TO SINGAPORE,,
6232,001_20040106_S0005_T0004.json,ORAL ANSWERS TO QUESTIONS,2004-01-06,HOUSING AND DEVELOPMENT BOARD FLATS (NEW CONCE...,,
...,...,...,...,...,...,...
165966,018_19980601_S0007_T0047.json,ORAL ANSWERS TO QUESTIONS,1998-06-01,CYCLISTS KILLED OR INJURED BY MOTOR VEHICLES (...,Assoc. Prof. Ho Peng Kee (for the Minister for...,
166211,019_19980420_S0003_T0009.json,BILLS,1998-04-20,TELECOMMUNICATION AUTHORITY OF SINGAPORE (AMEN...,The Minister of State for Communications (Dr J...,
166243,019_19981123_S0006_T0029.json,ORAL ANSWERS TO QUESTIONS,1998-11-23,REVAMPING OF EDUCATION SYSTEM (PROGRESS),The Senior Minister of State for Education (Dr...,
166484,020_19990211_S0003_T0003.json,BILLS,1999-02-11,CARRIAGE BY AIR (AMENDMENT) BILL,The Minister of State for Communications (Dr J...,


In [152]:
df = df[df['speech'] != '']

In [153]:
no_name = df[df['name'] == ''].reset_index()
no_name

Unnamed: 0,index,date,title,name,speech,id
0,3,1965-12-14,SINGAPORE TELEPHONE BOARD (AMENDMENT) BILL,,"""to amend the Singapore Telephone Board Ordina...",002_19651214_S0002_T0002.json
1,51,1965-12-13,REPUBLIC OF SINGAPORE INDEPENDENCE BILL,,"""to make provision for the government of Singa...",003_19651213_S0002_T0015.json
2,68,1965-12-17,"MEDICAL (THERAPY, EDUCATION AND RESEARCH) BILL",,"""to make provisions for the use of parts of bo...",003_19651217_S0002_T0002.json
3,103,1966-12-21,LAND TITLES (STRATA) BILL,,"""to facilitate the subdivision of land into st...",003_19661221_S0002_T0007.json
4,227,1965-12-13,INTERPRETATION BILL,,"""to define certain terms and expressions used ...",004_19651213_S0002_T0010.json
...,...,...,...,...,...,...
3234,167876,1998-07-31,APPENDIX - FUNDING OF SOCIAL SECURITY SYSTEM,,ANNEX III - ASSISTANCE TO PEOPLE WITH FINANCIA...,050_19980731_S0009_T0034.json
3235,167883,2000-10-09,COMPANIES (AMENDMENT) BILL,,"""to amend the Companies Act (Chapter 50 of the...",050_20001009_S0003_T0005.json
3236,167898,2000-10-09,ESTATE DUTY (AMENDMENT) BILL,,"""to amend the Estate Duty Act (Chapter 96 of t...",051_20001009_S0003_T0007.json
3237,167913,2000-10-09,PUBLIC ENTERTAINMENTS (AMENDMENT) BILL,,"""to amend the Public Entertainments Act (Chapt...",052_20001009_S0003_T0009.json


In [157]:
no_name[no_name['id']]

KeyError: "None of [Index(['002_19651214_S0002_T0002.json', '003_19651213_S0002_T0015.json',\n       '003_19651217_S0002_T0002.json', '003_19661221_S0002_T0007.json',\n       '004_19651213_S0002_T0010.json', '004_19651221_S0003_T0003.json',\n       '004_19651224_S0005_T0018.json', '004_19651229_S0004_T0016.json',\n       '004_19651231_S0003_T0004.json', '004_19661221_S0004_T0014.json',\n       ...\n       '048_19980731_S0009_T0036.json', '048_20001009_S0003_T0004.json',\n       '049_19980731_S0009_T0035.json', '049_19980904_S0005_T0027.json',\n       '049_20001009_S0003_T0003.json', '050_19980731_S0009_T0034.json',\n       '050_20001009_S0003_T0005.json', '051_20001009_S0003_T0007.json',\n       '052_20001009_S0003_T0009.json', '053_19980629_S0003_T0015.json'],\n      dtype='object', length=3239)] are in the [columns]"

In [160]:
no_name['id'].to_csv("id.csv")