# Hansard Data Exploration and Cleaning

This notebook explores a subset of the Canadian Hansard parliamentary debate dataset available at https://www.lipad.ca/data/. The pandas dataframe contains combines CSV transcripts of Parliamentary sessions from January 2016 to June 2019, roughly the period following the last federal election.

In [None]:
# Import modules
import pandas as pd
import numpy as np
from gensim.summarization.summarizer import summarize
import matplotlib.pyplot as plt
import re, math
from collections import Counter
import itertools

In [None]:
# Load the data
df_original = pd.read_pickle('../../data/hansard.pkl') # Version to keep as-is for later use
df = pd.read_pickle('../../data/hansard.pkl') # Version to be cleaned


In [None]:
# Inspect available columns
df.head(3)

In [None]:
# In the first few rows, all subtopics appear as NaN. Check if this is still the case later in the debate.
df.loc[110:112, :]

In [None]:
# Define functions for inspecting speakers

def get_num_unique_speakers():
    speakers = df['speakername'].unique()
    print('Number of unique speakers: %s' %len(speakers))

def inspect_row(row_num):
    for row in row_num:
        print('\nInspecting row %d:\n%s\n' % (row, df_original.loc[row, :]))

def inspect_speaker(speaker):
    matches = df['speakername'].str.contains(speaker, regex=True)
    print('\nInspecting speaker %s:\n%s\n' % (speaker, df.loc[matches[matches].index, :]))

def show_names_containing(name):
    matches = df['speakername'].str.contains(name, regex=True)
    print('\nUnique occurences of %s:\n%s\n' % (name, df['speakername'][matches[matches].index].unique()))
    
def overwrite_speaker(old_name, new_name, row_to_check):
    df['speakername'] = df['speakername'].replace('^'+old_name, new_name, regex=True)
    for row in row_to_check:
        print('Row after overwriting %s with %s:\n%s\n' % (old_name, new_name, df.loc[row,:]))
    
def get_key_from_value(dictionary, val): 
    all_keys = []
    for key, value in standalone_names_dict.items(): 
         if val == value: 
             all_keys.append(key) 
    return all_keys

def replace_string(target, replacement):
    df['speakername'] = df['speakername'].replace('\s+\(.+\)*', '', regex=True) 


In [None]:
# How many unique speakers spoke during the debates?
get_num_unique_speakers()

# Overwrite NaNs with spaces
df['speakername'].fillna(' ', inplace=True)

# Remove all information in speakername column that is not first or last name
text_to_remove = ['^Hon.', 'Mr. ', 'Mrs. ', 'Ms. ', 'Miss. ', '\s+\(.+\)', '^\s+']
df['speakername'] = df['speakername'].replace(text_to_remove, '', regex=True)

# Remove additional anomalies
df['speakername'] = df['speakername'].replace('\s+\(.+\)*', '', regex=True) # Parantheses containing a speaker's riding and/or party
df['speakername'] = df['speakername'].replace('The ', '', regex=True) # 
df['speakername'] = df['speakername'].replace('Speaker Speaker', 'Speaker', regex=True)
df['speakername'] = df['speakername'].replace(['Soem', 'Som', 'Somee+'], 'Some', regex=True) # Typos in transcription of 'some'
df['speakername'] = df['speakername'].replace('Hon\s', 'Hon.', regex=True)
df['speakername'] = df['speakername'].replace('hon\s', 'hon.', regex=True)
df['speakername'] = df['speakername'].replace('An hon. members', 'An hon. member', regex=True)
df['speakername'] = df['speakername'].replace('^members$', 'Members', regex=True)

# After additional cleaning, how many unique speakers spoke during the debates?
get_num_unique_speakers()

# Check for rows that contain only one name
is_single_word = df['speakername'].str.contains('^\w[\w]*$', regex=True)
is_not_speaker = ~df['speakername'].str.contains('^Speaker$', regex=True)
is_not_members = ~df['speakername'].str.contains('^Members$', regex=True)
bool_ind = is_single_word & is_not_speaker & is_not_members
standalone_names = df["speakername"][bool_ind[bool_ind].index]
print('Speakers with only one name listed (i.e. ambiguous cases):\n%s'% standalone_names)

# Store standalone names in python dict
standalone_names_dict = standalone_names.to_dict()


In [None]:
# Inspect row with standalone 'Miller' in original dataframe
key = get_key_from_value(standalone_names_dict, 'Miller')
inspect_row(key)


In [None]:
# Show all related names
show_names_containing('Miller')


In [None]:
# Raw data shows standalone Miller's riding is Bruce-Grey-Owen-Sound - cross-reference with other Millers
# Check entries for Larry Miller
inspect_speaker('Larry Miller')


In [None]:
# Larry Miller's riding is Bruce-Grey-Owen-Sound, same as standalone Miller
overwrite_speaker('Miller', 'Larry Miller', key)


In [None]:
# Inspect row with standalone 'Warkentin' in original dataframe
key = get_key_from_value(standalone_names_dict, 'Warkentin')
inspect_row(key)


In [None]:
# Show all related names
show_names_containing('Warkentin')


In [None]:
# Check entries for Chris Warkentin
inspect_speaker('Chris Warkentin')


In [None]:
# Standalone Warkentin seems to be Chris Warkentin
overwrite_speaker('Warkentin', 'Chris Warkentin', key)


In [None]:
# Inspect row with standalone 'MacKenzie' in original dataframe
key = get_key_from_value(standalone_names_dict, 'MacKenzie')
inspect_row(key)


In [None]:
# Show all related names
show_names_containing('MacKenzie')


In [None]:
# Check entries for Dave MacKenzie
inspect_speaker('Dave MacKenzie')


In [None]:
# Riding (Oxford) overlaps
# Standalone MacKenzie seems to be Dave MacKenzie
overwrite_speaker('MacKenzie', 'Dave MacKenzie', key)


In [None]:
# Inspect row with standalone 'Eglinski' in original dataframe
key = get_key_from_value(standalone_names_dict, 'Eglinski')
inspect_row(key)


In [None]:
# Show all related names
show_names_containing('Eglinski')


In [None]:
# Check entries for Jim Eglinski
inspect_speaker('Jim Eglinski')


In [None]:
# Riding (Yellowhead) overlaps
# Standalone Eglinski seems to be Jim Eglinski
overwrite_speaker('Eglinski', 'Jim Eglinski', key)


In [None]:
# Inspect row with standalone 'McCauley' in original dataframe
key = get_key_from_value(standalone_names_dict, 'McCauley')
inspect_row(key)


In [None]:
# Show all related names
show_names_containing('McCauley')


In [None]:
# Check entries for Kelly McCauley
inspect_speaker('Kelly McCauley')


In [None]:
# Riding (Edmonston West) overlaps
# Standalone McCauley seems to be Kelly McCauley
overwrite_speaker('McCauley', 'Kelly McCauley', key)


In [None]:
# Save to pkl file
df.to_pickle('../../data/hansard_cleaned.pkl')