# Hansard Data Exploration and Cleaning

This notebook explores a subset of the Canadian Hansard parliamentary debate dataset available at https://www.lipad.ca/data/. The pandas dataframe contains combines CSV transcripts of Parliamentary sessions from January 2016 to June 2019, roughly the period following the last federal election.

In [8]:
# Import modules
import pandas as pd
import numpy as np
from gensim.summarization.summarizer import summarize
import matplotlib.pyplot as plt
import re, math
from collections import Counter
import itertools

In [9]:
# Load the data
df = pd.read_pickle('../data/hansard.pkl')


In [10]:
# Inspect available columns
df.head(3)

Unnamed: 0.1,basepk,hid,speechdate,pid,opid,speakeroldname,speakerposition,maintopic,subtopic,subsubtopic,speechtext,speakerparty,speakerriding,speakername,speakerurl,Unnamed: 0
0,4648606.0,ca.proc.d.2016-01-25.17043.0,2016-01-25,,,,stagedirection,Speech From The Throne,Resumption of Debate on Address in Reply,,The House resumed from December 11 considerati...,,,,,
1,4648607.0,ca.proc.d.2016-01-25.17043.1,2016-01-25,f82c2265-7622-4abb-b866-59cba1fa52ee,10348.0,"Mr. Rhéal Fortin (Rivière-du-Nord, BQ)",,Speech From The Throne,Resumption of Debate on Address in Reply,,"Mr. Speaker, I would like to begin by thanking...",Bloc Québécois,Rivière-du-Nord,Rhéal Fortin,,
2,4648608.0,ca.proc.d.2016-01-25.17043.2,2016-01-25,07cf5767-802c-406c-92a7-7dc92af79b40,4264.0,Mr. Kevin Lamoureux (Parliamentary Secretary t...,,Speech From The Throne,Resumption of Debate on Address in Reply,,"Mr. Speaker, I appreciate the comments by the ...",Liberal,Winnipeg North,Kevin Lamoureux,http://www.parl.gc.ca/parlinfo/Files/Parliamen...,


In [11]:
# In the first few rows, all subtopics appear as NaN. Check if this is still the case later in the debate.
df.loc[110:112, :]

Unnamed: 0.1,basepk,hid,speechdate,pid,opid,speakeroldname,speakerposition,maintopic,subtopic,subsubtopic,speechtext,speakerparty,speakerriding,speakername,speakerurl,Unnamed: 0
110,4648716.0,ca.proc.d.2016-01-25.17043.110,2016-01-25,cf6f39df-d304-45e1-a559-70a0cf20a720,6.0,"Hon. Rona Ambrose (Leader of the Opposition, CPC)",,Oral Questions,The Economy,,"Mr. Speaker, the fact is that we left this gov...",Conservative,Sturgeon River--Parkland,Rona Ambrose,http://www.parl.gc.ca/parlinfo/Files/Parliamen...,
111,4648717.0,ca.proc.d.2016-01-25.17043.111,2016-01-25,b8c04eca-f237-48a8-8975-374ccd40d1a9,567.0,"Right Hon. Justin Trudeau (Prime Minister, Lib.)",,Oral Questions,The Economy,,"Mr. Speaker, it is quite something to hear tha...",Liberal,Papineau,Justin Trudeau,http://www.parl.gc.ca/parlinfo/Files/Parliamen...,
112,4648718.0,ca.proc.d.2016-01-25.17043.112,2016-01-25,cf6f39df-d304-45e1-a559-70a0cf20a720,6.0,"Hon. Rona Ambrose (Leader of the Opposition, CPC)",,Oral Questions,Natural Resources,,"Mr. Speaker, maybe the Prime Minister should s...",Conservative,Sturgeon River--Parkland,Rona Ambrose,http://www.parl.gc.ca/parlinfo/Files/Parliamen...,


In [15]:
# How many unique speakers spoke during the debates?
speakers = df['speakername'].unique()
print(len(speakers))

534


In [16]:
# This number is larger than expected, based on the available MP positions (as of the 2015 election). Why?
# Write functions to check for variations in the name of a given speaker

def text_2_vector(text):
    """
    Function that vectorizes text
    """
    
    words = re.findall(r"\w+", text)
    return Counter(words)

def compute_cosine(v1, v2):
    """
    Function that computes cosine similarity between two 
    vectorized sentences/ phrases
    """

    intersection = set(v1.keys()) & set(v2.keys())
    numerator = sum([v1[x] * v2[x] for x in intersection])
    sum1 = sum([v1[x]**2 for x in v1.keys()])
    sum2 = sum([v2[x]**2 for x in v2.keys()])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
       return 0.0
    else:
       return float(numerator) / denominator

def compute_text_similarity(text1, text2):
    """
    Function that bundles vectorization and similarity
    scoring in one place
    """
    
    dict1 = text_2_vector(text1)
    dict2 = text_2_vector(text2)
    cosine_score = compute_cosine(dict1, dict2)
    return cosine_score

# Overwrite NaNs with spaces
df['speakername'].fillna(' ', inplace=True) 

# Compute similarity scores for all combinations of speakers
speakers = [x for x in speakers if (x != [] and x!= "" and x!=" ")]
perms_of_speakers = list(itertools.product(*[speakers, speakers]))
speaker_count = 0
similarity_score = []
for speaker_pair in perms_of_speakers:
    similarity_score.append(compute_text_similarity(speaker_pair[0], speaker_pair[1]))
    speaker_count = speaker_count + 1    

# Determine possible duplicate names
possible_duplicates = list(itertools.compress(perms_of_speakers, [i>0 for i in similarity_score]))
print(possible_duplicates)

[('Rhéal Fortin', 'Rhéal Fortin'), ('Kevin Lamoureux', 'Kevin Lamoureux'), ('Kevin Lamoureux', 'Kevin Waugh'), ('Kevin Lamoureux', 'Kevin Sorenson'), ('Kevin Lamoureux', 'Hon. Kevin Sorenson'), ('Kevin Lamoureux', 'Mr. Kevin Lamoureux'), ('Kevin Lamoureux', 'Mr. Kevin Lamoureux (Parliamentary Secretary to the Leader of the Government in the House of Commons, Lib.)'), ('Pierre Nantel', 'Pierre Nantel'), ('Pierre Nantel', 'Pierre Breton'), ('Pierre Nantel', 'Pierre-Luc Dusseault'), ('Pierre Nantel', 'Pierre Poilievre'), ('Pierre Nantel', 'Pierre Paul-Hus'), ('Pierre Nantel', 'Mr. Pierre Nantel'), ('Louis Plamondon', 'Louis Plamondon'), ('Fayçal El-Khoury', 'Fayçal El-Khoury'), ('Adam Vaughan', 'Adam Vaughan'), ('Adam Vaughan', 'Mr. Adam Vaughan'), ('Adam Vaughan', 'Mr. Adam Vaughan (Parliamentary Secretary to the Minister of Families, Children and Social Development, Lib.)'), ('Alex Nuttall', 'Alex Nuttall'), ('Bruce Stanton', 'Bruce Stanton'), ('Bruce Stanton', 'Mr. Miller (Bruce—Grey—O