# Congressional Speech Analysis

In [1]:
import os
import glob
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
pd.options.display.max_columns = 1200  
pd.options.display.max_rows = 1200 

import unicodedata
import itertools
import datetime
import datefinder
import operator
import string
import re

import nltk
from nltk.tokenize import sent_tokenize

def listdir_nohidden(path):
    return glob.glob(os.path.join(path, '*'))

### Get files

The first step is to generate the data frame of all 114th Congress hearings. The files are contained in directories following this structure:

Data ---> Congress ---> Office ---> Department ---> Text files in txt format.

Each text file is imported into the documents column of the data frame and assigned a document ID.

In [21]:
from IPython.display import clear_output

FILES_PATH = '/Users/yc00027/Documents/GitHub/congress_speech_analysis/Data'

congresses = sorted(listdir_nohidden(FILES_PATH))

congressional_hearings = []

for congress in congresses:
    clear_output()
    print(f'Now loading congress {congress}')
    offices = listdir_nohidden(congress)
    for office in offices:
        clear_output()
        print(f'Now loading office {office}')
        congress = os.path.split(os.path.dirname(office))[1]
        departments = listdir_nohidden(office)
        for department in departments:
            clear_output()
            office = os.path.split(os.path.dirname(department))[1]
            try:
                session_files = listdir_nohidden(department)
            except:
                print(f'No txt files for session {session}')
                continue

            txt_files = [f for f in session_files if f.endswith('.txt')]
            for file in txt_files:
                with open(file, errors='replace') as f:
                    document = f.read().lower() 
                    department = os.path.split(os.path.dirname(file))[1]
                    name = os.path.basename(file)
                    congressional_hearings.append([document, congress, office, department, name])
            
clear_output()
print('Done')

congressional_hearings_df = pd.DataFrame(congressional_hearings).reset_index().rename(columns={'index': 'document_id'})
congressional_hearings_df.columns = ['document_id', 'document', 'congress','office','department','name']
congressional_hearings_df = congressional_hearings_df.sort_values(by=['congress', 'office','department','name'])

print(f'Congressional Hearings DF shape: {congressional_hearings_df.shape}')
congressional_hearings_df.head()

Done
Congressional Hearings DF shape: (2535, 6)


Unnamed: 0,document_id,document,congress,office,department,name
1837,1837,\n - eastern mediterranean energy: challenges ...,114th,House,Ad Hoc Committee on Energy,1.txt
1659,1659,\n - [errata] manipulation and fraud in the re...,114th,House,Commission on Security and Cooperation in Europe,1.txt
1658,1658,\n - the rule of law and civil society in azer...,114th,House,Commission on Security and Cooperation in Europe,2.txt
1657,1657,\n - human rights violations in russian-occupi...,114th,House,Commission on Security and Cooperation in Europe,3.txt
1656,1656,\n - nato's warsaw summit and the future of eu...,114th,House,Commission on Security and Cooperation in Europe,4.txt


Inspect the data frame of bills for the 114th congress and extract full list of speaker names, number of bills and bill types.

In [31]:
bills93_114_df = pd.read_csv('bills93-114.csv', error_bad_lines=False, encoding='ISO-8859-1', sep = ';')
bills_114 = bills93_114_df[bills93_114_df['Cong'] == 114]

print(f'There were {bills_114.shape[0]} bills in Congress 114th.')

speaker_names = sorted(bills_114['NameFull'].str.lower().unique())
print(f'There were {len(speaker_names)} speakers in Congress 114th.')
print('')

print('Number of bills according to bill type:')
print(bills_114['BillType'].value_counts())
print('')

print('Bill types: "hr" (House Bill); "s" (Senate Bill); "hres" (House Resolution); "sres" (Senate Resolution); \
                   "hcon" (House Concurrent Resolution); "scon" (Senate Concurrent Resolution); \
                   "hjres" (House Joint Resolution); "sjres" (Senate Joint Resolution).')


There were 12043 bills in Congress 114th.
There were 542 speakers in Congress 114th.

Number of bills according to bill type:
hr         6508
s          3547
hres        956
sres        642
hconres     183
hjres       108
sconres      58
sjres        41
Name: BillType, dtype: int64

Bill types: "hr" (House Bill); "s" (Senate Bill); "hres" (House Resolution); "sres" (Senate Resolution);                    "hcon" (House Concurrent Resolution); "scon" (Senate Concurrent Resolution);                    "hjres" (House Joint Resolution); "sjres" (Senate Joint Resolution).


Inspect the data frame of chairmen for the 114th congress and extract full list of chairmen names.

In [647]:
chairmen_114 = pd.read_excel('chairmen.xlsx', 'all')
chairmen_114 = chairmen_114[chairmen_114['Congress'] == 114]
chairmen_114 = chairmen_114[(chairmen_114['Chair Committee'].astype(int) == 1)] # | (chairmen_114['Chair Subcommittee'].astype(int) == 1)] # | (chairmen_114['Vice Chair Committee'].astype(int) == 1) | (chairmen_114['Vice Chair subcommittee'].astype(int) == 1)]
chairmen_114['Committee Name'] = chairmen_114['Committee Name'].str.replace("Veterans’ Affairs", 'Veterans Affairs')

chairmen_names = sorted(chairmen_114['Name'].str.lower().unique())
print(f'There were {len(chairmen_names)} chairmen of Committees and Subcommittees in Congress 114th.')

There were 36 chairmen of Committees and Subcommittees in Congress 114th.


### Get sentences

Extract the data frame of sentences for each document with respective sentence ID and speaker name. Match speaker names extracted from the hearings documents with the complete list of speakers extracted from the bills data frame.

In [423]:
starting_points = []
sentences = []
keywords = ['senator', 'secretary', 'chairman', 'mr.', 'ms.', 'mrs.', 'doc.']
all_names = chairmen_names + speaker_names

for i, (document_id, document) in enumerate(zip(congressional_hearings_df['document_id'], congressional_hearings_df['document'])):
    sent_text = sent_tokenize(document)
        
    for sentence_id, sentence in enumerate(sent_text):
        sentence_is_appendix_start = 'a p p e n d i x' in sentence
        if sentence_is_appendix_start:
            break
            
        sentences.append({
            'document_id': document_id,
            'sentence_id': sentence_id,
            'sentence': sentence,
        })

        sentence_split = sentence.split(' ')
        sentence_length = len(sentence_split) 
        if sentence_length < 2:
            continue
            
        first_word = sentence_split[0]
        sentence_is_start_sentence = sentence_length == 2 and first_word in keywords
        sentence_is_start_statement = 'statement of' in sentence
        if sentence_is_start_sentence:
            speaker_name = sentence_split[1].strip('.?')
            starting_points.append({
                'document_id': document_id,
                'speaker_name': speaker_name,
                'speech_id': sentence_id,
                'prefix': first_word
            })
        elif sentence_is_start_statement:
            try: 
                speaker_name = sent_text[sentence_id+1].split(', ')[0]
                starting_points.append({
                    'document_id': document_id,
                    'speaker_name': speaker_name,
                    'speech_id': sentence_id,
                    'prefix': None
                })
            except IndexError:
                continue

starting_points = pd.DataFrame(starting_points)
sentences = pd.DataFrame(sentences)

sentences_df = pd.merge(sentences, starting_points, left_on=['document_id', 'sentence_id'], \
                     right_on=['document_id', 'speech_id'], how='left')

sentences_df = sentences_df.groupby('document_id').ffill()
sentences_df = sentences_df.fillna(0)

f = lambda x: next(iter(name for name in chairmen_names if str(x) in name), 'no match')
sentences_df['clean_speaker_name'] = sentences_df['speaker_name'].apply(f)

sentences_df.drop(columns=['speaker_name'], inplace=True)

print(f'Sentences DF shape: {sentences_df.shape}')
sentences_df.head()

Sentences DF shape: (3609654, 6)


Unnamed: 0,document_id,sentence,sentence_id,prefix,speech_id,clean_speaker_name
0,1837,\n - eastern mediterranean energy: challenges ...,0,0,0.0,no match
1,1837,114-220\n\n (committee on ...,1,0,0.0,no match
2,1837,"114-90\n\n (committee on science, ...",2,0,0.0,no match
3,1837,"for more information, contact the gpo customer...",3,0,0.0,no match
4,1837,"phone 202-512-1800, or 866-512-1800 (toll-free).",4,0,0.0,no match


### Get speeches

Group sentences into speeches and extract title, date and chairman name from first block of each document.

In [713]:
blocks_df = sentences_df.groupby(['document_id', 'speech_id', 'clean_speaker_name', 'prefix'])['sentence'].apply(lambda x: ','.join(x)).reset_index()
blocks_df.columns = ['document_id', 'speech_id', 'clean_speaker_name', 'prefix', 'speech']

document_title = []
hearing_date = []

for i, (speech_id, speech) in enumerate(zip(blocks_df['speech_id'], blocks_df['speech'])):
    if speech_id == 0:
        title = speech.split('\n')[1].replace(' - ', '')
        regex_date = re.compile(r'(?:january|february|march|april|may|june|july|august|september|october|november|december)\s+\d{1,2},\s+\d{4}')
        date = regex_date.findall(speech)
        document_title.append(title)
        hearing_date.append(date)
    else:
        document_title.append(np.nan)
        hearing_date.append(np.nan)

chairmen = []

for index, (speech_id, speech) in enumerate(zip(blocks_df['speech_id'], blocks_df['speech'])):
    if speech_id != 0:
        chairmen.append(np.nan)
    elif speech_id == 0:
        split_speech = speech.splitlines()
        chairman_names = []
        for i, line in enumerate(split_speech):
            try:
                line_plus_2_long = split_speech[i+2] + split_speech[i+3] + split_speech[i+4] + split_speech[i+5]
                
                if 'committee on' in line:
                        
                    if ' chairman' in line_plus_2_long:
                        chairman_name = line_plus_2_long.split(',')[0].lstrip() 
                        chairman_names.append(chairman_name)
                        
                    elif ' chair' in line_plus_2_long:
                        chairman_name = line_plus_2_long.split(',')[0].lstrip() 
                        chairman_names.append(chairman_name)
                        
                    elif '(chairman)' in line_plus_2_long:
                        line_plus_2_long = line_plus_2_long.split('(chairman)')[0]
                        chairman_name = line_plus_2_long.split('hon.,')[-1].lstrip() 
                        chairman_names.append(chairman_name)
                        
                    elif '(chairwoman)' in line_plus_2_long:
                        line_plus_2_long = line_plus_2_long.split('(chairwoman)')[0]
                        chairman_name = line_plus_2_long.split('hon.,')[-1].lstrip() 
                        chairman_names.append(chairman_name)
                        
                    else:
                        continue
                
                if 'first session' in line:
                    
                    if ' chairman' in line_plus_2_long:
                        chairman_name = line_plus_2_long.split(',')[0].lstrip() 
                        chairman_names.append(chairman_name)
                    else:
                        continue
                        
                if 'second session' in line:
                    
                    if ' chairman' in line_plus_2_long:
                        chairman_name = line_plus_2_long.split(',')[0].lstrip() 
                        chairman_names.append(chairman_name)
                    else:
                        continue
                        
                elif '              house                                 senate' in line:
                    
                    if 'chairman' in line_plus_2_long:
                        chairman_name = line_plus_2_long.split(', ')[0].lstrip()
                        chairman_names.append(chairman_name)
                    else:
                        continue
                    
                else:
                    continue
                        
            except IndexError:
                continue
                        
        chairmen.append(chairman_names)
        
blocks_df['document_title'] = document_title

blocks_df['hearing_date'] = hearing_date
blocks_df['hearing_date'] = blocks_df['hearing_date'].str[0]

blocks_df['chairman'] = chairmen
blocks_df['chairman'] = blocks_df['chairman'].str[0]

chairman_dict = {
                 'jerry moran ' : 'jerry moran',
                 'john thune' : 'john r. thune',
                 'lamar alexander ' : 'lamar alexander',
                 'tom cole' : 'thomas cole',
                 'richard c. shelby ' : 'richard c. shelby', 
                 'lindsey graham' : 'lindsey o. graham',
                 'lindsey graham ': 'lindsey o. graham', 
                 'roy blunt ': 'roy blunt',
                 'susan collins, ': 'susan collins', 
                 'john hoeven ': 'john hoeven',
                 'enzi' : 'michael b. enzi', 
                 'one hundred fourteenth congress                              ----------                                                 david vitter' : 'david vitter',
                 '----------                                                 david vitter' : 'david vitter',
                 'one hundred fourteenth congress                             first session                  james m. inhofe' : 'james m. inhofe',
                 'one hundred fourteenth congress                             second session                  james m. inhofe' : 'james m. inhofe',
                 'one hundred fourteenth congress                             first session                  barbara boxer': 'barbara boxer',
                 'subcommittee on energy and water development                   michael k. simpson' : 'michael k. simpson',
                 'related agencies appropriations                mario diaz-balart' : 'mario diaz-balart',
                 'subcommittee on defense              rodney p. frelinghuysen': 'rodney p. frelinghuysen',
                 'subcommittee on homeland security                     john r. carter': 'john r. carter',
                 'subcommittee on defense       rodney p. frelinghuysen': 'rodney p. frelinghuysen',
                 'subcommittee on financial services and general government                    ander crenshaw' : 'ander crenshaw',
                 'subcommittee on energy and water development                                                                       michael k. simpson': 'michael k. simpson',
                 'related agencies                       ken calvert' : 'ken calvert',
                 "william m. ``mac'' thornberry" : 'william m. thornberry',
                 "one hundred fourteenth congress             william m. ``mac'' thornberry": 'william m. thornberry',
                 'subcommittee on interior': 'ken calvert', 
                 'hon.': 'lamar s. smith', 
                 'chairman                                  co-chairman        alcee l. hastings': 'christopher h. smith',
                 'alcee l. hastings': 'christopher h. smith',
                 'subcommittee on commerce': 'john abney culberson', 
                 'drug administration': 'robert b. aderholt', 
                 '[established by s. res.': 'richard burr', 
                 'washington': 'jeff flake'
               }

blocks_df= blocks_df.replace({'chairman': chairman_dict})

blocks_df['speech_len'] = blocks_df['speech'].str.split().apply(len)

blocks_df = blocks_df.groupby('document_id').ffill()

speaker_name_dict = {'john isakson': 'johnny isakson',
                     'richard shelby' :  'richard c. shelby', 
                     'lindsey graham' : 'lindsey o. graham', 
                     'michael enzi' : 'michael b. enzi', 
                     'orrin hatch' : 'orrin g. hatch', 
                     'john thune' : 'john r. thune', 
                     'james ihnofe' : 'james m. inhofe', 
                     'michael mccaul' : 'michael t. mccaul', 
                     'lamar smith' : 'lamar s. smith', 
                     'edward royce' : 'edward r. royce', 
                     'j. forbes' : 'j. randy forbes', 
                     'randy forbes' : 'j. randy forbes',
                     'candice miller' : 'candice s. miller', 
                     'k. conaway' : 'k. michael conaway', 
                     'michael conaway' : 'k. michael conaway',
                     'michael simpson' : 'michael k. simpson',
                     'robert aderholt' : 'robert b. aderholt',
                     'charles dent' : 'charles w. dent', 
                     'rodney frelinghuysen' : 'rodney p. frelinghuysen', 
                     'john culberson' : 'john abney culberson',
                     'john carter' : 'john r. carter', 
                     'christopher smith' : 'christopher h. smith', 
                     'william thornberry' : 'william m. thornberry', 
                     'robert wittman' : 'robert j. wittman',
                     'joseph heck' : 'joseph j. heck', 
                     'michael turner' : 'michael r. turner'}

blocks_df = blocks_df.replace({'clean_speaker_name': speaker_name_dict})
blocks_df['chairman'] = blocks_df['chairman'].fillna(0)

blocks_df = blocks_df[blocks_df['speech_id'] != 0]

male_speaker_names = ['johnny isakson', 'james m. inhofe', 'ron johnson', 'john mccain', 'lamar s. smith',  
                      'thad cochran', 'richard c. shelby', 'roy blunt', 'lamar alexander', 'charles w. dent', 
                      'bill shuster', 'orrin g. hatch', 'harold rogers', 'pat roberts', 'michael b. enzi', 
                      'william m. thornberry', 'john r. thune', 'chuck grassley', 'tom price', 'bob corker', 
                      'jason chaffetz', 'edward r. royce', 'pete sessions', 'kevin brady', 'rob bishop',
                      'k. michael conaway', 'david vitter', 'john kline', 'michael t. mccaul', 'fred upton', 
                      'steve chabot', 'jeb hensarling', 'bob goodlatte', 'jeff miller', 'jerry moran']

female_speaker_names = ['candice s. miller', 'lisa murkowski', 'lindsey o. graham', 'susan collins']


mask_male = (blocks_df['clean_speaker_name'].isin(male_speaker_names)) & (blocks_df['prefix'].isin(['ms.', 'mrs.']))
mask_female = (blocks_df['clean_speaker_name'].isin(female_speaker_names)) & (blocks_df['prefix'] == 'mr.')

#print(blocks_df['document_id'].nunique())


blocks_df['clean_speaker_name'] = np.where(mask_male, 'no match', blocks_df['clean_speaker_name'])
blocks_df['clean_speaker_name'] = np.where(mask_female, 'no match', blocks_df['clean_speaker_name']) 

#print(blocks_df['document_id'].nunique())


mask_1 = (blocks_df['chairman'] == 0) & (blocks_df['prefix'] == 'chairman') & (blocks_df['clean_speaker_name'] != 'no match')
mask_2 = (blocks_df['chairman'] != 0) & (blocks_df['prefix'] == 'chairman')

blocks_df['chairman'] = np.where(mask_1, blocks_df['clean_speaker_name'], blocks_df['chairman'])
blocks_df['clean_speaker_name'] = np.where(mask_2, blocks_df['chairman'], blocks_df['clean_speaker_name'])

blocks_df = pd.merge(blocks_df, congressional_hearings_df.drop(columns=['document']), on='document_id', how='left').reset_index().rename(columns={'index': 'unique_speech_id'})
blocks_df['department'] = blocks_df['department'].str.replace('Committee on ', '')
blocks_df['department'] = blocks_df['department'].str.replace('the ', '')

committee_chairman_dict = dict(zip(chairmen_114['Committee Name'], chairmen_114['Name'].str.lower()))
blocks_df['chairman_from_department'] = blocks_df['department'].map(committee_chairman_dict)

def combine_chairman(x):
    if isinstance(x['chairman_from_department'], float):
        return x['chairman']
    elif x['chairman_from_department'] != x['chairman']:
        return x['chairman_from_department']
    else:
        return x['chairman']

blocks_df['chairman'] = blocks_df.apply(combine_chairman, axis=1)
blocks_df = blocks_df.drop(columns=['chairman_from_department'])

blocks_2_df = blocks_df

blocks_df = blocks_df[blocks_df['clean_speaker_name'] != 'no match']
blocks_df = blocks_df[blocks_df['chairman'].isin(chairmen_names)]

print(f'Blocks DF shape: {blocks_df.shape}')
blocks_df.head()

Blocks DF shape: (47911, 14)


Unnamed: 0,unique_speech_id,document_id,speech_id,clean_speaker_name,prefix,speech,document_title,hearing_date,chairman,speech_len,congress,office,department,name
0,0,0,33.0,johnny isakson,0,"opening statement of hon.,johnny isakson, chai...",ending veteran homelessness,"july 29, 2015",johnny isakson,245,114th,Senate,Veterans Affairs,15.txt
2,2,0,55.0,johnny isakson,chairman,"chairman isakson.,when i spoke to the american...",ending veteran homelessness,"july 29, 2015",johnny isakson,354,114th,Senate,Veterans Affairs,15.txt
3,3,0,67.0,johnny isakson,chairman,"statement of hon.,richard blumenthal, \n ...",ending veteran homelessness,"july 29, 2015",johnny isakson,590,114th,Senate,Veterans Affairs,15.txt
4,4,0,100.0,johnny isakson,chairman,"chairman isakson.,it is now my privilege to in...",ending veteran homelessness,"july 29, 2015",johnny isakson,95,114th,Senate,Veterans Affairs,15.txt
8,8,0,207.0,johnny isakson,chairman,"chairman isakson.,thank you very much for your...",ending veteran homelessness,"july 29, 2015",johnny isakson,8,114th,Senate,Veterans Affairs,15.txt


### Get paragraphs

Break speeches into paragraphs of consistent length.

In [698]:
TOO_SMALL_LENGTH = 100
TOO_LARGE_LENGTH = 200

def split_paragraph(paragraph):
    import copy
    list_of_paragraphs = []
    remaining_paragraph = copy.copy(paragraph)
    paragraph_split = remaining_paragraph.split('. ')
    i = 0
    while len(remaining_paragraph.split()) > TOO_LARGE_LENGTH:
        paragraph_piece = ''
        while len(paragraph_piece.split()) < TOO_SMALL_LENGTH:
            paragraph_piece += paragraph_split[i]
            i += 1
            remaining_paragraph = '. '.join(paragraph_split[i:])
            
        list_of_paragraphs.append(paragraph_piece)
    
    if len(remaining_paragraph.split()) > TOO_SMALL_LENGTH:
        list_of_paragraphs.append(remaining_paragraph)
        
    return list_of_paragraphs

characters = ["-", "...", "''", "``", "@", "#",  
              '--', '=', '_', '..', '|', "/",
              '~', '—', '•', '“', '–', '>', '*']

def clean_text(text):
    '''Remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = text.replace(' \n', ' ')
    for character in characters:
        text = text.replace(character, " ")
    text = ' '.join(text.split())
    return text

In [700]:
skip_iterations = []
paragraphs = []

for index, speech in zip(blocks_df['unique_speech_id'], blocks_df['speech']):
    paragraph_text = re.split('[.?!]\n    ', speech)
    for i, paragraph in enumerate(paragraph_text):
        if i in skip_iterations:
            continue
            
        paragraph_length = len(paragraph.split())
        paragraph_too_small = paragraph_length < TOO_SMALL_LENGTH
        paragraph_too_large = paragraph_length > TOO_LARGE_LENGTH
        if paragraph_too_small:
            try:
                i_plus = 1
                while len(paragraph.split()) < TOO_SMALL_LENGTH:
                    paragraph += paragraph_text[i+i_plus]
                    if len(paragraph.split()) > TOO_LARGE_LENGTH:
                        raise IndexError
                    skip_iterations.append(i+i_plus)                    
                    i_plus += 1
                    
            except IndexError:
                continue
                
        elif paragraph_too_large:
            smaller_paragraphs = split_paragraph(paragraph)
            for p in smaller_paragraphs:
                paragraphs.append({
                    'unique_speech_id': index,
                    'paragraph': re.sub(' +', ' ', p),
                })
                
            continue
        
        if len(paragraph.split()) < TOO_SMALL_LENGTH:
            break
            assert False
        paragraphs.append({
            'unique_speech_id': index,
            'paragraph': re.sub(' +', ' ', paragraph),
        })
            
            
paragraphs_df = pd.DataFrame(paragraphs).reset_index().rename(columns={'index': 'paragraph_id'})

paragraphs_df['paragraph_len'] = paragraphs_df['paragraph'].str.split().apply(len)
paragraphs_df['paragraph'] = [clean_text(paragraph) for paragraph in paragraphs_df['paragraph']]

paragraphs_df = paragraphs_df[paragraphs_df['paragraph_len'] <= TOO_LARGE_LENGTH]

print(f'Paragraphs DF shape: {paragraphs_df.shape}')

congress_114_paragraphs_df = pd.merge(paragraphs_df, blocks_df.drop(columns=['speech']), on='unique_speech_id', how='left')

print(f'Full 114th Congress Paragraphs DF shape: {congress_114_paragraphs_df.shape}')
congress_114_paragraphs_df.head()

Paragraphs DF shape: (9113, 4)
Full 114th Congress Paragraphs DF shape: (9113, 16)


Unnamed: 0,paragraph_id,paragraph,unique_speech_id,paragraph_len,document_id,speech_id,clean_speaker_name,prefix,document_title,hearing_date,chairman,speech_len,congress,office,department,name
0,2,ho is going to talk today about the city of ho...,1,121,0,55.0,johnny isakson,chairman,ending veteran homelessness,"july 29, 2015",johnny isakson,354,114th,Senate,Veterans Affairs,15.txt
1,4,"chairman isakson.,i have been told by advocate...",11,109,0,392.0,johnny isakson,chairman,ending veteran homelessness,"july 29, 2015",johnny isakson,109,114th,Senate,Veterans Affairs,15.txt
2,5,"chairman isakson.,senator tillis, followed by ...",13,192,0,457.0,johnny isakson,chairman,ending veteran homelessness,"july 29, 2015",johnny isakson,192,114th,Senate,Veterans Affairs,15.txt
3,6,"chairman isakson.,let me amplify on that for 1...",15,111,0,585.0,johnny isakson,chairman,ending veteran homelessness,"july 29, 2015",johnny isakson,258,114th,Senate,Veterans Affairs,15.txt
4,7,"o'toole.,absolutely, sir.,one of the things we...",15,147,0,585.0,johnny isakson,chairman,ending veteran homelessness,"july 29, 2015",johnny isakson,258,114th,Senate,Veterans Affairs,15.txt


In [701]:
chairman_paragraphs_df = congress_114_paragraphs_df[congress_114_paragraphs_df['clean_speaker_name'].isin(chairmen_names)]

def FormatParagraph(paragraph):
    punc_filter = re.compile('([.!?]\s*)')
    split_with_punctuation = punc_filter.split(paragraph)
    final = ''.join([i.capitalize() for i in split_with_punctuation])+'.'
    return final

chairman_paragraphs_df['paragraph'] = chairman_paragraphs_df['paragraph'].apply(lambda x: FormatParagraph(x))
chairman_paragraphs_df['percentage_of_speech'] = [round(row['paragraph_len']/row['speech_len']*100, 2) for i, row in chairman_paragraphs_df.iterrows()]

mask_speaker_smith_not_chairman = (chairman_paragraphs_df['clean_speaker_name'] == 'lamar s. smith') & (~chairman_paragraphs_df['prefix'].isin(['senator', 'chairman']))
chairman_paragraphs_df = chairman_paragraphs_df[~mask_speaker_smith_not_chairman]

chairman_paragraphs_df.to_excel('chairman_paragraphs_df.xlsx', index=False)
print(f'114th Congress Speaker Paragraphs DF shape: {chairman_paragraphs_df.shape}')
chairman_paragraphs_df.head()

114th Congress Speaker Paragraphs DF shape: (8477, 17)


Unnamed: 0,paragraph_id,paragraph,unique_speech_id,paragraph_len,document_id,speech_id,clean_speaker_name,prefix,document_title,hearing_date,chairman,speech_len,congress,office,department,name,percentage_of_speech
0,2,Ho is going to talk today about the city of ho...,1,121,0,55.0,johnny isakson,chairman,ending veteran homelessness,"july 29, 2015",johnny isakson,354,114th,Senate,Veterans Affairs,15.txt,34.18
1,4,"Chairman isakson.,i have been told by advocate...",11,109,0,392.0,johnny isakson,chairman,ending veteran homelessness,"july 29, 2015",johnny isakson,109,114th,Senate,Veterans Affairs,15.txt,100.0
2,5,"Chairman isakson.,senator tillis, followed by ...",13,192,0,457.0,johnny isakson,chairman,ending veteran homelessness,"july 29, 2015",johnny isakson,192,114th,Senate,Veterans Affairs,15.txt,100.0
3,6,"Chairman isakson.,let me amplify on that for 1...",15,111,0,585.0,johnny isakson,chairman,ending veteran homelessness,"july 29, 2015",johnny isakson,258,114th,Senate,Veterans Affairs,15.txt,43.02
4,7,"O'toole.,absolutely, sir.,one of the things we...",15,147,0,585.0,johnny isakson,chairman,ending veteran homelessness,"july 29, 2015",johnny isakson,258,114th,Senate,Veterans Affairs,15.txt,56.98


In [721]:
chairman_paragraphs_df[chairman_paragraphs_df['clean_speaker_name'] == 'johnny isakson']

Unnamed: 0,paragraph_id,paragraph,unique_speech_id,paragraph_len,document_id,speech_id,clean_speaker_name,prefix,document_title,hearing_date,chairman,speech_len,congress,office,department,name,percentage_of_speech
0,2,Ho is going to talk today about the city of ho...,1,121,0,55.0,johnny isakson,chairman,ending veteran homelessness,"july 29, 2015",johnny isakson,354,114th,Senate,Veterans Affairs,15.txt,34.18
1,4,"Chairman isakson.,i have been told by advocate...",11,109,0,392.0,johnny isakson,chairman,ending veteran homelessness,"july 29, 2015",johnny isakson,109,114th,Senate,Veterans Affairs,15.txt,100.0
2,5,"Chairman isakson.,senator tillis, followed by ...",13,192,0,457.0,johnny isakson,chairman,ending veteran homelessness,"july 29, 2015",johnny isakson,192,114th,Senate,Veterans Affairs,15.txt,100.0
3,6,"Chairman isakson.,let me amplify on that for 1...",15,111,0,585.0,johnny isakson,chairman,ending veteran homelessness,"july 29, 2015",johnny isakson,258,114th,Senate,Veterans Affairs,15.txt,43.02
4,7,"O'toole.,absolutely, sir.,one of the things we...",15,147,0,585.0,johnny isakson,chairman,ending veteran homelessness,"july 29, 2015",johnny isakson,258,114th,Senate,Veterans Affairs,15.txt,56.98
5,14,"1731, the homeless veterans services protectio...",28,161,0,1820.0,johnny isakson,chairman,ending veteran homelessness,"july 29, 2015",johnny isakson,4996,114th,Senate,Veterans Affairs,15.txt,3.22
6,18,"Chairman isakson.,thank you very much, mr. Ste...",31,110,0,2217.0,johnny isakson,chairman,ending veteran homelessness,"july 29, 2015",johnny isakson,110,114th,Senate,Veterans Affairs,15.txt,100.0
7,20,"Chairman isakson.,the committee is adjourned.,...",45,106,0,2439.0,johnny isakson,chairman,ending veteran homelessness,"july 29, 2015",johnny isakson,3705,114th,Senate,Veterans Affairs,15.txt,2.86
8,24,Smith (name changed) is a 65 year old army vet...,45,113,0,2439.0,johnny isakson,chairman,ending veteran homelessness,"july 29, 2015",johnny isakson,3705,114th,Senate,Veterans Affairs,15.txt,3.05
9,25,Smith's appointment with the property manager ...,45,114,0,2439.0,johnny isakson,chairman,ending veteran homelessness,"july 29, 2015",johnny isakson,3705,114th,Senate,Veterans Affairs,15.txt,3.08


In [702]:
print(f'There are {chairman_paragraphs_df.clean_speaker_name.nunique()} speakers represented in the data')
chairman_paragraphs_df.clean_speaker_name.value_counts()

There are 36 speakers represented in the data


john mccain              1070
ron johnson               938
lamar s. smith            749
johnny isakson            657
jason chaffetz            592
jeb hensarling            578
edward r. royce           533
steve chabot              496
michael t. mccaul         333
james m. inhofe           317
kevin brady               266
pete sessions             196
richard c. shelby         193
pat roberts               178
david vitter              133
bob goodlatte             131
john kline                108
k. michael conaway        101
michael b. enzi            99
tom price                  76
rob bishop                 76
john r. thune              73
bill shuster               72
lamar alexander            62
fred upton                 57
chuck grassley             56
harold rogers              54
candice s. miller          51
william m. thornberry      48
bob corker                 43
orrin g. hatch             41
roy blunt                  38
lisa murkowski             27
charles w.

In [718]:
speeches_df = pd.merge(sentences_df.drop(columns=['clean_speaker_name', 'prefix', 'contains_appendix']), blocks_2_df, on=['document_id', 'speech_id'], how='left')

speeches_df = speeches_df[speeches_df['speech_id'] != 0]

speeches_df['sentence_len'] = speeches_df['sentence'].str.split().apply(len)
speeches_df = speeches_df[speeches_df['sentence_len'] > 3]

speeches_df.to_csv('speeches_df.csv', index=False)

print(f'Speeches DF shape: {speeches_df.shape}')
speeches_df.head()

Speeches DF shape: (2843773, 17)


Unnamed: 0,document_id,sentence,sentence_id,speech_id,unique_speech_id,clean_speaker_name,prefix,speech,document_title,hearing_date,chairman,speech_len,congress,office,department,name,sentence_len
11,1837,the subcommittees will come to order.,11,10.0,309462.0,no match,ms.,"ms. ros-lehtinen.,the subcommittees will come ...",eastern mediterranean energy: challenges and o...,"september 8, 2016",edward r. royce,857.0,114th,House,Ad Hoc Energy,1.txt,6
12,1837,"after recognizing myself, chairman weber, rank...",12,10.0,309462.0,no match,ms.,"ms. ros-lehtinen.,the subcommittees will come ...",eastern mediterranean energy: challenges and o...,"september 8, 2016",edward r. royce,857.0,114th,House,Ad Hoc Energy,1.txt,31
13,1837,we will then hear from our witnesses.,13,10.0,309462.0,no match,ms.,"ms. ros-lehtinen.,the subcommittees will come ...",eastern mediterranean energy: challenges and o...,"september 8, 2016",edward r. royce,857.0,114th,House,Ad Hoc Energy,1.txt,7
14,1837,"and, without \nobjection, the witnesses' prepa...",14,10.0,309462.0,no match,ms.,"ms. ros-lehtinen.,the subcommittees will come ...",eastern mediterranean energy: challenges and o...,"september 8, 2016",edward r. royce,857.0,114th,House,Ad Hoc Energy,1.txt,35
15,1837,we are also expected to be joined by the chair...,15,10.0,309462.0,no match,ms.,"ms. ros-lehtinen.,the subcommittees will come ...",eastern mediterranean energy: challenges and o...,"september 8, 2016",edward r. royce,857.0,114th,House,Ad Hoc Energy,1.txt,36
