# Congressional Speech Analysis

In [1]:
import os
import glob
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
pd.options.display.max_columns = 1200  
pd.options.display.max_rows = 1200 

import unicodedata
import itertools
import datetime
import datefinder
import operator
import string
import re

import nltk
from nltk.tokenize import sent_tokenize

def listdir_nohidden(path):
    return glob.glob(os.path.join(path, '*'))

### Get files

The first step is to generate the data frame of all 114th Congress hearings. The files are contained in directories following this structure:

Data ---> Congress ---> Office ---> Department ---> Text files in txt format.

Each text file is imported into the documents column of the data frame and assigned a document ID.

In [21]:
from IPython.display import clear_output

FILES_PATH = '/Users/yc00027/Documents/GitHub/congress_speech_analysis/Data'

congresses = sorted(listdir_nohidden(FILES_PATH))

congressional_hearings = []

for congress in congresses:
    clear_output()
    print(f'Now loading congress {congress}')
    offices = listdir_nohidden(congress)
    for office in offices:
        clear_output()
        print(f'Now loading office {office}')
        congress = os.path.split(os.path.dirname(office))[1]
        departments = listdir_nohidden(office)
        for department in departments:
            clear_output()
            office = os.path.split(os.path.dirname(department))[1]
            try:
                session_files = listdir_nohidden(department)
            except:
                print(f'No txt files for session {session}')
                continue

            txt_files = [f for f in session_files if f.endswith('.txt')]
            for file in txt_files:
                with open(file, errors='replace') as f:
                    document = f.read().lower() 
                    department = os.path.split(os.path.dirname(file))[1]
                    name = os.path.basename(file)
                    congressional_hearings.append([document, congress, office, department, name])
            
clear_output()
print('Done')

congressional_hearings_df = pd.DataFrame(congressional_hearings).reset_index().rename(columns={'index': 'document_id'})
congressional_hearings_df.columns = ['document_id', 'document', 'congress','office','department','name']
congressional_hearings_df = congressional_hearings_df.sort_values(by=['congress', 'office','department','name'])

print(f'Congressional Hearings DF shape: {congressional_hearings_df.shape}')
congressional_hearings_df.head()

Done
Congressional Hearings DF shape: (2535, 6)


Unnamed: 0,document_id,document,congress,office,department,name
1837,1837,\n - eastern mediterranean energy: challenges ...,114th,House,Ad Hoc Committee on Energy,1.txt
1659,1659,\n - [errata] manipulation and fraud in the re...,114th,House,Commission on Security and Cooperation in Europe,1.txt
1658,1658,\n - the rule of law and civil society in azer...,114th,House,Commission on Security and Cooperation in Europe,2.txt
1657,1657,\n - human rights violations in russian-occupi...,114th,House,Commission on Security and Cooperation in Europe,3.txt
1656,1656,\n - nato's warsaw summit and the future of eu...,114th,House,Commission on Security and Cooperation in Europe,4.txt


Inspect the data frame of bills for the 114th congress and extract full list of speaker names, number of bills and bill types.

In [31]:
bills93_114_df = pd.read_csv('bills93-114.csv', error_bad_lines=False, encoding='ISO-8859-1', sep = ';')
bills_114 = bills93_114_df[bills93_114_df['Cong'] == 114]

print(f'There were {bills_114.shape[0]} bills in Congress 114th.')

speaker_names = sorted(bills_114['NameFull'].str.lower().unique())
print(f'There were {len(speaker_names)} speakers in Congress 114th.')
print('')

print('Number of bills according to bill type:')
print(bills_114['BillType'].value_counts())
print('')

print('Bill types: "hr" (House Bill); "s" (Senate Bill); "hres" (House Resolution); "sres" (Senate Resolution); \
                   "hcon" (House Concurrent Resolution); "scon" (Senate Concurrent Resolution); \
                   "hjres" (House Joint Resolution); "sjres" (Senate Joint Resolution).')


There were 12043 bills in Congress 114th.
There were 542 speakers in Congress 114th.

Number of bills according to bill type:
hr         6508
s          3547
hres        956
sres        642
hconres     183
hjres       108
sconres      58
sjres        41
Name: BillType, dtype: int64

Bill types: "hr" (House Bill); "s" (Senate Bill); "hres" (House Resolution); "sres" (Senate Resolution);                    "hcon" (House Concurrent Resolution); "scon" (Senate Concurrent Resolution);                    "hjres" (House Joint Resolution); "sjres" (Senate Joint Resolution).


Inspect the data frame of chairmen for the 114th congress and extract full list of chairmen names.

In [32]:
chairmen_114 = pd.read_excel('chairmen.xlsx', 'all')
chairmen_114 = chairmen_114[chairmen_114['Congress'] == 114]
chairmen_114 = chairmen_114[(chairmen_114['Chair Committee'].astype(int) == 1) | (chairmen_114['Chair Subcommittee'].astype(int) == 1)] # | (chairmen_114['Vice Chair Committee'].astype(int) == 1) | (chairmen_114['Vice Chair subcommittee'].astype(int) == 1)]

chairmen_names = sorted(chairmen_114['Name'].str.lower().unique())
print(f'There were {len(chairmen_names)} chairmen of Committees and Subcommittees in Congress 114th.')

There were 165 chairmen of Committees and Subcommittees in Congress 114th.


### Get sentences

Extract the data frame of sentences for each document with respective sentence ID and speaker name. Match speaker names extracted from the hearings documents with the complete list of speakers extracted from the bills data frame.

In [33]:
starting_points = []
sentences = []
keywords = ['senator', 'secretary', 'chairman', 'mr.', 'ms.', 'mrs.', 'doc.']

for i, (document_id, document) in enumerate(zip(congressional_hearings_df['document_id'], congressional_hearings_df['document'])):
    sent_text = sent_tokenize(document)
        
    for sentence_id, sentence in enumerate(sent_text):
        sentences.append({
            'document_id': document_id,
            'sentence_id': sentence_id,
            'sentence': sentence,
        })

        sentence_split = sentence.split(' ')
        sentence_length = len(sentence_split) 
        if sentence_length < 2:
            continue
        first_word = sentence_split[0]
        sentence_is_start_sentence = sentence_length == 2 and first_word in keywords
        sentence_is_start_statement = 'statement of'in sentence
        if sentence_is_start_sentence:
            speaker_name = sentence_split[1].strip('.?')
            starting_points.append({
                'document_id': document_id,
                'speaker_name': speaker_name,
                'speech_id': sentence_id,
            })
        elif sentence_is_start_statement:
            try: 
                speaker_name = sent_text[sentence_id+1].split(', ')[0]
                starting_points.append({
                    'document_id': document_id,
                    'speaker_name': speaker_name,
                    'speech_id': sentence_id,
                })
            except IndexError:
                continue

starting_points = pd.DataFrame(starting_points)
sentences = pd.DataFrame(sentences)

sentences_df = pd.merge(sentences, starting_points, left_on=['document_id', 'sentence_id'], \
                     right_on=['document_id', 'speech_id'], how='left')

sentences_df = sentences_df.groupby('document_id').ffill()
sentences_df = sentences_df.fillna(0)

f = lambda x: next(iter(name for name in speaker_names if str(x) in name), 'no match')
sentences_df['clean_speaker_name'] = sentences_df['speaker_name'].apply(f)

sentences_df.drop(columns=['speaker_name'], inplace=True)

print(f'Sentences DF shape: {sentences_df.shape}')
sentences_df.head()

Sentences DF shape: (3849494, 5)


Unnamed: 0,document_id,sentence,sentence_id,speech_id,clean_speaker_name
0,1837,\n - eastern mediterranean energy: challenges ...,0,0.0,no match
1,1837,114-220\n\n (committee on ...,1,0.0,no match
2,1837,"114-90\n\n (committee on science, ...",2,0.0,no match
3,1837,"for more information, contact the gpo customer...",3,0.0,no match
4,1837,"phone 202-512-1800, or 866-512-1800 (toll-free).",4,0.0,no match


### Get speeches

Group sentences into speeches and extract title, date and chairman name from first block of each document.

In [35]:
blocks_df = sentences_df.groupby(['document_id', 'speech_id', 'clean_speaker_name'])['sentence'].apply(lambda x: ','.join(x)).reset_index()
blocks_df.columns = ['document_id', 'speech_id', 'clean_speaker_name', 'speech']

document_title = []
hearing_date = []

for i, (speech_id, speech) in enumerate(zip(blocks_df['speech_id'], blocks_df['speech'])):
    if speech_id == 0:
        title = speech.split('\n')[1].replace(' - ', '')
        regex_date = re.compile(r'(?:january|february|march|april|may|june|july|august|september|october|november|december)\s+\d{1,2},\s+\d{4}')
        date = regex_date.findall(speech)
        document_title.append(title)
        hearing_date.append(date)
    else:
        document_title.append(np.nan)
        hearing_date.append(np.nan)

chairmen = []

for index, (speech_id, speech) in enumerate(zip(blocks_df['speech_id'], blocks_df['speech'])):
    if speech_id != 0:
        chairmen.append(np.nan)
    elif speech_id == 0:
        split_speech = speech.splitlines()
        chairman_names = []
        for i, line in enumerate(split_speech):
            try:
                line_plus_2_long = split_speech[i+2] + split_speech[i+3] + split_speech[i+4] + split_speech[i+5]
                
                if 'committee on' in line:
                        
                    if ' chairman' in line_plus_2_long:
                        chairman_name = line_plus_2_long.split(',')[0].lstrip() 
                        chairman_names.append(chairman_name)
                        
                    if ' chair' in line_plus_2_long:
                        chairman_name = line_plus_2_long.split(',')[0].lstrip() 
                        chairman_names.append(chairman_name)
                        
                    if '(chairman)' in line_plus_2_long:
                        line_plus_2_long = line_plus_2_long.split('(chairman)')[0]
                        chairman_name = line_plus_2_long.split('hon.,')[-1].lstrip() 
                        chairman_names.append(chairman_name)
                        
                    if '(chairwoman)' in line_plus_2_long:
                        line_plus_2_long = line_plus_2_long.split('(chairwoman)')[0]
                        chairman_name = line_plus_2_long.split('hon.,')[-1].lstrip() 
                        chairman_names.append(chairman_name)
                        
                    else:
                        continue
                
                if 'first session' in line:
                    
                    if ' chairman' in line_plus_2_long:
                        chairman_name = line_plus_2_long.split(',')[0].lstrip() 
                        chairman_names.append(chairman_name)
                    else:
                        continue
                        
                if 'second session' in line:
                    
                    if ' chairman' in line_plus_2_long:
                        chairman_name = line_plus_2_long.split(',')[0].lstrip() 
                        chairman_names.append(chairman_name)
                    else:
                        continue
                        
                elif '              house                                 senate' in line:
                    
                    if 'chairman' in line_plus_2_long:
                        chairman_name = line_plus_2_long.split(', ')[0].lstrip()
                        chairman_names.append(chairman_name)
                    else:
                        continue
                    
                else:
                    continue
                        
            except IndexError:
                continue
                        
        chairmen.append(chairman_names)
        
blocks_df['document_title'] = document_title

blocks_df['hearing_date'] = hearing_date
blocks_df['hearing_date'] = blocks_df['hearing_date'].str[0]

blocks_df['chairman'] = chairmen
blocks_df['chairman'] = blocks_df['chairman'].str[0]

chairman_dict = {
                 'jerry moran ' : 'jerry moran',
                 'john thune' : 'john r. thune',
                 'lamar alexander ' : 'lamar alexander',
                 'tom cole' : 'thomas cole',
                 'richard c. shelby ' : 'richard c. shelby', 
                 'lindsey graham' : 'lindsey o. graham',
                 'lindsey graham ': 'lindsey o. graham', 
                 'roy blunt ': 'roy blunt',
                 'susan collins, ': 'susan collins', 
                 'john hoeven ': 'john hoeven',
                 'enzi' : 'michael b. enzi', 
                 'one hundred fourteenth congress                              ----------                                                 david vitter' : 'david vitter',
                 '----------                                                 david vitter' : 'david vitter',
                 'one hundred fourteenth congress                             first session                  james m. inhofe' : 'james m. inhofe',
                 'one hundred fourteenth congress                             second session                  james m. inhofe' : 'james m. inhofe',
                 'one hundred fourteenth congress                             first session                  barbara boxer': 'barbara boxer',
                 'subcommittee on energy and water development                   michael k. simpson' : 'michael k. simpson',
                 'related agencies appropriations                mario diaz-balart' : 'mario diaz-balart',
                 'subcommittee on defense              rodney p. frelinghuysen': 'rodney p. frelinghuysen',
                 'subcommittee on homeland security                     john r. carter': 'john r. carter',
                 'subcommittee on defense       rodney p. frelinghuysen': 'rodney p. frelinghuysen',
                 'subcommittee on financial services and general government                    ander crenshaw' : 'ander crenshaw',
                 'subcommittee on energy and water development                                                                       michael k. simpson': 'michael k. simpson',
                 'related agencies                       ken calvert' : 'ken calvert',
                 "william m. ``mac'' thornberry" : 'william m. thornberry',
                 "one hundred fourteenth congress             william m. ``mac'' thornberry": 'william m. thornberry',
                 'subcommittee on interior': 'ken calvert', 
                 'hon.': 'lamar s. smith', 
                 'chairman                                  co-chairman        alcee l. hastings': 'christopher h. smith',
                 'alcee l. hastings': 'christopher h. smith',
                 'subcommittee on commerce': 'john abney culberson', 
                 'drug administration': 'robert b. aderholt', 
                 '[established by s. res.': 'richard burr', 
                 'washington': 'jeff flake'
               }

blocks_df= blocks_df.replace({'chairman': chairman_dict})

blocks_df['speech_len'] = blocks_df['speech'].str.split().apply(len)

blocks_df = blocks_df.groupby('document_id').ffill()

blocks_df.to_csv('blocks_df.csv', index=False)

print(f'Blocks DF shape: {blocks_df.shape}')
blocks_df.head()

Blocks DF shape: (431515, 8)


Unnamed: 0,document_id,speech_id,clean_speaker_name,speech,document_title,hearing_date,chairman,speech_len
0,0,0.0,no match,\n - ending veteran homelessness\n[senate hear...,ending veteran homelessness,"july 29, 2015",johnny isakson,578
1,0,33.0,no match,"opening statement of hon.,johnny isakson, chai...",ending veteran homelessness,"july 29, 2015",johnny isakson,245
2,0,51.0,no match,"mr. violante.,thank you, mr. chairman.,i appre...",ending veteran homelessness,"july 29, 2015",johnny isakson,9
3,0,55.0,john isakson,"chairman isakson.,when i spoke to the american...",ending veteran homelessness,"july 29, 2015",johnny isakson,354
4,0,67.0,richard blumenthal,"statement of hon.,richard blumenthal, \n ...",ending veteran homelessness,"july 29, 2015",johnny isakson,590


In [7]:
speeches_df = pd.merge(blocks_df, sentences_df, left_on=['document_id', 'speech_id'], \
                     right_on=['document_id', 'speech_id'], how='left')

speeches_df = speeches_df[~speeches_df['clean_speaker_name'].isin(['no match'])]
speeches_df['sentence_len'] = speeches_df['sentence'].str.split().apply(len)
speeches_df = speeches_df[speeches_df['sentence_len'] > 3]

speeches_df.to_csv('speeches_df.csv', index=False)

print(f'Speeches DF shape: {speeches_df.shape}')
speeches_df.head()

Speeches DF shape: (1471194, 11)


Unnamed: 0,document_id,speech_id,speech,document_title,hearing_date,chairman,speech_len,sentence,sentence_id,clean_speaker_name,sentence_len
56,0,55.0,"chairman isakson.,when i spoke to the american...",ending veteran homelessness,"july 29, 2015",johnny isakson,354,when i spoke to the american legion at \nthe b...,56,john isakson,46
57,0,55.0,"chairman isakson.,when i spoke to the american...",ending veteran homelessness,"july 29, 2015",johnny isakson,354,first and foremost was veterans' \nhomelessnes...,57,john isakson,25
58,0,55.0,"chairman isakson.,when i spoke to the american...",ending veteran homelessness,"july 29, 2015",johnny isakson,354,"while it is improving in terms of lessening, i...",58,john isakson,20
59,0,55.0,"chairman isakson.,when i spoke to the american...",ending veteran homelessness,"july 29, 2015",johnny isakson,354,we have two \ndistinguished panels who will ta...,59,john isakson,19
60,0,55.0,"chairman isakson.,when i spoke to the american...",ending veteran homelessness,"july 29, 2015",johnny isakson,354,"from 2010 to 2014, the number of homeless vete...",60,john isakson,14


### Get paragraphs

Break speeches into paragraphs of consistent length.

In [208]:
path = '/Users/yc00027/Documents/GitHub/congress_speech_analysis/blocks_df.csv'
blocks_df = pd.read_csv(path).reset_index().rename(columns={'index': 'unique_speech_id'})
blocks_df = blocks_df[blocks_df['speech_id'] != 0]
#blocks_df = blocks_df[blocks_df['speech_len'] >= 10]

print(f'Blocks DF shape: {blocks_df.shape}')
blocks_df.head()

Blocks DF shape: (428980, 9)


Unnamed: 0,unique_speech_id,document_id,speech_id,clean_speaker_name,speech,document_title,hearing_date,chairman,speech_len
1,1,0,33.0,no match,"opening statement of hon.,johnny isakson, chai...",ending veteran homelessness,"july 29, 2015",johnny isakson,245
2,2,0,51.0,no match,"mr. violante.,thank you, mr. chairman.,i appre...",ending veteran homelessness,"july 29, 2015",johnny isakson,9
3,3,0,55.0,john isakson,"chairman isakson.,when i spoke to the american...",ending veteran homelessness,"july 29, 2015",johnny isakson,354
4,4,0,67.0,richard blumenthal,"statement of hon.,richard blumenthal, \n ...",ending veteran homelessness,"july 29, 2015",johnny isakson,590
5,5,0,100.0,john isakson,"chairman isakson.,it is now my privilege to in...",ending veteran homelessness,"july 29, 2015",johnny isakson,95


In [219]:
keywords = ['senator', 'secretary', 'chairman', 'mr.', 'ms.', 'mrs.', 'doc.']

speaker = []

for i, row in blocks_df.iterrows():
    if 'no match' in row['clean_speaker_name']:
        try: 
            if ' hon.' in row['speech'].split(',')[0]:
                speaker.append(row['speech'].split(',')[1])
            elif 'statement of ' in row['speech'].split(',')[0]:
                speaker.append(row['speech'].split(',')[0].split('statement of ')[1])
            elif 'prepared statement of ' in row['speech'].split('\n')[0]:
                speaker.append(row['speech'].split('follows:]\n   prepared ')[0].split('prepared statement of ')[1])
            elif any(row['speech'].startswith(w) for w in keywords):
                speaker.append(row['speech'].split(' ')[1].split('.,')[0])
            else: 
                speaker.append('no match')
        except IndexError:
            speaker.append('no match')
    else:
        speaker.append(row['clean_speaker_name'])
        
blocks_df['speaker'] = speaker 

def replace_all(text, dic):
    for i, j in dic.items():
        text = text.replace(i, j)
    return text

repl_dict = {'ms.' : '', 'mr.': '', 'chairman': '', 'follows:]\n prepared': '', 'john isakson': 'johnny isakson', 'aaron schock': 'jennifer ho'}

blocks_df['speaker'] = blocks_df['speaker'].apply(lambda x: replace_all(x, repl_dict))

f = lambda x: next(iter(name for name in speaker_names if str(x) in name), 'no match')
blocks_df['final_speaker'] = blocks_df['speaker'].apply(f)

blocks_df = blocks_df[blocks_df['final_speaker'] != 'no match']
blocks_df.drop(columns=['clean_speaker_name', 'speaker'], inplace=True)

print(f'Blocks DF shape: {blocks_df.shape}')
blocks_df.head()

Blocks DF shape: (273355, 9)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Unnamed: 0,unique_speech_id,document_id,speech_id,speech,document_title,hearing_date,chairman,speech_len,final_speaker
4,4,0,67.0,"statement of hon.,richard blumenthal, \n ...",ending veteran homelessness,"july 29, 2015",johnny isakson,590,richard blumenthal
29,29,0,409.0,"senator blumenthal.,thank you.,let me ask you,...",ending veteran homelessness,"july 29, 2015",johnny isakson,36,richard blumenthal
31,31,0,415.0,"senator blumenthal.,folks in the community.,so...",ending veteran homelessness,"july 29, 2015",johnny isakson,22,richard blumenthal
34,34,0,428.0,"senator blumenthal.,so, in theory, you would w...",ending veteran homelessness,"july 29, 2015",johnny isakson,24,richard blumenthal
36,36,0,435.0,"senator blumenthal.,in terms of the super-util...",ending veteran homelessness,"july 29, 2015",johnny isakson,246,richard blumenthal


In [220]:
TOO_SMALL_LENGTH = 100
TOO_LARGE_LENGTH = 200

def split_paragraph(paragraph):
    import copy
    list_of_paragraphs = []
    remaining_paragraph = copy.copy(paragraph)
    paragraph_split = remaining_paragraph.split('. ')
    i = 0
    while len(remaining_paragraph.split()) > TOO_LARGE_LENGTH:
        paragraph_piece = ''
        while len(paragraph_piece.split()) < TOO_SMALL_LENGTH:
            paragraph_piece += paragraph_split[i]
            i += 1
            remaining_paragraph = '. '.join(paragraph_split[i:])
            
        list_of_paragraphs.append(paragraph_piece)
    
    if len(remaining_paragraph.split()) > TOO_SMALL_LENGTH:
        list_of_paragraphs.append(remaining_paragraph)
        
    return list_of_paragraphs

characters = ["-", "...", "''", "``", "@", "#",  
              '--', '=', '_', '..', '|', "/",
              '~', '—', '•', '“', '–', '>', '*']

def clean_text(text):
    '''Remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = text.replace(' \n', ' ')
    for character in characters:
        text = text.replace(character, " ")
    text = ' '.join(text.split())
    return text

In [221]:
skip_iterations = []
paragraphs = []

for index, speech in zip(blocks_df['unique_speech_id'], blocks_df['speech']):
    paragraph_text = re.split('[.?!]\n    ', speech)
    for i, paragraph in enumerate(paragraph_text):
        if i in skip_iterations:
            continue
            
        paragraph_length = len(paragraph.split())
        paragraph_too_small = paragraph_length < TOO_SMALL_LENGTH
        paragraph_too_large = paragraph_length > TOO_LARGE_LENGTH
        if paragraph_too_small:
            try:
                i_plus = 1
                while len(paragraph.split()) < TOO_SMALL_LENGTH:
                    paragraph += paragraph_text[i+i_plus]
                    if len(paragraph.split()) > TOO_LARGE_LENGTH:
                        raise IndexError
                    skip_iterations.append(i+i_plus)                    
                    i_plus += 1
                    
            except IndexError:
                continue
                
        elif paragraph_too_large:
            smaller_paragraphs = split_paragraph(paragraph)
            for p in smaller_paragraphs:
                paragraphs.append({
                    'unique_speech_id': index,
                    'paragraph': re.sub(' +', ' ', p),
                })
                
            continue
        
        if len(paragraph.split()) < TOO_SMALL_LENGTH:
            break
            assert False
        paragraphs.append({
            'unique_speech_id': index,
            'paragraph': re.sub(' +', ' ', paragraph),
        })
            
            
paragraphs_df = pd.DataFrame(paragraphs).reset_index().rename(columns={'index': 'paragraph_id'})

paragraphs_df['paragraph_len'] = paragraphs_df['paragraph'].str.split().apply(len)
paragraphs_df['paragraph'] = [clean_text(paragraph) for paragraph in paragraphs_df['paragraph']]

paragraphs_df = paragraphs_df[paragraphs_df['paragraph_len'] <= TOO_LARGE_LENGTH]

print(f'Paragraphs DF shape: {paragraphs_df.shape}')

paragraphs_df.head()

Paragraphs DF shape: (60562, 4)


Unnamed: 0,paragraph_id,paragraph,unique_speech_id,paragraph_len
2,2,"senator tillis.,something i would like to see ...",42,165
4,4,"senator manchin.,hon.,joe manchin, u.s. senato...",45,194
5,5,"senator manchin.,well, i agree.,the thing that...",48,161
6,6,"senator manchin.,they cannot pass a drug test....",50,146
7,7,"senator manchin.,i am sorry.,dr. o'toole.,no, ...",55,101


In [222]:
print(paragraphs_df.shape)

_paragraphs_speeches_df = pd.merge(paragraphs_df, blocks_df.drop(columns=['speech']), on='unique_speech_id', how='left')
print(_paragraphs_speeches_df.shape)

congress_114_paragraphs_df = pd.merge(_paragraphs_speeches_df, congressional_hearings_df.drop(columns=['document']), on='document_id', how='left')
print(f'Full 114th Congress Paragraphs DF shape: {congress_114_paragraphs_df.shape}')
congress_114_paragraphs_df.head()

(60562, 4)
(60562, 11)
Full 114th Congress Paragraphs DF shape: (60562, 15)


Unnamed: 0,paragraph_id,paragraph,unique_speech_id,paragraph_len,document_id,speech_id,document_title,hearing_date,chairman,speech_len,final_speaker,congress,office,department,name
0,2,"senator tillis.,something i would like to see ...",42,165,0,477.0,ending veteran homelessness,"july 29, 2015",johnny isakson,413,thom tillis,114th,Senate,Committee on Veterans Affairs,15.txt
1,4,"senator manchin.,hon.,joe manchin, u.s. senato...",45,194,0,499.0,ending veteran homelessness,"july 29, 2015",johnny isakson,194,joe manchin,114th,Senate,Committee on Veterans Affairs,15.txt
2,5,"senator manchin.,well, i agree.,the thing that...",48,161,0,523.0,ending veteran homelessness,"july 29, 2015",johnny isakson,161,joe manchin,114th,Senate,Committee on Veterans Affairs,15.txt
3,6,"senator manchin.,they cannot pass a drug test....",50,146,0,538.0,ending veteran homelessness,"july 29, 2015",johnny isakson,146,joe manchin,114th,Senate,Committee on Veterans Affairs,15.txt
4,7,"senator manchin.,i am sorry.,dr. o'toole.,no, ...",55,101,0,566.0,ending veteran homelessness,"july 29, 2015",johnny isakson,101,joe manchin,114th,Senate,Committee on Veterans Affairs,15.txt


In [223]:
chairman_paragraphs_df = congress_114_paragraphs_df[congress_114_paragraphs_df['final_speaker'].isin(chairmen_names)]
#chairman_paragraphs_df = congress_114_paragraphs_df[congress_114_paragraphs_df['chairman'] == congress_114_paragraphs_df['final_speaker']]

def FormatParagraph(paragraph):
    punc_filter = re.compile('([.!?]\s*)')
    split_with_punctuation = punc_filter.split(paragraph)
    final = ''.join([i.capitalize() for i in split_with_punctuation])+'.'
    return final

'''percentage_of_speech = []
for i, row in chairm_paragraphs_df.iterrows():
    _percentage_of_speech = row['paragraph_len'] / chairman_paragraphs_df.groupby('speaker')['paragraph_len'].sum()[row['speaker']] * 100
    _percentage_of_speech = round(_percentage_of_speech, 2)
    percentage_of_speech.append(_percentage_of_speech)'''

chairman_paragraphs_df['paragraph'] = chairman_paragraphs_df['paragraph'].apply(lambda x: FormatParagraph(x))
chairman_paragraphs_df['percentage_of_speech'] = [round(row['paragraph_len']/row['speech_len']*100, 2) for i, row in chairman_paragraphs_df.iterrows()]
chairman_paragraphs_df.to_excel('chairman_paragraphs_df.xlsx', index=False)
print(f'114th Congress Speaker Paragraphs DF shape: {chairman_paragraphs_df.shape}')
chairman_paragraphs_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


114th Congress Speaker Paragraphs DF shape: (16613, 16)


Unnamed: 0,paragraph_id,paragraph,unique_speech_id,paragraph_len,document_id,speech_id,document_title,hearing_date,chairman,speech_len,final_speaker,congress,office,department,name,percentage_of_speech
6,9,"Senator boozman.,hon.,john boozman, u.Ssenator...",60,110,0,600.0,ending veteran homelessness,"july 29, 2015",johnny isakson,207,john boozman,114th,Senate,Committee on Veterans Affairs,15.txt,53.14
7,11,"Senator boozman.,mr. Chairman, we really need ...",71,114,0,654.0,ending veteran homelessness,"july 29, 2015",johnny isakson,114,john boozman,114th,Senate,Committee on Veterans Affairs,15.txt,100.0
8,13,"O'toole.,i am happy to jump in on that.,thank ...",73,111,0,664.0,ending veteran homelessness,"july 29, 2015",johnny isakson,324,dan sullivan,114th,Senate,Committee on Veterans Affairs,15.txt,34.26
9,16,"Senator heller.,is there a hard number?,dr. O'...",88,132,0,773.0,ending veteran homelessness,"july 29, 2015",johnny isakson,132,dean heller,114th,Senate,Committee on Veterans Affairs,15.txt,100.0
23,36,"Senator sullivan.,hon.,dan sullivan, u.Ssenato...",300,148,1,690.0,gao's high-risk list and the veterans health a...,"april 29, 2015",johnny isakson,316,dan sullivan,114th,Senate,Committee on Veterans Affairs,14.txt,46.84


In [224]:
print(f'There are {chairman_paragraphs_df.final_speaker.nunique()} speakers represented in the data')
chairman_paragraphs_df.final_speaker.value_counts()

There are 102 speakers represented in the data


kelly ayotte           632
dan sullivan           603
duncan hunter          602
harold rogers          470
jason chaffetz         468
john mccain            457
mark meadows           407
james lankford         402
jerry moran            390
deb fischer            354
steve chabot           340
austin scott           328
dana rohrabacher       293
scott perry            287
bill cassidy           271
barry loudermilk       258
martha mcsally         256
john barrasso          241
mike rounds            240
ileana ros-lehtinen    233
jim bridenstine        229
cory gardner           225
tom cotton             225
mike coffman           213
bob goodlatte          211
joni ernst             204
ron desantis           199
trey gowdy             198
marco rubio            196
virginia foxx          194
daniel coats           194
tom marino             190
john katko             189
john ratcliffe         182
trent franks           179
pat roberts            178
matt salmon            178
v