# Congressional Speech Analysis

In [1]:
import os
import glob
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
pd.options.display.max_columns = 1200  
pd.options.display.max_rows = 1200 

import itertools
import datetime
import datefinder
import operator

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
# nltk.download('vader_lexicon')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from metaphone import doublemetaphone
import re
import unicodedata

def listdir_nohidden(path):
    return glob.glob(os.path.join(path, '*'))

### Get files

In [2]:
from IPython.display import clear_output

FILES_PATH = '/Users/yc00027/Documents/GitHub/NLP_project/Data'

congresses = sorted(listdir_nohidden(FILES_PATH))

congressional_hearings = []

for congress in congresses:
    clear_output()
    print(f'Now loading congress {congress}')
    offices = listdir_nohidden(congress)
    for office in offices:
        clear_output()
        print(f'Now loading office {office}')
        congress = os.path.split(os.path.dirname(office))[1]
        departments = listdir_nohidden(office)
        for department in departments:
            clear_output()
            office = os.path.split(os.path.dirname(department))[1]
            try:
                session_files = listdir_nohidden(department)
            except:
                print(f'No txt files for session {session}')
                continue

            txt_files = [f for f in session_files if f.endswith('.txt')]
            for file in txt_files:
                with open(file, errors='replace') as f:
                    document = f.read().lower() 
                    department = os.path.split(os.path.dirname(file))[1]
                    name = os.path.basename(file)
                    congressional_hearings.append([document, congress, office, department, name])
            
clear_output()
print('Done')

congressional_hearings_df = pd.DataFrame(congressional_hearings)
congressional_hearings_df.columns = ['document', 'congress','office','department','name']
congressional_hearings_df = congressional_hearings_df.sort_values(by=['congress', 'office','department','name'])

print(f'Congressional Hearings DF shape: {congressional_hearings_df.shape}')
congressional_hearings_df.head()

Done
Congressional Hearings DF shape: (2535, 5)


Unnamed: 0,document,congress,office,department,name
1837,\n - eastern mediterranean energy: challenges ...,114th,House,Ad Hoc Committee on Energy,1.txt
1659,\n - [errata] manipulation and fraud in the re...,114th,House,Commission on Security and Cooperation in Europe,1.txt
1658,\n - the rule of law and civil society in azer...,114th,House,Commission on Security and Cooperation in Europe,2.txt
1657,\n - human rights violations in russian-occupi...,114th,House,Commission on Security and Cooperation in Europe,3.txt
1656,\n - nato's warsaw summit and the future of eu...,114th,House,Commission on Security and Cooperation in Europe,4.txt


In [11]:
bills93_114_df = pd.read_csv('bills93-114.csv', error_bad_lines=False, encoding='ISO-8859-1', sep = ';')
bills_114 = bills93_114_df[bills93_114_df['Cong'] == 114]

print(f'There were {bills_114.shape[0]} bills in Congress 114th.')

speaker_names = sorted(bills_114['NameFull'].str.lower().unique())
print(f'There were {len(speaker_names)} speakers in Congress 114th.')
print('')

print('Number of bills according to bill type:')
print(bills_114['BillType'].value_counts())
print('')

print('Bill types: "hr" (House Bill); "s" (Senate Bill); "hres" (House Resolution); "sres" (Senate Resolution); "hcon" (House Concurrent Resolution); "scon" (Senate Concurrent Resolution); "hjres" (House Joint Resolution); "sjres" (Senate Joint Resolution).')

There were 12043 bills in Congress 114th.
There were 542 speakers in Congress 114th.

Number of bills according to bill type:
hr         6508
s          3547
hres        956
sres        642
hconres     183
hjres       108
sconres      58
sjres        41
Name: BillType, dtype: int64

Bill types: "hr" (House Bill); "s" (Senate Bill); "hres" (House Resolution); "sres" (Senate Resolution); "hcon" (House Concurrent Resolution); "scon" (Senate Concurrent Resolution); "hjres" (House Joint Resolution); "sjres" (Senate Joint Resolution).


### Get paragraphs

In [4]:
documents = []

for index, document in congressional_hearings_df['document'].items():
    try:
        documents.append({
                'document_id': index,
                'document': document,
                'title': document.split('\n')[1].replace(' - ', ''),
                'chairman': [sentence for sentence in document.split('\n\n') if 'chairman' in sentence][0].split(',')[0].lstrip(),
                'date': document.split('\n\n                               __________\n\n')[1].lstrip().split('\n')[0]
        })
    except Exception as e:
        continue
        
documents_df = pd.DataFrame(documents)

documents_df = documents_df[documents_df['chairman'].str.split().apply(len) <= 3]
documents_df = documents_df[documents_df['chairman'] != 'epstein. great']
documents_df['chairman'] = documents_df['chairman'].apply(lambda x: ' '.join(x.split(' ')[-2:]))

documents_df = documents_df[documents_df['date'].str.contains('\d')]
documents_df['date'] = documents_df['date'].apply(lambda x: ' '.join(x.split(' ')[-3:]))

dates = []
for i, row in documents_df.iterrows():
    regex = re.compile(r'(?:january|february|march|april|may|june|july|august|september|october|november|december)\s+\d{1,2},\s+\d{4}')
    dates.append(regex.findall(row['date']))
    
documents_df['date'] = dates
documents_df['date'] = documents_df['date'].str[0]

documents_df.dropna(inplace=True)

print(f'Documents DF shape: {documents_df.shape}')
documents_df.head()

Documents DF shape: (1860, 5)


Unnamed: 0,chairman,date,document,document_id,title
0,r. royce,"september 8, 2016",\n - eastern mediterranean energy: challenges ...,1837,eastern mediterranean energy: challenges and o...
1,michael conaway,"february 11, 2015",\n - hearing to review the state of the rural ...,1255,hearing to review the state of the rural economy
2,michael conaway,"april 22, 2015",\n - hearing to review reauthorization of the ...,1220,hearing to review reauthorization of the lives...
3,michael conaway,"april 22, 2015",\n - hearing to review reauthorization of the ...,1223,hearing to review reauthorization of the u.s. ...
4,michael conaway,"april 29, 2015",\n - hearing to review the national forest sys...,1219,hearing to review the national forest system a...


In [5]:
blocks = []
for index, document in zip(documents_df['document_id'], documents_df['document']):
    block_text = re.split('statement of hon.', document)
    for block in block_text:
        try:
            blocks.append({
                'document_id': index,
                'block': block,
                'speaker': block.split(',', 1)[0]
            })
        except Exception as e:
            print(block)
            raise(e)  

blocks_df = pd.DataFrame(blocks).reset_index().rename(columns={'index': 'block_id'})

blocks_df = blocks_df[~blocks_df.speaker.str.contains("\n")]

words_to_remove = ['rable', 'of', 'maryland']
    
blocks_df['speaker'] = [' '.join([y for y in x.split() if y not in words_to_remove]) for x in blocks_df['speaker']]

blocks_df['block_len'] = blocks_df['block'].str.split().apply(len)

print(f'Blocks DF shape: {blocks_df.shape}')

blocks_df.head()

Blocks DF shape: (3806, 5)


Unnamed: 0,block_id,block,document_id,speaker,block_len
2,2,"k. michael conaway, a representative \n ...",1255,k. michael conaway,661
3,3,"k. michael conaway, a representative in \n ...",1255,k. michael conaway,534
4,4,"collin c. peterson, a representative \n ...",1255,collin c. peterson,537
5,5,"thomas ``tom'' j. vilsack, secretary, u.s. \n...",1255,thomas ``tom'' j. vilsack,869
6,6,"thomas ``tom'' j. vilsack, secretary, u.s. \n...",1255,thomas ``tom'' j. vilsack,47825


In [6]:
TOO_SMALL_LENGTH = 100
TOO_LARGE_LENGTH = 300

def split_paragraph(paragraph):
    import copy
    list_of_paragraphs = []
    remaining_paragraph = copy.copy(paragraph)
    paragraph_split = remaining_paragraph.split('. ')
    i = 0
    while len(remaining_paragraph.split()) > TOO_LARGE_LENGTH:
        paragraph_piece = ''
        while len(paragraph_piece.split()) < TOO_SMALL_LENGTH:
            paragraph_piece += paragraph_split[i]
            i += 1
            remaining_paragraph = '. '.join(paragraph_split[i:])
            
        list_of_paragraphs.append(paragraph_piece)
    
    if len(remaining_paragraph.split()) > TOO_SMALL_LENGTH:
        list_of_paragraphs.append(remaining_paragraph)
        
    return list_of_paragraphs

In [7]:
skip_iterations = []
paragraphs = []
unused_paragraphs = []

for index, block in zip(blocks_df['block_id'], blocks_df['block']):
    paragraph_text = block.split('.\n ')
    for i, paragraph in enumerate(paragraph_text):
        if i in skip_iterations:
            continue
            
        paragraph = paragraph.replace('\n', ' ').replace('\t', ' ')
        paragraph_length = len(paragraph.split())
        paragraph_too_small = paragraph_length < TOO_SMALL_LENGTH
        paragraph_too_large = paragraph_length > TOO_LARGE_LENGTH
        if paragraph_too_small:
            try:
                i_plus = 1
                while len(paragraph.split()) < TOO_SMALL_LENGTH:
                    paragraph += paragraph_text[i+i_plus]
                    skip_iterations.append(i+i_plus)                    
                    i_plus += 1
                    
            except IndexError:
                continue
                
        elif paragraph_too_large:
            smaller_paragraphs = split_paragraph(paragraph)
            for p in smaller_paragraphs:
                paragraphs.append({
                    'block_id': index,
                    'paragraph': p
                })
                
            continue
        
        if len(paragraph.split()) < TOO_SMALL_LENGTH:
            break
            assert False
        paragraphs.append({
            'block_id': index,
            'paragraph': paragraph,
        })
            
            
paragraphs_df = pd.DataFrame(paragraphs).reset_index().rename(columns={'index': 'paragraph_id'})

paragraphs_df['paragraph_len'] = paragraphs_df['paragraph'].str.split().apply(len)

print(f'Paragraphs DF shape: {paragraphs_df.shape}')

paragraphs_df.head()

Paragraphs DF shape: (26911, 4)


Unnamed: 0,paragraph_id,block_id,paragraph,paragraph_len
0,0,2,"k. michael conaway, a representative ...",180
1,1,2,saturday marked the 1 year anniversary of t...,129
2,2,2,while the agricultural economy has been tur...,126
3,3,2,while i thank you for your hard work in imp...,129
4,4,3,"k. michael conaway, a representative in ...",145


In [23]:
paragraphs_temp_df = pd.merge(paragraphs_df, blocks_df, on='block_id', how='left')
paragraphs_final_df = pd.merge(paragraphs_temp_df, documents_df, on='document_id', how='left')
congress_114_paragraphs_df = pd.merge(paragraphs_final_df, congressional_hearings_df.reset_index().drop(columns=['document']), left_on='document_id', right_on='index', how='left')

speaker_dict = {'chris smith': 'christopher smith',
                'christopher h. smith': 'christopher smith',
                'h. smith': 'christopher smith', 
                'g. hatch': 'orrin hatch', 
                'orrin g. hatch': 'orrin hatch', 
                'm. inhofe': 'james inhofe', 
                'james m. inhofe': 'james inhofe', 
                'michael conaway': 'k. conaway',
                'k. michael conaway': 'k. conaway',
                'johnny isakson': 'john isakson',
                't. mccaul': 'michael mccaul',
                'jeff sessions': 'jefferson sessions'
               }

congress_114_paragraphs_df = congress_114_paragraphs_df.replace({'chairman': speaker_dict})
congress_114_paragraphs_df = congress_114_paragraphs_df.replace({'speaker': speaker_dict})
congress_114_paragraphs_df.drop(columns=['index'], inplace=True)
print(f'Full 114th Congress Paragraphs DF shape: {congress_114_paragraphs_df.shape}')
congress_114_paragraphs_df.head()

Full 114th Congress Paragraphs DF shape: (26911, 16)


Unnamed: 0,paragraph_id,block_id,paragraph,paragraph_len,block,document_id,speaker,block_len,chairman,date,document,title,congress,office,department,name
0,0,2,"k. michael conaway, a representative ...",180,"k. michael conaway, a representative \n ...",1255,k. conaway,661,k. conaway,"february 11, 2015",\n - hearing to review the state of the rural ...,hearing to review the state of the rural economy,114th,House,Committee on Agriculture,1.txt
1,1,2,saturday marked the 1 year anniversary of t...,129,"k. michael conaway, a representative \n ...",1255,k. conaway,661,k. conaway,"february 11, 2015",\n - hearing to review the state of the rural ...,hearing to review the state of the rural economy,114th,House,Committee on Agriculture,1.txt
2,2,2,while the agricultural economy has been tur...,126,"k. michael conaway, a representative \n ...",1255,k. conaway,661,k. conaway,"february 11, 2015",\n - hearing to review the state of the rural ...,hearing to review the state of the rural economy,114th,House,Committee on Agriculture,1.txt
3,3,2,while i thank you for your hard work in imp...,129,"k. michael conaway, a representative \n ...",1255,k. conaway,661,k. conaway,"february 11, 2015",\n - hearing to review the state of the rural ...,hearing to review the state of the rural economy,114th,House,Committee on Agriculture,1.txt
4,4,3,"k. michael conaway, a representative in ...",145,"k. michael conaway, a representative in \n ...",1255,k. conaway,534,k. conaway,"february 11, 2015",\n - hearing to review the state of the rural ...,hearing to review the state of the rural economy,114th,House,Committee on Agriculture,1.txt


In [33]:
chairman_paragraphs_df = congress_114_paragraphs_df[congress_114_paragraphs_df['chairman'] == congress_114_paragraphs_df['speaker']]
chairman_paragraphs_df['percentage_of_speech'] = round(chairman_paragraphs_df['paragraph_len'] / chairman_paragraphs_df['block_len'] * 100, 2)
chairman_paragraphs_df.drop(columns=['block', 'block_id', 'speaker', 'name', 'block_len', 'document'], inplace=True)

chairman_paragraphs_df.to_csv('congress114_chairman_paragraphs.csv', index=False)
print(f'114th Congress Chairman Paragraphs DF shape: {chairman_paragraphs_df.shape}')
chairman_paragraphs_df.head()

114th Congress Chairman Paragraphs DF shape: (2489, 11)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Unnamed: 0,paragraph_id,paragraph,paragraph_len,document_id,chairman,date,title,congress,office,department,percentage_of_speech
0,0,"k. michael conaway, a representative ...",180,1255,k. conaway,"february 11, 2015",hearing to review the state of the rural economy,114th,House,Committee on Agriculture,27.23
1,1,saturday marked the 1 year anniversary of t...,129,1255,k. conaway,"february 11, 2015",hearing to review the state of the rural economy,114th,House,Committee on Agriculture,19.52
2,2,while the agricultural economy has been tur...,126,1255,k. conaway,"february 11, 2015",hearing to review the state of the rural economy,114th,House,Committee on Agriculture,19.06
3,3,while i thank you for your hard work in imp...,129,1255,k. conaway,"february 11, 2015",hearing to review the state of the rural economy,114th,House,Committee on Agriculture,19.52
4,4,"k. michael conaway, a representative in ...",145,1255,k. conaway,"february 11, 2015",hearing to review the state of the rural economy,114th,House,Committee on Agriculture,27.15


In [34]:
print(f'There are {chairman_paragraphs_df.chairman.nunique()} speakers represented in the data')
chairman_paragraphs_df.chairman.value_counts()

There are 15 speakers represented in the data


orrin hatch          1124
k. conaway            393
john thune            167
james inhofe          146
lisa murkowski        130
christopher smith     102
john kline             96
david vitter           85
fred upton             63
bob corker             55
john barrasso          54
pat roberts            38
john isakson           28
richard burr            5
roy blunt               3
Name: chairman, dtype: int64

### Get bills

In [None]:
bill_types = [' hr ', ' hr', ' h.r. ', ' h.r.', ' hr. ', ' hr.',
              ' hconres ', ' hconres',  ' hconres. ', ' hconres.', 
              ' hjres ', ' hjres', ' hjres. ', ' hjres.',
              ' hres ', ' hres', ' hres. ', ' hres.',
              ' s. ', ' s.', ' s_ ', ' s_', ' sb ',
              ' sconres ', ' sconres', ' sconres. ', ' sconres.',
              ' sjres ', ' sjres', ' sjres. ', ' sjres.',
              ' sres ', ' sres', ' sres. ', ' sres.']

In [None]:
bills = []

for index, paragraph in zip(paragraphs_df['paragraph_id'], paragraphs_df['paragraph']):
    paragraph = paragraph.replace(' h.r. ', ' hr ').replace(' s. ', ' sb ')
    if any(x in paragraph for x in bill_types):
        try:
            regex = re.compile(r'(?: h.r. | hr | hr. | hconres | hconres. | hjres | hjres. | hres | hres. | \
                                     s. | sb | sconres | sconres. | sjres | sjres. | sres | sres. )+(\d+)+')
            bills.append({'paragraph_id': index,
                          'bill_id': regex.findall(paragraph),
                          'bill_type': [b for b in bill_types if b in paragraph],
                          'paragraph': paragraph
                         })
        except Exception as e:
            print(paragraph)
            raise(e)  

bills_df = pd.DataFrame(bills)
bills_df = bills_df[bills_df.astype(str)['bill_id'] != "[]"]
bills_df['bill_id'] = bills_df['bill_id'].apply(lambda x: list(set(x)))
#bills_df = bills_df[bills_df['bill_id'].map(lambda d: len(d)) == 1]
bills_df['bill_id'] = bills_df['bill_id'].str[0]
bills_df['bill_type'] = bills_df['bill_type'].apply(lambda x: x[0])
bills_df['bill_type'] = bills_df['bill_type'].replace(" sb ", "s").replace(" s.", "s").replace(" hr ", "hr").replace(" hr", "hr")

print(f'Bills DF shape: {bills_df.shape}')
bills_df.head()

In [None]:
print(f'There are {bills_df.bill_id.nunique()} unique bills in the dataframe.')
print('')
print('Number of S. bills and number of H.R. bills:')
print(bills_df.bill_type.value_counts())

### Get final DF

In [None]:
bills_speakers_df = pd.merge(bills_df, blocks_df.drop(columns=['block']), on='block_id', how='left')
print(f'Bills and speakers DF shape: {bills_speakers_df.shape}')
bills_speakers_df.head()

In [None]:
congress_114_full_df = pd.merge(bills_speakers_df, congressional_hearings_df.reset_index(),
                                left_on='document_id',right_on='index', how='left')
print(f'Full 144th Congressional Hearings DF shape: {congress_114_full_df.shape}')
congress_114_full_df.head()

In [None]:
congress_114_full_df['paragraph'].nunique()

In [None]:
congress_114_full_df['sentence'].nunique()

In [None]:
congress_114_full_df['paragraph_len'] = congress_114_full_df['paragraph'].str.split().apply(len)
congress_114_full_df['sentence_len'] = congress_114_full_df['sentence'].str.split().apply(len)
print(congress_114_full_df.paragraph_len.mean())
print(congress_114_full_df.sentence_len.mean())

In [None]:
congress_114_full_df[['paragraph', 'paragraph_len', 'sentence', 'sentence_len']]

In [None]:
len(congress_114_full_df['paragraph'].iloc[0].split())

In [None]:
congress_114_full_df['paragraph'].iloc[0]

In [None]:
def clean_text(text):
    # Step 1. Drop special characters and keep just the alpha
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text) #remove all except A-Z, a-z, 0-9   
    #Step 2. Transform them into lower case
    text = text.lower().split()
    #Step 3. Delete stop words
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]    
    text = " ".join(text)
    return(text)