In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('data/csv/processed/record-1999.csv'). \
    drop('Unnamed: 0', axis=1)
data.head()

Unnamed: 0,branch,congressID,ideology_score,page,part,speaker,speech,year
0,,106,0.391,1,1999-01-08,Mr. THURMOND.,I suggest the absence of a quorum. The PRESI...,1999
1,,106,0.39,1,1999-01-08,Mr. LOTT.,"Mr. President, to inform all of our colleague...",1999
2,Senate,106,0.331,188,1999-01-14,Mr. BENNETT.,"Your Honor, excuse me, Mr. President, I need ...",1999
3,Senate,106,0.39,2,1999-01-22,Mr. LOTT.,"Mr. Chief Justice, I do send the first questi...",1999
4,Senate,106,-0.326,30,1999-01-22,Mr. ROCKEFELLER.,"Mr. President, I would like to join Senator F...",1999


In [3]:
remove_speakers = ['The PRESIDING OFFICER.',
                   'The SPEAKER pro tempore.']
data = data[~data.speaker.isin(remove_speakers)]

data['speech'] = data['speech'].str.replace('Mr\.', 'Mr')
data['speech'] = data['speech'].str.replace('Mrs\.', 'Mrs')
data['speech'] = data['speech'].str.replace('Ms\.', 'Ms')
data['speech'] = data['speech'].str.replace('\s+', ' ')
data['speech'] = data['speech'].str.replace('a\.m\. ', 'am ')
data['speech'] = data['speech'].str.replace('p\.m\. ', 'pm ')
data['speech'] = data['speech'].str.replace('H\.R\. ', 'HR ')

data.head()

Unnamed: 0,branch,congressID,ideology_score,page,part,speaker,speech,year
0,,106,0.391,1,1999-01-08,Mr. THURMOND.,I suggest the absence of a quorum. The PRESID...,1999
1,,106,0.39,1,1999-01-08,Mr. LOTT.,"Mr President, to inform all of our colleagues...",1999
2,Senate,106,0.331,188,1999-01-14,Mr. BENNETT.,"Your Honor, excuse me, Mr President, I need s...",1999
3,Senate,106,0.39,2,1999-01-22,Mr. LOTT.,"Mr Chief Justice, I do send the first questio...",1999
4,Senate,106,-0.326,30,1999-01-22,Mr. ROCKEFELLER.,"Mr President, I would like to join Senator FR...",1999


# Sentences

In [4]:
import re

sentences = pd.concat([data['speech'].str.split('\. ', expand=True)])
df = pd.concat([data, sentences], axis=1). \
    drop('speech', axis=1)
    
df = pd.melt(df, id_vars=['branch','congressID','ideology_score','page','part', 'speaker', 'year'],
             value_name='sentence'). \
    sort_values(by=['year','part', 'page']). \
    reset_index(). \
    drop(['variable', 'index'], axis=1)
    
df['sentence'] = df['sentence'].str.replace('VerDate.*', '')
    
df['sent_length'] = df['sentence'].str.len()
df['caps_length'] = df['sentence'].str.findall('[A-Z]').str.len()
df['letters_length'] = df['sentence'].str.findall('\w').str.len()
df['caps_prop'] = df['caps_length']/df['letters_length']
df = df.query('sent_length > 8 & caps_prop < .4'). \
    drop(['sent_length', 'caps_length', 'letters_length','caps_prop'], axis=1)
    

df.head()

Unnamed: 0,branch,congressID,ideology_score,page,part,speaker,year,sentence
0,,106,0.391,1,1999-01-08,Mr. THURMOND.,1999,I suggest the absence of a quorum
1,,106,0.39,1,1999-01-08,Mr. LOTT.,1999,"Mr President, to inform all of our colleagues..."
3,,106,0.39,1,1999-01-08,Mr. LOTT.,1999,"And when a quorum is established, the Senate w..."
4,,106,0.391,1,1999-01-08,Mr. THURMOND.,1999,The clerk will call the roll
6,,106,0.391,1,1999-01-08,Mr. THURMOND.,1999,The legislative clerk proceeded call the roll


In [5]:
pd.set_option('display.max_rows', 500)
pd.options.display.max_colwidth = 100

for i in df['sentence'][df['sentence'].str.contains('(.*[0-9]){4}')][50:100]:
    print(i,'\n')



While it is important that the Congress consider again this issue in the 106th Congress, I would also urge the Secretary to consider the facilities I mentioned qualified under Section 29 if they met the Service’s criteria for placed-in-service by June 30, 1998 whether or not such facilities were consistently producing commercial quantities of marketable products on a daily basis 

The Production Tax Credit, section 45 of the Internal Revenue Code was enacted as part of the Energy Policy Act of 1992 

This tax credit currently provides a 1.5 cent per kilowatt hour credit for energy produced from a new facility brought on-line after December 31, 1993 and before July 1, 1999 for the first ten years of the facility’s existence 

1459, currently has 22 cosponsors, including half of the Finance Committee 

 Mr President, today, I am introducing, with Senators BROWNBACK, BAUCUS, and KERREY, the Food and Medicine for the World Act of 1999 

Exports now account for 30 percent of gross cash rece

In [6]:
df[df['sentence'].str.contains('VerDate')]

Unnamed: 0,branch,congressID,ideology_score,page,part,speaker,year,sentence
