In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('data/csv/processed/record-1983.csv', nrows=200). \
    drop('Unnamed: 0', axis=1)
data.head()

Unnamed: 0,page,part,speaker,speech,year
0,15,3,Mr. GREEN.,MEASURE ORDERED HELD AT DESK The Committee...,1983
1,41,3,Mr. THURMOND.,"Mr. President, I rise today to introduce ...",1983
2,80,3,Mr. HELMS.,"Mr. President, I wonder if Senators noted ...",1983
3,93,3,The PRESIDING OFFICER.,Is there objection? The Chair hears none...,1983
4,93,3,Mr. METZENBAUM.,"Mr. President, I ask unanimous consent th...",1983


In [3]:
data['speech'] = data['speech'].str.replace('Mr\.', 'Mr')
data['speaker'] = data['speaker'].str.replace('  ',' ')
data['congress'] = round(data['year']/2-894)

#remove_speakers = ['The PRESIDING OFFICER',
#                   'The SPEAKER pro tempore']
#data = data[~data.speaker.isin(remove_speakers)]

# Mapping to ideology

In [4]:
pd.set_option('display.max_columns', 30)

In [5]:
ideology_scores = pd.read_csv('data/HSall_members.csv',
    usecols=['congress', 'chamber', 'icpsr', 'bioname', 'nominate_dim1']). \
    query('nominate_dim1 == nominate_dim1 & congress > 92')
ideology_scores['last_name'] = ideology_scores['bioname'].str.split(',').str[0]
ideology_scores['last_name'] = ideology_scores['last_name'].str.upper()
ideology_scores = ideology_scores[ideology_scores['chamber'] != 'President']

ideology_scores.head()

Unnamed: 0,congress,chamber,icpsr,bioname,nominate_dim1,last_name
36261,93,House,5058,"JONES, Robert Emmett, Jr.",-0.358,JONES
36262,93,House,10706,"BUCHANAN, John Hall, Jr.",0.146,BUCHANAN
36263,93,House,10717,"DICKINSON, William Louis",0.398,DICKINSON
36264,93,House,10721,"EDWARDS, William Jackson (Jack)",0.177,EDWARDS
36265,93,House,11000,"BEVILL, Tom",-0.213,BEVILL


In [6]:
speakers = data[['speaker', 'congress']].copy().drop_duplicates()
speakers['speaker'] = speakers['speaker'].str.replace('M[rs]{1,2}\. ','')
speakers['speaker'] = speakers['speaker'].str.replace('\.$','')
speakers['speaker'] = speakers['speaker'].str.replace('\sof.+','')
speakers['congress'] = speakers['congress'].astype(int)

remove_speakers = ['The PRESIDING OFFICER',
                   'The SPEAKER pro tempore']
speakers = speakers[~speakers.speaker.isin(remove_speakers)]

speakers.head()

Unnamed: 0,speaker,congress
0,GREEN,98
1,THURMOND,98
2,HELMS,98
4,METZENBAUM,98
6,MICHEL,98


In [7]:
pd.merge(speakers, ideology_scores[['last_name','nominate_dim1', 'congress']],
        left_on=['speaker','congress'], right_on=['last_name','congress'],
        how='left')

Unnamed: 0,speaker,congress,last_name,nominate_dim1
0,GREEN,98,GREEN,-0.044
1,THURMOND,98,THURMOND,0.391
2,HELMS,98,HELMS,0.631
3,METZENBAUM,98,METZENBAUM,-0.492
4,MICHEL,98,MICHEL,0.372
5,KAPTUR,98,KAPTUR,-0.350
6,COURTER,98,COURTER,0.269
7,CORCORAN,98,CORCORAN,0.319
8,BREAUX,98,BREAUX,-0.123
9,SCHUMER,98,SCHUMER,-0.351


In [8]:
ideology_scores.query('last_name == "FORD" & congress == 98')

Unnamed: 0,congress,chamber,icpsr,bioname,nominate_dim1,last_name
39179,98,House,10727,"FORD, William David",-0.516,FORD
39355,98,House,14224,"FORD, Harold Eugene",-0.438,FORD
39458,98,Senate,14302,"FORD, Wendell Hampton",-0.277,FORD


In [9]:
temp = ideology_scores[ideology_scores['bioname'].str.contains('sam',case=False)]
temp[temp['congress'] == 98]

Unnamed: 0,congress,chamber,icpsr,bioname,nominate_dim1,last_name
39057,98,House,14825,"GEJDENSON, Samuel",-0.416,GEJDENSON
39066,98,House,10588,"GIBBONS, Sam Melville",-0.233,GIBBONS
39186,98,House,14450,"SAWYER, Harold Samuel",0.167,SAWYER
39242,98,House,9017,"STRATTON, Samuel Studdiford",-0.139,STRATTON
39371,98,House,14291,"HALL, Sam Blakeley, Jr.",0.054,HALL
39443,98,Senate,14108,"NUNN, Samuel Augustus",-0.145,NUNN


# Sentences

In [10]:
sentences = pd.concat([data['speech'].str.split('\. ', expand=True)])
df = pd.concat([data, sentences], axis=1). \
    drop('speech', axis=1)
    
df = pd.melt(df, id_vars=['page', 'part', 'speaker', 'year'], value_name='sentence'). \
    sort_values(by=['part', 'page']). \
    reset_index(). \
    drop(['variable', 'index'], axis=1)
    
df['sent_length'] = df['sentence'].str.len()
df = df.query('sent_length > 8'). \
    drop('sent_length', axis=1)
    
df.head()

Unnamed: 0,page,part,speaker,year,sentence
1,15,3,Mr. GREEN.,1983,MEASURE ORDERED HELD AT DESK The Committee...
2,15,3,Mr. GREEN.,1983,1043) to amend section 8(e) of the Small Bus...
3,15,3,Mr. GREEN.,1983,EXECUTIVE AND OTHER COMMUNICATIONS The foll...
4,15,3,Mr. GREEN.,1983,A communication from the Ad ministrator of th...
6,15,3,Mr. GREEN.,1983,A communication from the Vice President of ...
