### Sample 10,000 speeches from each session of congress

In [11]:
def sample_speechids(the_path, seat, sample_size):
    '''function to sample from file of text speeches'''
    import os
    import pandas as pd
    import numpy as np
    
    the_path = the_path
    the_dirs = os.listdir(the_path)

    the_df = pd.DataFrame()
    the_filesnames = []
    #iterate through each directory one by by
    for dir_name in the_dirs:
        if dir_name[0]=="0" or dir_name[0]=="1":
            the_filenames = the_path + "/" + dir_name
            the_filesnames.append(dir_name)

    for line in the_filesnames:
        f = pd.read_csv(filepath_or_buffer = the_path + "/" + line, sep="|", header = None, engine="python")

        df = pd.DataFrame(f)
        df = df.rename(columns=df.iloc[0]).drop(df.index[0])
        df = df[df['chamber']==seat]
        np.random.seed(9)
        sampled_df = df.sample(sample_size, replace=False)
        the_df = the_df.append(sampled_df, ignore_index=True)
    return(the_df)

In [12]:
samples = sample_speechids("./hein-bound", "S", 10000)

In [13]:
samples

Unnamed: 0,speakerid,speech_id,lastname,firstname,chamber,state,gender,party,district,nonvoting
0,103118361,1030189997,MCCONNELL,MITCH,S,KY,M,R,,voting
1,103112231,1030213776,WELLSTONE,PAUL,S,MN,M,D,,voting
2,103112321,1030126026,CAMPBELL,BEN,S,CO,M,R,,voting
3,103112301,1030008162,BREAUX,JOHN,S,LA,M,D,,voting
4,103108471,1030231926,RIEGLE,DONALD,S,MI,M,D,,voting
...,...,...,...,...,...,...,...,...,...,...
179995,109112991,1090005859,FRIST,WILLIAM,S,TN,M,R,,voting
179996,109113441,1090084523,TALENT,JAMES,S,MO,M,R,,voting
179997,109121541,1090180490,THUNE,JOHN,S,SD,M,R,,voting
179998,109113921,1090090920,WARNER,JOHN,S,VA,M,R,,voting


In [14]:
samples['fullname']=samples['firstname']+' '+samples['lastname']

In [None]:
#change altnerative party politicans to D/R party they most closely align with
sampled_df.loc[sampled_df['fullname'] == 'BERNARD SANDERS', 'party'] = "D"
sampled_df.loc[sampled_df['fullname'] == 'JOSEPH LIEBERMAN', 'party'] = "D"
sampled_df.loc[sampled_df['fullname'] == 'HARRY BYRD', 'party'] = "D"
sampled_df.loc[sampled_df['fullname'] == 'JAMES JEFFORDS', 'party'] = "D"
sampled_df.loc[sampled_df['fullname'] == 'DEAN BARKLEYS', 'party'] = "D"
sampled_df.loc[sampled_df['fullname'] == 'JAMES BUCKLEY', 'party'] = "R"
speeches_merged.loc[speeches_merged['fullname'] == 'DEAN BARKLEY', 'party'] = "R"

In [None]:
#Create party label 
party=pd.get_dummies(samples['party'],prefix="party",drop_first=True)
samples[party.columns]=party

In [31]:
def read_speech(the_path):
    '''function to read in speech files'''
    import os
    import pandas as pd
    
    the_path = the_path
    the_dirs = os.listdir(the_path)

    the_df = pd.DataFrame()

    the_filesnames = []

    #iterate through each directory one by by
    for dir_name in the_dirs:
        if dir_name[0]=="s":
            the_filenames = the_path + "/" + dir_name
            the_filesnames.append(dir_name)

    for line in the_filesnames:
        f = open(the_path + "/" + line, "r", encoding= 'ISO-8859-1')
        speech_id = []
        speech = []
        congress = []
        for line in f:
            if line[0]=="9":
                speech_id.append(line[0:9])
                speech.append(line[10:])            
                congress.append(line[0:2])
            if line[0]=="1":
                speech_id.append(line[0:10])
                speech.append(line[11:])
                congress.append(line[0:3])

        df = pd.DataFrame(list(zip(speech_id, speech, congress)), 
                       columns =['speech_id', 'speech', 'congress'])
        df = df.drop([0]) #remove first row
        the_df = the_df.append(df, ignore_index=True)
        
    return(the_df)

In [32]:
speeches = read_speech("/Users/kristenakey/Documents/Documents/QMSS/NLP/Project/hein-bound")

In [33]:
speeches

Unnamed: 0,speech_id,speech,congress
0,940000002,The Chair lays before the Senate letters of re...,94
1,940000003,Mr. President. I ask unanimous consent that th...,94
2,940000004,Is there objection?\n,94
3,940000005,Mr. President. reserving the right to objectan...,94
4,940000006,The credentials will be laid before the Senate.\n,94
...,...,...,...
4640376,990286727,Mr. Speaker. It is my privilege and pleasure t...,99
4640377,990286728,Mr. Speaker. on October 6. 1986. I requested a...,99
4640378,990286729,Mr. Speaker. I want to discuss the moneylaunde...,99
4640379,990286730,Mr. Speaker. I would like to express my suppor...,99


In [34]:
#Merge sampled speech IDs and speeches raw
speech_merged = pd.merge(samples, speeches, on=['speech_id'], how='left')

In [35]:
speech_merged

Unnamed: 0,speakerid,speech_id,lastname,firstname,chamber,state,gender,party,district,nonvoting,speech,congress
0,103118361,1030189997,MCCONNELL,MITCH,S,KY,M,R,,voting,Mr. President. I thank the distinguished Senat...,103
1,103112231,1030213776,WELLSTONE,PAUL,S,MN,M,D,,voting,Madam President. I say to my colleague from So...,103
2,103112321,1030126026,CAMPBELL,BEN,S,CO,M,R,,voting,Mr. President. I am pleased that the bill I in...,103
3,103112301,1030008162,BREAUX,JOHN,S,LA,M,D,,voting,Mr. President. I yield back the remainder of m...,103
4,103108471,1030231926,RIEGLE,DONALD,S,MI,M,D,,voting,Mr. President. I rise to commend my friend and...,103
...,...,...,...,...,...,...,...,...,...,...,...,...
179995,109112991,1090005859,FRIST,WILLIAM,S,TN,M,R,,voting,I ask unanimous consent that the resolution an...,109
179996,109113441,1090084523,TALENT,JAMES,S,MO,M,R,,voting,I thank the Senator for his ingenuous unanimou...,109
179997,109121541,1090180490,THUNE,JOHN,S,SD,M,R,,voting,Mr. President. today I wish to recognize White...,109
179998,109113921,1090090920,WARNER,JOHN,S,VA,M,R,,voting,Mr. President. I now ask further unanimous con...,109


In [245]:
#speech_merged.to_pickle(r'./pickled_data/project-180000_speech_merged.pkl')

In [None]:
with open("./extended_stop_words.txt", 'r') as input_file:
    ext_stop_words = input_file.read().split(',')
ext_stop_words = [str(i.lower()) for i in ext_stop_words]
ext_stop_words.append(',,')
ext_stop_words.append(',')

In [241]:
def speech_clean(df):
    '''function to clean/parse speech files'''
    import re
    import nltk
    from nltk.corpus import stopwords
    stop_words = nltk.corpus.stopwords.words('english')
    stop_words.extend(ext_stop_words)
    from nltk.stem import PorterStemmer
    my_stem = PorterStemmer()
    from nltk.stem import WordNetLemmatizer 
    lemmatizer = WordNetLemmatizer()
    dictionary = set(w.lower() for w in nltk.corpus.words.words())
    from tqdm import tqdm

    the_df = pd.DataFrame()
    speechl = pd.Series.tolist(df[['speech']])
    
    #for check
    congressl = pd.Series.tolist(df[['congress']]) # as check
    congressl = [item for sublist in congressl for item in sublist]

    #for check
    speech_idl = pd.Series.tolist(df[['speech_id']]) 
    speech_idl = [item for sublist in speech_idl for item in sublist]

    for line, idnum, session in tqdm(zip(speechl, speech_idl, congressl), total=170000, position=0, leave=True):
        speech_og = []
        for i in line:
            if type(i)==float:
                speech_og.append("NA")
            else:
                i = i[:-1]
                speech_og.append(i)

        #Clean text
        tmp_read =re.sub("[\.]", ' ', str(line)).lower()
        tmp_read = re.sub("['\n']", '', tmp_read)
        tmp_read = re.sub('[^a-zA-Z]+', ' ', tmp_read)

        #Tokenization and remove stop words
        tmp_read = [word for word in tmp_read.split() if word not in stop_words]

            #dictionary words
        dict_read = [word for word in tmp_read if word in dictionary]

        #Stemming
        tmp_read_stm = [my_stem.stem(word) for word in tmp_read]
        dict_read_stm = [my_stem.stem(word) for word in dict_read]

        #Lemminization 
        tmp_read_lem = [lemmatizer.lemmatize(word) for word in tmp_read]
        dict_read_lem = [lemmatizer.lemmatize(word) for word in dict_read]

        #rejoin speeches
        tmp_read = ' '.join(tmp_read)
        tmp_read_stm = ' '.join(tmp_read_stm)
        tmp_read_lem = ' '.join(tmp_read_lem)

        dict_read = ' '.join(dict_read)
        dict_read_stm = ' '.join(dict_read_stm)
        dict_read_lem = ' '.join(dict_read_lem)

        #df
        tmp = pd.DataFrame([speech_og], columns=['original speech'])
        tmp['body'] = tmp_read
        tmp['body_stem'] = tmp_read_stm
        tmp['body_lem'] = tmp_read_lem
        tmp['body_dict'] = dict_read
        tmp['body_dict_stem'] = dict_read_stm
        tmp['body_dict_lem'] = dict_read_lem
        
        #for check
        tmp['congress'] = session
        tmp['speech_id'] = idnum

        the_df = the_df.append(tmp, ignore_index=True)
    return(the_df)
    #the_df.to_csv(r'/Users/kristenakey/Desktop/hein-bound-111.csv', index=False)

In [242]:
speech_parsed = speech_clean(speech_merged)

100%|██████████| 180000/180000 [8:59:58<00:00,  5.56it/s]       


In [243]:
speech_parsed

Unnamed: 0,original speech,body,body_stem,body_lem,body_dict,body_dict_stem,body_dict_lem,congress,speechid
0,Mr. President. I thank the distinguished Senat...,mr president thank distinguished senator oklah...,mr presid thank distinguish senat oklahoma lea...,mr president thank distinguished senator oklah...,mr president thank distinguished senator oklah...,mr presid thank distinguish senat oklahoma lea...,mr president thank distinguished senator oklah...,103,1030189997
1,Madam President. I say to my colleague from So...,madam president say colleague south dakota abs...,madam presid say colleagu south dakota absolut...,madam president say colleague south dakota abs...,madam president say colleague south dakota abs...,madam presid say colleagu south dakota absolut...,madam president say colleague south dakota abs...,103,1030213776
2,Mr. President. I am pleased that the bill I in...,mr president pleased bill introduced last apri...,mr presid pleas bill introduc last april desig...,mr president pleased bill introduced last apri...,mr president bill last april designate federal...,mr presid bill last april design feder courtho...,mr president bill last april designate federal...,103,1030126026
3,Mr. President. I yield back the remainder of m...,mr president yield back remainder time suggest...,mr presid yield back remaind time suggest prev...,mr president yield back remainder time suggest...,mr president yield back remainder time suggest...,mr presid yield back remaind time suggest prev...,mr president yield back remainder time suggest...,103,1030008162
4,Mr. President. I rise to commend my friend and...,mr president rise commend friend colleague sen...,mr presid rise commend friend colleagu senat l...,mr president rise commend friend colleague sen...,mr president rise commend friend colleague sen...,mr presid rise commend friend colleagu senat l...,mr president rise commend friend colleague sen...,103,1030231926
...,...,...,...,...,...,...,...,...,...
179995,I ask unanimous consent that the resolution an...,ask unanimous consent resolution preamble agre...,ask unanim consent resolut preambl agre en blo...,ask unanimous consent resolution preamble agre...,ask unanimous consent resolution preamble agre...,ask unanim consent resolut preambl agre en blo...,ask unanimous consent resolution preamble agre...,109,1090005859
179996,I thank the Senator for his ingenuous unanimou...,thank senator ingenuous unanimous consent requ...,thank senat ingenu unanim consent request allo...,thank senator ingenuous unanimous consent requ...,thank senator ingenuous unanimous consent requ...,thank senat ingenu unanim consent request go f...,thank senator ingenuous unanimous consent requ...,109,1090084523
179997,Mr. President. today I wish to recognize White...,mr president today wish recognize white lake i...,mr presid today wish recogn white lake indepen...,mr president today wish recognize white lake i...,mr president today wish recognize white lake i...,mr presid today wish recogn white lake indepen...,mr president today wish recognize white lake i...,109,1090180490
179998,Mr. President. I now ask further unanimous con...,mr president ask unanimous consent senate proc...,mr presid ask unanim consent senat proceed imm...,mr president ask unanimous consent senate proc...,mr president ask unanimous consent senate proc...,mr presid ask unanim consent senat proceed imm...,mr president ask unanimous consent senate proc...,109,1090090920


In [244]:
#speech_parsed.to_pickle(r'./pickled_data/project-180000_speeches_parsed.pkl')

In [246]:
#rejoin clean/parsed speeches to speech_merged 
speech_c = pd.merge(speech_merged, speech_parsed, left_on=['speech_id', "congress"], right_on=['speech_id', "congress"], how='left')#[['cfscore','seat','Cand.ID','Name']]

In [250]:
#speech_c.to_pickle(r'./pickled_data/project-180000_speech_c.pkl')

### Split speeches from each session of congress into unique dataframes 

In [55]:
speech_c = pd.read_pickle(r'./pickled_data/project-180000_speech_c.pkl')

i.e df_103 = 103rd congressional speeches