In [1]:
import PyPDF2
import re
import pandas as pd
import json
from nameparser import HumanName

# import json
# from langchain.document_loaders import PyPDFLoader
# from typing import List
# from langchain.llms import OpenAI
# from langchain.output_parsers import PydanticOutputParser
# from langchain.prompts import PromptTemplate
# from langchain.pydantic_v1 import BaseModel, Field, validator

# Text Parser

In [2]:
states = '(Alabama|Alaska|Arizona|Arkansas|California|Colorado|Connecticut|Delaware|Florida|Georgia|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland|Massachusetts|Michigan|Minnesota|Mississippi|Missouri|Montana|Nebraska|Nevada|New Hampshire|New Jersey|New Mexico|New York|North Carolina|North Dakota|Ohio|Oklahoma|Oregon|Pennsylvania|Rhode Island|South Carolina|South Dakota|Tennessee|Texas|Utah|Vermont|Virginia|Washington|West Virginia|Wisconsin|Wyoming)'

In [3]:
def special_cases_name_change(committees_df_raw):
    # Catherine Cortez Masto
    cortez_masto_obj = committees_df_raw[committees_df_raw['full_name'] == 'Catherine Cortez Masto']['name_obj']
    if not cortez_masto_obj.empty:
        cortez_masto = cortez_masto_obj.iloc[0]
        cortez_masto['last'] = cortez_masto['middle'] + ' ' + cortez_masto['last']
        cortez_masto['middle'] = ''
        
        # committees_df[committees_df['full_name'] == 'Catherine Cortez Masto'].name_object = cortez_masto
        committees_df_raw[committees_df_raw['full_name'] == 'Catherine Cortez Masto']['name_obj'].iloc[0] = cortez_masto
    return committees_df_raw

In [4]:
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for i, page in enumerate(reader.pages):
            text += page.extract_text()
    return text


def extract_standing_committee_text(text):
    return text.partition('STANDING COMMITTEES')[2].partition('OTHER COMMITTEE')[0]


def extract_standing_committee_list(text):
    # Simplified regular expression to match the text between "Standing Committees:" and "Other Committee:"
    pattern = r'Standing Committees:(.*?)Other Committee:'
    
    # Extracting the committee names
    committees = re.search(pattern, text, re.DOTALL)
    if committees:
        committees = committees.group(1).strip().split('\n')
    
    # Extracting individual committee names from the matched text
    committee_pattern = r'([A-Za-z,’\s]+)'
    individual_committees = re.findall(committee_pattern, committees[0]) if committees else None
    
    # Cleaning up the committee names (remove leading and trailing whitespace and empty strings)
    clean_committees = [committee.strip() for committee in individual_committees if committee.strip()] if individual_committees else None
    # return [committee.upper() for committee in clean_committees]
    return ['AGRICULTURE, NUTRITION, AND FORESTRY', 'APPROPRIATIONS', 'ARMED SERVICES', 'BANKING, HOUSING, AND URBAN AFFAIRS', 
            'BUDGET', 'COMMERCE, SCIENCE, AND TRANSPORTATION', 'ENERGY AND NATURAL RESOURCES', 'ENVIRONMENT AND PUBLIC WORKS', 
            'FINANCE', 'FOREIGN RELATIONS','HEALTH, EDUCATION, LABOR, AND PENSIONS', 'HOMELAND SECURITY AND GOVERNMENTAL AFFAIRS', 
            'JUDICIARY','RULES AND ADMINISTRATION', 'SMALL BUSINESS AND ENTREPRENEURSHIP', 'VETERANS’ AFFAIRS'] 
    

def extract_committees_subcommittees(standing_committees_text, standing_committee_list):
    subcommittee_assignments = {}
    committee_assignments = {}
    lines = standing_committees_text.split('\n')
    
    current_committee = standing_committee_list[0]
    current_subcommittee = None
    is_subcommittee = False
    previous_line_cap = False
    previous_line_state = False
    current_line_state = None
    
    for line in lines:
        # take out white spaces
        line = line.strip()
        
        if previous_line_state:
            previous_line_state = False
            line = current_line_state + ' ' + line

        # random typo cases:
        if "of Mew York" in line:
            line = re.sub(r'of Mew York', 'of New York', line)
        if 'James M. lnhofe' in line:
            line = re.sub(r'James M. lnhofe', 'James M. Inhofe', line)
        if "Luja ´n" in line:
            line = re.sub(r'Luja ´n', 'Lujan', line)
        if 'Merkely' in line:
            line = re.sub(r'Merkely', 'Merkley', line)
        if 'Mr,' in line:
            line = re.sub(r'Mr,', 'Mr.', line)
        if 'Mrs,' in line:
            line = re.sub(r'Mrs,', 'Mrs.', line)
        if 'Ms,' in line:
            line = re.sub(r'Ms,', 'Ms.', line)
            
        if line in standing_committee_list: # new committee
            current_committee = line
            current_subcommittee = None
            is_subcommittee = False
        elif line.startswith('VerDate') or line in('', current_committee + "—Continued", 'No Subcommittees', 'Chairman', 'Chairman,', 'Updates Pending'):
            continue
        elif line == 'Subcommittees':
            is_subcommittee = True
        elif line.isupper() and is_subcommittee and not previous_line_cap:
            current_subcommittee = line
            previous_line_cap = True
        elif line.isupper() and is_subcommittee and previous_line_cap:
            current_subcommittee = current_subcommittee + ' ' + line
        elif re.search(r'(New|North|Rhode|South|West|, of)$', line):
            previous_line_state = True
            current_line_state = line
            continue
        else: # now at the names
            previous_line_cap = False
            if is_subcommittee: # subcommittee members (last name)
                subs = re.sub(r'(Mr\.|Ms\.|Mrs\.)', ',', line)
                subs = re.sub(r',\s', '', subs, 1)
                subs = re.sub(r', (Vice )*Chairman', '', subs)
                subs = re.sub(r' , ', ', ', subs)
                subs = re.sub(r', of', ' of', subs)
                subs = subs.split(', ')
                for senator in subs:
                    if senator not in subcommittee_assignments:
                        subcommittee_assignments[senator] = [current_subcommittee]
                    else:
                        subcommittee_assignments[senator].append(current_subcommittee)
            else: # committee members (full names)
                subs = re.sub(r', (Vice )*Chairman', '', line)
                subs = re.sub(r', Jr.', ' Jr.', subs)
                subs = re.sub(r', (of )*' + states, r' of \2,', subs)
                # subs = re.sub(r', of ' + states, r' of \1,', subs)
                subs = subs.rstrip(',').split(', ')
                for senator in subs:
                    if senator not in committee_assignments:
                        committee_assignments[senator] = [current_committee]
                    else:
                        committee_assignments[senator].append(current_committee)
                
    return subcommittee_assignments, committee_assignments


def extract_committee_assignment_df(pdf_path):
    # Extracting standing committees text from the PDF
    text = extract_text_from_pdf(pdf_path)
    standing_committees_text = extract_standing_committee_text(text)
    # Extract standing committee list of names
    standing_committee_list = extract_standing_committee_list(text)
    
    # Extract assignment dataframes
    subcommittee_assignments, committee_assignments = extract_committees_subcommittees(standing_committees_text, standing_committee_list)
    
    # Committees DF
    committees_df = pd.DataFrame(list(committee_assignments.items()), columns=['full_name', 'committees'])
    committees_df[['full_name','state']] = committees_df['full_name'].str.split(' of ', expand=True)
    committees_df['name_obj'] = committees_df['full_name'].apply(HumanName)
    committees_df = special_cases_name_change(committees_df)
    committees_df['last_name'] = [name['last'] for name in committees_df['name_obj']]
    
    
    # Subcommittees DF
    subcommittees_df = pd.DataFrame(list(subcommittee_assignments.items()), columns=['last_name', 'subcomittees'])
    subcommittees_df['state'] = [last_name.split(' of ')[1] if ' of ' in last_name else '' for last_name in subcommittees_df['last_name']]
    subcommittees_df['last_name'] = [last_name.split(' of ')[0] if ' of ' in last_name else last_name for last_name in subcommittees_df['last_name']]

    # Committee Assignments DF
    committee_assignments_df = committees_df.merge(subcommittees_df, on='last_name', how='outer').fillna('')
    committee_assignments_df = committee_assignments_df[(committee_assignments_df['state_y']=='') | (committee_assignments_df['state_y']==committee_assignments_df['state_x'])]
    committee_assignments_df = committee_assignments_df.drop(columns=['state_y']).reset_index(drop=True)
    return committee_assignments_df[['full_name', 'state_x', 'committees', 'subcomittees']].rename(columns={'state_x': 'state'})

In [5]:
# Loop through each Congress PDF
committee_assignments_dict = {}
for congress in range(109, 119):
    pdf_path = f"./committee_pdfs/GPO-CPUB-{congress}.pdf"
    committee_assignments_dict[congress] = extract_committee_assignment_df(pdf_path).to_json(orient="records")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  committees_df_raw[committees_df_raw['full_name'] == 'Catherine Cortez Masto']['name_obj'].iloc[0] = cortez_masto
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  committees_df_raw[committees_df_raw['full_name'] == 'Catherine Cortez Masto']['name_obj'].iloc[0] = cortez_masto
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  committees_df_raw[committees_df_raw['full_name'] == 'Catherine Cortez Masto']['name_obj'].iloc[0] = cortez_masto
A value is trying to be set on a copy

In [6]:
# Export committee assignments into json file
with open('committee_assignments.json', 'w') as fp:
    json.dump(committee_assignments_dict, fp)

Template:
committee_assignments_dict
- keys: congress integer (109-118)
- values: json of congressional committee assignments, list of records 
    - each record is a dictionary consisting of:
        - full_name (string)
        - state (string)
        - committees (list of strings)
        - subcommittees (list of strings)
     
Example: 
    
committee_assignments_dict = 

{  

    109: [  
           {'full_name':'Jo', 'state':'California', 'committees':['com_a', 'com_b'], 'subcommittees':['subcom_a']},  
           {'full_name':'Bo', 'state':'New York', 'committees':['com_a', 'com_c'], 'subcommittees':[]}, ...  
         ],  
    110: [  
           {'full_name':'Sarah', 'state':'Arizona', 'committees':['com_a', 'com_b', 'com_c'], 'subcommittees':['subcom_a']},  
           {'full_name':'Jo', 'state':'California', 'committees':['com_b'], 'subcommittees':['subcom_a', 'subcom_c']}, 
           {'full_name':'Derek', 'state':'Florida', 'committees':['com_d', 'com_e'], 'subcommittees':['subcom_b']}, ...  
         ],  
    111: ...
    
}

## Notes:

109: Hillary Clinton's state incorrect: "Mew York" instead of "New York"

111: "Merkely" instead of "Merkley", "Mr," instead of "Mr."


113-114: Nevada Senate Harry Reid not in any committees, so 99 rows


117: "James M. lnhofe" instead of "James M. Inhofe"


118: "John Fetterman, Pennsylvania" instead of "John Fetterman, of Pennsylvania"


has Senators will same last name: 109, 110, 111, 112, 113, 117, 118

# Committee Subcommittee Mapping


In [64]:
def extract_committees_subcommittees(standing_committees_text, standing_committee_list):
    mappings = {}
    lines = standing_committees_text.split('\n')
    
    current_committee = standing_committee_list[0]
    is_subcommittee = False
    previous_line_cap = False
    
    for line in lines:
        # take out white spaces
        line = line.strip()
        
        if line in standing_committee_list: # new committee
            mappings[line] = []
            is_subcommittee = False
            current_committee = line
        elif line == 'Subcommittees':
            is_subcommittee = True
        elif line.isupper() and is_subcommittee and not previous_line_cap:
            mappings[current_committee].append(line)
            previous_line_cap = True
        elif line.isupper() and is_subcommittee and previous_line_cap:
            redo = mappings[current_committee][-1] + ' ' + line
            mappings[current_committee][-1] = redo
#             mappings[current_committee].append(redo)
        else: # now at the names
            previous_line_cap = False
                
    return mappings
    

In [65]:
# Loop through each Congress PDF
committee_mappings_dict = {}
for congress in range(109, 119):
    pdf_path = f"./committee_pdfs/GPO-CPUB-{congress}.pdf"
    
    # Extracting standing committees text from the PDF
    text = extract_text_from_pdf(pdf_path)
    standing_committees_text = extract_standing_committee_text(text)
    # Extract standing committee list of names
    standing_committee_list = extract_standing_committee_list(text)

    committee_mappings_dict[congress] = extract_committees_subcommittees(standing_committees_text, standing_committee_list)
    

In [68]:
# Export committee assignments into json file
with open('committee_mappings.json', 'w') as fp:
    json.dump(committee_mappings_dict, fp)