In [1]:

#data sources:
#https://history.house.gov/People/Search


In [2]:

import warnings

warnings.filterwarnings('ignore')


In [3]:

%%capture

!pip install requests


In [4]:

#2000-present congress
women_url_template = '''https://history.house.gov/People/Search?Term=Search&SearchIn=LastName&ShowNonMember=false&Office=&Leadership=&State=&Party=&ContinentalCongress=false&BlackAmericansInCongress=false&WomenInCongress=true&WomenInCongress=false&HispanicAmericansInCongress=false&AsianPacificAmericansInCongress=false&Dates=2000-present&CongressNumberList=&PreviousSearch=Search%2cLastName%2c%2c%2c%2c%2cFalse%2cFalse%2cTrue%2c2000-present%2c%2cLastName&CurrentPage=1&SortOrder=LastName&ResultType=Grid&Command={i}'''
women_pages = 22
black_url_template = '''https://history.house.gov/People/Search?Term=Search&SearchIn=LastName&ShowNonMember=false&Office=&Leadership=&State=&Party=&ContinentalCongress=false&BlackAmericansInCongress=true&BlackAmericansInCongress=false&WomenInCongress=false&HispanicAmericansInCongress=false&AsianPacificAmericansInCongress=false&Dates=2000-present&CongressNumberList=&PreviousSearch=Search%2cLastName%2c%2c%2c%2c%2cFalse%2cTrue%2cFalse%2c2000-present%2c%2cLastName&CurrentPage=1&SortOrder=LastName&ResultType=Grid&Command={i}'''
black_pages = 9
hispanic_url_template = '''https://history.house.gov/People/Search?Term=Search&SearchIn=LastName&ShowNonMember=false&Office=&Leadership=&State=&Party=&ContinentalCongress=false&BlackAmericansInCongress=false&WomenInCongress=false&HispanicAmericansInCongress=true&HispanicAmericansInCongress=false&AsianPacificAmericansInCongress=false&Dates=2000-present&CongressNumberList=&PreviousSearch=Search%2cLastName%2c%2c%2c%2c%2cFalse%2cFalse%2cFalse%2c2000-present%2c%2cLastName&CurrentPage=1&SortOrder=LastName&ResultType=Grid&Command={i}'''
hispanic_pages = 8
api_url_template = '''https://history.house.gov/People/Search?Term=Search&SearchIn=LastName&ShowNonMember=false&Office=&Leadership=&State=&Party=&ContinentalCongress=false&BlackAmericansInCongress=false&WomenInCongress=false&HispanicAmericansInCongress=false&AsianPacificAmericansInCongress=true&AsianPacificAmericansInCongress=false&Dates=2000-present&CongressNumberList=&PreviousSearch=Search%2cLastName%2c%2c%2c%2c%2cFalse%2cFalse%2cFalse%2c2000-present%2c%2cLastName&CurrentPage=1&SortOrder=LastName&ResultType=Grid&Command={i}'''
api_pages = 4
all_url_template = '''https://history.house.gov/People/Search?Term=Search&SearchIn=LastName&ShowNonMember=false&Office=&Leadership=&State=&Party=&ContinentalCongress=false&BlackAmericansInCongress=false&WomenInCongress=false&HispanicAmericansInCongress=false&AsianPacificAmericansInCongress=false&Dates=2000-present&CongressNumberList=&PreviousSearch=Search%2cLastName%2c%2c%2c%2c%2cFalse%2cFalse%2cFalse%2c2000-present%2c%2cLastName&CurrentPage=1&SortOrder=LastName&ResultType=Grid&Command={i}'''
all_pages = 100


In [5]:

import requests
import re
from time import sleep
import random

def get_names(url_template: str, pages: int) -> list:
    
    names = []

    for i in range(1, pages + 1):
        r = requests.get(url_template.format(i=i))
        names.extend(re.findall(r'class="name">\r\n                    (.*)\r\n', r.text))
        sleep(random.uniform(0, 1))
        
    return names


In [6]:

import pandas as pd

women_names = None
black_names = None
hispanic_names = None
api_names = None
all_names = None

try:
    women_names = pd.read_csv('2000-present/women_names.csv')['name'].values
    black_names = pd.read_csv('2000-present/black_names.csv')['name'].values
    hispanic_names = pd.read_csv('2000-present/hispanic_names.csv')['name'].values
    api_names = pd.read_csv('2000-present/api_names.csv')['name'].values
    all_names = pd.read_csv('2000-present/all_names.csv')['name'].values
except Exception as e:
    print(str(e))
    

In [7]:

if women_names is None:
    women_names = get_names(women_url_template, women_pages)
    pd.Series(women_names).rename('name').to_csv('2000-present/women_names.csv', index=False)

if black_names is None:
    black_names = get_names(black_url_template, black_pages)
    pd.Series(black_names).rename('name').to_csv('2000-present/black_names.csv', index=False)

if hispanic_names is None:
    hispanic_names = get_names(hispanic_url_template, hispanic_pages)
    pd.Series(hispanic_names).rename('name').to_csv('2000-present/hispanic_names.csv', index=False)

if api_names is None:
    api_names = get_names(api_url_template, api_pages)
    pd.Series(api_names).rename('name').to_csv('2000-present/api_names.csv', index=False)

if all_names is None:
    all_names = get_names(all_url_template, all_pages)
    pd.Series(all_names).rename('name').to_csv('2000-present/all_names.csv', index=False)


In [8]:

print('women_names length:', len(women_names))
print('black_names length:', len(black_names))
print('hispanic_names length:', len(hispanic_names))
print('api_names length:', len(api_names))
print('all_names length:', len(all_names))


women_names length: 261
black_names length: 106
hispanic_names length: 86
api_names length: 39
all_names length: 1190


In [9]:

%%capture

!pip install unidecode


In [10]:

import html
import unidecode

women_names = [unidecode.unidecode(html.unescape(name)) for name in women_names]
black_names = [unidecode.unidecode(html.unescape(name)) for name in black_names]
hispanic_names = [unidecode.unidecode(html.unescape(name)) for name in hispanic_names]
api_names = [unidecode.unidecode(html.unescape(name)) for name in api_names]
all_names = [unidecode.unidecode(html.unescape(name)) for name in all_names]


In [11]:

# names_union = set(women_names) | set(black_names) | set(hispanic_names) | set(api_names) | set(all_names)
# congress_members = pd.DataFrame(index=list(names_union))
congress_members = pd.DataFrame(index=all_names)
congress_members.head()


"ABERCROMBIE, Neil"
"ABRAHAM, Ralph"
"ACEVEDO-VILA, Anibal"
"ACKERMAN, Gary Leonard"
"ADAMS, Alma"


In [12]:

women_names_set = set(women_names)
black_names_set = set(black_names)
hispanic_names_set = set(hispanic_names)
api_names_set = set(api_names)
all_names_set = set(all_names)
duplicated_race_names_set = (black_names_set & hispanic_names_set) | (black_names_set & api_names_set) | (hispanic_names_set & api_names_set)


In [13]:

duplicated_race_names_set


{'CLARKE, Hansen',
 'DELGADO, Antonio',
 'HARRIS, Kamala Devi',
 'SABLAN, Gregorio Kilili Camacho',
 'SAN NICOLAS, Michael',
 'SCOTT, Robert C.',
 'STRICKLAND, Marilyn',
 'TORRES, Ritchie',
 'UNDERWOOD, Robert A.'}

In [14]:

def normalize_last_name(name: str, last_name: str) -> str:
    if name in women_names_set:
        if ' ' in last_name:
            return last_name.split()[0].replace('-', '').lower()
        elif '-' in last_name:
            return last_name.split('-')[0].lower()
        else:
            return last_name.lower()
    else:
        return last_name.replace('-', '').lower()

congress_members[['last_name', 'first_name']] = congress_members.index.to_series().str.split(',', n=1, expand=True)
congress_members['first_name'] = congress_members['first_name'].str.lower().str.replace(r'([a-zA-Z]+\.\,?\s+)|([\(\)\-\,]+)', '').str.split(n=1, expand=True)[0].str.strip()
congress_members['last_name'] = congress_members.reset_index().apply(lambda row: normalize_last_name(row['index'], row['last_name']), axis=1).values
congress_members.head()


Unnamed: 0,last_name,first_name
"ABERCROMBIE, Neil",abercrombie,neil
"ABRAHAM, Ralph",abraham,ralph
"ACEVEDO-VILA, Anibal",acevedovila,anibal
"ACKERMAN, Gary Leonard",ackerman,gary
"ADAMS, Alma",adams,alma


In [15]:

def get_race(name: str) -> str:
    if name in duplicated_race_names_set:
        return None
    if name in black_names_set:
        return 'black'
    elif name in hispanic_names_set:
        return 'hispanic'
    elif name in api_names_set:
        return 'api'
    else:
        return None

def get_gender(name: str) -> str:
    if name in women_names_set:
        return 'female'
    else:
        return 'male'


In [16]:

congress_members['race'] = congress_members.index.to_series().apply(get_race)
congress_members.head()


Unnamed: 0,last_name,first_name,race
"ABERCROMBIE, Neil",abercrombie,neil,
"ABRAHAM, Ralph",abraham,ralph,
"ACEVEDO-VILA, Anibal",acevedovila,anibal,hispanic
"ACKERMAN, Gary Leonard",ackerman,gary,
"ADAMS, Alma",adams,alma,black


In [17]:

congress_members['gender'] = congress_members.index.to_series().apply(get_gender)
congress_members.head()


Unnamed: 0,last_name,first_name,race,gender
"ABERCROMBIE, Neil",abercrombie,neil,,male
"ABRAHAM, Ralph",abraham,ralph,,male
"ACEVEDO-VILA, Anibal",acevedovila,anibal,hispanic,male
"ACKERMAN, Gary Leonard",ackerman,gary,,male
"ADAMS, Alma",adams,alma,black,female


In [18]:

congress_members['gender'].value_counts()


male      958
female    232
Name: gender, dtype: int64

In [19]:

congress_members['race'].value_counts()


black       96
hispanic    76
api         30
Name: race, dtype: int64

In [20]:

congress_members.reset_index().rename(columns={'index':'full_name'}).to_csv('2000-present/congress_members.csv')
