In [1]:
import bs4; print( 'bs4 ' + bs4.__version__)
from bs4 import BeautifulSoup, SoupStrainer;

import re; print('re ' + re.__version__)

import requests; print('requests ' + requests.__version__)
import ast;
import pandas as pd; print( 'pandas ' + pd.__version__)


bs4 4.6.3
re 2.2.1
requests 2.19.1
pandas 0.22.0


In [2]:
URL_LIST_OF_ALL = 'https://en.wikipedia.org/wiki/List_of_current_members_of_the_United_States_House_of_Representatives'

LOCAL_RAW_LIST_FILE_PATH = 'data/raw_wikipedia_list.html'
SAVED_CSV_FILE_PATH = 'data/representatives_list_raw.csv'


In [3]:
def get_text_from_local():
    with open(LOCAL_RAW_LIST_FILE_PATH) as f:
        return f

def get_text_from_url_and_save(url):
    text = requests.get(url).text
    with open(LOCAL_RAW_LIST_FILE_PATH, 'w') as f:
        f.write(text)
    return text


def get_html_list_of_representatives():
    # first try to get from local
    try:
        return get_text_from_local()
    # otherwise pull from URL
    except:
        return get_text_from_url_and_save(URL_LIST_OF_ALL)
text = get_html_list_of_representatives()


In [4]:
# Helper functions


def get_nth_cell(row, num):
    #print('td:nth-of-type({})'.format(num))
    return row.find_all('td')[num]

def get_link_element(cell):
    link_row = get_nth_cell(cell, 1)
    return link_row.find_all(name='a')[-1]

def split_by_br(cell):
    split_by = '--BRRR--'
    cell_string = str(cell).replace('<br/>', split_by)
    b = BeautifulSoup(cell_string, 'lxml')
    text = b.text
    return [line.strip() for line in text.split(split_by)]


In [5]:

def get_district(row):
    return get_nth_cell(row, 0).text.strip()

def get_name(row):
    return get_link_element(row).text.strip()
def get_link(row):
    return get_link_element(row).attrs['href']

def get_party(row):
    return get_nth_cell(row, 3).text.strip()

def get_experience(row):
    cell = get_nth_cell(row, -5)
    return split_by_br(cell)

def get_education(row):
    cell = get_nth_cell(row, -4)
    educations_raw = split_by_br(cell)
    educations_glued = []
    for experience_raw in educations_raw:
        match = re.match('^\(.*\)$', experience_raw)
        if not match:
            # this is as expected
            educations_glued.append(experience_raw)
        else:
            # this shouldn't be by itself and should be added to the previous school mentioned - example (BA) or (MS)
            educations_glued[-1] = educations_glued[-1] + " " + experience_raw
    return educations_glued

def is_vacant(row):
    vacant_row = get_nth_cell(row, 1).text.strip()
    return vacant_row ==   'Vacant'

def get_all_links_and_info(text):
    b = BeautifulSoup(text, 'lxml')
    table = b.find(id = 'votingmembers')
    rows = table.findAll(name='tr')
    output = {
        'district': [],
        'name': [],
        'link': [],
        'party': [],
        'experience': [],
        'education': [],
    }
    i = 0
    for row in rows[1:]:
        i+=1
        if is_vacant(row):
            continue
        output['district'].append(get_district(row))
        output['name'].append(get_name(row))
        output['link'].append(get_link(row))
        output['party'].append(get_party(row))
        output['experience'].append(get_experience(row))
        output['education'].append(get_education(row))
    return pd.DataFrame(output)
all_info = get_all_links_and_info(text)
all_info

Unnamed: 0,district,education,experience,link,name,party
0,Alabama 1,"[Duke University (BA), University of Alabama (...","[Alabama Senate, Alabama State Board of Educat...",/wiki/Bradley_Byrne,Bradley Byrne,Republican
1,Alabama 2,"[New York University (BM), Samford University ...",[Montgomery City Council],/wiki/Martha_Roby,Martha Roby,New York University (BM)Samford University (JD)
2,Alabama 3,"[Jacksonville State University (BA, MPA), Birm...","[Calhoun County Commissioner, Alabama House of...",/wiki/Mike_Rogers_(Alabama_politician),Mike Rogers,"Jacksonville State University (BA, MPA)Birming..."
3,Alabama 4,"[University of North Alabama, Birmingham–South...",[Haleyville Municipal Judge],/wiki/Robert_Aderholt,Robert Aderholt,University of North AlabamaBirmingham–Southern...
4,Alabama 5,"[Duke University (BA), University of Alabama (...","[Alabama House of Representatives, Madison Cou...",/wiki/Mo_Brooks,Mo Brooks,Duke University (BA)University of Alabama (JD)
5,Alabama 6,[University of Alabama (BS)],[Policy analyst],/wiki/Gary_Palmer_(politician),Gary Palmer,University of Alabama (BS)
6,Alabama 7,"[Princeton University (BA), St Hilda's College...",[Attorney],/wiki/Terri_Sewell,Terri Sewell,Democratic
7,Alaska at large,"[Yuba College, California State University, Ch...","[Alaska Senate, Ship captain, Mayor of Fort Yu...",/wiki/Don_Young,Don Young,Republican
8,Arizona 1,"[Lewis University, DePaul University]",[Arizona Senate],/wiki/Tom_O%27Halleran,Tom O'Halleran,Democratic
9,Arizona 2,"[United States Air Force Academy (BS), Harvard...",[USAF Colonel],/wiki/Martha_McSally,Martha McSally,Republican


In [6]:
all_info.education

0      [Duke University (BA), University of Alabama (...
1      [New York University (BM), Samford University ...
2      [Jacksonville State University (BA, MPA), Birm...
3      [University of North Alabama, Birmingham–South...
4      [Duke University (BA), University of Alabama (...
5                           [University of Alabama (BS)]
6      [Princeton University (BA), St Hilda's College...
7      [Yuba College, California State University, Ch...
8                  [Lewis University, DePaul University]
9      [United States Air Force Academy (BS), Harvard...
10                          [University of Arizona (BA)]
11                      [Creighton University (BS, DDS)]
12     [Brigham Young University (BA), University of ...
13           [Arizona State University, Tempe (BS, MBA)]
14                             [Harvard University (BA)]
15                [University of Wisconsin–Madison (BA)]
16     [Brigham Young University (BA), Arizona State ...
17                      [Arkans

In [7]:
def highest_education_count():
    count = 0
    for education in all_info.education:
        if len(education) > count:
            count = len(education)
    return count
highest_education_count()

4

In [8]:
def highest_experience_count():
    count = 0
    for experience in all_info.experience:
        if len(experience) > count:
            count = len(experience)
    return count
highest_experience_count()

4

In [9]:
all_info.to_csv(SAVED_CSV_FILE_PATH, index=False)

In [10]:
# to get lists back
converters = {'experience':ast.literal_eval, 'education':ast.literal_eval}
data = pd.read_csv(SAVED_CSV_FILE_PATH, converters=converters)
data

Unnamed: 0,district,education,experience,link,name,party
0,Alabama 1,"[Duke University (BA), University of Alabama (...","[Alabama Senate, Alabama State Board of Educat...",/wiki/Bradley_Byrne,Bradley Byrne,Republican
1,Alabama 2,"[New York University (BM), Samford University ...",[Montgomery City Council],/wiki/Martha_Roby,Martha Roby,New York University (BM)Samford University (JD)
2,Alabama 3,"[Jacksonville State University (BA, MPA), Birm...","[Calhoun County Commissioner, Alabama House of...",/wiki/Mike_Rogers_(Alabama_politician),Mike Rogers,"Jacksonville State University (BA, MPA)Birming..."
3,Alabama 4,"[University of North Alabama, Birmingham–South...",[Haleyville Municipal Judge],/wiki/Robert_Aderholt,Robert Aderholt,University of North AlabamaBirmingham–Southern...
4,Alabama 5,"[Duke University (BA), University of Alabama (...","[Alabama House of Representatives, Madison Cou...",/wiki/Mo_Brooks,Mo Brooks,Duke University (BA)University of Alabama (JD)
5,Alabama 6,[University of Alabama (BS)],[Policy analyst],/wiki/Gary_Palmer_(politician),Gary Palmer,University of Alabama (BS)
6,Alabama 7,"[Princeton University (BA), St Hilda's College...",[Attorney],/wiki/Terri_Sewell,Terri Sewell,Democratic
7,Alaska at large,"[Yuba College, California State University, Ch...","[Alaska Senate, Ship captain, Mayor of Fort Yu...",/wiki/Don_Young,Don Young,Republican
8,Arizona 1,"[Lewis University, DePaul University]",[Arizona Senate],/wiki/Tom_O%27Halleran,Tom O'Halleran,Democratic
9,Arizona 2,"[United States Air Force Academy (BS), Harvard...",[USAF Colonel],/wiki/Martha_McSally,Martha McSally,Republican
