In [6]:
%pip install httpx lxml cssselect

Collecting cssselect
  Downloading cssselect-1.3.0-py3-none-any.whl.metadata (2.6 kB)
Downloading cssselect-1.3.0-py3-none-any.whl (18 kB)
Installing collected packages: cssselect
Successfully installed cssselect-1.3.0
Note: you may need to restart the kernel to use updated packages.


In [7]:
import httpx
import lxml.html

# first we'll fetch the HTML in question and load it with lxml
response = httpx.get("https://www.senate.gov/senators/SenateSalariesSince1789.htm")
root = lxml.html.fromstring(response.text)

# Ideally, we would rely less on getchildren() since a change to the
# structure would break our scraper.
# Here we use a CSS Selector:
#
#    #SortableData_table   -- the # character looks up by ID
#    tbody tr              -- names without a leading # or . refer to tag names
#
# So this grabs "all tr elements that are inside the tbody
#  that are inside id=SortableData_table".
rows = root.cssselect("#SortableData_table tbody tr")
for row in rows:
    # this time we'll iterate over the <td> elements within
    # since we are starting the .cssselect with `row` instead of `root`
    # this only gets the <td>s within the current row
    year_td, salary_td = row.cssselect("td")

    # finally, we use .text_content() to extract the text nodes
    # which contain the data we're after
    year = year_td.text_content()
    salary = salary_td.text_content()
    print(year, "|", salary)

1789_181517891815 | $6.00 per diem
1815_181718151817 | $1,500 per annum
1817_185518171855 | $8.00 per diem
1855_186518551865 | $3,000 per annum
1865_187118651871 | $5,000 per annum
1871_187318711873 | $7,500 per annum
1873_190718731907 | $5,000 per annum
1907_192519071925 | $7,500 per annum
1925_193219251932 | $10,000 per annum
1932_193319321933 | $9,000 per annum
1933_193519331935 | $8,500 per annum
1935_194719351947 | $10,000 per annum
1947_195519471955 | $12,500 per annum
1955_196519551965 | $22,500 per annum
1965_196919651969 | $30,000 per annum
1969_197519691975 | $42,500 per annum
1975_197719751977 | $44,600 per annum
1977_197819771978 | $57,500 per annum
1979_198319791983 | $60,662.50 per annum
19831983 | $69,800 per annum
19841984 | $72,600 per annum
1985_198619851986 | $75,100 per annum
19870101_19870203Jan 1, 1987  Feb 3, 1987 | $77,400 per annum
19870204Feb 4, 1987 | $89,500 per annum
19900201Feb 1, 1990 | $98,400 per annum
19911991 | $101,900 per annum


In [8]:
import httpx
import lxml.html

# first we'll fetch the HTML in question and load it with lxml
response = httpx.get("https://www.senate.gov/senators/SenateSalariesSince1789.htm")
root = lxml.html.fromstring(response.text)

print(root)

print("\n------\n")

# we can examine the root node and see 3 children
print("children", root.getchildren())

print("\n------\n")

# but it'll be more reliable to jump straight to the table by ID
table = root.get_element_by_id("SortableData_table")
print("Children of table:", table.getchildren())
# table contains thead followed by tfooter followed by tbody
# so element 2 is required
print("Rows: ", table.getchildren()[2].getchildren())

<Element html at 0x257298237a0>

------

children [<!-- <![endif] -->, <Element head at 0x25729823570>, <Element body at 0x25729823750>]

------

Children of table: [<Element tfoot at 0x25729823520>, <Element thead at 0x25729823570>, <Element tbody at 0x25729823750>]
Rows:  [<Element tr at 0x25729823520>, <Element tr at 0x25729823570>, <Element tr at 0x257298234d0>, <Element tr at 0x25729823480>, <Element tr at 0x25729823430>, <Element tr at 0x257298233e0>, <Element tr at 0x25729823390>, <Element tr at 0x25729823340>, <Element tr at 0x257298232f0>, <Element tr at 0x257298232a0>, <Element tr at 0x25729823250>, <Element tr at 0x25729823200>, <Element tr at 0x257298231b0>, <Element tr at 0x25729823160>, <Element tr at 0x25729823110>, <Element tr at 0x257298230c0>, <Element tr at 0x25729823070>, <Element tr at 0x25729823020>, <Element tr at 0x25729822fd0>, <Element tr at 0x25729822f80>, <Element tr at 0x25729822f30>, <Element tr at 0x25729822ee0>, <Element tr at 0x25729822e90>, <Element tr

In [9]:
import httpx
import lxml.html

# first we'll fetch the HTML in question and load it with lxml
response = httpx.get("https://www.senate.gov/senators/SenateSalariesSince1789.htm")
root = lxml.html.fromstring(response.text)

rows = root.xpath("//table[@id='SortableData_table']/tbody/tr")
for row in rows:
    # like above, we start our query now on the row in question (not root)
    # and use Xpath's directory-like syntax to get the underlying text
    year_td, salary_td = row.xpath(".//td")
    year = year_td.text_content()
    salary = salary_td.text_content()
    print(year, "|", salary)

1789_181517891815 | $6.00 per diem
1815_181718151817 | $1,500 per annum
1817_185518171855 | $8.00 per diem
1855_186518551865 | $3,000 per annum
1865_187118651871 | $5,000 per annum
1871_187318711873 | $7,500 per annum
1873_190718731907 | $5,000 per annum
1907_192519071925 | $7,500 per annum
1925_193219251932 | $10,000 per annum
1932_193319321933 | $9,000 per annum
1933_193519331935 | $8,500 per annum
1935_194719351947 | $10,000 per annum
1947_195519471955 | $12,500 per annum
1955_196519551965 | $22,500 per annum
1965_196919651969 | $30,000 per annum
1969_197519691975 | $42,500 per annum
1975_197719751977 | $44,600 per annum
1977_197819771978 | $57,500 per annum
1979_198319791983 | $60,662.50 per annum
19831983 | $69,800 per annum
19841984 | $72,600 per annum
1985_198619851986 | $75,100 per annum
19870101_19870203Jan 1, 1987  Feb 3, 1987 | $77,400 per annum
19870204Feb 4, 1987 | $89,500 per annum
19900201Feb 1, 1990 | $98,400 per annum
19911991 | $101,900 per annum


In [10]:
try:
    1 / 2
except ValueError as e:
    print("got a value error:", e)
except Exception as e:
    print("got some other error:", type(e), e)
else:
    print("else executed")
finally:
    print("always prints at the end")

else executed
always prints at the end
