In [1]:
from bs4 import BeautifulSoup
import requests

response = requests.get("https://opensyllabus.org/results-list/titles?size=50&usState=AK")
doc = BeautifulSoup(response.text)

In [2]:
# Both of these select div's with the class 'title'
doc.find_all('div', class_='title')
doc.select('.title')

[]

In [3]:
# !pip install selenium

In [4]:
# web driver talks to each browser. the web driver needs to be installed, so you have to download it
# originally you had to do https://chromedriver.chromium.org/, but now you can use a web driver manager
# !pip install webdriver-manager

## Part 1: Open Syllabus

In [5]:
# from selenium import webdriver
# driver = webdriver.Chrome()

# the above breaks ^ 
# this bottom code tells us: start a web driver and also install the proper web driver if you need it

from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager

driver = webdriver.Chrome(ChromeDriverManager().install())



Current google-chrome version is 96.0.4664
Get LATEST chromedriver version for 96.0.4664 google-chrome
Driver [/Users/ilenapeng/.wdm/drivers/chromedriver/mac64/96.0.4664.45/chromedriver] found in cache
  driver = webdriver.Chrome(ChromeDriverManager().install())


In [6]:
# IRL, you'll probably need all of these when you work with Selenium
import pandas as pd

import time

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select

from webdriver_manager.chrome import ChromeDriverManager



In [7]:
driver.get("https://opensyllabus.org/results-list/titles?size=50&usState=AK")

In [8]:
# Sadly can't do this any more
# driver.find_elements_by_class_name("title")
# By.CLASS_NAME or By.ID or By.CSS_SELECTOR
titles = driver.find_elements(By.CLASS_NAME, "title")
for title in titles:
    print(title.text)

ADVANCED FILTERS
11,072 Titles
CLEAR FILTER
The Elements of Style
William Strunk
Multiple Editions
The Craft of Scientific Presentations: Critical Steps to Succeed and Critical Errors to Avoid
Michael Alley
Multiple Editions
Chemistry and Chemical Reactivity
Paul M. Treichel, John C. Kotz, Paul Treichel
Multiple Editions
Essential Algebra for Chemistry Students
David W. Ball
Brooks / Cole,2006
The Feynman Lectures on Physics
Richard Phillips Feynman
Multiple Editions
Biological Science
Scott Freeman
Multiple Editions
An Introduction to Dynamic Meteorology
James R. Holton
Multiple Editions
Physics for Scientists and Engineers
Raymond A. Serway
Multiple Editions
Geophysical Fluid Dynamics
Joseph Pedlosky
Springer Publishing,1979
Writing Papers in the Biological Sciences
Vicky McMillan
St. Martin's / Bedford Books,1988
Alcibiades
Plutarch
Multiple Editions
Mathematical Methods in the Physical Sciences
Mary L. Boas
Multiple Editions
Atmosphere-Ocean Dynamics
Adrian E. Gill
Elsevier / Acade

In [9]:
# printing this will pull out gibberish for us! so you just have to get good about knowing what you're searching for
# here i'm printing just one for example
title
# but for full, print:
# titles

<selenium.webdriver.remote.webelement.WebElement (session="f400e09609735e8df7c9eedc0dff7fcc", element="e2f1452d-2a4b-4220-a4e6-1d97111445d6")>

In [10]:
# driver.page_source gives us the entire page source
# and then you can load it into beautiful soup

# the reason using beautiful soup to request this page in doesn't work is because this page uses Javascript to run
# and loads on user selection

doc = BeautifulSoup(driver.page_source)

# BEAUTIFUL SOUP IS FASTER THAN SELENIUM, BUT SOME PAGES CAN'T BE SCRAPED -- SO IT'S A TRADEOFF

In [11]:
# Go find 
books = doc.select('.title-list ul li')
len(books)

50

In [12]:
# the title is inside a p tag and inside an a tag
# <div class="name-div"><p><a href="/result/title?id=8297876850707">The Craft of Scientific Presentations: 
# Critical Steps to Succeed and Critical Errors to Avoid</a></p>

# author is in span class 'name'
# <span class="name"><div><a href="/result/author?id=Michael+Alley">Michael Alley</a></div></span>

# score is in class = score

for book in books:
    print('----')
    title = book.select_one('.title p a').text
    print(title)
    name = book.select_one('span.name').text
    print(name)
    score = book.select_one('.score').text
    print(score)

----
The Elements of Style
William Strunk
100
----
The Craft of Scientific Presentations: Critical Steps to Succeed and Critical Errors to Avoid
Michael Alley
17
----
Chemistry and Chemical Reactivity
Paul M. Treichel, John C. Kotz, Paul Treichel
36
----
Essential Algebra for Chemistry Students
David W. Ball
4
----
The Feynman Lectures on Physics
Richard Phillips Feynman
77
----
Biological Science
Scott Freeman
54
----
An Introduction to Dynamic Meteorology
James R. Holton
29
----
Physics for Scientists and Engineers
Raymond A. Serway
87
----
Geophysical Fluid Dynamics
Joseph Pedlosky
13
----
Writing Papers in the Biological Sciences
Vicky McMillan
51
----
Alcibiades
Plutarch
7
----
Mathematical Methods in the Physical Sciences
Mary L. Boas
55
----
Atmosphere-Ocean Dynamics
Adrian E. Gill
17
----
Introduction to Geophysical Fluid Dynamics
Benoit Cushman-Roisin
11
----
Philosophic Classics
Forrest E. Baird
8
----
Div, Grad, Curl, and All That: An Informal Text on Vector Calculus
H. M. S

In [13]:
# to have the above as a list of dictionaries, do 

dataset = []

for book in books:
    data = {}
    data['title'] = book.select_one('.title p a').text
    data['name'] = book.select_one('span.name').text
    data['score'] = book.select_one('.score').text
    # Every time we go through the loop, append the book to 
    dataset.append(data)

print(dataset)

[{'title': 'The Elements of Style', 'name': 'William Strunk', 'score': '100'}, {'title': 'The Craft of Scientific Presentations: Critical Steps to Succeed and Critical Errors to Avoid', 'name': 'Michael Alley', 'score': '17'}, {'title': 'Chemistry and Chemical Reactivity', 'name': 'Paul M. Treichel,\xa0John C. Kotz,\xa0Paul Treichel', 'score': '36'}, {'title': 'Essential Algebra for Chemistry Students', 'name': 'David W. Ball', 'score': '4'}, {'title': 'The Feynman Lectures on Physics', 'name': 'Richard Phillips Feynman', 'score': '77'}, {'title': 'Biological Science', 'name': 'Scott Freeman', 'score': '54'}, {'title': 'An Introduction to Dynamic Meteorology', 'name': 'James R. Holton', 'score': '29'}, {'title': 'Physics for Scientists and Engineers', 'name': 'Raymond A. Serway', 'score': '87'}, {'title': 'Geophysical Fluid Dynamics', 'name': 'Joseph Pedlosky', 'score': '13'}, {'title': 'Writing Papers in the Biological Sciences', 'name': 'Vicky McMillan', 'score': '51'}, {'title': 'Al

In [14]:
df = pd.DataFrame(dataset)
df.head()

Unnamed: 0,title,name,score
0,The Elements of Style,William Strunk,100
1,The Craft of Scientific Presentations: Critica...,Michael Alley,17
2,Chemistry and Chemical Reactivity,"Paul M. Treichel, John C. Kotz, Paul Treichel",36
3,Essential Algebra for Chemistry Students,David W. Ball,4
4,The Feynman Lectures on Physics,Richard Phillips Feynman,77


In [15]:
# to click the show more button 10 times: this says find the button class btnPagination, click it 10 times, 
# waiting two seconds in between each click
for i in range(10):
    driver.find_element(By.CLASS_NAME, "btnPagination").click()
    time.sleep(2)

# and then you can run your scrape code again

In [16]:
# OR you can look @ network tab when interacting with site > look at headers > can find their 
# secret API / request URL to load into beautiful soup
response = requests.get("https://explorer-api.opensyllabus.org/v1/works.json?size=500&usState=NY")
data = response.json()

In [17]:
pd.DataFrame(data['results']['works'])

Unnamed: 0,id,rank,name,subtitle,publisher,publishDate,score,appearances,openAccess,persons
0,8306467210251,1,The Communist Manifesto,,"{'id': None, 'name': 'Multiple Editions'}",,100,849,False,"[{'id': 'Karl Marx', 'name': 'Karl Marx'}]"
1,7636452301400,2,The Republic,,"{'id': None, 'name': 'Multiple Editions'}",,100,844,False,"[{'id': 'Plato', 'name': 'Plato'}]"
2,7980049403911,3,Second Treatise of Government,,"{'id': None, 'name': 'Multiple Editions'}",,99,694,False,"[{'id': 'John Locke', 'name': 'John Locke'}]"
3,7507602833850,4,Frankenstein,,"{'id': None, 'name': 'Multiple Editions'}",,100,614,False,"[{'id': 'Mary Wollstonecraft Shelley', 'name':..."
4,7507602833494,5,Confessions,,"{'id': None, 'name': 'Multiple Editions'}",,96,605,False,"[{'id': 'Augustine', 'name': 'Augustine'}]"
...,...,...,...,...,...,...,...,...,...,...
495,33629593930128,496,The C++ Programming Language,,"{'id': 'Pearson / Addison Wesley', 'name': 'Pe...",1986.0,87,78,False,"[{'id': 'Bjarne Stroustrup', 'name': 'Bjarne S..."
496,8048768767966,497,Relativity,,"{'id': None, 'name': 'Multiple Editions'}",,82,78,False,"[{'id': 'Albert Einstein', 'name': 'Albert Ein..."
497,8624294790888,498,Song of Myself,,"{'id': None, 'name': 'Multiple Editions'}",,79,78,False,"[{'id': 'Walt Whitman', 'name': 'Walt Whitman'}]"
498,7507602833869,499,Great Expectations,,"{'id': None, 'name': 'Multiple Editions'}",,79,78,False,"[{'id': 'Charles Dickens', 'name': 'Charles Di..."


## Part 2: South Dakota licenses

In [18]:
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get("https://apps.sd.gov/ld17btp/licenseelist.aspx")



Current google-chrome version is 96.0.4664
Get LATEST chromedriver version for 96.0.4664 google-chrome
Driver [/Users/ilenapeng/.wdm/drivers/chromedriver/mac64/96.0.4664.45/chromedriver] found in cache
  driver = webdriver.Chrome(ChromeDriverManager().install())


In [19]:
# to find the search box
driver.find_element(By.ID, 'ctl00_ContentPlaceHolder1_txtName')

# xpath is a specific way to talk about elements on a page. if you click it and you don't want to read the actual code
# right click, copy, copy XPath or copy Full XPath (both work):
# driver.find_element(By.XPATH, '/html/body/form/div[3]/div[2]/table/tbody/tr/td/table/tbody/tr[2]/td/table/tbody/tr[2]/td/table/tbody/tr[1]/td[3]/input')

<selenium.webdriver.remote.webelement.WebElement (session="b791d4b0b2c65997b12d8c295d1e4a6e", element="9c2abcce-128b-4e83-b319-6b264776c840")>

In [20]:
# to search in the search box
driver.find_element(By.ID, 'ctl00_ContentPlaceHolder1_txtName').send_keys('Arnold')

In [21]:
# to click the search button
driver.find_element(By.ID, 'ctl00_ContentPlaceHolder1_btnSearch').click()

In [22]:
# select within a table and then scroll up til you find its root
driver.find_element(By.ID, 'ctl00_ContentPlaceHolder1_rgLicensee_ctl00')

<selenium.webdriver.remote.webelement.WebElement (session="b791d4b0b2c65997b12d8c295d1e4a6e", element="d84b055a-79bf-430b-afdd-fbe61a9ac5ee")>

In [23]:
# grab table off of the page
table = driver.find_element(By.ID, 'ctl00_ContentPlaceHolder1_rgLicensee_ctl00')

# then feed html using .get
df = pd.read_html(table.get_attribute('outerHTML'))[0]
df.head()

Unnamed: 0_level_0,Command item |,Command item |,Command item |,Command item |,Command item |,Command item |,Command item |,Command item |,Command item |,Command item |,Command item |
Unnamed: 0_level_1,Profession,Name,Address,City,State,Zip,Phone,RegistrationNumber,PEDisc.,ExpirationDate,Status
Unnamed: 0_level_2,Command item,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
0,,|,,,,,,,,,
1,PE,Barry Kay Arnold,1193 W. Fallow Way,Pleasant View,UT,84414.0,(801) 782-6008,6615,CE,8/31/2022,Active
2,FE,Dawn M Arnold,,,XX,0.0,,E-7281,,,Intern
3,FE,Eric Joseph Arnold,,,XX,0.0,,E-5825,,,Intern
4,FE,George S Arnold,,,XX,0.0,,E-7527,,,Intern


## Part 3: Utah licensee lookup

In [24]:
# Launch a new Chrome, install the appropriate ChromeDriver if necessary
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get("https://secure.utah.gov/llv/search/index.html")



Current google-chrome version is 96.0.4664
Get LATEST chromedriver version for 96.0.4664 google-chrome
Driver [/Users/ilenapeng/.wdm/drivers/chromedriver/mac64/96.0.4664.45/chromedriver] found in cache
  driver = webdriver.Chrome(ChromeDriverManager().install())


In [25]:
# select the geologist checkbox and click it
driver.find_element(By.ID, 'item115').click()

In [26]:
# click the search button
driver.find_element(By.XPATH, '/html/body/div[2]/div[2]/div[2]/form/fieldset[3]/p[1]/input[1]').click()

In [28]:
table = driver.find_element(By.CLASS_NAME, "resultsTable")
df = pd.read_html(table.get_attribute('outerHTML'))[0]
df.head()

Unnamed: 0,Licensee Name,City,Profession,License #,Status
0,AARON BURNS,STRASBURG,GEOLOGIST PROFESSIONAL GEOLOGIST,5269857-2250,EXPIRED
1,AARON JOHN CANTRELL,BOISE,GEOLOGIST PROFESSIONAL GEOLOGIST,8494206-2250,ACTIVE
2,AARON LEE NORTON,FARMINGTON,GEOLOGIST PROFESSIONAL GEOLOGIST,9174027-2250,ACTIVE
3,ADAM ISAAC HISCOCK,SALT LAKE CITY,GEOLOGIST PROFESSIONAL GEOLOGIST,9404658-2250,ACTIVE
4,ADAM PAUL MCKEAN,SALT LAKE CITY,GEOLOGIST PROFESSIONAL GEOLOGIST,9255409-2250,ACTIVE


In [29]:
driver.find_element(By.ID, "pagination-next").click()

In [30]:
table = driver.find_element(By.CLASS_NAME, "resultsTable")
df = pd.read_html(table.get_attribute('outerHTML'))[0]
df.head()

Unnamed: 0,Licensee Name,City,Profession,License #,Status
0,ANTHONY FELIX STIRBYS,WOODINVILLE,GEOLOGIST PROFESSIONAL GEOLOGIST,5377508-2250,EXPIRED
1,ANTHONY JOHN CARMELI,GOLDEN,GEOLOGIST PROFESSIONAL GEOLOGIST,5526774-2250,ACTIVE
2,ANTHONY MAGLIOCCHINO,SANDY,GEOLOGIST PROFESSIONAL GEOLOGIST,5327371-2250,ACTIVE
3,ANTHONY R PAWLOSKI,TUCSON,GEOLOGIST PROFESSIONAL GEOLOGIST,167723-2250,EXPIRED
4,ARIC MORTON,SANDY,GEOLOGIST PROFESSIONAL GEOLOGIST,11585804-2250,ACTIVE


In [31]:
dataframes = []

# Keep doing this.... FOREVER????
while True:
    # Get a new dataframe
    table = driver.find_element(By.CLASS_NAME, "resultsTable")
    df = pd.read_html(table.get_attribute('outerHTML'))[0]
    
    # Each time we pull a table off the page,
    # add it to our list of dataframes
    dataframes.append(df)
    
    # Try to click the next button
    try:
        driver.find_element(By.ID, "pagination-next").click()
    except:
        # If it can't find it, exit the while loop
        break

In [32]:
df = pd.concat(dataframes, ignore_index=True)
df.head()

Unnamed: 0,Licensee Name,City,Profession,License #,Status
0,ANTHONY FELIX STIRBYS,WOODINVILLE,GEOLOGIST PROFESSIONAL GEOLOGIST,5377508-2250,EXPIRED
1,ANTHONY JOHN CARMELI,GOLDEN,GEOLOGIST PROFESSIONAL GEOLOGIST,5526774-2250,ACTIVE
2,ANTHONY MAGLIOCCHINO,SANDY,GEOLOGIST PROFESSIONAL GEOLOGIST,5327371-2250,ACTIVE
3,ANTHONY R PAWLOSKI,TUCSON,GEOLOGIST PROFESSIONAL GEOLOGIST,167723-2250,EXPIRED
4,ARIC MORTON,SANDY,GEOLOGIST PROFESSIONAL GEOLOGIST,11585804-2250,ACTIVE


In [33]:
# click the boxes
# athlete agents
driver.find_element(By.ID, 'item8').click()
# handymen
driver.find_element(By.ID, 'item118').click()
# radiologists
driver.find_element(By.ID, 'item268').click()

In [35]:
# click search
driver.find_element(By.XPATH, '/html/body/div[2]/div[2]/div[2]/form/fieldset[3]/p[1]/input[1]').click()

In [36]:
# load each of the pages
dataframes = []

# Keep doing this.... FOREVER????
while True:
    # can add time.sleep(1) to always wait one second for resultsTable to show up
    # OR wait for resultsTable to appear on the page. if it takes longer than 3 seconds, just give up and throw error
    WebDriverWait(driver, 3).until(
        EC.presence_of_element_located((By.CLASS_NAME, 'resultsTable'))
    )
    
    # Get a new dataframe
    table = driver.find_element(By.CLASS_NAME, "resultsTable")
    df = pd.read_html(table.get_attribute('outerHTML'))[0]
    
    # Each time we pull a table off the page,
    # add it to our list of dataframes
    dataframes.append(df)
    
    # Try to click the next button
    try:
        driver.find_element(By.ID, "pagination-next").click()
    except:
        # If it can't find it, exit the while loop
        break

In [37]:
df = pd.concat(dataframes, ignore_index=True)
df.head()

Unnamed: 0,Licensee Name,City,Profession,License #,Status
0,AUSTIN KEITH HANSON DBA: WOOLF INTERIORS,OGDEN,CONTRACTOR HANDYMAN EXEMPTION,12234193-5500,ACTIVE
1,DAKOTA JAMES STEWARTDBA: STEWARTS PAINTING AND...,SAINT GEORGE,CONTRACTOR HANDYMAN EXEMPTION,12580385-5500,ACTIVE
2,THE UINTA CRAFTSMAN LLC,PEOA,CONTRACTOR HANDYMAN EXEMPTION,12393868-5500,ACTIVE
3,"""THE FIXER"" CHRIS BROTHERS LLC",MURRAY,CONTRACTOR HANDYMAN EXEMPTION,10847890-5500,ACTIVE
4,1 CREATOR PAINTING AND DESIGN LLC,SALT LAKE CITY,CONTRACTOR HANDYMAN EXEMPTION,11004846-5500,ACTIVE


In [38]:
df.shape

(10500, 5)

In [40]:
df.Profession.value_counts()

RADIOLOGY  RADIOLOGIC TECHNOLOGIST           3915
CONTRACTOR  HANDYMAN EXEMPTION               3826
RADIOLOGY  RADIOLOGY PRACTICAL TECHNICIAN    2616
ATHLETE AGENTS  ATHLETE AGENT                 130
RADIOLOGY  RADIOLOGIST ASSISTANT               13
Name: Profession, dtype: int64

In [42]:
df.groupby('Profession').City.value_counts().groupby(level=0).head(1)

Profession                                 City          
ATHLETE AGENTS  ATHLETE AGENT              NEW YORK           11
CONTRACTOR  HANDYMAN EXEMPTION             SALT LAKE CITY    594
RADIOLOGY  RADIOLOGIC TECHNOLOGIST         SALT LAKE CITY    324
RADIOLOGY  RADIOLOGIST ASSISTANT           GRAND JUNCTION      2
RADIOLOGY  RADIOLOGY PRACTICAL TECHNICIAN  SALT LAKE CITY    229
Name: City, dtype: int64

## Part 4: Oil and gas wells

In [None]:
# Launch a new Chrome, install the appropriate ChromeDriver if necessary
driver = webdriver.Chrome(ChromeDriverManager().install())

driver.get("https://www.dmr.nd.gov/oilgas/findwellsvw.asp")