In [4]:
import requests
from bs4 import BeautifulSoup

url = "https://opensyllabus.org/results-list/titles?size=50&usState=AK"
response = requests.get(url)
doc = BeautifulSoup(response.text)

In [5]:
doc.select(".name-div")

[]

## Pretend you are in a browser
... not python

In [14]:
headers = {
    'User agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
}

response = requests.get(url, headers = headers)
doc = BeautifulSoup(response.text)

## Or scrape a secret API 

... but sometimes you need to pretend you are a browser

In [11]:
url = "https://explorer-api.opensyllabus.org/v1/works.json?size=50&usState=AK"
response = requests.get(url)
data = response.json()

In [12]:
import pandas as pd
df = pd.DataFrame(data['results']['works'])
df

Unnamed: 0,id,rank,name,subtitle,publisher,publishDate,score,appearances,openAccess,persons
0,9199819950029,1,The Elements of Style,,"{'id': None, 'name': 'Multiple Editions'}",,100,67,False,"[{'id': 'William Strunk', 'name': 'William Str..."
1,8297876850707,2,The Craft of Scientific Presentations: Critica...,Critical Steps to Succeed and Critical Errors ...,"{'id': None, 'name': 'Multiple Editions'}",,17,52,False,"[{'id': 'Michael Alley', 'name': 'Michael Alle..."
2,65017214971298,3,Chemistry and Chemical Reactivity,,"{'id': None, 'name': 'Multiple Editions'}",,36,40,False,"[{'id': 'Paul M. Treichel', 'name': 'Paul M. T..."
3,41979010351797,4,Essential Algebra for Chemistry Students,,"{'id': 'Brooks / Cole', 'name': 'Brooks / Cole'}",2006.0,4,40,False,"[{'id': 'David W. Ball', 'name': 'David W. Bal..."
4,8323646833678,5,The Feynman Lectures on Physics,,"{'id': None, 'name': 'Multiple Editions'}",,77,31,False,"[{'id': 'Richard Phillips Feynman', 'name': 'R..."
5,33028298506982,6,Biological Science,,"{'id': None, 'name': 'Multiple Editions'}",,54,30,False,"[{'id': 'Scott Freeman', 'name': 'Scott Freema..."
6,9002251772830,7,An Introduction to Dynamic Meteorology,,"{'id': None, 'name': 'Multiple Editions'}",,29,28,False,"[{'id': 'James R. Holton', 'name': 'James R. H..."
7,32598801778758,8,Physics for Scientists and Engineers,,"{'id': None, 'name': 'Multiple Editions'}",,87,27,False,"[{'id': 'Raymond A. Serway', 'name': 'Raymond ..."
8,7602092430926,9,Geophysical Fluid Dynamics,,"{'id': 'Springer Publishing', 'name': 'Springe...",1979.0,13,26,False,"[{'id': 'Joseph Pedlosky', 'name': 'Joseph Ped..."
9,33835752359272,10,Writing Papers in the Biological Sciences,,"{'id': 'St. Martin's / Bedford Books', 'name':...",1988.0,51,25,False,"[{'id': 'Vicky McMillan', 'name': 'Vicky McMil..."


## Or use a browser automation tool

Tools like **Selenium** or **Playwrigth** exist so people can run ecommerce websites and not worry about accidently breaking something.

They automatically click buttons, add to a chart, check things out - in every step they check that things look like they should. 

In [16]:
! pip install playwright

Collecting playwright
  Downloading playwright-1.28.0-py3-none-macosx_11_0_arm64.whl (29.1 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.1/29.1 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0mm
[?25hCollecting greenlet==2.0.1
  Downloading greenlet-2.0.1-cp311-cp311-macosx_10_9_universal2.whl (259 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m259.4/259.4 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m[31m3.7 MB/s[0m eta [36m0:00:01[0m
[?25hCollecting pyee==9.0.4
  Downloading pyee-9.0.4-py2.py3-none-any.whl (14 kB)
Collecting typing-extensions
  Using cached typing_extensions-4.4.0-py3-none-any.whl (26 kB)
Installing collected packages: typing-extensions, greenlet, pyee, playwright
Successfully installed greenlet-2.0.1 playwright-1.28.0 pyee-9.0.4 typing-extensions-4.4.0


In [17]:
from playwright.async_api import async_playwright

In [19]:
! pip install pytest-playwright

Collecting pytest-playwright
  Downloading pytest_playwright-0.3.0-py3-none-any.whl (10 kB)
Collecting pytest
  Downloading pytest-7.2.0-py3-none-any.whl (316 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.8/316.8 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m[31m3.2 MB/s[0m eta [36m0:00:01[0m
[?25hCollecting pytest-base-url
  Downloading pytest_base_url-2.0.0-py3-none-any.whl (4.6 kB)
Collecting python-slugify
  Downloading python_slugify-7.0.0-py2.py3-none-any.whl (9.4 kB)
Collecting iniconfig
  Downloading iniconfig-1.1.1-py2.py3-none-any.whl (5.0 kB)
Collecting pluggy<2.0,>=0.12
  Downloading pluggy-1.0.0-py2.py3-none-any.whl (13 kB)
Collecting text-unidecode>=1.3
  Downloading text_unidecode-1.3-py2.py3-none-any.whl (78 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.2/78.2 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m[31m3.6 MB/s[0m eta [36m0:00:01[0m
Installing collected packages: text-unidecode, inic

In [21]:
!  playwright install 

Downloading Chromium 108.0.5359.29 (playwright build v1033)[2m from https://playwright.azureedge.net/builds/chromium/1033/chromium-mac-arm64.zip[22m


[1G118 Mb [                    ] 0% 7.9s[0K[1G118 Mb [                    ] 0% 115.8s[0K[1G118 Mb [                    ] 0% 73.1s[0K[1G118 Mb [                    ] 0% 92.2s[0K[1G118 Mb [                    ] 0% 91.4s[0K[1G118 Mb [                    ] 0% 86.7s[0K[1G118 Mb [                    ] 0% 84.0s[0K[1G118 Mb [                    ] 0% 89.5s[0K[1G118 Mb [                    ] 0% 89.6s[0K[1G118 Mb [                    ] 0% 92.4s[0K[1G118 Mb [                    ] 0% 91.3s[0K[1G118 Mb [                    ] 0% 98.4s[0K[1G118 Mb [                    ] 0% 102.3s[0K[1G118 Mb [                    ] 0% 100.2s[0K[1G118 Mb [                    ] 0% 130.4s[0K[1G118 Mb [                    ] 0% 117.2s[0K[1G118 Mb [                    ] 0% 106.5s[0K[1G118 Mb [                    ] 0% 92.6s[0K[1G118 Mb [                    ] 0% 86.4s[0K[1G118 Mb [                    ] 0% 73.8s[0K[1G118 Mb [                    ] 0% 74.6s[0K[1G118 Mb [                 

[1G118 Mb [=                   ] 6% 136.0s[0K[1G118 Mb [=                   ] 6% 135.7s[0K[1G118 Mb [=                   ] 6% 135.8s[0K[1G118 Mb [=                   ] 6% 135.1s[0K[1G118 Mb [=                   ] 6% 134.8s[0K[1G118 Mb [=                   ] 6% 134.6s[0K[1G118 Mb [=                   ] 6% 134.2s[0K[1G118 Mb [=                   ] 6% 136.1s[0K[1G118 Mb [=                   ] 6% 135.8s[0K[1G118 Mb [=                   ] 6% 135.3s[0K[1G118 Mb [=                   ] 6% 135.0s[0K[1G118 Mb [=                   ] 7% 134.5s[0K[1G118 Mb [=                   ] 7% 135.1s[0K[1G118 Mb [=                   ] 7% 134.8s[0K[1G118 Mb [=                   ] 7% 134.7s[0K[1G118 Mb [=                   ] 7% 135.9s[0K[1G118 Mb [=                   ] 7% 135.8s[0K[1G118 Mb [=                   ] 7% 135.3s[0K[1G118 Mb [=                   ] 7% 135.1s[0K[1G118 Mb [=                   ] 7% 135.2s[0K[1G118 Mb [=                   ] 7% 134.7s[0K[1G118 Mb [=

[1G118 Mb [===                 ] 13% 115.0s[0K[1G118 Mb [===                 ] 13% 115.5s[0K[1G118 Mb [===                 ] 13% 115.4s[0K[1G118 Mb [===                 ] 13% 115.6s[0K[1G118 Mb [===                 ] 13% 115.4s[0K[1G118 Mb [===                 ] 13% 115.1s[0K[1G118 Mb [===                 ] 13% 114.8s[0K[1G118 Mb [===                 ] 13% 114.7s[0K[1G118 Mb [===                 ] 13% 114.5s[0K[1G118 Mb [===                 ] 13% 114.4s[0K[1G118 Mb [===                 ] 13% 113.9s[0K[1G118 Mb [===                 ] 13% 113.7s[0K[1G118 Mb [===                 ] 13% 113.4s[0K[1G118 Mb [===                 ] 13% 114.4s[0K[1G118 Mb [===                 ] 13% 113.9s[0K[1G118 Mb [===                 ] 13% 113.6s[0K[1G118 Mb [===                 ] 13% 113.3s[0K[1G118 Mb [===                 ] 13% 113.2s[0K[1G118 Mb [===                 ] 13% 112.9s[0K[1G118 Mb [===                 ] 14% 112.8s[0K[1G118 Mb [===                 ] 14% 11

[1G118 Mb [====                ] 20% 98.1s[0K[1G118 Mb [====                ] 20% 98.0s[0K[1G118 Mb [====                ] 20% 97.9s[0K[1G118 Mb [====                ] 20% 97.8s[0K[1G118 Mb [====                ] 20% 97.7s[0K[1G118 Mb [====                ] 20% 97.6s[0K[1G118 Mb [====                ] 20% 97.5s[0K[1G118 Mb [====                ] 20% 97.4s[0K[1G118 Mb [====                ] 20% 97.3s[0K[1G118 Mb [====                ] 20% 97.2s[0K[1G118 Mb [====                ] 20% 97.1s[0K[1G118 Mb [====                ] 20% 97.0s[0K[1G118 Mb [====                ] 20% 96.9s[0K[1G118 Mb [====                ] 20% 96.8s[0K[1G118 Mb [====                ] 20% 97.2s[0K[1G118 Mb [====                ] 20% 97.1s[0K[1G118 Mb [====                ] 20% 97.0s[0K[1G118 Mb [====                ] 20% 96.9s[0K[1G118 Mb [====                ] 20% 96.8s[0K[1G118 Mb [====                ] 20% 96.7s[0K[1G118 Mb [====                ] 21% 96.9s[0K[1G118 Mb [=















Chromium 108.0.5359.29 (playwright build v1033) downloaded to /Users/laurabejderjensen/Library/Caches/ms-playwright/chromium-1033
Downloading FFMPEG playwright build v1008[2m from https://playwright.azureedge.net/builds/ffmpeg/1008/ffmpeg-mac-arm64.zip[22m
FFMPEG playwright build v1008 downloaded to /Users/laurabejderjensen/Library/Caches/ms-playwright/ffmpeg-1008
Downloading Firefox 106.0 (playwright build v1364)[2m from https://playwright.azureedge.net/builds/firefox/1364/firefox-mac-11-arm64.zip[22m


[1G69.2 Mb [                    ] 0% 4.6s[0K[1G69.2 Mb [                    ] 0% 135.8s[0K[1G69.2 Mb [                    ] 0% 61.7s[0K[1G69.2 Mb [                    ] 0% 142.3s[0K[1G69.2 Mb [                    ] 0% 84.2s[0K[1G69.2 Mb [                    ] 0% 234.9s[0K[1G69.2 Mb [                    ] 0% 239.9s[0K[1G69.2 Mb [                    ] 0% 156.5s[0K[1G69.2 Mb [                    ] 0% 151.7s[0K[1G69.2 Mb [                    ] 0% 148.4s[0K[1G69.2 Mb [                    ] 0% 162.7s[0K[1G69.2 Mb [                    ] 0% 144.5s[0K[1G69.2 Mb [                    ] 0% 146.5s[0K[1G69.2 Mb [                    ] 1% 139.6s[0K[1G69.2 Mb [                    ] 1% 135.7s[0K[1G69.2 Mb [                    ] 1% 129.8s[0K[1G69.2 Mb [                    ] 1% 126.9s[0K[1G69.2 Mb [                    ] 1% 124.2s[0K[1G69.2 Mb [                    ] 1% 120.8s[0K[1G69.2 Mb [                    ] 1% 118.6s[0K[1G69.2 Mb [                    ] 1% 114.9s

[1G69.2 Mb [==                  ] 12% 70.2s[0K[1G69.2 Mb [==                  ] 12% 69.9s[0K[1G69.2 Mb [==                  ] 12% 69.8s[0K[1G69.2 Mb [==                  ] 12% 69.3s[0K[1G69.2 Mb [==                  ] 12% 69.0s[0K[1G69.2 Mb [===                 ] 12% 68.6s[0K[1G69.2 Mb [===                 ] 12% 68.3s[0K[1G69.2 Mb [===                 ] 12% 67.8s[0K[1G69.2 Mb [===                 ] 12% 67.7s[0K[1G69.2 Mb [===                 ] 12% 67.6s[0K[1G69.2 Mb [===                 ] 12% 67.5s[0K[1G69.2 Mb [===                 ] 12% 68.0s[0K[1G69.2 Mb [===                 ] 13% 67.9s[0K[1G69.2 Mb [===                 ] 13% 67.8s[0K[1G69.2 Mb [===                 ] 13% 67.5s[0K[1G69.2 Mb [===                 ] 13% 67.3s[0K[1G69.2 Mb [===                 ] 13% 67.0s[0K[1G69.2 Mb [===                 ] 13% 66.5s[0K[1G69.2 Mb [===                 ] 13% 66.3s[0K[1G69.2 Mb [===                 ] 13% 66.1s[0K[1G69.2 Mb [===                 ] 13% 6







Firefox 106.0 (playwright build v1364) downloaded to /Users/laurabejderjensen/Library/Caches/ms-playwright/firefox-1364
Downloading Webkit 16.4 (playwright build v1735)[2m from https://playwright.azureedge.net/builds/webkit/1735/webkit-mac-11-arm64.zip[22m






Webkit 16.4 (playwright build v1735) downloaded to /Users/laurabejderjensen/Library/Caches/ms-playwright/webkit-1735


In [24]:
playwright = await async_playwright().start()

# Launch the chromium browser. headless = False makes us see the window. 
browser = await playwright.chromium.launch(headless = False)

# With "await" we tell the computer to do a bunch of things while we wait. Usually how we start a playwright command.

In [28]:
page = await browser.new_page()

In [30]:
# Opening a particular page in chromium
await page.goto("https://opensyllabus.org/results-list/titles?size=50&usState=AK")

<Response url='https://opensyllabus.org/results-list/titles?size=50&usState=AK' request=<Request url='https://opensyllabus.org/results-list/titles?size=50&usState=AK' method='GET'>>

In [32]:
# Giving us all the content on the webpage
await page.content()

'<!DOCTYPE html><html lang="en"><head><meta charset="utf-8"><meta name="viewport" content="width=device-width,initial-scale=1"><script type="text/javascript" async="" src="https://www.google-analytics.com/analytics.js"></script><script async="" src="https://www.googletagmanager.com/gtag/js?id=UA-72367808-1"></script><script>function gtag(){dataLayer.push(arguments)}window.dataLayer=window.dataLayer||[],gtag("js",new Date),gtag("config","UA-72367808-1")</script><link rel="shortcut icon" href="/favicon.ico"><link rel="stylesheet" href="https://unpkg.com/leaflet@1.0.3/dist/leaflet.css"><meta property="og:site_name" content="Open Syllabus"><meta property="og:title" content="Open Syllabus: Explorer"><meta property="og:description" content="Mapping the college curriculum across 7,292,573 syllabi."><meta property="og:image" content="https://opensyllabus.org/og-image.jpg"><meta name="twitter:card" content="summary_large_image"><meta name="twitter:site" content="@opensyllabus"><title>Open Sylla

In [33]:
from bs4 import BeautifulSoup

html = await page.content()
doc = BeautifulSoup(html)

In [34]:
doc.select('.name-div')

[<div class="name-div"><p><a href="/result/title?id=9199819950029">The Elements of Style</a></p><span class="name"><div><a href="/result/author?id=William+Strunk">William Strunk</a></div></span><span class="publisher"><div class="div-no-link">Multiple Editions</div></span></div>,
 <div class="name-div"><p><a href="/result/title?id=8297876850707">The Craft of Scientific Presentations: Critical Steps to Succeed and Critical Errors to Avoid</a></p><span class="name"><div><a href="/result/author?id=Michael+Alley">Michael Alley</a></div></span><span class="publisher"><div class="div-no-link">Multiple Editions</div></span></div>,
 <div class="name-div"><p><a href="/result/title?id=65017214971298">Chemistry and Chemical Reactivity</a></p><span class="name"><div><a href="/result/author?id=Paul+M.+Treichel">Paul M. Treichel</a>, </div><div><a href="/result/author?id=John+C.+Kotz">John C. Kotz</a>, </div><div><a href="/result/author?id=Paul+Treichel">Paul Treichel</a></div></span><span class="pu

In [35]:
len(doc.select('.name-div'))

50

We want to turn the HTML into a list of dictionaries. 

1. What are our rows?
    - Find the CSS selector for them
2. What are our columns?
    - Find the CSS selector for them
        - Do it **once at a time**!!
3. Ready to scrape!

In [38]:
rows = doc.select('.title-item')
len(rows)

50

In [39]:
for row in rows:
    print(row)
    print("-------")

<div class="title-item"><div class="rank">1</div><div class="title"><div class="name-div"><p><a href="/result/title?id=9199819950029">The Elements of Style</a></p><span class="name"><div><a href="/result/author?id=William+Strunk">William Strunk</a></div></span><span class="publisher"><div class="div-no-link">Multiple Editions</div></span></div></div><div class="appearances">67</div><div class="score">100</div></div>
-------
<div class="title-item"><div class="rank">2</div><div class="title"><div class="name-div"><p><a href="/result/title?id=8297876850707">The Craft of Scientific Presentations: Critical Steps to Succeed and Critical Errors to Avoid</a></p><span class="name"><div><a href="/result/author?id=Michael+Alley">Michael Alley</a></div></span><span class="publisher"><div class="div-no-link">Multiple Editions</div></span></div></div><div class="appearances">52</div><div class="score">17</div></div>
-------
<div class="title-item"><div class="rank">3</div><div class="title"><div cl

In [40]:
# Get the titles
for row in rows:
    title = row.select_one(".title .name-div p")
    print(title.text)
    print("-------")

The Elements of Style
-------
The Craft of Scientific Presentations: Critical Steps to Succeed and Critical Errors to Avoid
-------
Chemistry and Chemical Reactivity
-------
Essential Algebra for Chemistry Students
-------
The Feynman Lectures on Physics
-------
Biological Science
-------
An Introduction to Dynamic Meteorology
-------
Physics for Scientists and Engineers
-------
Geophysical Fluid Dynamics
-------
Writing Papers in the Biological Sciences
-------
Alcibiades
-------
Mathematical Methods in the Physical Sciences
-------
Atmosphere-Ocean Dynamics
-------
Introduction to Geophysical Fluid Dynamics
-------
Philosophic Classics
-------
Div, Grad, Curl, and All That: An Informal Text on Vector Calculus
-------
Brock Biology of Microorganisms
-------
How to Write and Publish a Scientific Paper
-------
Meno
-------
A Short Guide to Writing About Biology
-------
Flora of Alaska and Adjacent Parts of Canada
-------
Human Anatomy and Physiology
-------
Human Anatomy and Physiology 

The spaces in `(".title .name-div p")` means the thing that are inside another thing. Going one level further in. 

In [43]:
# Here we find the links

for row in rows:
    link_url = row.select_one(".title .name-div a")['href']
    print(link_url)
    print("-------")

/result/title?id=9199819950029
-------
/result/title?id=8297876850707
-------
/result/title?id=65017214971298
-------
/result/title?id=41979010351797
-------
/result/title?id=8323646833678
-------
/result/title?id=33028298506982
-------
/result/title?id=9002251772830
-------
/result/title?id=32598801778758
-------
/result/title?id=7602092430926
-------
/result/title?id=33835752359272
-------
/result/title?id=61924838546397
-------
/result/title?id=8718783931546
-------
/result/title?id=50792283243920
-------
/result/title?id=33337536151615
-------
/result/title?id=8134668567196
-------
/result/title?id=7928509949424
-------
/result/title?id=32822140076391
-------
/result/title?id=7524782967270
-------
/result/title?id=9036611695339
-------
/result/title?id=32976758900349
-------
/result/title?id=56891136803962
-------
/result/title?id=32804960208506
-------
/result/title?id=32530082300371
-------
/result/title?id=32684701125364
-------
/result/title?id=42880953485349
-------
/result/ti

In [45]:
# Here we find the links

for row in rows:
    authors = row.select_one(".title .name-div .name")
    print(authors.text)
    print("-------")

William Strunk
-------
Michael Alley
-------
Paul M. Treichel, John C. Kotz, Paul Treichel
-------
David W. Ball
-------
Richard Phillips Feynman
-------
Scott Freeman
-------
James R. Holton
-------
Raymond A. Serway
-------
Joseph Pedlosky
-------
Vicky McMillan
-------
Plutarch
-------
Mary L. Boas
-------
Adrian E. Gill
-------
Benoit Cushman-Roisin
-------
Forrest E. Baird
-------
H. M. Schey
-------
Michael T. Madigan
-------
Robert A. Day
-------
Plato
-------
Jan A. Pechenik
-------
Jacob Peter Anderson
-------
Elaine Nicpon Marieb
-------
Elaine Nicpon Marieb
-------
Bruce Alberts
-------
William J. Cody
-------
Geoffrey Chaucer
-------
Jean-Paul Sartre
-------
David Halliday
-------
John Alcock
-------
Frederick A. Bettelheim
-------
Milton Abramowitz
-------
James G. Harris
-------
Henry L. Roediger, Jeffrey D. Karpicke
-------
P. J. Potts
-------
Thomas Jefferson
-------
Donald A. Walker, Martha K. Raynolds, Fred J. A. Daniëls
-------
D. A. Walker, N. A. Auerbach, J. G. Boc

## Turn it into a list of dictionaries!

We are going to turn all of the above into a list of dictionaries

In [48]:
all_data = []

for row in rows:
    print("-------")
    
    data = {}
    
    data['title'] = row.select_one(".title .name-div p").text
    data['links'] = row.select_one(".title .name-div a")['href']
    data['authors'] = row.select_one(".title .name-div .name").text
    
    #And then add it to the list as a dictionary
    all_data.append(data)

-------
-------
-------
-------
-------
-------
-------
-------
-------
-------
-------
-------
-------
-------
-------
-------
-------
-------
-------
-------
-------
-------
-------
-------
-------
-------
-------
-------
-------
-------
-------
-------
-------
-------
-------
-------
-------
-------
-------
-------
-------
-------
-------
-------
-------
-------
-------
-------
-------
-------


In [49]:
df = pd.DataFrame(all_data)
df.head()

Unnamed: 0,title,links,authors
0,The Elements of Style,/result/title?id=9199819950029,William Strunk
1,The Craft of Scientific Presentations: Critica...,/result/title?id=8297876850707,Michael Alley
2,Chemistry and Chemical Reactivity,/result/title?id=65017214971298,"Paul M. Treichel, John C. Kotz, Paul Treichel"
3,Essential Algebra for Chemistry Students,/result/title?id=41979010351797,David W. Ball
4,The Feynman Lectures on Physics,/result/title?id=8323646833678,Richard Phillips Feynman


In [50]:
df.to_csv("books.csv", index=False)

If there are missing values you can do like this!

First you scrape everything and leave the field blank if it doesn't meet our criteria

    all_data = []

        for row in rows:
            print("-------")
    
            data = {}
    
            try:
                data['title'] = row.select_one(".title .name-div p").text
            except:
                pass
            
            try:
                data['links'] = row.select_one(".title .name-div a")['href']
            except:
                pass
            
            try:
                data['authors'] = row.select_one(".title .name-div .name").text
            except:
                pass

And then drop the missing values.  


In [83]:
# The basis for using Playwright

from playwright.async_api import async_playwright

playwright = await async_playwright().start()
browser = await playwright.chromium.launch(headless = False)
page = await browser.new_page()

This time we will do will

In [85]:
await page.goto("https://opensyllabus.org/results-list/titles?size=50&usState=AK")
rows = page.locator(".title-item")

count = await rows.count()
count

50

In [76]:
# We count from 0 to 49.
for i in range(count):
    row = rows.nth(i)
    print(await row.text_content())

1The Elements of StyleWilliam StrunkMultiple Editions67100
2The Craft of Scientific Presentations: Critical Steps to Succeed and Critical Errors to AvoidMichael AlleyMultiple Editions5217
3Chemistry and Chemical ReactivityPaul M. Treichel, John C. Kotz, Paul TreichelMultiple Editions4036
4Essential Algebra for Chemistry StudentsDavid W. BallBrooks / Cole,2006404
5The Feynman Lectures on PhysicsRichard Phillips FeynmanMultiple Editions3177
6Biological ScienceScott FreemanMultiple Editions3054
7An Introduction to Dynamic MeteorologyJames R. HoltonMultiple Editions2829
8Physics for Scientists and EngineersRaymond A. SerwayMultiple Editions2787
9Geophysical Fluid DynamicsJoseph PedloskySpringer Publishing,19792613
10Writing Papers in the Biological SciencesVicky McMillanSt. Martin's / Bedford Books,19882551
11AlcibiadesPlutarchMultiple Editions257
12Mathematical Methods in the Physical SciencesMary L. BoasMultiple Editions2455
13Atmosphere-Ocean DynamicsAdrian E. GillElsevier / Academic Pr

In [86]:
# Click a thing on the webpage
await page.locator("label[for='filter10']").click()

In [87]:
from playwright.async_api import async_playwright

playwright = await async_playwright().start()
browser = await playwright.chromium.launch(headless = False)
page = await browser.new_page()

In [88]:
await page.goto("https://apps.sd.gov/ld17btp/licenseelist.aspx")

<Response url='https://apps.sd.gov/ld17btp/licenseelist.aspx' request=<Request url='https://apps.sd.gov/ld17btp/licenseelist.aspx' method='GET'>>

In [89]:
# First we click the drop down
await page.locator("#ctl00_ContentPlaceHolder1_ddlProfession_Arrow").click()

In [91]:
# And then select one
await page.locator("xpath=/html/body/form/div[1]/div/div/ul/li[7]").click()

In [92]:
# And then we click the search button. 
await page.locator("xpath=/html/body/form/div[4]/div[2]/table/tbody/tr/td/table/tbody/tr[2]/td/table/tbody/tr[2]/td/table/tbody/tr[4]/td[2]/input[1]").click()


In [93]:
# And download it as a csv
await page.locator("xpath=/html/body/form/div[3]/div[2]/table/tbody/tr/td/table/tbody/tr[2]/td/table/tbody/tr[4]/td/div/table/thead/tr[1]/td/table/tbody/tr/td[2]/input[2]").click()

In [98]:
from playwright.async_api import async_playwright

playwright = await async_playwright().start()
browser = await playwright.chromium.launch(headless = False)
page = await browser.new_page()

In [99]:
await page.goto("https://www.dmr.nd.gov/oilgas/findwellsvw.asp")

<Response url='https://www.dmr.nd.gov/oilgas/findwellsvw.asp' request=<Request url='https://www.dmr.nd.gov/oilgas/findwellsvw.asp' method='GET'>>

In [96]:
await page.goto(url)

<Response url='https://explorer-api.opensyllabus.org/v1/works.json?size=50&usState=AK' request=<Request url='https://explorer-api.opensyllabus.org/v1/works.json?size=50&usState=AK' method='GET'>>

In [106]:
html = await page.content()
tables = pd.read_html(html)
df = tables[2]

TypeError: read_html() missing 1 required positional argument: 'io'

In [103]:
township_ids = [140, 137, 154]
await page.locator("#ddmTownship").select_option('154')

['154']

In [105]:
await page.locator("xpath=/html/body/table/tbody/tr/td[2]/form/p[4]/input[1]").click()

In [109]:
# make a loop that runs through the process several times and save it as a csv.

township_id = ['140', '137', '154']


for id in township_ids:
    # Select it from the dropdown
    await page.locator("#ddmTownship").select_option(township_id)

    # Click the submit button
    await page.locator("xpath=/html/body/table[1]/tbody/tr/td[2]/form/p[4]/input[1]").click()
    
    # Wait for the table to load
    # because it takes a second, maybe?
    await page.locator('[summary="Well Log search results table"]').wait_for()

    # Save a CSV
    html = await page.content()
    tables = pd.read_html(html)
    df = tables[2]
    filename = f"township-{id}.csv"
    df.to_csv(filename, index=False)