# BoardDocs Crawl

Let's figure out how to crawl BoardDocs!

We'll try the Redwood City School District site using BeautifulSoup.

https://go.boarddocs.com/ca/redwood/Board.nsf/Public

In [2]:
# Each site may contain multiple committees, we have to pick which we want to index
# For example, RCSD's Board of Trustees is commitee A4EP6J588C05 in ca/redwood

site = "ca/redwood"
committeeID = "A4EP6J588C05"

In [3]:
# We'll use the requests module to fetch info here.

import requests

# set up the BoardDocs endpoints based on params we were passed.
baseURL = "https://go.boarddocs.com/" + site + "/Board.nsf"
publicURL = baseURL + "/Public"
meetingsListURL = baseURL + "/BD-GetMeetingsList?open"

# set up the headers required for the server to answer
headers = {
    "accept": "application/json, text/javascript, */*; q=0.01",
    "accept-language": "en-US,en;q=0.9",
    "content-type": "application/x-www-form-urlencoded; charset=UTF-8",
    "sec-ch-ua": '"Google Chrome";v="113", "Chromium";v="113", "Not-A.Brand";v="24"',
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": '"macOS"',
    "sec-fetch-dest": "empty",
    "sec-fetch-mode": "cors",
    "sec-fetch-site": "same-origin",
    "x-requested-with": "XMLHttpRequest",
}

# set the committee
data = "current_committee_id=" + committeeID

# POST the request!
response = requests.post(meetingsListURL, headers=headers, data=data)

print("Status returned by meetings list request:", response.status_code)

Status returned by meetings list request: 200


In [4]:
# Now we're going to parse the JSON data.

# Response is a JSON array of meetings, in this format:
# [{"unique": "CPSNV9612DF1",
#  "name": "Board of Trustees Regular Meeting - 7:00pm (Closed Session at 6:15 PM)",
#  "current": "1",
#  "preliveoak": "",
#  "numberdate": "20230510",
#  "unid": "BE4CAA121D6BFD458525896E00612DF1"},

# print(response.text)

import json

meetingsData = json.loads(response.text)

meetings = [
    {
        "meetingID": meeting.get("unique", None),
        "date": meeting.get("numberdate", None),
        "unid": meeting.get("unid", None),
    }
    for meeting in meetingsData
]

print(str(len(meetings)) + " meetings found")

278 meetings found


In [None]:
# Here's an alternate approach, there's apparently an XML feed..

import xml.etree.ElementTree as ET

xmlMeetingListURL = baseURL + "/XML-ActiveMeetings"
xmlMeetingListData = requests.get(xmlMeetingListURL)
xmlMeetingList = ET.fromstring(xmlMeetingListData)

# The returned XML document is in this form:

# <meetings>
# <meeting bodyid="A4EP6J588C05" bodyname="Board of Trustees" id="C55TDQ76E688" order="1">
# <name>Board of Trustees Regular Meeting - 7:00pm</name>
# <start>
# <date format="yyyy-mm-dd">2021-08-11</date>
# <english>
# <weekday>Wednesday</weekday>
# <date>August 11, 2021</date>
# </english>
# </start>
# <description>Please click the video link above to access the regular board meeting EDUCATING EVERY CHILD FOR SUCCESS REDWOOD CITY SCHOOL DISTRICT BOARD OF EDUCATION REGULAR MEETING WEDNESDAY, AUGUST 11, 2021 AT 7:00pm TELECONFERENCE MEETING https://rcsdk8-net.zoom.us/s/86849531859 (to participate in the Regular Board Meeting) US : +1 669 900 6833 or +1 346 248 7799 or +1 301 715 8592 or +1 312 626 6799 or +1 929 436 2866 or +1 253 215 8782 Webinar ID: 868 4953 1859 Password: rcsdbot Backup Password: 0863523 (to listen to the Regular Board Meeting) TELECONFERENCE NOTIFICATION for the REGULAR BOARD MEETING In light of the current Public Health Emergency and consistent with the Governor&#8217s recent order suspending some of the Brown Act&#8217s teleconferencing requirements, the Board will be holding its August 11th regular meeting by teleconference. The Board invites the public to join the open session portion of the meeting and offer public comment via Zoom. Additionally, the meeting will be recorded and staff will be available to receive real-time comments via the links below. Comments received during the open session of the meeting will be shared publicly during the meeting: ENGLISH https://docs.google.com/forms/d/e/1FAIpQLSexN3rAtNYJrhCjKT0s9AG__Eq0-_iAUFPI6ID3Mo0Jn8yeGA/viewform?usp=sf_link SPANISH https://docs.google.com/forms/d/e/1FAIpQLScMO3Wo8kjGmJF7KNhihQqanOLfzfoyQ7IT904jU9QtFFF28Q/viewform?usp=sf_link If you require Spanish interpretation please call: 978-990-5137 and press 8377041# for the password. Si requiere interpretaci&#243n al espa&#241ol por favor llame al: 978-990-5137 y presione 8377041# para la contrase&#241a. If you need special assistance or a modification due to a disability (including auxiliary aids or services) to participate in this meeting, please contact Eliana Garc&#237a at egarcia@rcsdk8.net at least 48 hours in advance of the meeting and we will make our best efforts to accommodate.</description>
# <link>http://go.boarddocs.com/ca/redwood/Board.nsf/goto?open&id=C55TDQ76E688</link>
# <category id="C55TDR76E689" order="1">
# <name>1. Call to Order</name>
# <agendaitems>
# <item id="C55TDS76E68A" order="1">
# <name>1.1 Roll Call</name>
# <link>http://go.boarddocs.com/ca/redwood/Board.nsf/goto?open&id=C55TDS76E68A</link>
# <actiontype>Procedural</actiontype>
# </item>
# </agendaitems>
# </category>

In [17]:
# Ah HA! The detailes "print" agenda has all the info we want - and links to the PDFs!

detailedMeetingAgendaURL = baseURL + "/PRINT-AgendaDetailed"

meetingID = "CPSNV9612DF1"

# set the meetingID & committee
data = "id=" + meetingID + "&" + "current_committee_id=" + committeeID

# POST the request!
response = requests.post(detailedMeetingAgendaURL, headers=headers, data=data)

print("Status returned by detailed agenda fetch request:", response.status_code)

import html2text
from bs4 import BeautifulSoup

# parse the returned HTML
soup = BeautifulSoup(response.content, "html.parser")
agendaDate = soup.find("div", {"class": "print-meeting-date"}).string
agendaTitle = soup.find("div", {"class": "print-meeting-name"}).string
agendaFiles = [
    fd.a.get("href") for fd in soup.find_all("div", {"class": "public-file"})
]
agendaData = html2text.html2text(response.text)
print("Agenda Title:", agendaTitle)
print("Agenda Date:", agendaDate)
print("Number of Files:", len(agendaFiles))

print(agendaFiles)

Status returned by detailed agenda fetch request: 200
Agenda Title: Board of Trustees Regular Meeting - 7:00pm (Closed Session at 6:15 PM)
Agenda Date: Wednesday, May 10, 2023
Number of Files: 33
['/ca/redwood/Board.nsf/files/CRAQFV6923F8/$file/230510%20RCSD%20%2420k%20and%20Under%20Tracker%20FY%2022-23.pdf', '/ca/redwood/Board.nsf/files/CRASSK741766/$file/230510%20RCSD%20GA%20Bid%20Package%20D%20CO%20No.%2014%20Package.pdf', '/ca/redwood/Board.nsf/files/CRATNB7827AD/$file/230510%20RCSD%20GA%20Bid%20Package%20G%20CO%20No.%2016%20Package.pdf', '/ca/redwood/Board.nsf/files/CR9SWS74B531/$file/01-118012_Invoice_01-13356_2023-04-18.pdf', '/ca/redwood/Board.nsf/files/CRFNZ4615266/$file/3250%20BP_AR%20Transportation%20Fees.pdf', '/ca/redwood/Board.nsf/files/CRFP8N62304A/$file/3540%20BP%20Transportation.pdf', '/ca/redwood/Board.nsf/files/CRFPGE63E9A7/$file/3555%20BP_E%20Nutrition%20Program%20Compliance.pdf', '/ca/redwood/Board.nsf/files/CRFPM964FB8C/$file/4030%20BP_AR%20Nondiscrimination%20in%

In [12]:
# Fetch meeting agenda for each meeting

for meeting in meetings:
    print(meeting["meetingID"])

CPSNV9612DF1
CPNUPZ7B7D09
CQ7TPZ78313B
CR2MCR59EE37
CNUN245B80D7
CNCQ2F663B8C
CPWNM5605E00
CNCPQY64EE36
CMSTNT783963
CMSTML77B689
CN9V837F7242
CMZR4H6C2928
CMBPD95DF6DB
CKYUYU7E62A8
CLLPZT5E8971
CKJKSG533AF1
CKHSER725DEA
CK4PBG638FA6
CJYTL8775FA8
CJANRA6126F9
CK6PAK62FF2D
CK6N565C9EB6
CJ2S33686A4D
CHKLWM588244
CHEM3K58E555
CHEMVQ5D1F0F
CH4UY57E3BD1
CFLT9N7492F3
CFFTMD7567B0
CF8Q7X66C51F
CETRFZ6DD9CE
CF7TF6771C58
CEPKKH523FEC
CEBNMZ5DAC30
CDWQH3694A8D
CDARDL6D82AB
CDFKEW510C6E
CCSN6X5E7859
CCMRJT6E4626
CC5UYY7E6893
CBJQLT6911AB
CBATCX765D01
CAYM47593BD6
CAFRFB6D7A83
CABM9357C659
CACUCV7B77BB
C9BVZ5831E3D
C8SP2G6169F1
C8FTNP72595E
C8MQ92681B5B
C87LTS552926
C7XVCJ801ABC
C7KUF87BCE71
C72NJ46017D1
C75M5L592D5D
C6GTZ9796118
C6DRX2700FAB
C63URL79A65D
C66PAR62DFB1
C5LNS66103E7
C55TDQ76E688
CRN7DG191DCC
CRN63A12EF28
CRP2ZC7DEDD9
CRM2R703650F
CRM2YY0488C9
CRJ2SA01B8F1
CRLUJK7C4CE2
CRJ2QE00512B
CRH24J005DC4
CRKVVW82A567
CRFVN48180D5
CRE4XS0DBC93
CRE4S90CEC88
CRDUU67DB46C
CQNLT957DAEE
CRAUSP7B7A9A