In [2]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

In [None]:
base_url = "https://www.dmv.ca.gov"
url = f"{base_url}/portal/vehicle-industry-services/autonomous-vehicles/autonomous-vehicle-collision-reports/"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

In [None]:
def download_pdf(pdf_url, filename):
    pdf_response = requests.get(pdf_url)
    if pdf_response.status_code == 200:
        with open(filename, 'wb') as f:
            f.write(pdf_response.content)
    else:
        print(f"Failed to download PDF from {pdf_url}")

In [None]:
# only download 2023
accordion_block = soup.find('div', {'class': 'accordion-block js-accordion-block', 'id': 'acc-2023'})
links = accordion_block.find_all('a', href=True)
for link in links:
    full_url = urljoin(base_url, link['href'])
    filename = link.text + ".pdf"
    file = os.path.join(os.getcwd(), filename) 
    download_pdf(full_url, file)

In [None]:
# to download all reports from 2014-2023, can be time-consuming

for year in range(2014,2024):
    year_id = 'acc-' + str(year)
    accordion_block = soup.find('div', {'class': 'accordion-block js-accordion-block', 'id': year_id})  
    links = accordion_block.find_all('a', href=True)
    
    os.makedirs(year_id, exist_ok=True)
    for link in links:
        full_url = urljoin(base_url, link['href'])
        filename = link.text + ".pdf"
        file = os.path.join(os.getcwd(), year_id, filename) 
        download_pdf(full_url, file)

In [3]:
from PyPDF2 import PdfReader

pdf_path = os.path.join(os.getcwd(), 'acc-2023', 'Cruise July 2, 2023.pdf')
reader = PdfReader(pdf_path)
page = reader.pages[0]
print(page.extract_text())

      
OL 316 (REV. 7/2020)  WWWREPORT OF TRAFFIC COLLISION INVOLVING  
AN AUTONOMOUS VEHICLE
Instructions: Please print within the spaces and boxes on this form. If you need to provide additional information on 
a separate piece of paper(s) or you include a copy of any law enforcement agency report, please check the box to indicate “Additional Information Attached.”
•Write unk (for unknown)  or none  in any space or box when you do not have the information on the other party involved.
•Give insurance information that is complete and which correctly and fully  identifies the company  that issued the insurance
policy or surety bond, or whether there is a certificate of self-insurance.
•Place the National Association of Insurance Commissioners (NAIC) number for your Insurance or Surety Company in the
boxes provided. The NAIC number should be located on the proof of insurance provided by you company or you can contactyour insurer for that information.
•Identify any person involved in the 

In [4]:
from ironpdf import *
pdf = PdfDocument.FromFile(os.path.join(os.getcwd(), 'acc-2023', 'Cruise July 2, 2023.pdf'))
pdf.Flatten()
pdf.SaveAs(os.path.join(os.getcwd(), 'acc-2023','flatten.pdf'))

IronPdf detected root Python package directory of C:\Users\Johnny\anaconda3/IronPdf.Slim
IronPdf will now download dependencies for Windows to C:\Users\Johnny\anaconda3\IronPdf.Native.Chrome.Windows\2023.8.6. If you encounter any issues launching IronPdf, please remove .nupkg files from this directory and try again. Visit https://ironpdf.com/python/docs/ for more information.
Optionally you may set Installation.CustomDeploymentDirectory to a custom directory and manually download IronPdf.Native.Chrome.X and IronSoftware.PdfModel NuGet packages to this directory.


<IronPdf.PdfDocument object at 0x000002199D817A40>

In [5]:
pdf_path = os.path.join(os.getcwd(), 'acc-2023', 'flatten.pdf')
reader = PdfReader(pdf_path)
page = reader.pages[0]
print(page.extract_text())

      
OL 316 (REV. 7/2020)  WWWREPORT OF TRAFFIC COLLISION INVOLVING  
AN AUTONOMOUS VEHICLE
Instructions: Please print within the spaces and boxes on this form. If you need to provide additional information on 
a separate piece of paper(s) or you include a copy of any law enforcement agency report, please check the box to indicate “Additional Information Attached.”
•Write unk (for unknown)  or none  in any space or box when you do not have the information on the other party involved.
•Give insurance information that is complete and which correctly and fully  identifies the company  that issued the insurance
policy or surety bond, or whether there is a certificate of self-insurance.
•Place the National Association of Insurance Commissioners (NAIC) number for your Insurance or Surety Company in the
boxes provided. The NAIC number should be located on the proof of insurance provided by you company or you can contactyour insurer for that information.
•Identify any person involved in the 

In [69]:
from io import StringIO

from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser

output_string = StringIO()
with open(pdf_path, 'rb') as in_file:
    parser = PDFParser(in_file)
    doc = PDFDocument(parser)
    rsrcmgr = PDFResourceManager()
    device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.create_pages(doc):
        interpreter.process_page(page)

print(output_string.getvalue())

A Public Service Agency

REPORT OF TRAFFIC COLLISION INVOLVING 
AN AUTONOMOUS VEHICLE

DMV USE ONLY

AVT NUMBER

NAME

Instructions: Please print within the spaces and boxes on this form. If you need to provide additional information on 
a  separate  piece  of  paper(s)  or  you  include  a  copy  of  any  law  enforcement  agency  report,  please  check  the  box  to 
indicate “Additional Information Attached.”

• Write unk (for unknown) or none in any space or box when you do not have the information on the other party involved.

• Give insurance information that is complete and which correctly and fully identifies the company that issued the insurance

policy or surety bond, or whether there is a certificate of self-insurance.

•

•

•

Place the National Association of Insurance Commissioners (NAIC) number for your Insurance or Surety Company in the
boxes provided. The NAIC number should be located on the proof of insurance provided by you company or you can contact
your insurer fo

In [71]:
from pdfminer.high_level import extract_text

text = extract_text(pdf_path)
print(text)



A Public Service Agency

REPORT OF TRAFFIC COLLISION INVOLVING 
AN AUTONOMOUS VEHICLE

DMV USE ONLY

AVT NUMBER

NAME

Instructions: Please print within the spaces and boxes on this form. If you need to provide additional information on 
a  separate  piece  of  paper(s)  or  you  include  a  copy  of  any  law  enforcement  agency  report,  please  check  the  box  to 
indicate “Additional Information Attached.”

• Write unk (for unknown) or none in any space or box when you do not have the information on the other party involved.

• Give insurance information that is complete and which correctly and fully identifies the company that issued the insurance

policy or surety bond, or whether there is a certificate of self-insurance.

•

•

•

Place the National Association of Insurance Commissioners (NAIC) number for your Insurance or Surety Company in the
boxes provided. The NAIC number should be located on the proof of insurance provided by you company or you can contact
your insurer fo