In [2]:
import requests

url = "https://web3.ncaa.org/aprsearch/gsrsearch"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
}

response = requests.get(url, headers=headers)

In [3]:
import os
import pandas as pd
import requests
from playwright.async_api import async_playwright

In [4]:
import re

def clean_filename(text):
    """Remove unsafe characters and format for filenames"""
    return re.sub(r'[\\/*?:"<>|]', "", text.strip().replace(" ", "_"))

async def main():
    os.makedirs("pdfs", exist_ok=True)

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        page = await browser.new_page()

        await page.goto("https://web3.ncaa.org/aprsearch/asrsearch", timeout=60000)
        
        await page.wait_for_selector('select[name="sportCode"]')
        await page.select_option('select[name="sportCode"]', label="Overall")
        await page.select_option('select[name="cohortYear"]', label="2023-2024")
        await page.wait_for_timeout(1000)
        await page.locator("button:has-text('Search')").click(force=True)

        await page.wait_for_selector("table tbody tr", timeout=20000)

        header_elts = await page.locator("table thead tr th").all_inner_texts()
        headers = [h.strip() for h in header_elts] + ["PDF_URL"]

        all_rows = []
        all_pdf_links = set()
        page_num = 1

        while True:
            print(f"Scraping page {page_num}...")

            row_count = await page.locator("table tbody tr").count()

            for i in range(row_count):
                row_locator = page.locator("table tbody tr").nth(i)
                cell_count = await row_locator.locator("td").count()
                row_cells = []

                for j in range(cell_count):
                    cell_text = await row_locator.locator("td").nth(j).inner_text()
                    row_cells.append(cell_text.strip())

                pdf_cell = row_locator.locator("td").nth(cell_count - 1)
                pdf_link_locator = pdf_cell.locator("a[href*='inst2024']")
                if await pdf_link_locator.count() > 0:
                    pdf_href = await pdf_link_locator.get_attribute("href")
                    full_url = f"https://web3.ncaa.org{pdf_href}"
                    row_cells.append(full_url)
                    all_pdf_links.add((row_cells[0], full_url))
                else:
                    row_cells.append("")

                if len(row_cells) == len(headers):
                    all_rows.append(row_cells)

            next_btn = page.locator("text=Next")
            if await next_btn.count() == 0:
                break
            try:
                btn_class = await next_btn.first.get_attribute("class") or ""
            except:
                break
            if "disabled" in btn_class:
                break

            first_row_before = await page.locator("table tbody tr").nth(0).inner_text()
            await next_btn.click()
            await page.wait_for_timeout(1500)
            first_row_after = await page.locator("table tbody tr").nth(0).inner_text()

            if first_row_after == first_row_before:
                break
            page_num += 1

        df = pd.DataFrame(all_rows, columns=headers)
        df.to_csv("div2_data_2023_2024.csv", index=False)
        print(f"\n Saved {len(all_rows)} rows to div2_data_2023_2024.csv")

        await browser.close()

        downloaded = 0
        for school_name, url in all_pdf_links:
            if not url:
                continue

            filename = f"{clean_filename(school_name)}_FSR.pdf"
            path = os.path.join("pdfs", filename)

            try:
                r = requests.get(url)
                r.raise_for_status()
                with open(path, "wb") as f:
                    f.write(r.content)
                print(f"Downloaded: {filename}")
                downloaded += 1
            except Exception as e:
                print(f"Failed to download {filename}: {e}")

        print(f"Total PDFs downloaded: {downloaded}")

await main()

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Scraping page 21...
Scraping page 22...
Scraping page 23...
Scraping page 24...
Scraping page 25...
Scraping page 26...
Scraping page 27...
Scraping page 28...
Scraping page 29...
Scraping page 30...
Scraping page 31...
Scraping page 32...

 Saved 303 rows to div2_data_2023_2024.csv
Failed to download 2017_FSR.pdf: 404 Client Error: Not Found for url: https://web3.ncaa.org/aprsearch/public_reports/inst2024/8744.pdf?v=1755023136420
Downloaded: 2017_FSR.pdf
Downloaded: 2017_FSR.pdf
Failed to download 2017_FSR.pdf: 404 Client Error: Not Found for url: https://web3.ncaa.org/aprsearch/public_reports/inst202

In [None]:
#For the first page that did not load the first time

In [8]:
import pandas as pd
import nest_asyncio
import asyncio
from playwright.async_api import async_playwright

nest_asyncio.apply()

async def fetch_div2_asr_first_page(csv_path="div2_asr_2023_2024_first_page_with_pdfs.csv"):
    rows = []
    pdf_links = {}

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        page = await browser.new_page()

        await page.goto("https://web3.ncaa.org/aprsearch/asrsearch", timeout=60000)

        await page.wait_for_selector('select[name="sportCode"]', timeout=30000)
        await page.wait_for_selector('select[name="cohortYear"]', timeout=30000)

        await page.select_option('select[name="sportCode"]', label="Overall")
        await page.select_option('select[name="cohortYear"]', label="2023-2024")

        await page.wait_for_timeout(1000)
        await page.click("button:has-text('Search')", force=True)

        await page.wait_for_selector("table tbody tr", timeout=20000)
        await page.wait_for_timeout(2000)

        row_count = await page.locator("table tbody tr").count()

        for i in range(row_count):
            row = page.locator("table tbody tr").nth(i)
            cells = row.locator("td")
            cell_texts = []
            for j in range(await cells.count()):
                cell_text = await cells.nth(j).inner_text()
                cell_texts.append(cell_text.strip())

            pdf_link_locator = cells.nth(-1).locator("a[href*='inst2024']")
            if await pdf_link_locator.count() > 0:
                href = await pdf_link_locator.get_attribute("href")
                full_url = f"https://web3.ncaa.org{href}"
            else:
                full_url = ""

            cell_texts.append(full_url)
            rows.append(cell_texts)

        await browser.close()

    headers = [
        "Cohort Year", "School", "Conference", "Sport", "State", 
        "ASR", "FGR", "ASR Report", "FGR Report", "PDF_URL"
    ]

    df = pd.DataFrame(rows, columns=headers)
    df.to_csv(csv_path, index=False)
    print(f"Saved to {csv_path}")

await fetch_div2_asr_first_page()

CancelledError: 