In [1]:
import requests

url = "https://web3.ncaa.org/aprsearch/gsrsearch"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
}

response = requests.get(url, headers=headers)

In [2]:
import os
import pandas as pd
import requests
from playwright.async_api import async_playwright

In [3]:
import requests

url = "https://web3.ncaa.org/aprsearch/gsrsearch"

response = requests.get(url)
response
from bs4 import BeautifulSoup


soup_doc = BeautifulSoup(response.content, 'html.parser')
soup_doc.prettify()

'<!DOCTYPE HTML>\n<meta content="IE=edge" http-equiv="X-UA-Compatible"/>\n<html>\n <head>\n  <meta content="25b3cfff-472d-497f-8fb9-041dedb095e5" name="_csrf">\n   <meta content="X-CSRF-TOKEN" name="_csrf_header"/>\n   <title>\n    Graduation Success Rate\n   </title>\n   <link href="/aprsearch/css/bootstrap.min.css" rel="stylesheet"/>\n   <link href="https://fonts.googleapis.com/css?family=Roboto:400,900,700,500" rel="stylesheet" type="text/css"/>\n   <link href="https://fonts.googleapis.com/css?family=Raleway:400,900,700,500" rel="stylesheet" type="text/css"/>\n   <link href="/aprsearch/css/ncaaApps.css" rel="stylesheet"/>\n   <link href="/aprsearch/css/styles.css" rel="stylesheet"/>\n   <script src="/aprsearch/js/modernizr-2.6.2-respond-1.1.0.min.js">\n   </script>\n   <script src="/aprsearch/js/jquery-1.11.3.min.js">\n   </script>\n   <script src="/aprsearch/js/bootstrap.min.js">\n   </script>\n   <script>\n    (window.BOOMR_mq=window.BOOMR_mq||[]).push(["addVar",{"rua.upush":"fals

In [16]:
import os
import re
import pandas as pd
import requests
from playwright.async_api import async_playwright

def clean_filename(text):
    """Remove unsafe characters and format for filenames"""
    return re.sub(r'[\\/*?:"<>|]', "", text.strip().replace(" ", "_"))

async def main():
    os.makedirs("pdfs", exist_ok=True)

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        page = await browser.new_page()

        await page.goto("https://web3.ncaa.org/aprsearch/gsrsearch", timeout=60000)

        await page.wait_for_selector('select[name="sportCode"]')
        await page.select_option('select[name="sportCode"]', label="Overall")
        await page.select_option('select[name="cohortYear"]', label="2013-2014")
        await page.wait_for_timeout(1000)
        await page.locator("button:has-text('Search')").click(force=True)

        await page.wait_for_selector("table tbody tr", timeout=20000)

        header_elts = await page.locator("table thead tr th").all_inner_texts()
        headers = [h.strip() for h in header_elts] + ["PDF_URL"]

        all_rows = []
        all_pdf_links = set()
        page_num = 1

        while True:
            print(f"Scraping page {page_num}...")

            row_count = await page.locator("table tbody tr").count()
            rows_before = await page.locator("table tbody tr").all_inner_texts()

            for i in range(row_count):
                row_locator = page.locator("table tbody tr").nth(i)
                cell_count = await row_locator.locator("td").count()
                row_cells = []

                for j in range(cell_count):
                    cell_text = await row_locator.locator("td").nth(j).inner_text()
                    row_cells.append(cell_text.strip())

                school_name = row_cells[0] if len(row_cells) > 0 else "Unknown_School"

                pdf_cell = row_locator.locator("td").nth(cell_count - 1)
                links = await pdf_cell.locator("a").all()
                found_pdf = False
                for link in links:
                    href = await link.get_attribute("href")
                    if href and 'inst' in href:
                        full_url = f"https://web3.ncaa.org{href}"
                        found_pdf = True
                        break

                if found_pdf:
                    row_cells.append(full_url)
                    all_pdf_links.add((school_name, full_url))
                else:
                    row_cells.append("")

                if len(row_cells) == len(headers):
                    all_rows.append(row_cells)

            next_btn = page.locator("text=Next")
            if await next_btn.count() == 0:
                break
            try:
                btn_class = await next_btn.first.get_attribute("class") or ""
            except:
                break
            if "disabled" in btn_class:
                break

            await next_btn.click()
            await page.wait_for_timeout(2000)
            rows_after = await page.locator("table tbody tr").all_inner_texts()

            if rows_after == rows_before:
                print("No new rows detected after clicking Next. Ending loop.")
                break

            page_num += 1

        df = pd.DataFrame(all_rows, columns=headers)
        df.to_csv("gsr_data_2013_2014.csv", index=False)
        print(f"Saved {len(all_rows)} rows to gsr_data_2013_2014.csv")

        await browser.close()

        downloaded = 0
        for school_name, url in all_pdf_links:
            if not url:
                continue
            filename = f"{clean_filename(school_name)}_FSR.pdf"
            path = os.path.join("pdfs", filename)
            try:
                r = requests.get(url)
                r.raise_for_status()
                with open(path, "wb") as f:
                    f.write(r.content)
                print(f"Downloaded: {filename}")
                downloaded += 1
            except Exception as e:
                print(f"Failed to download {filename}: {e}")
        print(f"Total PDFs downloaded: {downloaded}")
        downloaded_school_names = sorted(school for school, _ in all_pdf_links)
        pd.Series(downloaded_school_names).to_csv("pdf_downloaded_schools.csv", index=False, header=False)
        print("Saved list of schools with PDFs to 'pdf_downloaded_schools.csv'")

In [17]:
import nest_asyncio
import asyncio

nest_asyncio.apply()
await main()

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Scraping page 21...
Scraping page 22...
Scraping page 23...
Scraping page 24...
Scraping page 25...
Scraping page 26...
Scraping page 27...
Scraping page 28...
Scraping page 29...
Scraping page 30...
Scraping page 31...
Scraping page 32...
Scraping page 33...
Scraping page 34...
Scraping page 35...
Scraping page 36...
No new rows detected after clicking Next. Ending loop.
✅ Saved 343 rows to gsr_data_2013_2014.csv
Downloaded: 2007_FSR.pdf
Downloaded: 2007_FSR.pdf
Downloaded: 2007_FSR.pdf
Downloaded: 2007_FSR.pdf
Downloaded: 2007_FSR.pdf
Downloaded: 2007_FSR.pdf
Downloaded: 2007_FSR.pdf
Downloaded: 2007

In [1]:
df = pd.read_csv("gsr_data_2013_2014.csv")
all_schools = set(df.iloc[:, 0].dropna().str.strip())

pdf_schools = set(pd.read_csv("pdf_downloaded_schools.csv", header=None)[0].str.strip())

missing_schools = sorted(all_schools - pdf_schools)
print(f"Total missing schools with no PDF downloaded: {len(missing_schools)}")

df_missing = df[df.iloc[:, 0].str.strip().isin(missing_schools)]
df_missing.to_csv("missing_pdfs_2013_2014.csv", index=False)
print("Saved missing schools to 'missing_pdfs_2013_2014.csv'")

NameError: name 'pd' is not defined

In [2]:
nest_asyncio.apply()

async def fetch_2013_2014_first_page_with_pdfs():
    pdf_rows = []

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        page = await browser.new_page()

        await page.goto("https://web3.ncaa.org/aprsearch/gsrsearch", timeout=60000)

        await page.wait_for_selector('select[name="sportCode"]', timeout=30000)
        await page.wait_for_selector('select[name="cohortYear"]', timeout=30000)

        await page.select_option('select[name="sportCode"]', label="Overall")
        await page.select_option('select[name="cohortYear"]', label="2013-2014")
        await page.wait_for_timeout(1000)
        await page.click("button:has-text('Search')", force=True)

        await page.wait_for_selector("table tbody tr", timeout=30000)
        await page.wait_for_timeout(5000)

        row_count = await page.locator("table tbody tr").count()

        for i in range(row_count):
            row = page.locator("table tbody tr").nth(i)

            if not await row.is_visible():
                continue

            cells = await row.locator("td").all_inner_texts()
            if len(cells) < 7:
                continue

            cohort_year = cells[0].strip()
            school = cells[1].strip()
            conference = cells[2].strip()
            sport = cells[3].strip()
            state = cells[4].strip()
            gsr = cells[5].strip()
            fgr = cells[6].strip()

            pdf_link_locator = row.locator("td").nth(-1).locator("a[href*='inst2014']")
            pdf_url = ""
            if await pdf_link_locator.count() > 0:
                href = await pdf_link_locator.get_attribute("href")
                pdf_url = f"https://web3.ncaa.org{href}"

            pdf_rows.append({
                "Cohort Year": cohort_year,
                "School": school,
                "Conference": conference,
                "Sport": sport,
                "State": state,
                "GSR": gsr,
                "FGR": fgr,
                "PDF_URL": pdf_url
            })

        await browser.close()

    df = pd.DataFrame(pdf_rows)
    df.to_csv("gsr_data_2013_2014_first_page_with_pdfs.csv", index=False)
    print(f"Saved {len(df)} rows to gsr_data_2013_2014_first_page_with_pdfs.csv")

await fetch_2013_2014_first_page_with_pdfs()

NameError: name 'nest_asyncio' is not defined