# Install & core imports

In [10]:
!pip install -q requests beautifulsoup4 reportlab lxml

import os, re, sys, time, random, logging, html, textwrap
from pathlib import Path
from urllib.parse import urljoin, urlparse
import urllib.robotparser as robotparser
import dataclasses as dc
from typing import List, Iterable, Optional

import requests
from bs4 import BeautifulSoup, Tag

# --- ReportLab pieces for PDF ---
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from reportlab.lib.units import inch



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


# Constants & logging

In [11]:
BASE_LIST_URL = "https://legacy.baseballprospectus.com/prospects/eyewitness.php"
USER_AGENT    = "EyewitnessScoutBot/0.1 (+https://example.com)"

DEFAULT_DELAY_RANGE = (1.0, 3.0)   # polite crawl delay (seconds)
OUTPUT_ROOT         = Path("reports")
OUTPUT_ROOT.mkdir(exist_ok=True)

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s  %(levelname)-8s %(message)s",
    handlers=[logging.StreamHandler(sys.stdout)],
)
log = logging.getLogger("eyewitness")


#  Helpers: session, robots check, dataclass


In [12]:
class RespectfulSession(requests.Session):
    """Requests session that respects delays & retries."""
    def __init__(self, delay_range=DEFAULT_DELAY_RANGE):
        super().__init__()
        self.delay_range = delay_range
        self.headers.update({"User-Agent": USER_AGENT})
        adapter = requests.adapters.HTTPAdapter(
            max_retries=requests.adapters.Retry(
                total=5, backoff_factor=0.5, status_forcelist=[429, 500, 502, 503, 504]
            )
        )
        self.mount("http://", adapter); self.mount("https://", adapter)

    def get(self, url, **kw):
        resp = super().get(url, timeout=30, **kw)
        time.sleep(random.uniform(*self.delay_range))
        return resp


def allowed_by_robots(url: str, ua: str = USER_AGENT) -> bool:
    parsed = urlparse(url)
    rp = robotparser.RobotFileParser()
    rp.set_url(f"{parsed.scheme}://{parsed.netloc}/robots.txt")
    try:
        rp.read()
        return rp.can_fetch(ua, url)
    except Exception as e:
        log.warning("robots.txt check failed (%s) – assuming allowed", e)
        return True  # fallback


@dc.dataclass(slots=True)
class ReportMeta:
    url: str
    player: str
    position: Optional[str] = None
    evaluator: Optional[str] = None
    report_date: Optional[str] = None
    ofp: Optional[str] = None
    org: Optional[str] = None
    body_html: Optional[str] = None

    def pdf_path(self) -> Path:
        clean = lambda s: re.sub(r"[^A-Za-z0-9_]+", "_", s).strip("_")
        name  = f"{clean(self.player)}_{clean(self.org or 'NA')}_{self.report_date or 'undated'}.pdf"
        year  = (self.report_date or "unknown").split("/")[-1]
        out_dir = OUTPUT_ROOT / year
        out_dir.mkdir(exist_ok=True, parents=True)
        return out_dir / name


# Discover report links

In [13]:
def collect_report_links(session: requests.Session, limit: Optional[int] = None) -> List[ReportMeta]:
    """Scrape the index table and return list of ReportMeta (header fields pre-filled)."""
    log.info("Fetching index…")
    soup = BeautifulSoup(session.get(BASE_LIST_URL).text, "lxml")

    links: List[ReportMeta] = []
    for a in soup.select("a[href^=eyewitness_]"):
        row = a.find_parent("tr")
        cells = row.find_all("td") if row else []
        links.append(
            ReportMeta(
                url=urljoin(BASE_LIST_URL, a["href"]),
                player=a.get_text(strip=True),
                position=cells[1].get_text(strip=True) if len(cells) > 1 else None,
                evaluator=cells[2].get_text(strip=True) if len(cells) > 2 else None,
                report_date=cells[3].get_text(strip=True) if len(cells) > 3 else None,
                ofp=cells[4].get_text(strip=True) if len(cells) > 4 else None,
            )
        )
    log.info("Found %d reports", len(links))
    return links[:limit] if limit else links


# Fetch details & generate PDF

In [None]:
# --- PDF helper (ReportLab version) ---
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from reportlab.lib.units import inch
import textwrap, html, re

def fetch_report(meta: ReportMeta, session: requests.Session) -> None:
    if not allowed_by_robots(meta.url):
        log.warning("Skipping disallowed URL: %s", meta.url)
        return
    soup = BeautifulSoup(session.get(meta.url).text, "lxml")

    # org from Affiliate line
    aff = soup.find(string=re.compile(r"Affiliate"))
    if aff and (m := re.search(r"Affiliate\s+\([^,]+,\s*([^)]+)\)", aff)):
        meta.org = m.group(1).strip()

    # crude start-stop extraction
    start = soup.find(string=re.compile(re.escape(meta.player)))
    body_bits = []
    for el in start.parent.next_siblings:  # type: ignore
        if isinstance(el, Tag) and el.find(string=re.compile(r"Terms of Service")):
            break
        body_bits.append(str(el))
    meta.body_html = "".join(body_bits) or soup.prettify()

def render_html(meta: ReportMeta) -> str:
    header = (
        f"<h1>{html.escape(meta.player)}</h1>"
        f"<h3>{html.escape(meta.position or '')} | {html.escape(meta.org or '')}</h3>"
        f"<p><b>Evaluator:</b> {html.escape(meta.evaluator or 'N/A')}<br>"
        f"<b>Date:</b> {html.escape(meta.report_date or 'N/A')}  "
        f"<b>OFP:</b> {html.escape(meta.ofp or 'N/A')}</p><hr>"
    )
    css = """
    body { font-family: Helvetica, Arial, sans-serif; margin: 1in; }
    h1 { margin: 0; font-size: 26pt; }
    h3 { margin-top: .2em; color:#555; }
    table { border-collapse: collapse; width:100%; margin-top:1em; }
    th,td { border:1px solid #ccc; padding:4px 6px; font-size:10pt; }
    th { background:#f0f0f0; }
    """
    return f"""<!doctype html><html><head><meta charset='utf-8'><style>{css}</style></head>
<body>{header}{meta.body_html}</body></html>"""

def strip_tags(raw_html: str) -> str:
    """Very simple HTML→text converter — good enough for plain reports."""
    return html.unescape(re.sub(r"<[^>]+>", "", raw_html).replace("\xa0", " "))

def save_pdf(meta: ReportMeta) -> None:
    text = strip_tags(render_html(meta))
    out_path = meta.pdf_path()

    c = canvas.Canvas(str(out_path), pagesize=letter)
    width, height = letter
    margin = 0.75 * inch
    y = height - margin
    line_h = 11

    for line in textwrap.wrap(text, 95):
        if y < margin:          # new page
            c.showPage()
            y = height - margin
        c.drawString(margin, y, line)
        y -= line_h

    c.save()
    log.info("✓  %s (ReportLab)", out_path)


In [18]:
import re, types, certifi

def _clean(s: str) -> str:
    """Keep alphanumerics/underscore, collapse everything else to '_'."""
    return re.sub(r"[^A-Za-z0-9_]+", "_", s).strip("_")

def safe_pdf_path(self):
    safe_date = _clean(self.report_date or "undated")
    name      = f"{_clean(self.player)}_{_clean(self.org or 'NA')}_{safe_date}.pdf"
    year      = (self.report_date or "unknown").split("/")[-1]
    out_dir   = OUTPUT_ROOT / year
    out_dir.mkdir(parents=True, exist_ok=True)
    return out_dir / name

# Hot-patch the class defined earlier
ReportMeta.pdf_path = types.MethodType(safe_pdf_path, ReportMeta)


In [19]:
# Set how many recent reports to grab (None = all)
LATEST = 5

session = RespectfulSession()
for meta in collect_report_links(session, limit=LATEST):
    try:
        fetch_report(meta, session)
        if meta.body_html:
            save_pdf(meta)
        else:
            log.warning("No body for %s", meta.url)
    except Exception as e:
        log.exception("Failed %s: %s", meta.player, e)

print(f"\nDone. PDFs are in {OUTPUT_ROOT.resolve()}")


2025-07-09 17:30:21,718  INFO     Fetching index…
2025-07-09 17:30:23,074  INFO     Found 1180 reports
2025-07-09 17:30:25,395  ERROR    Failed CJ Abrams: expected string or bytes-like object, got 'member_descriptor'
Traceback (most recent call last):
  File "/var/folders/qw/rynbsm9n3yvb9l_bjh7rchgh0000gn/T/ipykernel_87241/2863431571.py", line 9, in <module>
    save_pdf(meta)
    ~~~~~~~~^^^^^^
  File "/var/folders/qw/rynbsm9n3yvb9l_bjh7rchgh0000gn/T/ipykernel_87241/2495931425.py", line 13, in save_pdf
    out_path = meta.pdf_path()
  File "/var/folders/qw/rynbsm9n3yvb9l_bjh7rchgh0000gn/T/ipykernel_87241/1328411676.py", line 8, in safe_pdf_path
    safe_date = _clean(self.report_date or "undated")
  File "/var/folders/qw/rynbsm9n3yvb9l_bjh7rchgh0000gn/T/ipykernel_87241/1328411676.py", line 5, in _clean
    return re.sub(r"[^A-Za-z0-9_]+", "_", s).strip("_")
           ~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/re/__init_