In [2]:
# Notebook-friendly HigherEdJobs Wayback scraper (no argparse)
# pip install -q requests beautifulsoup4 lxml

from __future__ import annotations
import csv, json, re, time
from dataclasses import dataclass, asdict
from datetime import datetime
from typing import Dict, Iterable, List, Optional, Tuple
from urllib.parse import urlsplit, parse_qs

import requests
from bs4 import BeautifulSoup

CDX_ENDPOINT = "https://web.archive.org/cdx/search/cdx"
WAYBACK_PREFIX = "https://web.archive.org/web"

DETAIL_PATTERNS = [
    "https://www.higheredjobs.com/details.cfm",
    "https://www.higheredjobs.com/faculty/details.cfm",
    "https://www.higheredjobs.com/admin/details.cfm",
    "https://www.higheredjobs.com/executive/details.cfm",
    "http://www.higheredjobs.com/details.cfm",
    "http://www.higheredjobs.com/faculty/details.cfm",
    "http://www.higheredjobs.com/admin/details.cfm",
    "http://www.higheredjobs.com/executive/details.cfm",
]

HEADERS = {
    "User-Agent": "MonkieResearch/1.0 (+https://monkie.ai; research use; contact: you@example.com)"
}
REQUEST_DELAY_SEC = 0.25  # be polite

@dataclass
class JobPosting:
    job_id: Optional[str]
    original_url: str
    wayback_timestamp: str
    wayback_url: str
    category: Optional[str]
    title: Optional[str]
    institution: Optional[str]
    location: Optional[str]
    posted_date_str: Optional[str]
    description_text: Optional[str]

def parse_iso_date(date_str: str) -> str:
    for fmt in ("%Y-%m-%d", "%Y/%m/%d", "%Y%m%d", "%Y-%m", "%Y%m", "%Y"):
        try:
            dt = datetime.strptime(date_str, fmt)
            break
        except ValueError:
            continue
    else:
        raise ValueError(f"Cannot parse date: {date_str}")
    if len(date_str) in (4, 6, 7):
        if len(date_str) == 4:
            dt = dt.replace(month=1, day=1)
        else:
            dt = dt.replace(day=1)
    return dt.strftime("%Y%m%d")

def cdx_query(session: requests.Session, url_prefix: str, from_ymd: str, to_ymd: str,
              limit: int = 2000, offset: int = 0) -> List[Dict[str, str]]:
    params = {
        "url": url_prefix,
        "matchType": "prefix",
        "from": from_ymd,
        "to": to_ymd,
        "output": "json",
        "fl": "timestamp,original,mimetype,statuscode,digest,length",
        "filter": ["statuscode:200", "mimetype:text/html"],
        "limit": str(limit),
        "offset": str(offset),
    }
    resp = session.get(CDX_ENDPOINT, params=params, headers=HEADERS, timeout=30)
    resp.raise_for_status()
    data = resp.json()
    if not data:
        return []
    header, *rows = data
    out = []
    for row in rows:
        if len(row) == len(header):
            out.append(dict(zip(header, row)))
    return out

def iter_all_cdx(session: requests.Session, url_prefix: str, from_ymd: str, to_ymd: str,
                 max_records: Optional[int]) -> Iterable[Dict[str, str]]:
    fetched, limit, offset = 0, 2000, 0
    while True:
        if max_records is not None and fetched >= max_records:
            return
        chunk = cdx_query(session, url_prefix, from_ymd, to_ymd, limit=limit, offset=offset)
        if not chunk:
            return
        for row in chunk:
            yield row
            fetched += 1
            if max_records is not None and fetched >= max_records:
                return
        offset += limit
        time.sleep(REQUEST_DELAY_SEC)

def build_wayback_url(timestamp: str, original: str) -> str:
    return f"{WAYBACK_PREFIX}/{timestamp}id_/{original}"

def categorize_from_path(original_url: str) -> Optional[str]:
    path = urlsplit(original_url).path.lower()
    if "/faculty/" in path: return "faculty"
    if "/admin/" in path: return "administrative"
    if "/executive/" in path: return "executive"
    return "other"

def extract_jobcode(original_url: str) -> Optional[str]:
    qs = parse_qs(urlsplit(original_url).query)
    for key in ("JobCode", "jobcode", "JobID", "jobid"):
        if key in qs and qs[key]:
            return qs[key][0]
    return None

def clean(txt: Optional[str]) -> Optional[str]:
    if not txt: return None
    return re.sub(r"\s+", " ", txt).strip()

def parse_job_html(html: str) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str]]:
    soup = BeautifulSoup(html, "html.parser")
    title = None
    h1 = soup.find("h1")
    if h1 and clean(h1.get_text()): title = clean(h1.get_text())
    if not title:
        og = soup.find("meta", attrs={"property": "og:title"})
        if og and og.get("content"): title = clean(og["content"])
    if not title and soup.title: title = clean(soup.title.get_text())

    def find_label_value(labels: List[str]) -> Optional[str]:
        label_regex = re.compile(r"^\s*(%s)\s*[:：]\s*$" % "|".join(map(re.escape, labels)), re.I)
        for tag in soup.find_all(string=label_regex):
            parent = tag.parent
            if parent and parent.next_sibling:
                try:
                    candidate = clean(getattr(parent.next_sibling, "get_text", lambda: str(parent.next_sibling))())
                    if candidate: return candidate
                except Exception:
                    pass
            line = clean(parent.get_text()) if parent else None
            if line:
                value = re.sub(label_regex, "", line, count=1).strip(" :-")
                if value: return value
        return None

    institution = find_label_value(["Institution", "College/University", "Employer", "Organization"])
    location = find_label_value(["Location", "City/State", "City", "State"])
    posted_date_str = find_label_value(["Posted", "Posted On", "Date Posted", "Posting Date"])

    if not institution and title:
        m = re.search(r"\bat\s+([^|–—-]+)$", title)
        if m: institution = clean(m.group(1))

    return title, institution, location, posted_date_str

def scrape_higheredjobs(date_from: str,
                        date_to: str,
                        out_csv: str = "higheredjobs_wayback.csv",
                        out_ndjson: str = "higheredjobs_wayback.ndjson",
                        max_per_pattern: Optional[int] = None,
                        max_pages: Optional[int] = None,
                        timeout: int = 30) -> Dict[str, JobPosting]:

    from_ymd = parse_iso_date(date_from)
    to_ymd = parse_iso_date(date_to)
    sess = requests.Session(); sess.headers.update(HEADERS)

    seen_jobs: Dict[str, JobPosting] = {}
    total_fetched = 0

    for url_prefix in DETAIL_PATTERNS:
        print(f"[CDX] {url_prefix}  [{from_ymd}..{to_ymd}]")
        for row in iter_all_cdx(sess, url_prefix, from_ymd, to_ymd, max_per_pattern):
            ts, original = row.get("timestamp"), row.get("original")
            if not ts or not original: continue

            wayback_url = build_wayback_url(ts, original)
            job_id = extract_jobcode(original) or f"NOJOBCODE-{hash(original) & 0xFFFFFFFF:x}"
            if job_id in seen_jobs: continue  # keep earliest

            try:
                time.sleep(REQUEST_DELAY_SEC)
                r = sess.get(wayback_url, timeout=timeout)
                if r.status_code != 200: continue
                html = r.text
            except requests.RequestException:
                continue

            if "Page cannot be displayed due to robots.txt" in html or \
               "Wayback Machine doesn't have that page archived" in html:
                continue

            title, institution, location, posted_date_str = parse_job_html(html)

            soup = BeautifulSoup(html, "html.parser")
            candidates = soup.find_all(["div", "section", "article"], limit=50)
            best = ""
            for block in candidates:
                txt = clean(block.get_text(" "))
                if txt and len(txt) > len(best): best = txt
            description_text = best if best else clean(soup.get_text(" "))

            jp = JobPosting(
                job_id=job_id,
                original_url=original,
                wayback_timestamp=datetime.strptime(ts, "%Y%m%d%H%M%S").isoformat(),
                wayback_url=wayback_url,
                category=categorize_from_path(original),
                title=title,
                institution=institution,
                location=location,
                posted_date_str=posted_date_str,
                description_text=description_text,
            )
            seen_jobs[job_id] = jp
            total_fetched += 1

            if max_pages is not None and total_fetched >= max_pages:
                break
        if max_pages is not None and total_fetched >= max_pages:
            break

    print(f"[DONE] Unique jobs collected: {len(seen_jobs)}")

    with open(out_csv, "w", newline="", encoding="utf-8") as f:
        w = csv.writer(f)
        w.writerow([
            "job_id","category","title","institution","location","posted_date_str",
            "wayback_timestamp","original_url","wayback_url","description_text"
        ])
        for jp in seen_jobs.values():
            w.writerow([
                jp.job_id, jp.category or "", jp.title or "", jp.institution or "",
                jp.location or "", jp.posted_date_str or "", jp.wayback_timestamp,
                jp.original_url, jp.wayback_url, (jp.description_text or "")[:20000]
            ])

    with open(out_ndjson, "w", encoding="utf-8") as f:
        for jp in seen_jobs.values():
            f.write(json.dumps(asdict(jp), ensure_ascii=False) + "\n")

    print(f"Wrote: {out_csv} and {out_ndjson}")
    return seen_jobs


In [4]:
_ = scrape_higheredjobs("2023-07-01", "2023-8-30", max_pages=200)

[CDX] https://www.higheredjobs.com/details.cfm  [20230701..20230830]
[DONE] Unique jobs collected: 200
Wrote: higheredjobs_wayback.csv and higheredjobs_wayback.ndjson
