Agentic Frameworks for Job-Aggregation

The goal of this task is to perform a rapid agentic framework. Build a agent capable of extracting job posting information from several different company career pages.

In [5]:
%pip install --quiet selenium langgraph pandas tqdm pydantic openai


Note: you may need to restart the kernel to use updated packages.


In [1]:
print('hello')

hello


In [7]:
from __future__ import annotations
from typing import Optional, List, TypedDict, NotRequired
from urllib.parse import urlparse
from datetime import datetime
import json, os, random, time, re
import pandas as pd
from tqdm import tqdm
from typing import cast

# LangGraph
from langgraph.graph import StateGraph, END
# No checkpointer for now to avoid thread_id requirements
# from langgraph.checkpoint.memory import InMemorySaver

# Optional LLM normalizer (disabled by default)
from openai import OpenAI  # not used unless USE_LLM=True

# Selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# ---------- Config ----------
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36",
]

HEADLESS = True
MAX_SCROLLS = 8
SCROLL_PAUSE = 0.6  # slightly longer for stability

USE_LLM = False
OPENAI_MODEL = "gpt-4o-mini"

def now_iso() -> str:
    return datetime.utcnow().isoformat(timespec="seconds") + "Z"

def get_origin(url: str) -> str:
    p = urlparse(url)
    return f"{p.scheme}://{p.netloc}"

class JobRecordDict(TypedDict, total=False):
    title: Optional[str]
    location: Optional[str]
    company: Optional[str]
    application_link: Optional[str]
    description_snippet: Optional[str]
    source: Optional[str]
    scraped_at: str

# State typing: only `url` is required; the rest are NotRequired
class ScrapeState(TypedDict):
    url: str
    limit: NotRequired[int]
    site: NotRequired[str]
    records: NotRequired[List[JobRecordDict]]
    normalized: NotRequired[List[JobRecordDict]]
    notes: NotRequired[List[str]]


In [8]:
def make_driver() -> webdriver.Chrome:
    ua = random.choice(USER_AGENTS)
    opts = Options()
    if HEADLESS:
        opts.add_argument("--headless=new")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--disable-gpu")
    opts.add_argument("--disable-dev-shm-usage")
    opts.add_argument("--window-size=1280,2000")
    opts.add_argument(f"user-agent={ua}")
    return webdriver.Chrome(options=opts)

def open_and_render(driver: webdriver.Chrome, url: str, wait_css: Optional[str]=None, timeout: int=30):
    driver.get(url)
    try:
        if wait_css:
            WebDriverWait(driver, timeout).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, wait_css))
            )
    except Exception:
        # surface a hint for debugging
        print(f"[open_and_render] Timed out waiting for: {wait_css}")

def try_dismiss_overlays(driver: webdriver.Chrome):
    """Best-effort: dismiss cookie/consent banners that block content."""
    xpaths = [
        "//button[contains(., 'Accept all')]",
        "//button[contains(., 'I agree')]",
        "//button[contains(., 'Accept')]",
        "//button[contains(., 'Got it')]",
        "//button[contains(., 'OK')]",
        "//div[@role='dialog']//button[contains(., 'Accept')]",
    ]
    for xp in xpaths:
        try:
            btn = WebDriverWait(driver, 2).until(EC.element_to_be_clickable((By.XPATH, xp)))
            btn.click()
            time.sleep(0.3)
        except Exception:
            pass  # ignore if not present

def infinite_scroll(driver: webdriver.Chrome, max_scrolls: int=MAX_SCROLLS, pause: float=SCROLL_PAUSE):
    last_height = driver.execute_script("return document.body.scrollHeight")
    for _ in range(max_scrolls):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(pause)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height


In [9]:
def detect_site(url: str) -> str:
    u = url.lower()
    if "careers.google.com" in u:
        return "google"
    if "metacareers.com" in u or "facebookcareers" in u:
        return "meta"
    if "boards.greenhouse.io" in u:
        return "greenhouse"
    return "generic"


In [10]:
def parse_google_with_selenium(driver: webdriver.Chrome, limit: int = 25) -> List[JobRecordDict]:
    anchors = driver.find_elements(By.CSS_SELECTOR, "a[href*='/jobs/results/']")
    out: List[JobRecordDict] = []
    seen = set()
    for a in anchors:
        title = (a.text or "").strip()
        link = a.get_attribute("href")
        if not link:
            continue
        key = (title, link)
        if key in seen:
            continue
        seen.add(key)

        loc = None
        try:
            parent = a.find_element(By.XPATH, "./ancestor-or-self::*[1]")
            loc_el = parent.find_elements(By.CSS_SELECTOR, "[data-test*='locations'], [class*='location'], .place")
            if loc_el:
                loc = (loc_el[0].text or "").strip()
        except Exception:
            pass

        rec: JobRecordDict = {
            "title": title or None,
            "location": loc or None,
            "company": "Google",
            "application_link": link,
        }
        out.append(rec)
        if len(out) >= limit:
            break
    return out


In [11]:
def parse_meta_with_selenium(driver: webdriver.Chrome, limit: int = 25) -> List[JobRecordDict]:
    anchors = driver.find_elements(By.CSS_SELECTOR, "a[href*='/jobs/']")
    out: List[JobRecordDict] = []
    seen = set()
    for a in anchors:
        link = a.get_attribute("href")
        if not link:
            continue
        title = (a.text or "").strip()
        if not title:
            try:
                h = a.find_element(By.CSS_SELECTOR, "h2, h3")
                title = h.text.strip()
            except Exception:
                pass

        loc = None
        try:
            loc_el = a.find_elements(By.CSS_SELECTOR, "[data-testid*='job-location'], [class*='location']")
            if loc_el:
                loc = (loc_el[0].text or "").strip()
        except Exception:
            pass

        key = (title, link)
        if key in seen:
            continue
        seen.add(key)

        rec: JobRecordDict = {
            "title": title or None,
            "location": loc or None,
            "company": "Meta",
            "application_link": link,
        }
        out.append(rec)
        if len(out) >= limit:
            break
    return out


In [12]:
def parse_greenhouse_with_selenium(driver: webdriver.Chrome, limit: int = 25) -> List[JobRecordDict]:
    anchors = driver.find_elements(By.CSS_SELECTOR, "div.opening a, section#jobs a[href*='/jobs/']")
    out: List[JobRecordDict] = []
    seen = set()
    for a in anchors:
        title = (a.text or "").strip()
        link = a.get_attribute("href")
        if not link:
            continue

        loc = None
        try:
            sibs = a.find_elements(By.XPATH, "following-sibling::*[1]")
            if sibs:
                loc = (sibs[0].text or "").strip()
        except Exception:
            pass

        key = (title, link)
        if key in seen:
            continue
        seen.add(key)

        rec: JobRecordDict = {
            "title": title or None,
            "location": loc or None,
            "company": None,
            "application_link": link,
        }
        out.append(rec)
        if len(out) >= limit:
            break
    return out


In [13]:
def parse_generic_with_selenium(driver: webdriver.Chrome, limit: int = 25) -> List[JobRecordDict]:
    anchors = driver.find_elements(By.CSS_SELECTOR, "a[href*='job'], a[href*='/jobs/'], a[href*='careers']")
    out: List[JobRecordDict] = []
    seen = set()
    for a in anchors:
        title = (a.text or "").strip()
        link = a.get_attribute("href")
        if not link:
            continue
        if not title or len(title.split()) > 15:
            continue

        key = (title, link)
        if key in seen:
            continue
        seen.add(key)

        rec: JobRecordDict = {
            "title": title,
            "location": None,
            "company": None,
            "application_link": link,
        }
        out.append(rec)
        if len(out) >= limit:
            break
    return out


In [14]:
DESC_SELECTORS = [
    "section", "article", "[data-test*='description']",
    ".job-description", "#job-details", ".section"
]

def resolve_description_with_selenium(url: Optional[str], max_len: int=800) -> Optional[str]:
    if not url:
        return None
    drv = make_driver()
    try:
        open_and_render(drv, url, wait_css=None, timeout=15)
        try_dismiss_overlays(drv)
        for css in DESC_SELECTORS:
            try:
                el = drv.find_element(By.CSS_SELECTOR, css)
                txt = (el.text or "").strip()
                if txt:
                    return txt[:max_len]
            except Exception:
                continue
        return None
    except Exception:
        return None
    finally:
        drv.quit()


In [15]:
def node_detect(state: ScrapeState) -> ScrapeState:
    url = state.get("url")
    if not url:
        raise ValueError("State missing required key: 'url'")

    site = detect_site(url)

    # cast so Pylance knows this dict is a ScrapeState
    new = cast(ScrapeState, dict(state))
    new["site"] = site

    notes = list(new.get("notes", []))
    notes.append(f"Detected site: {site}")
    new["notes"] = notes
    return new

from typing import cast, List  # make sure this import is present

def node_parse(state: ScrapeState) -> ScrapeState:
    url = state.get("url")
    if not url:
        raise ValueError("State missing required key: 'url'")

    limit = state.get("limit", 25)
    site = state.get("site", "generic")

    wait_map = {
        "google": "a[href*='/jobs/results/']",
        "meta": "a[href*='/jobs/']",
        "greenhouse": "section#jobs a, div.opening a",
        "generic": "a",
    }

    drv = make_driver()
    rows: List[JobRecordDict] = []   # <-- initialize with correct type

    try:
        open_and_render(drv, url, wait_css=wait_map.get(site, "a"), timeout=30)
        try_dismiss_overlays(drv)
        infinite_scroll(drv)

        if site == "google":
            rows = parse_google_with_selenium(drv, limit=limit)
        elif site == "meta":
            rows = parse_meta_with_selenium(drv, limit=limit)
        elif site == "greenhouse":
            rows = parse_greenhouse_with_selenium(drv, limit=limit)
        else:
            rows = parse_generic_with_selenium(drv, limit=limit)
    finally:
        drv.quit()

    new = cast(ScrapeState, dict(state))
    new["records"] = rows

    notes = list(new.get("notes", []))
    notes.append(f"Parsed {len(rows)} rows")
    new["notes"] = notes
    return new


In [16]:
from typing import List, cast

def node_enrich(state: ScrapeState) -> ScrapeState:
    """
    Enrich each parsed row by visiting the job page and extracting a short description.
    Requires:
      - resolve_description_with_selenium(url: Optional[str]) -> Optional[str]
      - now_iso() -> str
      - JobRecordDict, ScrapeState TypedDicts
    """
    url = state.get("url", "")
    rows: List[JobRecordDict] = state.get("records", []) or []

    out: List[JobRecordDict] = []
    for r in tqdm(rows, desc="Resolving details"):
        desc = resolve_description_with_selenium(r.get("application_link"))

        rec: JobRecordDict = {
            "title": r.get("title"),
            "location": r.get("location"),
            "company": r.get("company"),
            "application_link": r.get("application_link"),
            "description_snippet": desc,
            "source": url,
            "scraped_at": now_iso(),
        }
        out.append(rec)

    new = cast(ScrapeState, dict(state))   # tell Pylance this dict conforms to ScrapeState
    new["records"] = out

    notes = list(new.get("notes", []))
    notes.append("Enriched with description snippets")
    new["notes"] = notes
    return new


In [17]:
SYSTEM_PROMPT = """You are a data normalizer.
Normalize 'title' and 'location' to concise standardized forms. Do not invent values.
Keep other fields unchanged. If missing, leave as null.
Return JSON list with the same keys.
"""

def _client_or_none():
    key = os.getenv("OPENAI_API_KEY")
    if not key:
        return None
    try:
        return OpenAI(api_key=key)
    except Exception:
        return None

def _strip_code_fence(text: str) -> str:
    t = text.strip()
    if t.startswith("```"):
        t = t.strip("`")
        # After stripping backticks, it may start with 'json\n'
        t = re.sub(r"^json\s*", "", t, flags=re.IGNORECASE)
    return t.strip()

def node_normalize_llm(state: ScrapeState) -> ScrapeState:
    if not USE_LLM:
        new = cast(ScrapeState, dict(state))
        new["normalized"] = state.get("records", [])
        notes = list(new.get("notes", []))
        notes.append("LLM normalization skipped (USE_LLM=False)")
        new["notes"] = notes
        return new

    client = _client_or_none()
    if client is None:
        new = cast(ScrapeState, dict(state))
        new["normalized"] = state.get("records", [])
        notes = list(new.get("notes", []))
        notes.append("LLM normalization skipped (no OPENAI_API_KEY)")
        new["notes"] = notes
        return new

    records = state.get("records", [])
    content = json.dumps(records, ensure_ascii=False)
    try:
        resp = client.chat.completions.create(
            model=OPENAI_MODEL,
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": f"Normalize these records and return JSON only:\n{content}"}
            ],
            temperature=0.0,
        )
        txt = resp.choices[0].message.content or "[]"
        txt = _strip_code_fence(txt)
        normalized = json.loads(txt)
    except Exception:
        normalized = records

    new = cast(ScrapeState, dict(state))
    new["normalized"] = normalized
    notes = list(new.get("notes", []))
    notes.append("Applied LLM normalizer" if normalized is not records else "LLM normalizer failed; passthrough")
    new["notes"] = notes
    return new


In [18]:
# No checkpointer to keep it simple
graph = StateGraph(ScrapeState)
graph.add_node("detect", node_detect)
graph.add_node("parse", node_parse)
graph.add_node("enrich", node_enrich)

graph.set_entry_point("detect")
graph.add_edge("detect", "parse")
graph.add_edge("parse", "enrich")
graph.add_edge("enrich", END)

app = graph.compile()
print("LangGraph (Selenium) pipeline ready.")
app


LangGraph (Selenium) pipeline ready.


ValueError: Failed to reach https://mermaid.ink/ API while trying to render your graph. Status code: 502.

To resolve this issue:
1. Check your internet connection and try again
2. Try with higher retry settings: `draw_mermaid_png(..., max_retries=5, retry_delay=2.0)`
3. Use the Pyppeteer rendering method which will render your graph locally in a browser: `draw_mermaid_png(..., draw_method=MermaidDrawMethod.PYPPETEER)`

<langgraph.graph.state.CompiledStateGraph at 0x112246660>

In [19]:
def run_scrape(url: str, limit: int=10):
    final = app.invoke({"url": url, "limit": limit})
    rows = final.get("records") or []
    df = pd.DataFrame(rows)
    if df.empty:
        print("[run_scrape] Empty results — check console counts above and consider increasing MAX_SCROLLS.")
    return final, df

def export_json(records: List[dict], path: str) -> None:
    with open(path, "w", encoding="utf-8") as f:
        json.dump(records, f, ensure_ascii=False, indent=2)
    print(f"Wrote {len(records)} records to {path}")

def export_csv(records: List[dict], path: str) -> None:
    if not records:
        print("No records to export.")
        return
    pd.DataFrame(records).to_csv(path, index=False)
    print(f"Wrote {len(records)} records to {path}")


In [21]:
final, df = run_scrape("https://www.metacareers.com/jobs", limit=10)
display(df.head())
print("\nNotes:", *final.get("notes", []), sep="\n - ")

  return datetime.utcnow().isoformat(timespec="seconds") + "Z"
Resolving details: 100%|██████████| 10/10 [02:24<00:00, 14.48s/it]


Unnamed: 0,title,location,company,application_link,description_snippet,source,scraped_at
0,"Software Engineer, Machine Learning\nSunnyvale...",,Meta,https://www.metacareers.com/jobs/1436181490732782,,https://www.metacareers.com/jobs,2025-09-24T04:04:46Z
1,"Software Engineer, Infrastructure\nSunnyvale, ...",,Meta,https://www.metacareers.com/jobs/677160418622314,,https://www.metacareers.com/jobs,2025-09-24T04:05:01Z
2,"Product Design Engineer\nSunnyvale, CA +3 loca...",,Meta,https://www.metacareers.com/jobs/1092822929374881,,https://www.metacareers.com/jobs,2025-09-24T04:05:15Z
3,"Research Scientist Intern, AI & System Co-Desi...",,Meta,https://www.metacareers.com/jobs/1859723961565682,,https://www.metacareers.com/jobs,2025-09-24T04:05:30Z
4,Wireless - Embedded HW Connectivity Engineer\n...,,Meta,https://www.metacareers.com/jobs/2540883539615658,,https://www.metacareers.com/jobs,2025-09-24T04:05:45Z



Notes:
 - Detected site: meta
 - Parsed 10 rows
 - Enriched with description snippets


In [22]:
final, df = run_scrape("https://www.metacareers.com/jobs?q=data%20analyst", limit=10)
display(df.head())
print("\nNotes:", *final.get("notes", []), sep="\n - ")

  return datetime.utcnow().isoformat(timespec="seconds") + "Z"
Resolving details: 100%|██████████| 10/10 [03:08<00:00, 18.82s/it]


Unnamed: 0,title,location,company,application_link,description_snippet,source,scraped_at
0,"Data Analyst - People Analytics\nAustin, TX +2...",,Meta,https://www.metacareers.com/jobs/1124394892922401,,https://www.metacareers.com/jobs?q=data%20analyst,2025-09-03T15:28:39Z
1,"Global Operations Data Analyst\nDublin, Irelan...",,Meta,https://www.metacareers.com/jobs/796191043363303,,https://www.metacareers.com/jobs?q=data%20analyst,2025-09-03T15:28:58Z
2,Data Management Analyst (Short Term Employee)\...,,Meta,https://www.metacareers.com/jobs/693648076696098,,https://www.metacareers.com/jobs?q=data%20analyst,2025-09-03T15:29:16Z
3,"Data Analyst\nMenlo Park, CA\n•\nData & Analyt...",,Meta,https://www.metacareers.com/jobs/1851270308789075,,https://www.metacareers.com/jobs?q=data%20analyst,2025-09-03T15:29:34Z
4,Infrastructure Market Research Analyst\nWashin...,,Meta,https://www.metacareers.com/jobs/720998080830793,,https://www.metacareers.com/jobs?q=data%20analyst,2025-09-03T15:29:53Z



Notes:
 - Detected site: meta
 - Parsed 10 rows
 - Enriched with description snippets


In [27]:
final, df = run_scrape("https://www.amazon.jobs/en/search?base", limit=20)
display(df.head(10))
print("\nNotes:", *final.get("notes", []), sep="\n - ")

  return datetime.utcnow().isoformat(timespec="seconds") + "Z"
Resolving details: 100%|██████████| 20/20 [05:58<00:00, 17.92s/it]


Unnamed: 0,title,location,company,application_link,description_snippet,source,scraped_at
0,"Medical Records Specialist, Amazon One Medical...",,,https://www.amazon.jobs/en/jobs/2889177/medica...,,https://www.amazon.jobs/en/search?base,2025-09-03T15:58:46Z
1,...Read more,,,https://www.amazon.jobs/en/jobs/2889177/medica...,,https://www.amazon.jobs/en/search?base,2025-09-03T15:59:04Z
2,"Executive Assistant, Devices & Services Design",,,https://www.amazon.jobs/en/jobs/3063086/execut...,,https://www.amazon.jobs/en/search?base,2025-09-03T15:59:21Z
3,...Read more,,,https://www.amazon.jobs/en/jobs/3063086/execut...,,https://www.amazon.jobs/en/search?base,2025-09-03T15:59:38Z
4,"Site Marketing Manager , Zappos FBA",,,https://www.amazon.jobs/en/jobs/3051114/site-m...,,https://www.amazon.jobs/en/search?base,2025-09-03T15:59:55Z
5,...Read more,,,https://www.amazon.jobs/en/jobs/3051114/site-m...,,https://www.amazon.jobs/en/search?base,2025-09-03T16:00:12Z
6,"Senior UX Designer, Digital Acceleration",,,https://www.amazon.jobs/en/jobs/2985608/senior...,,https://www.amazon.jobs/en/search?base,2025-09-03T16:00:29Z
7,...Read more,,,https://www.amazon.jobs/en/jobs/2985608/senior...,,https://www.amazon.jobs/en/search?base,2025-09-03T16:00:46Z
8,"Applied Scientist, AGI Foundational Modeling -...",,,https://www.amazon.jobs/en/jobs/3006856/applie...,,https://www.amazon.jobs/en/search?base,2025-09-03T16:01:05Z
9,...Read more,,,https://www.amazon.jobs/en/jobs/3006856/applie...,,https://www.amazon.jobs/en/search?base,2025-09-03T16:01:27Z



Notes:
 - Detected site: generic
 - Parsed 20 rows
 - Enriched with description snippets


In [28]:
final, df = run_scrape("https://explore.jobs.netflix.net/careers", limit = 10)
display(df.head())
print("\nNotes:", *final.get("notes", []), sep="\n - ")

  return datetime.utcnow().isoformat(timespec="seconds") + "Z"
Resolving details: 100%|██████████| 4/4 [01:15<00:00, 18.77s/it]


Unnamed: 0,title,location,company,application_link,description_snippet,source,scraped_at
0,Netflix’s culture,,,https://jobs.netflix.com/culture,,https://explore.jobs.netflix.net/careers,2025-09-03T16:07:33Z
1,Netflix House,,,https://apply.netflixhouse.com/careers,,https://explore.jobs.netflix.net/careers,2025-09-03T16:07:52Z
2,Privacy,,,https://jobs.netflix.com/candidate-privacy,,https://explore.jobs.netflix.net/careers,2025-09-03T16:08:11Z
3,Do Not Sell Or Share My Personal Information,,,https://jobs.netflix.com/dnssi,,https://explore.jobs.netflix.net/careers,2025-09-03T16:08:30Z



Notes:
 - Detected site: generic
 - Parsed 4 rows
 - Enriched with description snippets


In [29]:
final, df = run_scrape("https://hiring.amazon.com/%22", limit = 10)
display(df.head())
print("\nNotes:", *final.get("notes", []), sep="\n - ")

  return datetime.utcnow().isoformat(timespec="seconds") + "Z"
Resolving details: 100%|██████████| 4/4 [01:47<00:00, 26.92s/it]


Unnamed: 0,title,location,company,application_link,description_snippet,source,scraped_at
0,contact us.,,,https://www.amazondelivers.jobs/contactus,,https://hiring.amazon.com/%22,2025-09-05T15:06:04Z
1,instagraminstagram,,,https://www.instagram.com/amazonjobs/,Log In\nSign Up\namazonjobs\nAmazon Jobs\n153 ...,https://hiring.amazon.com/%22,2025-09-05T15:06:27Z
2,Amazon Jobs Overview,,,https://hiring.amazon.com/job-opportunities,Open side menu.\nEnglish\nInformation\nHey! We...,https://hiring.amazon.com/%22,2025-09-05T15:06:55Z
3,Job Search,,,https://hiring.amazon.com/app#/jobSearch,,https://hiring.amazon.com/%22,2025-09-05T15:07:22Z



Notes:
 - Detected site: generic
 - Parsed 4 rows
 - Enriched with description snippets
