In [None]:
!pip install requests beautifulsoup4 python-docx tldextract urllib3

In [1]:
import re, time, urllib.parse as up
from collections import deque, defaultdict
import requests
from bs4 import BeautifulSoup
from urllib.robotparser import RobotFileParser
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import tldextract

In [2]:
SITES = [
    # Seed URLs (static forums / blogs). Add/remove as you like.
    "https://www.clearadmit.com/",
    "https://gmatclub.com/forum/",
    "https://find-mba.com/board/general-forum",
]


In [3]:
KEYWORDS = [
    # Exact phrases/keywords to match (case-insensitive).
    # Add/remove freely.
    "Duke Fuqua",
    "Fuqua MBA", "Duke MBA",
    "Fuqua MMS", "Duke MMS",
    "Fuqua MQM", "Duke MQM",
    "Fuqua MSQM", "Duke MSQM",
    "Duke Business Analytics", "Duke MSBA",
    "Fuqua EMBA", "Fuqua GEMBA", "Fuqua WEMBA",
]

In [4]:
DEGREE_TOKENS = [
    r"MSBA", r"MBA", r"M\.B\.A", r"EMBA", r"GEMBA", r"WEMBA",
    r"MMS(?:\:FOB|:DKU)?", r"MQM(?:\:BA)?", r"MSQM(?:\:HA)?"
]


In [5]:
USER_AGENT = "Mozilla/5.0 (compatible; FuquaStaticScanner/1.0; +https://fuqua.duke.edu)"
DEFAULT_TIMEOUT = 15
REQUEST_DELAY_SEC = 1.5     # polite delay between requests
MAX_PAGES_PER_DOMAIN = 40   # cap pages per seed domain
MAX_DEPTH = 2               # BFS crawl depth
ALLOWED_CONTENT_TYPES = ("text/html", "application/xhtml+xml")

In [6]:
STATIC_ALLOWLIST = {"clearadmit.com", "gmatclub.com", "find-mba.com"}
EXCLUDE_DOMAINS = {
    "reddit.com", "twitter.com", "x.com", "quora.com",
    "chasedream.com", # JS-heavy (skip in Week 3)
    "feedspot.com",
}


In [7]:
def new_session():
    s = requests.Session()
    s.headers.update({"User-Agent": USER_AGENT, "Accept": "text/html,application/xhtml+xml;q=0.9,*/*;q=0.8"})
    retry = Retry(total=3, backoff_factor=0.5,
                  status_forcelist=[429, 500, 502, 503, 504],
                  allowed_methods=["GET"])
    s.mount("http://", HTTPAdapter(max_retries=retry))
    s.mount("https://", HTTPAdapter(max_retries=retry))
    return s

In [8]:
def domain_of(url):
    ext = tldextract.extract(url)
    return ".".join(p for p in [ext.domain, ext.suffix] if p)

In [9]:
def is_static_domain(url):
    dom = domain_of(url)
    if dom in EXCLUDE_DOMAINS:
        return False
    if dom in STATIC_ALLOWLIST:
        return True
    # Heuristic: allow if not in common JS-heavy/excluded list
    return True

In [10]:
def same_reg_domain(u1, u2):
    a, b = tldextract.extract(u1), tldextract.extract(u2)
    return (a.domain, a.suffix) == (b.domain, b.suffix)

In [11]:
def canonicalize_url(base, link):
    try:
        url = up.urljoin(base, link)
        parsed = up.urlparse(url)
        if parsed.scheme not in ("http", "https"):
            return None
        # drop fragments
        return up.urlunparse(parsed._replace(fragment=""))
    except:
        return None

In [12]:
def allowed_by_robots(url, session, cache):
    parsed = up.urlparse(url)
    root = f"{parsed.scheme}://{parsed.netloc}"
    if root not in cache:
        rp = RobotFileParser()
        robots_url = up.urljoin(root, "/robots.txt")
        try:
            r = session.get(robots_url, timeout=DEFAULT_TIMEOUT)
            content = r.text if r.status_code == 200 else ""
        except Exception:
            content = ""
        rp.parse(content.splitlines())
        cache[root] = rp
    return cache[root].can_fetch(USER_AGENT, url)

In [13]:
def fetch(url, session):
    resp = session.get(url, timeout=DEFAULT_TIMEOUT)
    ctype = resp.headers.get("Content-Type", "").lower()
    if not any(ct in ctype for ct in ALLOWED_CONTENT_TYPES):
        raise ValueError("Non-HTML content")
    return resp

In [14]:
def extract_meta(soup):
    title = (soup.title.string or "").strip() if soup.title else ""
    # common timestamp hints
    ts = None
    for sel in [
        "meta[property='article:published_time']",
        "meta[name='pubdate']",
        "meta[itemprop='datePublished']",
        "time[datetime]"
    ]:
        m = soup.select_one(sel)
        if m:
            ts = (m.get("content") or m.get("datetime") or "").strip()
            if ts:
                break
    # author hints
    author = None
    for sel in [
        "meta[name='author']","a[rel='author']",
        ".author a",".author",".byline a",".byline","[itemprop='author']"
    ]:
        m = soup.select_one(sel)
        if m:
            author = (m.get("content") or m.get_text()).strip()
            if author:
                break
    return title[:300], (ts or "")[:100], (author or "")[:120]

In [15]:
def visible_text(soup):
    for s in soup(["script","style","noscript","iframe","svg"]):
        s.decompose()
    return soup.get_text(separator=" ", strip=True)


In [16]:
def build_patterns(phrases):
    exact = [re.compile(re.escape(p), re.IGNORECASE) for p in phrases if p.strip()]
    prox_duke_fuqua = re.compile(r"\bDuke\b(?:\W+\w+){0,10}?\W+\bFuqua\b|\bFuqua\b(?:\W+\w+){0,10}?\W+\bDuke\b", re.IGNORECASE)
    deg_union = "(?:" + "|".join(DEGREE_TOKENS) + ")"
    prox_degree = re.compile(
        rf"(?:\bDuke\b|\bFuqua\b)(?:\W+\w+){{0,5}}?\W+\b{deg_union}\b|\b{deg_union}\b(?:\W+\w+){{0,5}}?\W+(?:\bDuke\b|\bFuqua\b)",
        re.IGNORECASE
    )
    return exact, [prox_duke_fuqua, prox_degree]


In [17]:
def find_snippet(text, span, radius=90):
    s, e = span
    s = max(0, s - radius); e = min(len(text), e + radius)
    snip = text[s:e]
    return re.sub(r"\s+", " ", snip).strip()


In [18]:
def scan_text(text, exact_patterns, prox_patterns):
    hits = []
    for pat in exact_patterns:
        for m in pat.finditer(text):
            hits.append(("exact", m.span()))
    for pat in prox_patterns:
        for m in pat.finditer(text):
            hits.append(("prox", m.span()))
    hits.sort(key=lambda x: x[1][0])
    return hits

In [19]:
def crawl_domain(seed_url, phrases):
    if not is_static_domain(seed_url):
        return []

    session = new_session()
    robots_cache = {}
    visited = set()
    per_domain_count = 0
    results = []
    q = deque([(seed_url, 0)])

    origin = up.urlparse(seed_url).scheme + "://" + up.urlparse(seed_url).netloc
    exact_patterns, prox_patterns = build_patterns(phrases)

    while q and per_domain_count < MAX_PAGES_PER_DOMAIN:
        url, depth = q.popleft()
        if url in visited or depth > MAX_DEPTH:
            continue
        visited.add(url)
        if not same_reg_domain(seed_url, url):
            continue
        if not allowed_by_robots(url, session, robots_cache):
            continue

        try:
            resp = fetch(url, session)
            time.sleep(REQUEST_DELAY_SEC)
        except Exception:
            continue

        per_domain_count += 1
        soup = BeautifulSoup(resp.text, "html.parser")
        title, ts, author = extract_meta(soup)
        text = visible_text(soup)

        matches = scan_text(text, exact_patterns, prox_patterns)
        if matches:
            mtype, mspan = matches[0]
            snippet = find_snippet(text, mspan)
            results.append({
                "title": title or "(no title)",
                "url": url,
                "timestamp": ts,
                "author": author,
                "match_type": mtype,
                "snippet": snippet
            })

        # enqueue internal links
        for a in soup.find_all("a", href=True):
            nxt = canonicalize_url(url, a["href"])
            if not nxt:
                continue
            if not nxt.startswith(origin):
                continue
            if nxt in visited:
                continue
            low = nxt.lower()
            if any(x in low for x in ["/login","/signup","/register","/rss","/feed","/wp-json"]):
                continue
            q.append((nxt, depth + 1))

    return results

In [20]:
def run_all(sites=SITES, phrases=KEYWORDS):
    grouped = defaultdict(list)
    for seed in sites:
        if not seed.startswith(("http://","https://")):
            continue
        dom = domain_of(seed)
        if dom in EXCLUDE_DOMAINS:
            print(f"- Skipping excluded: {seed}")
            continue
        rows = crawl_domain(seed, phrases)
        grouped[dom].extend(rows)

    # Print results
    any_hits = False
    for dom, rows in grouped.items():
        if not rows:
            print(f"[{dom}] No matches found.")
            continue
        any_hits = True
        print(f"\n=== {dom} — {len(rows)} match(es) ===")
        for i, r in enumerate(rows, 1):
            print(f"\n[{i}] {r['title']}")
            print(f"URL: {r['url']}")
            if r["timestamp"]:
                print(f"Time: {r['timestamp']}")
            if r["author"]:
                print(f"Author: {r['author']}")
            print(f"Match: {r['match_type']}")
            print(f"Snippet: {r['snippet']}")
    if not any_hits:
        print("No matches across all seeds.")

In [21]:
run_all()


=== clearadmit.com — 31 match(es) ===

[1] Clear Admit: MBA News, Admissions Advice & Trends
URL: https://www.clearadmit.com/
Match: exact
Snippet: xon Weekly Columns Fridays from the Frontline: Leading With Curiosity in the Age of AI at Duke Fuqua Myth Busters MBA Mythbusters: All MBA Roads Lead to Consulting, Tech or Investment Bankin

[2] Clear Admit: MBA News, Admissions Advice & Trends
URL: https://www.clearadmit.com
Match: exact
Snippet: xon Weekly Columns Fridays from the Frontline: Leading With Curiosity in the Age of AI at Duke Fuqua Myth Busters MBA Mythbusters: All MBA Roads Lead to Consulting, Tech or Investment Bankin

[3] Live MBA Admissions Decisions | MBA Livewire | Clear Admit
URL: https://www.clearadmit.com/livewire/
Match: prox
Snippet: bridge / Judge Carnegie Mellon / Tepper CEIBS Columbia Cornell / Johnson Dartmouth / Tuck Duke / Fuqua Edinburgh emlyon Emory / Goizueta ESADE ESCP ESMT Berlin Florida / Warrington Georgetown

[4] Data Dashboard | Clear Admit
URL: ht