In [None]:
import pandas as pd
import re

def parse_frey_osborne(xlsx_path: str,
                       csv_path: str | None = None) -> pd.DataFrame:
    """
    Parse Frey–Osborne automation-risk data.

    Parameters
    ----------
    xlsx_path : str
        Path to frey_osborne_automation_risk_index.xlsx.
    csv_path : str | None, default None
        If provided, the parsed table is saved to this CSV.

    Returns
    -------
    pd.DataFrame
        Columns: Rank (int), Probability (float), Label (Int64 with NA),
                 SOC code (str), Occupation (str)
    """
    # 1) Load and drop header
    raw = pd.read_excel(xlsx_path, header=None)
    lines = raw.iloc[1:, 0].astype(str)

    # 2) Regex
    pattern = re.compile(
        r"^\s*(\d+)\.\s+([01](?:\.\d+)?)\s+(?:(0|1)\s+)?(\d{2}-\d{4})\s+(.+)$"
    )

    parsed = []
    for line in lines:
        m = pattern.match(line)
        if not m:
            raise ValueError(f"Couldn’t parse: {line!r}")

        rank        = int(m.group(1))
        probability = float(m.group(2))

        # --- NEW: keep blank label as NA -------------------------------
        label_raw = m.group(3)
        label = pd.NA if label_raw is None else int(label_raw)
        # ----------------------------------------------------------------

        soc_code    = m.group(4)
        occupation  = m.group(5).strip()

        parsed.append((rank, probability, label, soc_code, occupation))

    df = (
        pd.DataFrame(
            parsed,
            columns=["Rank", "Probability", "Label", "SOC code", "Occupation"]
        )
        .astype({"Label": "Int64"})        # nullable integer
        .sort_values("Rank")
        .reset_index(drop=True)
    )

    if csv_path:
        df.to_csv(csv_path, index=False)

    return df

# Example usage:
df = parse_frey_osborne("frey_osborne_automation_risk_index.xlsx",
                        "frey_osborne_automation_risk_index_clean.csv")

pd.read_csv("frey_osborne_automation_risk_index_clean.csv").to_excel("frey_osborne_automation_risk_index_clean.xlsx", index=False)



In [None]:
# Core libs
!pip -q install selenium pandas lxml webdriver-manager

# Chrome + ChromeDriver (Colab VMs don’t have them pre-installed)
!apt-get -q update
!apt-get -q install -y chromium-browser

import sys, subprocess, os, textwrap, json, pathlib, time
print("Setup complete ✅")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m45.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m499.2/499.2 kB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.8/45.8 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.5.3.2 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cuda-cupti-cu12==12.4.127; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cuda-cupti-cu12 12.5.82 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == "Linux"

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

def build_driver() -> webdriver.Chrome:
    opts = Options()
    opts.add_argument("--headless=new")            # keep Chrome fully headless
    opts.add_argument("--disable-gpu")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--disable-dev-shm-usage")
    opts.add_argument("--disable-blink-features=AutomationControlled")
    opts.add_argument(f"user-agent={random.choice(USER_AGENTS)}")

    driver = webdriver.Chrome(
        service=Service(ChromeDriverManager().install()),
        options=opts,
    )
    return driver


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%pip install -q google-colab-selenium[undetected] pandas lxml

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/65.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.4/65.4 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for undetected-chromedriver (setup.py) ... [?25l[?25hdone


In [None]:
# Scrape ONET summary pages in **Google Colab** using the *google‑colab‑selenium* helper
# -----------------------------------------------------------------------------
# Setup in Colab:
#   %pip install -q google-colab-selenium[undetected] pandas lxml
#   # upload codes.csv  (one ONET‑SOC code per row)
#   !python scrape_onet.py
# -----------------------------------------------------------------------------

import csv
import random
import time
from pathlib import Path
from typing import Dict, List

import google_colab_selenium as gs
import pandas as pd
from lxml import html
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.common.exceptions import MoveTargetOutOfBoundsException

# -----------------------------------------------------------------------------
# CONFIGURATION
# -----------------------------------------------------------------------------

CODES_CSV_PATH = Path("codes.csv")          # input file (one code per line)
OUTPUT_CSV_PATH = Path("onet_data_scraped.csv")

DELAY_RANGE = (5, 20)                        # seconds between actions
MOUSE_MOVE_COUNT_RANGE = (3, 7)              # wiggles per page
SCROLL_PIXELS_RANGE = (200, 1200)            # scroll distance

USER_AGENTS = [
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17 Safari/605.1.15",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
]

USE_UNDETECTED = False  # flip to True for `gs.UndetectedChrome()`

# -----------------------------------------------------------------------------
# DRIVER BUILDER
# -----------------------------------------------------------------------------

def build_driver():
    """Spin up a Colab‑ready Chrome session via google‑colab‑selenium."""
    opts = Options()
    opts.add_argument(f"user-agent={random.choice(USER_AGENTS)}")
    if USE_UNDETECTED:
        driver = gs.UndetectedChrome(options=opts)
    else:
        driver = gs.Chrome(options=opts)
    return driver


# -----------------------------------------------------------------------------
# HELPER ROUTINES
# -----------------------------------------------------------------------------

def random_delay():
    time.sleep(random.uniform(*DELAY_RANGE))


def simulate_human_interaction(driver):
    """Scroll a bit and wiggle the mouse safely inside the page bounds."""
    # Random scroll down the page
    driver.execute_script(
        "window.scrollTo(0, arguments[0]);",
        random.randint(*SCROLL_PIXELS_RANGE),
    )
    time.sleep(random.uniform(0.5, 1.5))

    # Random mouse moves – constrained inside <body> rectangle to avoid
    # MoveTargetOutOfBoundsException.
    body = driver.find_element(By.TAG_NAME, "body")
    rect = body.rect  # dict with width/height/…
    width = max(1, int(rect.get("width", 1)))
    height = max(1, int(rect.get("height", 1)))

    actions = ActionChains(driver)
    for _ in range(random.randint(*MOUSE_MOVE_COUNT_RANGE)):
        x = random.randint(0, width - 1)
        y = random.randint(0, height - 1)
        try:
            actions.move_to_element_with_offset(body, x, y).pause(
                random.uniform(0.2, 0.8)
            )
        except MoveTargetOutOfBoundsException:
            # Skip if coordinates are invalid for some reason
            continue
    try:
        actions.perform()
    except MoveTargetOutOfBoundsException:
        pass  # ignore final edge cases


def read_codes(path: Path) -> List[str]:
    with path.open(newline="") as f:
        return [
            row[0].strip()
            for row in csv.reader(f)
            if row and row[0].strip() and row[0].strip().lower() != "onet_soc_code"
        ]


def extract_fields(page_html: str) -> Dict[str, str]:
    tree = html.fromstring(page_html)
    syn_nodes = tree.xpath("/html/body/div[1]/div[1]/div/div[2]/p/text()")
    task_nodes = tree.xpath("/html/body/div[1]/div[1]/div/div[2]/div[2]/div[1]/text()")
    return {
        "synonyms": " ".join(t.strip() for t in syn_nodes if t.strip()),
        "tasks": " ".join(t.strip() for t in task_nodes if t.strip()),
    }


def scrape_codes(codes: List[str]) -> pd.DataFrame:
    random.shuffle(codes)  # visit in random order
    driver = build_driver()
    rows: List[Dict[str, str]] = []

    try:
        for code in codes:
            url = f"https://www.mynextmove.org/profile/summary/{code}"
            driver.get(url)
            random_delay()
            simulate_human_interaction(driver)
            random_delay()
            data = extract_fields(driver.page_source)
            data["onet_soc_code"] = code
            rows.append(data)
            print(f"✓ {code} done")
    finally:
        driver.quit()

    return pd.DataFrame(rows, columns=["onet_soc_code", "synonyms", "tasks"])


# -----------------------------------------------------------------------------
# MAIN
# -----------------------------------------------------------------------------

def main():
    codes = read_codes(CODES_CSV_PATH)
    if not codes:
        raise SystemExit("No codes found in codes.csv")
    df = scrape_codes(codes)
    df.to_csv(OUTPUT_CSV_PATH, index=False)
    print(f"Saved {len(df)} records → {OUTPUT_CSV_PATH}")


if __name__ == "__main__":
    main()


<IPython.core.display.Javascript object>

✓ 43-3021.00 done
✓ 51-2092.00 done
✓ 17-2199.00 done
✓ 29-2036.00 done
✓ 17-3029.01 done
✓ 25-1062.00 done
✓ 19-2043.00 done
✓ 11-1021.00 done
✓ 27-1022.00 done
✓ 31-9092.00 done
✓ 43-4141.00 done
✓ 33-9099.02 done
✓ 53-6031.00 done
✓ 17-2199.03 done
✓ 49-3092.00 done
✓ 35-1012.00 done
✓ 43-5021.00 done
✓ 15-1299.09 done
✓ 31-9099.00 done
✓ 25-3041.00 done
✓ 51-3091.00 done
✓ 25-1064.00 done
✓ 19-5011.00 done
✓ 17-3023.00 done
✓ 47-2072.00 done
✓ 41-9091.00 done
✓ 13-1081.02 done
✓ 29-2057.00 done
✓ 51-9071.06 done
✓ 25-1031.00 done
✓ 17-2161.00 done
✓ 45-4011.00 done
✓ 49-9044.00 done
✓ 21-1014.00 done
✓ 45-2093.00 done
✓ 51-3092.00 done
✓ 49-9097.00 done
✓ 49-9091.00 done
✓ 39-1013.00 done
✓ 15-1231.00 done
✓ 49-3011.00 done
✓ 13-1071.00 done
✓ 23-2093.00 done
✓ 27-4032.00 done
✓ 31-1133.00 done
✓ 51-6063.00 done
✓ 15-1299.04 done
✓ 43-3031.00 done
✓ 21-1022.00 done
✓ 17-2071.00 done
✓ 29-1125.00 done
✓ 53-7065.00 done
✓ 11-9039.00 done
✓ 43-5052.00 done
✓ 17-1012.00 done
✓ 51-8011.