# USDA Fruit & Vegetables Data Processor

This notebook implements the ETL pipeline for USDA Fruit and Vegetable retail price data.

## Setup & Imports

In [1]:
import os
import re
import tempfile
from collections import defaultdict
from pathlib import Path
from zipfile import ZipFile

import openpyxl
import requests
from bs4 import BeautifulSoup

## Constants

These constants define the processing behavior:
- `LISTING_URL`: USDA data products page with download links
- `OUTPUT_DIR`: Destination for generated CSV files (layout: `alternative/usda/fruitandvegetables/{product}.csv`)
- `YEAR_REGEX`: Matches 4-digit years (1900-2099) in titles/filenames
- Unit mappings: Canonical forms for price and cup equivalent units

**Output format**: One CSV file per product (e.g., `apples.csv`) containing all forms (Fresh, Applesauce, Juice, etc.) as rows.
CSV columns: `Date,Form,AverageRetailPrice,Unit,PreparationYieldFactor,CupEquivalentSize,CupEquivalentUnit,PricePerCupEquivalent`

In [2]:
# --- Constants ---
LISTING_URL = "https://www.ers.usda.gov/data-products/fruit-and-vegetable-prices"

# Output directory: uses TEMP_OUTPUT_DIRECTORY env var if set (for automated/cloud runs),
# otherwise defaults to ./output (interactive notebook use)
OUTPUT_DIR = (
    Path(os.environ.get("TEMP_OUTPUT_DIRECTORY", "/temp-output-directory")) / "alternative/usda/fruitandvegetables"
)

YEAR_REGEX = re.compile(r"\b((?:19|20)\d{2})\b")
FOOTNOTE_REGEX = re.compile(r"\s*\d+(?:,\d+)*\s*$")
FORM_CATEGORY_LABELS = {
    "fresh",
    "canned",
    "frozen",
    "dried",
    "juice",
    "peas & carrots",
    "green peas & carrots",
    "succotash",
}

# --- XLSX Structure Constants ---
# Header rows appear at index 0-1; MAX_HEADER_SEARCH_ROWS provides safety margin.
# Data rows have 7-9 columns; MIN_DATA_ROW_COLUMNS is the minimum for a valid row.
MAX_HEADER_SEARCH_ROWS = 15
MIN_DATA_ROW_COLUMNS = 7

# Note: Extended descriptions like "per pint (16 fluid ounces concentrate)" match via substring.
PRICE_UNIT_MAP = {
    "per pound": "per_pound",
    "per pint": "per_pint",
}
CUP_UNIT_MAP = {
    "pounds": "pounds",
    "pound": "pounds",
    "pints": "pints",
    "fluid ounces": "fluid_ounces",
    "fl oz": "fluid_ounces",
    "fl. oz.": "fluid_ounces",
}

## Helper Functions

Utility functions for text processing.

In [None]:
def slugify(text: str) -> str:
    """Convert text to lowercase slug with underscores."""
    return re.sub(r"[^a-z0-9]+", "_", text.lower()).strip("_")


def collapse_whitespace(text: str) -> str:
    """Collapse multiple whitespace characters into single spaces."""
    return " ".join(text.split())


def normalize_form_separator(form: str) -> str:
    """Normalize form name separators to consistent ' - ' (space-hyphen-space).

    This ensures consistent form names across years, regardless of whether the source
    data uses commas, semicolons, or other separators. For example:
    - "Fresh; Boiled" → "Fresh - Boiled"
    - "Fresh, Boiled" → "Fresh - Boiled"
    - "Juice; Ready to drink" → "Juice - Ready to drink"

    Note: This runs AFTER comma sanitization (which converts ',' to ';' to avoid
    breaking CSV parsing), so we only need to normalize '; ' patterns.
    """
    # Normalize '; ' (semicolon-space) to ' - ' (space-hyphen-space)
    # This catches both direct semicolons in source data and sanitized commas
    return re.sub(r";\s*", " - ", form)


def lookup_canonical_unit(raw_unit_text: object, unit_map: dict[str, str]) -> str:
    """Look up canonical unit form from raw XLSX text using substring matching."""
    if raw_unit_text is None:
        return ""
    text = collapse_whitespace(str(raw_unit_text)).lower()
    return next((canonical for pattern, canonical in unit_map.items() if pattern in text), "")


def cell_to_csv(value: object) -> str:
    """Convert cell value to CSV string. Empty/None becomes empty string."""
    if value is None:
        return ""
    text = str(value).strip()
    return text if text else ""


def normalize_cup_equivalent(size_str: str, unit: str) -> tuple[str, str]:
    """Normalize cup equivalent units: convert fluid_ounces to pints (16 fl oz = 1 pint)."""
    if not size_str or unit != "fluid_ounces":
        return size_str, unit
    try:
        size_pints = float(size_str) / 16.0
        return str(size_pints).rstrip("0").rstrip("."), "pints"
    except ValueError:
        return size_str, unit

## Download & Extract

Downloads XLSX/ZIP files from the USDA website and extracts them to a temporary directory.

In [4]:
def download_and_extract(temp_dir: str) -> list[Path]:
    """Download XLSX/ZIP files from USDA and extract to temp directory.

    Note: BeautifulSoup's dynamic attribute access doesn't have complete type stubs,
    so we use explicit str() conversion to satisfy type checkers.
    """
    print(f"Fetching {LISTING_URL}")
    response = requests.get(LISTING_URL, timeout=60)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, "html.parser")

    download_links: list[str] = []
    for link_tag in soup.find_all("a", href=True):
        href: str = str(link_tag["href"])
        # Strip query string before checking extension (handles ".xlsx?timestamp=123" patterns)
        base_href = href.lower().split("?")[0]
        if base_href.endswith((".xlsx", ".zip")):
            download_links.append(href)

    print(f"Found {len(download_links)} files to download")
    xlsx_files: list[Path] = []

    for link in download_links:
        url = link if link.startswith("http") else f"https://www.ers.usda.gov{link}"
        filename = url.split("/")[-1].split("?")[0]
        local_path = Path(temp_dir) / filename

        try:
            print(f"Downloading {filename}")
            file_response = requests.get(url, timeout=60)
            file_response.raise_for_status()
            local_path.write_bytes(file_response.content)

            if filename.endswith(".zip"):
                with ZipFile(local_path, "r") as zip_archive:
                    for archived_name in zip_archive.namelist():
                        if archived_name.endswith(".xlsx"):
                            zip_archive.extract(archived_name, temp_dir)
                            xlsx_files.append(Path(temp_dir) / archived_name)
            else:
                xlsx_files.append(local_path)
        except Exception as e:
            print(f"Error downloading {filename}: {e}")

    return xlsx_files

## XLSX Parsing

Functions to parse XLSX files and extract price data from worksheets.

In [5]:
def find_header_row(rows: list[list[object]]) -> int:
    """Find header row index by looking for 'Form' and 'Average retail price'."""
    for i, row in enumerate(rows[:MAX_HEADER_SEARCH_ROWS]):
        text = " ".join(collapse_whitespace(str(v)) for v in row if v).lower()
        if "form" in text and "average retail price" in text:
            return i
        # Check merged header (split across two rows)
        if i + 1 < len(rows):
            next_text = " ".join(collapse_whitespace(str(v)) for v in rows[i + 1] if v).lower()
            combined = text + " " + next_text
            if "form" in combined and "average retail price" in combined:
                return i + 1
    return -1


def extract_year(rows: list[list[object]], header_row_index: int, sheet_title: str, filename: str) -> int | None:
    """Extract year from title rows, sheet name, or filename (in priority order).

    Uses walrus operator (:=) to search and capture in a single expression.
    Sources are checked in order: title rows first (most reliable), then sheet name, then filename.
    """
    sources = [str(row[0]) for row in rows[:header_row_index] if row and row[0]]
    sources += [sheet_title, filename]
    for source in sources:
        if match := YEAR_REGEX.search(source):
            return int(match.group(1))
    return None


def extract_product_name(rows: list[list[object]], header_row_index: int, sheet_title: str) -> str:
    """Extract product name from title row or sheet name."""
    for row in rows[:header_row_index]:
        if row and row[0]:
            title = str(row[0]).strip()
            # Split on em-dash or hyphen (USDA uses various dash styles)
            for delim in ("\u2014", " - ", " \u2013 "):
                if delim in title:
                    return title.split(delim)[0].strip()
            if title:
                return title
    return sheet_title.strip()

In [None]:
def parse_data_row(
    row: list[object], product_name: str, date_str: str, current_group: str | None
) -> tuple[str, str, str] | tuple[str, str] | None:
    """Parse a data row into (product_code, form, csv_line), ("__GROUP__", group_name), or None.

    Returns:
        - (product_code, form, csv_row): Valid data row with product code, form, and CSV data
        - ("__GROUP__", group_name): Group header (e.g., "Fresh", "Canned") - signals context change
        - None: Skip row (footnote, source line, or insufficient data)
    """
    if len(row) < MIN_DATA_ROW_COLUMNS:
        return None

    form_raw = str(row[0] or "").strip()
    if not form_raw:
        return None

    # Skip non-data rows: footnotes (start with digit), source/contact lines
    form_lower = form_raw.lower()
    if form_raw[0].isdigit() or form_lower.startswith(("source", "contact", "errata")):
        return None

    # Get raw string values for numeric columns
    avg_price = cell_to_csv(row[1])
    yield_factor = cell_to_csv(row[3])
    cup_size = cell_to_csv(row[4])
    price_per_cup = cell_to_csv(row[6])
    has_numeric_data = any(v for v in (avg_price, yield_factor, cup_size, price_per_cup))

    # Check for group headers (Fresh, Canned, etc.) - must have no numeric data
    # Strip footnotes first (e.g., "Fresh1" → "Fresh") before checking
    form_normalized = FOOTNOTE_REGEX.sub("", form_raw).strip().lower()
    if form_normalized in FORM_CATEGORY_LABELS and not has_numeric_data:
        return ("__GROUP__", form_normalized.title())

    # All numeric values missing = not a data row (but wasn't a group header either)
    if not has_numeric_data:
        return None

    # Strip trailing footnote markers from form
    form = FOOTNOTE_REGEX.sub("", form_raw).strip()
    if not form:
        return None

    # Apply group context (e.g., "Florets" -> "Fresh - Florets")
    # Use hyphen instead of comma to avoid breaking CSV parsing
    if current_group:
        form_lower_clean = form.lower()
        if current_group.lower() not in form_lower_clean:
            form = f"{current_group} - {form}"

    # Sanitize any remaining commas in form names (e.g., "Juice, Ready to drink")
    # Replace with semicolon to preserve readability while avoiding CSV column issues
    form = form.replace(",", ";")

    # Normalize form separators to consistent ' - ' pattern
    # This ensures "Fresh; Boiled" (2024 format) matches "Fresh - Boiled" (earlier years)
    form = normalize_form_separator(form)

    # Parse units (only when corresponding value exists)
    price_unit = lookup_canonical_unit(row[2], PRICE_UNIT_MAP) if avg_price else ""
    cup_unit = lookup_canonical_unit(row[5], CUP_UNIT_MAP) if cup_size else ""

    # Normalize cup equivalent units (fluid_ounces → pints)
    cup_size, cup_unit = normalize_cup_equivalent(cup_size, cup_unit)

    # Product code is just the product name slugified
    product_code = slugify(product_name)

    # CSV row now includes Form as the second column (after date which is prepended later)
    # Format: Date,Form,AverageRetailPrice,Unit,PreparationYieldFactor,CupEquivalentSize,CupEquivalentUnit,PricePerCupEquivalent
    csv_row = f"{date_str},{form},{avg_price},{price_unit},{yield_factor},{cup_size},{cup_unit},{price_per_cup}"

    return product_code, form, csv_row

In [7]:
def parse_xlsx(file_path: Path, product_csv_rows: dict[str, list[str]]) -> None:
    """Parse single XLSX file into product_csv_rows.

    Data is grouped by product code (not product+form). Each product file will contain
    multiple forms as separate rows, with Form as the second column.

    Note: openpyxl's cell.value has complex union types. We use list[list[object]]
    as a practical type annotation that captures "list of rows, each row is a list of cell values".
    """
    workbook = openpyxl.load_workbook(file_path, data_only=True)

    for sheet in workbook.worksheets:
        rows: list[list[object]] = [[cell.value for cell in row] for row in sheet.iter_rows()]
        if not rows:
            continue

        header_row_index = find_header_row(rows)
        if header_row_index < 0:
            print(f"Warning: No header found in {file_path.name} sheet {sheet.title}")
            continue

        year = extract_year(rows, header_row_index, sheet.title, file_path.name)
        if not year:
            print(f"Warning: No year found in {file_path.name} sheet {sheet.title}")
            continue

        product_name = extract_product_name(rows, header_row_index, sheet.title)
        # Offset by 1 year to prevent look-ahead bias (publication date unknown)
        date_str = f"{year + 1}0101"
        current_group = None  # Track group context (Fresh, Canned, etc.)

        for row_idx, row in enumerate(rows[header_row_index + 1 :], start=header_row_index + 2):
            try:
                row_result = parse_data_row(row, product_name, date_str, current_group)
                if row_result is None:
                    continue
                if row_result[0] == "__GROUP__":
                    current_group = row_result[1]
                    continue
                # Unpack: (product_code, form, csv_row)
                product_code, _form, csv_row = row_result
                product_csv_rows[product_code].append(csv_row)
            except ValueError as e:
                print(f"Warning: {file_path.name} sheet {sheet.title} row {row_idx}: {e}")


def parse_all_files(xlsx_files: list[Path]) -> dict[str, list[str]]:
    """Parse all XLSX files and return data grouped by product code.

    Returns dict mapping product_code -> list of CSV rows (unsorted).
    Each CSV row includes the date and form, so rows can be sorted later.
    """
    product_csv_rows: dict[str, list[str]] = defaultdict(list)
    for file_path in xlsx_files:
        try:
            parse_xlsx(file_path, product_csv_rows)
        except Exception as e:
            print(f"Error parsing {file_path.name}: {e}")
    return product_csv_rows

## Output

Writes one CSV file per product (not per form), containing all forms sorted by date then form name.

In [8]:
def write_output(product_csv_rows: dict[str, list[str]]) -> None:
    """Write one CSV file per product, containing all forms sorted by date then form.

    Each product file (e.g., apples.csv) contains rows for all forms (Fresh, Applesauce, etc.)
    with CSV format: Date,Form,AverageRetailPrice,Unit,PreparationYieldFactor,CupEquivalentSize,CupEquivalentUnit,PricePerCupEquivalent
    """
    for product_code, csv_rows in sorted(product_csv_rows.items()):
        output_path = OUTPUT_DIR / f"{product_code}.csv"
        # Sort by date (column 0), then by form (column 1) for consistent output
        sorted_rows = sorted(csv_rows, key=lambda r: (r.split(",")[0], r.split(",")[1]))
        output_path.write_text("\n".join(sorted_rows))
        print(f"Wrote {len(sorted_rows)} rows to {output_path.name}")

## Main Entry Point

Run the full ETL pipeline.

In [9]:
def main() -> int:
    print("USDA Fruit & Vegetables Data Processor")
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

    with tempfile.TemporaryDirectory() as temp_dir:
        xlsx_files = download_and_extract(temp_dir)
        if not xlsx_files:
            print("Error: No XLSX files found")
            return 1

        product_csv_rows = parse_all_files(xlsx_files)
        if not product_csv_rows:
            print("Error: No data parsed")
            return 1

        write_output(product_csv_rows)
        print(f"Generated {len(product_csv_rows)} product files")

    print("Processing complete")
    return 0


# Run the processor. Raise SystemExit only on failure so callers can detect errors.
# Success (exit code 0) completes silently to avoid nbconvert treating it as an exception.
_exit_code = main()
if _exit_code != 0:
    raise SystemExit(_exit_code)

USDA Fruit & Vegetables Data Processor
Fetching https://www.ers.usda.gov/data-products/fruit-and-vegetable-prices


Found 82 files to download
Downloading apples-average-retail-price-per-pound-or-pint-and-per-cup-equivalent.xlsx


Downloading apricots-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading bananas-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading berries-mixed-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading blackberries-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading blueberries-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading cantaloupe-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading cherries-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading clementines-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading cranberries-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading dates-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading figs-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading fruit-cocktail-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading grapefruit-average-retail-price-per-pound-or-pint-and-per-cup-equivalent.xlsx


Downloading grapes-average-retail-price-per-pound-or-pint-and-per-cup-equivalent.xlsx


Downloading honeydew-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading kiwi-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading mangoes-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading nectarines-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading oranges-average-retail-price-per-pound-or-pint-and-per-cup-equivalent.xlsx


Downloading papaya-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading peaches-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading pears-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading pineapple-average-retail-price-per-pound-or-pint-and-per-cup-equivalent.xlsx


Downloading plums-average-retail-price-per-pound-or-pint-and-per-cup-equivalent.xlsx


Downloading pomegranate-average-retail-price-per-pound-or-pint-and-per-cup-equivalent.xlsx


Downloading raspberries-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading strawberries-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading watermelon-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading acorn-squash-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading artichoke-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading asparagus-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading avocados-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading beets-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading black-beans-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading blackeye-peas-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading broccoli-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading brussels-sprouts-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading butternut-squash-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading cabbage-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading carrots-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading cauliflower-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading celery-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading collard-greens-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading sweet-corn-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading cucumbers-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading great-northern-beans-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading green-beans-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading green-peas-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading green-peppers-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading kale-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading kidney-beans-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading lentils-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading iceberg-lettuce-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading romaine-lettuce-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading lima-beans-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading mixed-vegetables-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading mushrooms-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading mustard-greens-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading navy-beans-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading okra-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading olives-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading onions-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading pinto-beans-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading potatoes-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading pumpkin-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading radish-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading red-peppers-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading spinach-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading sweet-potatoes-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading tomatoes-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading turnip-greens-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading zucchini-average-retail-price-per-pound-and-per-cup-equivalent.xlsx


Downloading archived-2013-data-tables-for-fruit.zip


Downloading archived-2016-data-tables-for-fruit.zip


Downloading archived-2020-data-tables-for-fruit.zip


Downloading archived-2022-data-tables-for-fruit.zip


Downloading archived-2013-data-tables-for-vegetables.zip


Downloading archived-2016-data-tables-for-vegetables.zip


Downloading archived-2020-data-tables-for-vegetables.zip


Downloading archived-2022-data-tables-for-vegetables.zip


Downloading archived-data-tables-for-snack-substitutions.zip


Wrote 5 rows to acorn_squash.csv
Wrote 20 rows to apples.csv
Wrote 20 rows to apricots.csv
Wrote 12 rows to artichoke.csv
Wrote 15 rows to asparagus.csv
Wrote 5 rows to avocados.csv
Wrote 5 rows to bananas.csv
Wrote 5 rows to beets.csv
Wrote 10 rows to black_beans.csv
Wrote 10 rows to blackberries.csv
Wrote 10 rows to blackeye_peas.csv
Wrote 10 rows to blueberries.csv
Wrote 15 rows to broccoli.csv
Wrote 10 rows to brussels_sprouts.csv
Wrote 5 rows to butternut_squash.csv
Wrote 15 rows to cabbage.csv
Wrote 5 rows to cantaloupe.csv
Wrote 25 rows to carrots.csv
Wrote 15 rows to cauliflower.csv
Wrote 10 rows to celery.csv
Wrote 10 rows to cherries.csv
Wrote 3 rows to clementines.csv
Wrote 15 rows to collard_greens.csv
Wrote 5 rows to cranberries.csv
Wrote 10 rows to cucumbers.csv
Wrote 5 rows to dates.csv
Wrote 5 rows to figs.csv
Wrote 10 rows to fruit_cocktail.csv
Wrote 11 rows to grapefruit.csv
Wrote 20 rows to grapes.csv
Wrote 10 rows to great_northern_beans.csv
Wrote 15 rows to green_b