# EWG Scrape – Extract/Transform Smoke Test

This notebook does a *small* end-to-end scrape test:
- Discover ~5 products from one EWG category
- Extract ingredients from the product pages (keep ~10 unique ingredients)
- Transform into SQL-like tables (products / ingredient dimension / junction)
- (Optional) Enrich ingredients via DeepSeek if `DEEPSEEK_API_KEY` is set

Notes:
- Requires internet access to `https://www.ewg.org`. If EWG blocks requests or the layout changes, the scrape cells will raise/print an error.
- The enrichment step is optional; it will be skipped if no API key is configured.

In [1]:
from __future__ import annotations

import os
import sys
import time
from pathlib import Path

import pandas as pd

pd.set_option('display.max_colwidth', 120)
pd.set_option('display.max_rows', 50)

def find_repo_root(start: Path | None = None) -> Path:
    p = (start or Path.cwd()).resolve()
    for _ in range(10):
        if (p / 'docker-compose.yml').exists() and (p / 'scraper').exists():
            return p
        if p.parent == p:
            break
        p = p.parent
    raise RuntimeError('Could not find repo root (expected docker-compose.yml + scraper/)')

ROOT = find_repo_root()
SCRAPER_DIR = ROOT / 'scraper'
SCRIPTS_DIR = SCRAPER_DIR / 'scripts'

# Make the existing scripts importable:
# - scrape_ewg.py is in scraper/scripts and is imported as `import scrape_ewg`
# - utils/ is in scraper/utils and is imported as `from utils...`
sys.path.insert(0, str(SCRIPTS_DIR))
sys.path.insert(0, str(SCRAPER_DIR))

print('ROOT:', ROOT)
print('SCRIPTS_DIR:', SCRIPTS_DIR)
print('Python:', sys.executable)

ROOT: D:\IPSSI\skinintel
SCRIPTS_DIR: D:\IPSSI\skinintel\scraper\scripts
Python: c:\Users\abd el moumene\AppData\Local\Programs\Python\Python312\python.exe


In [2]:
# Imports from the existing project
from scrape_ewg import (
    build_session,
    build_sql_like_tables,
    scrape_ewg_category_products,
    scrape_ewg_product_ingredients_from_product_page,
    setup_logging,
)

setup_logging()
sess = build_session()

print('Ready.')

Ready.


In [3]:
# Parameters (tweak as needed)
CATEGORY = os.getenv('EWG_CATEGORY', 'Facial Cleanser')
N_PRODUCTS = int(os.getenv('EWG_N_PRODUCTS', '5'))
N_UNIQUE_INGREDIENTS = int(os.getenv('EWG_N_INGREDIENTS', '10'))

REQUEST_TIMEOUT = (5.0, 30.0)
DELAY_S = 0.35

print('CATEGORY:', CATEGORY)
print('N_PRODUCTS:', N_PRODUCTS)
print('N_UNIQUE_INGREDIENTS:', N_UNIQUE_INGREDIENTS)

CATEGORY: Facial Cleanser
N_PRODUCTS: 5
N_UNIQUE_INGREDIENTS: 10


In [4]:
# 1) DISCOVER (scrape one category page, keep ~5 products)
try:
    products = scrape_ewg_category_products(
        category=CATEGORY,
        start_page=1,
        max_pages=1,
        max_products=N_PRODUCTS,
        delay=DELAY_S,
        timeout=REQUEST_TIMEOUT,
        session=sess,
    )
    products_df = pd.DataFrame([p.__dict__ for p in products])
except Exception as e:
    products_df = pd.DataFrame(columns=['category', 'company', 'product', 'url'])
    print('Discovery failed:', repr(e))

products_df

2026-01-30 02:38:45 | INFO | scrape_ewg | Scraping category=Facial Cleanser page=1


Unnamed: 0,category,company,product,url
0,Facial Cleanser,Honest Beauty,Honest Beauty,https://www.ewg.org/skindeep/products/970204-Honest_Beauty_Skin_Sweep_Exfoliating_Powder_Cleanser/
1,Facial Cleanser,Dr. Bronner's,Dr. Bronner's,https://www.ewg.org/skindeep/products/1049349-Dr_Bronners_All_One_Hemp_Pure_Castile_Bar_Soap_Green_Tea/
2,Facial Cleanser,Just the Goods,Just the Goods,https://www.ewg.org/skindeep/products/765583-Just_the_Goods_moisturizing_vegan_face_wash_for_oilycombination_skin/
3,Facial Cleanser,Dr. Bronner's,Dr. Bronner's,https://www.ewg.org/skindeep/products/1049348-Dr_Bronners_All_One_Hemp_Pure_Castile_Bar_Soap_Sandalwood_Jasmine/
4,Facial Cleanser,Honey Sweetie Acres,Honey Sweetie Acres,https://www.ewg.org/skindeep/products/829297-Honey_Sweetie_Acres_Goat_Milk_Soap_Avocado_Facial/


In [5]:
# 2) EXTRACT (scrape ingredients per product page; keep ~10 unique ingredients total)
ingredient_rows: list[dict] = []

if products_df.empty:
    print('No products discovered; cannot extract ingredients.')
else:
    seen_ingredient_keys: set[str] = set()

    for i, p in enumerate(products_df.to_dict(orient='records'), start=1):
        url = str(p['url'])
        print(f'[{i}/{len(products_df)}] Scraping ingredients:', url)

        try:
            ing = scrape_ewg_product_ingredients_from_product_page(
                url,
                max_ingredients=None,
                timeout=REQUEST_TIMEOUT,
                session=sess,
            )
        except Exception as e:
            print('  -> ingredient scrape failed:', repr(e))
            continue

        for r in ing:
            key = (str(r.get('ingredient') or '') + '|' + str(r.get('ingredient_url') or '')).strip().lower()
            if not key or key in seen_ingredient_keys:
                continue
            seen_ingredient_keys.add(key)

            ingredient_rows.append(
                {
                    'category': p.get('category'),
                    'company': p.get('company'),
                    'product': p.get('product'),
                    'product_url': url,
                    'ingredient': r.get('ingredient'),
                    'ingredient_url': r.get('ingredient_url'),
                    'functions': r.get('functions') if isinstance(r.get('functions'), list) else [],
                    'concerns': r.get('concerns') if isinstance(r.get('concerns'), list) else [],
                }
            )

            if len(ingredient_rows) >= N_UNIQUE_INGREDIENTS:
                break

        if len(ingredient_rows) >= N_UNIQUE_INGREDIENTS:
            break

        time.sleep(DELAY_S)

ingredients_df = pd.DataFrame(ingredient_rows)
ingredients_df

[1/5] Scraping ingredients: https://www.ewg.org/skindeep/products/970204-Honest_Beauty_Skin_Sweep_Exfoliating_Powder_Cleanser/


Unnamed: 0,category,company,product,product_url,ingredient,ingredient_url,functions,concerns
0,Facial Cleanser,Honest Beauty,Honest Beauty,https://www.ewg.org/skindeep/products/970204-Honest_Beauty_Skin_Sweep_Exfoliating_Powder_Cleanser/,AVENA SATIVA (OAT) KERNEL FLOUR,https://www.ewg.org/skindeep/ingredients/700581-AVENA_SATIVA_OAT_KERNEL_FLOUR/,"[abrasive, absorbent, bulking, viscosity controlling, bulking agent, viscosity increasing agent - aqueous]","[Allergies/immunotoxicity (low), Use restrictions (low)]"
1,Facial Cleanser,Honest Beauty,Honest Beauty,https://www.ewg.org/skindeep/products/970204-Honest_Beauty_Skin_Sweep_Exfoliating_Powder_Cleanser/,CELLULOSE ACETATE,https://www.ewg.org/skindeep/ingredients/717526-CELLULOSE_ACETATE/,"[film forming, film former]",[]
2,Facial Cleanser,Honest Beauty,Honest Beauty,https://www.ewg.org/skindeep/products/970204-Honest_Beauty_Skin_Sweep_Exfoliating_Powder_Cleanser/,COCONUT FATTY ACID,https://www.ewg.org/skindeep/ingredients/701551-COCONUT_FATTY_ACID/,"[cleansing, emollient, emulsifying, surfactant, surfactant - cleansing agentsurfactant-cleansing agent is included a...","[Multiple, additive exposure sources (low)]"
3,Facial Cleanser,Honest Beauty,Honest Beauty,https://www.ewg.org/skindeep/products/970204-Honest_Beauty_Skin_Sweep_Exfoliating_Powder_Cleanser/,DISODIUM COCOYL GLUTAMATE,https://www.ewg.org/skindeep/ingredients/702141-DISODIUM_COCOYL_GLUTAMATE/,"[cleansing, surfactant, surfactant - cleansing agent]","[Use restrictions (low), Irritation (skin, eyes, or lungs) (low)]"
4,Facial Cleanser,Honest Beauty,Honest Beauty,https://www.ewg.org/skindeep/products/970204-Honest_Beauty_Skin_Sweep_Exfoliating_Powder_Cleanser/,GLUTAMIC ACID,https://www.ewg.org/skindeep/ingredients/702604-GLUTAMIC_ACID/,"[antistatic, hair conditioning, humectant, fragrance ingredient, hair conditioning agent, skin-conditioning agent - ...","[Irritation (skin, eyes, or lungs) (low)]"
5,Facial Cleanser,Honest Beauty,Honest Beauty,https://www.ewg.org/skindeep/products/970204-Honest_Beauty_Skin_Sweep_Exfoliating_Powder_Cleanser/,KAOLIN,https://www.ewg.org/skindeep/ingredients/703305-KAOLIN/,"[abrasive, absorbent, anticaking, bulking, opacifying, anticaking agent, bulking agent, opacifying agent, skin prote...",[]
6,Facial Cleanser,Honest Beauty,Honest Beauty,https://www.ewg.org/skindeep/products/970204-Honest_Beauty_Skin_Sweep_Exfoliating_Powder_Cleanser/,MAGNESIUM OXIDE,https://www.ewg.org/skindeep/ingredients/703703-MAGNESIUM_OXIDE/,"[absorbent, buffering, opacifying, opacifying agent, ph adjuster]",[Enhanced skin absorption]
7,Facial Cleanser,Honest Beauty,Honest Beauty,https://www.ewg.org/skindeep/products/970204-Honest_Beauty_Skin_Sweep_Exfoliating_Powder_Cleanser/,MICROCRYSTALLINE CELLULOSE,https://www.ewg.org/skindeep/ingredients/703956-MICROCRYSTALLINE_CELLULOSE/,"[abrasive, absorbent, anticaking agent, bulking agent, emulsion stabilizer, slip modifier;viscosity increasing agent...",[Allergies/immunotoxicity (moderate)]
8,Facial Cleanser,Honest Beauty,Honest Beauty,https://www.ewg.org/skindeep/products/970204-Honest_Beauty_Skin_Sweep_Exfoliating_Powder_Cleanser/,ORYZA SATIVA (RICE) STARCH,https://www.ewg.org/skindeep/ingredients/704365-ORYZA_SATIVA_RICE_STARCH/,"[absorbent, binding, bulking, viscosity controlling, bulking agent]","[Use restrictions (low), Contamination concerns (PESTICIDES)]"
9,Facial Cleanser,Honest Beauty,Honest Beauty,https://www.ewg.org/skindeep/products/970204-Honest_Beauty_Skin_Sweep_Exfoliating_Powder_Cleanser/,POPULUS TREMULOIDES BARK EXTRACT,https://www.ewg.org/skindeep/ingredients/722643-POPULUS_TREMULOIDES_BARK_EXTRACT/,"[antiseborrhoeic, skin conditioning, not reported]","[Cancer (low), Developmental/reproductive toxicity (low), Allergies/immunotoxicity (low), Use restrictions (high), E..."


In [6]:
# 3) TRANSFORM (build tables similar to what the DB pipeline writes)
if products_df.empty or ingredients_df.empty:
    print('Missing products or ingredients; cannot transform.')
    products_out = pd.DataFrame()
    ingredients_dim = pd.DataFrame()
    product_ingredients = pd.DataFrame()
else:
    # build_sql_like_tables expects: products_df has `url` OR `product_url`; ingredients_df has `product_url`
    products_out, ingredients_dim, product_ingredients = build_sql_like_tables(
        products_df=products_df[['category', 'company', 'product', 'url']].copy(),
        ingredients_df=ingredients_df.copy(),
    )

print('products_out:', products_out.shape)
print('ingredients_dim:', ingredients_dim.shape)
print('product_ingredients:', product_ingredients.shape)

display(products_out.head())
display(ingredients_dim.head())
display(product_ingredients.head())

products_out: (5, 5)
ingredients_dim: (10, 5)
product_ingredients: (10, 2)


Unnamed: 0,category,company,product,url,ingredient_ids
0,Facial Cleanser,Honest Beauty,Honest Beauty,https://www.ewg.org/skindeep/products/970204-Honest_Beauty_Skin_Sweep_Exfoliating_Powder_Cleanser/,"[""ing_12925a216138cb7f"", ""ing_3345cc8aaec57313"", ""ing_35a1abecacd1f9ba"", ""ing_60ae9e34f35660c8"", ""ing_613012f31fb02d..."
1,Facial Cleanser,Dr. Bronner's,Dr. Bronner's,https://www.ewg.org/skindeep/products/1049349-Dr_Bronners_All_One_Hemp_Pure_Castile_Bar_Soap_Green_Tea/,[]
2,Facial Cleanser,Just the Goods,Just the Goods,https://www.ewg.org/skindeep/products/765583-Just_the_Goods_moisturizing_vegan_face_wash_for_oilycombination_skin/,[]
3,Facial Cleanser,Dr. Bronner's,Dr. Bronner's,https://www.ewg.org/skindeep/products/1049348-Dr_Bronners_All_One_Hemp_Pure_Castile_Bar_Soap_Sandalwood_Jasmine/,[]
4,Facial Cleanser,Honey Sweetie Acres,Honey Sweetie Acres,https://www.ewg.org/skindeep/products/829297-Honey_Sweetie_Acres_Goat_Milk_Soap_Avocado_Facial/,[]


Unnamed: 0,ingredient_id,ingredient,ingredient_url,functions,concerns
0,ing_12925a216138cb7f,MICROCRYSTALLINE CELLULOSE,https://www.ewg.org/skindeep/ingredients/703956-MICROCRYSTALLINE_CELLULOSE/,"[abrasive, absorbent, anticaking agent, bulking agent, emulsion stabilizer, slip modifier;viscosity increasing agent...",[Allergies/immunotoxicity (moderate)]
1,ing_3345cc8aaec57313,DISODIUM COCOYL GLUTAMATE,https://www.ewg.org/skindeep/ingredients/702141-DISODIUM_COCOYL_GLUTAMATE/,"[cleansing, surfactant, surfactant - cleansing agent]","[Use restrictions (low), Irritation (skin, eyes, or lungs) (low)]"
2,ing_35a1abecacd1f9ba,ORYZA SATIVA (RICE) STARCH,https://www.ewg.org/skindeep/ingredients/704365-ORYZA_SATIVA_RICE_STARCH/,"[absorbent, binding, bulking, viscosity controlling, bulking agent]","[Use restrictions (low), Contamination concerns (PESTICIDES)]"
3,ing_60ae9e34f35660c8,AVENA SATIVA (OAT) KERNEL FLOUR,https://www.ewg.org/skindeep/ingredients/700581-AVENA_SATIVA_OAT_KERNEL_FLOUR/,"[abrasive, absorbent, bulking, viscosity controlling, bulking agent, viscosity increasing agent - aqueous]","[Allergies/immunotoxicity (low), Use restrictions (low)]"
4,ing_613012f31fb02d14,COCONUT FATTY ACID,https://www.ewg.org/skindeep/ingredients/701551-COCONUT_FATTY_ACID/,"[cleansing, emollient, emulsifying, surfactant, surfactant - cleansing agentsurfactant-cleansing agent is included a...","[Multiple, additive exposure sources (low)]"


Unnamed: 0,product_url,ingredient_id
0,https://www.ewg.org/skindeep/products/970204-Honest_Beauty_Skin_Sweep_Exfoliating_Powder_Cleanser/,ing_60ae9e34f35660c8
1,https://www.ewg.org/skindeep/products/970204-Honest_Beauty_Skin_Sweep_Exfoliating_Powder_Cleanser/,ing_f8dbe1cb1d166c7c
2,https://www.ewg.org/skindeep/products/970204-Honest_Beauty_Skin_Sweep_Exfoliating_Powder_Cleanser/,ing_613012f31fb02d14
3,https://www.ewg.org/skindeep/products/970204-Honest_Beauty_Skin_Sweep_Exfoliating_Powder_Cleanser/,ing_3345cc8aaec57313
4,https://www.ewg.org/skindeep/products/970204-Honest_Beauty_Skin_Sweep_Exfoliating_Powder_Cleanser/,ing_d81f4a7eabe26c8c


## Optional: Enrich (DeepSeek)

If you set `DEEPSEEK_API_KEY` in your environment, the next cell will call DeepSeek for each ingredient (up to the small sample size) and add:
- `skin_type_compatibility`
- `interactions`
- `recommendation_time`

If no API key is configured, it will just attach empty arrays (so you can still test the wiring).

In [None]:
# Enrichment via DeepSeek – standalone (no DB / psycopg2 required)
import json
import requests
from dataclasses import dataclass
from typing import Any

DEEPSEEK_API_KEY = os.getenv('DEEPSEEK_API_KEY', 'sk-f86dab0c76f44d09a012e691dc6243d6')
DEEPSEEK_BASE_URL = os.getenv('DEEPSEEK_BASE_URL', 'https://api.deepseek.com/v1')
DEEPSEEK_MODEL = os.getenv('DEEPSEEK_MODEL', 'deepseek-chat')

SYSTEM_PROMPT = (
    "You are a cosmetic ingredient analyst. "
    "Return ONLY valid JSON. No markdown. No extra keys. "
    "If unknown, use empty arrays."
)

def _build_user_prompt(ingredient: str, functions: list[str], concerns: list[str]) -> str:
    payload: dict[str, Any] = {
        "task": "enrich_cosmetic_ingredient",
        "ingredient": ingredient,
        "output_schema": {
            "skin_type_compatibility": ["oily", "dry", "combination", "normal", "sensitive", "acne_prone", "mature", "all"],
            "interactions": ["<list of ingredients that should NOT be used with this ingredient>"],
            "recommendation_time": ["morning", "evening"],
        },
        "rules": [
            "Return ONLY these 3 keys: skin_type_compatibility, interactions, recommendation_time.",
            "All values must be arrays of lowercase snake_case strings.",
            "skin_type_compatibility: which skin types benefit from or tolerate this ingredient.",
            "interactions: list ONLY ingredients that have NEGATIVE interactions with this ingredient (should NOT be used together due to irritation, reduced efficacy, or instability). Examples: retinol + aha, vitamin_c + benzoyl_peroxide, niacinamide + vitamin_c (disputed), retinol + benzoyl_peroxide. Return empty array if no known negative interactions.",
            "recommendation_time: morning, evening, or both.",
            "Keep arrays reasonably sized (<=8 items).",
        ],
    }
    if functions:
        payload["known_functions"] = functions[:10]
    if concerns:
        payload["known_concerns"] = concerns[:10]
    return json.dumps(payload, ensure_ascii=False)

def _norm_token(s: str) -> str:
    v = (s or "").strip().lower().replace("-", "_").replace(" ", "_")
    while "__" in v:
        v = v.replace("__", "_")
    return v.strip("_")

def _normalize_str_list(x: Any, max_len: int) -> list[str]:
    if not isinstance(x, list):
        return []
    out: list[str] = []
    for item in x:
        tok = _norm_token(str(item))
        if tok and tok not in out:
            out.append(tok)
        if len(out) >= max_len:
            break
    return out

def _normalize_enrichment(obj: Any) -> dict[str, Any]:
    if not isinstance(obj, dict):
        return {"skin_type_compatibility": [], "interactions": [], "recommendation_time": []}

    allowed_skin = {"oily", "dry", "combination", "normal", "sensitive", "acne_prone", "mature", "all"}
    allowed_times = {"morning", "evening"}

    skin_types = [s for s in _normalize_str_list(obj.get("skin_type_compatibility"), 8) if s in allowed_skin]
    # interactions: negative interactions only (no whitelist)
    interactions = _normalize_str_list(obj.get("interactions"), 12)
    recommendation_time = [t for t in _normalize_str_list(obj.get("recommendation_time"), 2) if t in allowed_times]

    return {
        "skin_type_compatibility": skin_types,
        "interactions": interactions,
        "recommendation_time": recommendation_time,
    }

def enrich_one_local(ingredient: str, functions: list[str], concerns: list[str]) -> dict:
    if not DEEPSEEK_API_KEY:
        return {"skin_type_compatibility": [], "interactions": [], "recommendation_time": []}

    url = f"{DEEPSEEK_BASE_URL.rstrip('/')}/chat/completions"
    headers = {"Authorization": f"Bearer {DEEPSEEK_API_KEY}", "Content-Type": "application/json"}
    body = {
        "model": DEEPSEEK_MODEL,
        "temperature": 0.2,
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": _build_user_prompt(ingredient, functions, concerns)},
        ],
    }
    resp = requests.post(url, headers=headers, json=body, timeout=45)
    resp.raise_for_status()
    content = resp.json()["choices"][0]["message"]["content"]
    parsed = json.loads(content)
    return _normalize_enrichment(parsed)

# ---- Run enrichment on ingredients_dim ----
if ingredients_dim.empty:
    print('No ingredient dimension to enrich.')
    enriched_dim = ingredients_dim.copy()
else:
    rows = []
    for r in ingredients_dim.to_dict(orient='records'):
        ing_name = str(r.get('ingredient') or '').strip()
        if not ing_name:
            continue

        try:
            enr = enrich_one_local(
                ingredient=ing_name,
                functions=r.get('functions') if isinstance(r.get('functions'), list) else [],
                concerns=r.get('concerns') if isinstance(r.get('concerns'), list) else [],
            )
        except Exception as e:
            print('Enrichment failed for', ing_name, '|', repr(e))
            enr = {"skin_type_compatibility": [], "interactions": [], "recommendation_time": []}

        rows.append({**r, **enr})

        if DEEPSEEK_API_KEY:
            time.sleep(0.6)

    enriched_dim = pd.DataFrame(rows)

print('DEEPSEEK_API_KEY configured:', bool(DEEPSEEK_API_KEY))
enriched_dim

DEEPSEEK_API_KEY configured: True


Unnamed: 0,ingredient_id,ingredient,ingredient_url,functions,concerns,skin_type_compatibility,interactions,recommendation_time
0,ing_12925a216138cb7f,MICROCRYSTALLINE CELLULOSE,https://www.ewg.org/skindeep/ingredients/703956-MICROCRYSTALLINE_CELLULOSE/,"[abrasive, absorbent, anticaking agent, bulking agent, emulsion stabilizer, slip modifier;viscosity increasing agent...",[Allergies/immunotoxicity (moderate)],"[oily, dry, combination, normal, sensitive, acne_prone]",[],"[morning, evening]"
1,ing_3345cc8aaec57313,DISODIUM COCOYL GLUTAMATE,https://www.ewg.org/skindeep/ingredients/702141-DISODIUM_COCOYL_GLUTAMATE/,"[cleansing, surfactant, surfactant - cleansing agent]","[Use restrictions (low), Irritation (skin, eyes, or lungs) (low)]","[dry, sensitive, normal, combination, all]",[],"[morning, evening]"
2,ing_35a1abecacd1f9ba,ORYZA SATIVA (RICE) STARCH,https://www.ewg.org/skindeep/ingredients/704365-ORYZA_SATIVA_RICE_STARCH/,"[absorbent, binding, bulking, viscosity controlling, bulking agent]","[Use restrictions (low), Contamination concerns (PESTICIDES)]","[oily, dry, combination, normal, sensitive, acne_prone]",[],"[morning, evening]"
3,ing_60ae9e34f35660c8,AVENA SATIVA (OAT) KERNEL FLOUR,https://www.ewg.org/skindeep/ingredients/700581-AVENA_SATIVA_OAT_KERNEL_FLOUR/,"[abrasive, absorbent, bulking, viscosity controlling, bulking agent, viscosity increasing agent - aqueous]","[Allergies/immunotoxicity (low), Use restrictions (low)]","[dry, sensitive, normal, combination, all]",[],"[morning, evening]"
4,ing_613012f31fb02d14,COCONUT FATTY ACID,https://www.ewg.org/skindeep/ingredients/701551-COCONUT_FATTY_ACID/,"[cleansing, emollient, emulsifying, surfactant, surfactant - cleansing agentsurfactant-cleansing agent is included a...","[Multiple, additive exposure sources (low)]","[dry, normal, mature]",[],[evening]
5,ing_7b8feb076190aa62,KAOLIN,https://www.ewg.org/skindeep/ingredients/703305-KAOLIN/,"[abrasive, absorbent, anticaking, bulking, opacifying, anticaking agent, bulking agent, opacifying agent, skin prote...",[],"[oily, combination, acne_prone]",[],"[morning, evening]"
6,ing_adb1f1e50788ffe0,POPULUS TREMULOIDES BARK EXTRACT,https://www.ewg.org/skindeep/ingredients/722643-POPULUS_TREMULOIDES_BARK_EXTRACT/,"[antiseborrhoeic, skin conditioning, not reported]","[Cancer (low), Developmental/reproductive toxicity (low), Allergies/immunotoxicity (low), Use restrictions (high), E...",[all],[],"[morning, evening]"
7,ing_d81f4a7eabe26c8c,GLUTAMIC ACID,https://www.ewg.org/skindeep/ingredients/702604-GLUTAMIC_ACID/,"[antistatic, hair conditioning, humectant, fragrance ingredient, hair conditioning agent, skin-conditioning agent - ...","[Irritation (skin, eyes, or lungs) (low)]","[dry, normal, sensitive, mature]",[],"[morning, evening]"
8,ing_f8dbe1cb1d166c7c,CELLULOSE ACETATE,https://www.ewg.org/skindeep/ingredients/717526-CELLULOSE_ACETATE/,"[film forming, film former]",[],"[oily, dry, combination, normal, sensitive, acne_prone]",[],"[morning, evening]"
9,ing_f9e3e96ddd7817d5,MAGNESIUM OXIDE,https://www.ewg.org/skindeep/ingredients/703703-MAGNESIUM_OXIDE/,"[absorbent, buffering, opacifying, opacifying agent, ph adjuster]",[Enhanced skin absorption],"[oily, combination, normal]",[],"[morning, evening]"


In [None]:
# 4) Export small outputs (for quick inspection)
out_dir = SCRAPER_DIR / 'notebooks' / 'output'
out_dir.mkdir(parents=True, exist_ok=True)

products_out_path = out_dir / 'products_out_sample.csv'
ingredients_dim_path = out_dir / 'ingredients_dim_sample.csv'
product_ing_path = out_dir / 'product_ingredients_sample.csv'
enriched_dim_path = out_dir / 'ingredients_dim_enriched_sample.csv'

if not products_out.empty:
    products_out.to_csv(products_out_path, index=False)
if not ingredients_dim.empty:
    ingredients_dim.to_csv(ingredients_dim_path, index=False)
if not product_ingredients.empty:
    product_ingredients.to_csv(product_ing_path, index=False)
if 'enriched_dim' in globals() and isinstance(enriched_dim, pd.DataFrame) and not enriched_dim.empty:
    enriched_dim.to_csv(enriched_dim_path, index=False)

print('Wrote:', out_dir)
for p in [products_out_path, ingredients_dim_path, product_ing_path, enriched_dim_path]:
    if p.exists():
        print(' -', p.name)

## Optional: Run the DB-backed pipeline (staging + transform + enrich)

This section uses Postgres tables defined in `scraper/utils/db.py` and the scripts:
- `extract_ewg.check_and_stage_discovery()`
- `extract_ewg.extract_new_products_to_staging()`
- `transform_ewg.transform_run_to_staging()`
- `enrich_ingredients.enrich_ingredients_for_run()`

You need a reachable Postgres (e.g. via `docker compose up postgres`) and env vars like `DATABASE_URL` or `DB_HOST/DB_USER/...`.
This is heavier than the in-memory smoke test above.

In [None]:
# Uncomment to run (requires Postgres)
# from extract_ewg import check_and_stage_discovery, extract_new_products_to_staging
# from transform_ewg import transform_run_to_staging
# from enrich_ingredients import enrich_ingredients_for_run
#
# import time
# run_id = f'notebook_{int(time.time())}'
#
# new_count = check_and_stage_discovery(run_id=run_id, categories=[CATEGORY])
# print('New products (vs final table):', new_count)
#
# extracted = extract_new_products_to_staging(
#     run_id=run_id,
#     categories=[CATEGORY],
#     max_ingredients_per_product=30,
# )
# print('Ingredient rows staged:', extracted)
#
# tx = transform_run_to_staging(run_id=run_id)
# print('Transform output:', tx)
#
# enr = enrich_ingredients_for_run(run_id=run_id)
# print('Enrichment output:', enr)