# Robert Wood Johnson Foundation

RWJF builds their [Grants Directory](https://www.rwjf.org/en/grants/awarded-grants.html?s=1) via a bunch of JSON calls. We'll just hit up the JSON API.

In [None]:
import polars as pl
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from requests_cache import CachedSession
from typing import List, Dict, Any
from datetime import datetime
from tqdm.notebook import tqdm
from oic_scrape.items import AwardParticipant, AwardItem
from attrs import asdict

In [None]:
START_YEAR = 1995
OUTPUT_LOCATION = "data/rwjf.org.jsonl"
USE_CACHE = True

In [None]:
FUNDER_ORG_NAME = "Robert Wood Johnson Foundation"
FUNDER_ORG_ROR_ID = "https://ror.org/02ymmdj85"

In [None]:
## HTTP Configuration

# Cache for development use only
if USE_CACHE:
    session = CachedSession(
        "cache.sqlite",
        backend="sqlite",
        allowable_methods=("GET", "POST"),
        allowable_codes=(200, 404),
    )
else:
    session = requests.Session()
retry = Retry(connect=5, backoff_factor=1.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount("http://", adapter)
session.mount("https://", adapter)

In [None]:
base_json = "https://www.rwjf.org/content/rwjf-web/us/en/_jcr_content.grants.json?k=&s=1&resultsPerPage=100&start=1972&end=2024&amt=-1&active=true&closed=true&sortBy=year&ascending=false&m="

r = session.get(base_json)

In [None]:
def get_grants(
    start_year: int = 2000, end_year: int = datetime.now().year
) -> List[Dict[str, Any]]:
    """Downloads grants from Robert Wood Johnson Foundation

    Args:
        start_year (int, optional): Start year for grants. Defaults to 2000. Minimum is 1974.
        end_year (int, optional): End year for grants. Defaults to current year.
    """
    if start_year < 1974:
        start_year = 1974

    grants = []
    page = 1
    count = 0
    total = None
    total_pages = None
    pbar = tqdm(total=total, desc="Downloading RWJF Grants")

    while total_pages is None or page < total_pages + 1:
        url = f"https://www.rwjf.org/content/rwjf-web/us/en/_jcr_content.grants.json?k=&s={page}&resultsPerPage=100&start={start_year}&end={end_year}&amt=-1&active=true&closed=true&sortBy=year&ascending=true&m="
        r = session.get(url)
        r.raise_for_status()
        data = r.json()
        grants.extend(data["results"])
        if total_pages is None:
            total_pages = data["totalPages"]
            pbar.total = data["totalResults"]
            pbar.refresh()
        pbar.update(len(data["results"]))
        page += 1

    pbar.close()

    return grants


grants = get_grants(start_year=START_YEAR)
crawl_ts = datetime.now()

In [None]:
awards = []

for grant in grants:
    try:
        # Create list of AwardParticipant objects
        named_participants = []
        for contact in grant.get("contact", []):
            if contact.get("name"):  # Only create participant if name exists
                person = AwardParticipant(
                    full_name=contact["name"],
                    is_pi=contact.get("role") == "Project Director",
                    grant_role=contact.get("role"),
                    identifiers={"email": contact["email"]}
                    if contact.get("email")
                    else None,
                )
                named_participants.append(person)

        # Handle empty participants list
        if not named_participants:
            named_participants = None
            pi_string = None
        else:
            # Get PI names for pi_string
            pi_list = [p.full_name for p in named_participants if p.is_pi]
            pi_string = ", ".join(pi_list) if pi_list else None

        # Create AwardItem
        award = AwardItem(
            source="rwjf.org",
            grant_id=f"rwjf::{grant['grantNumber']}",
            funder_org_name=FUNDER_ORG_NAME,
            funder_org_ror_id=FUNDER_ORG_ROR_ID,
            recipient_org_name=grant["granteeInfo"]["orgName"],
            recipient_org_location=", ".join(
                filter(
                    None,
                    [
                        grant["granteeInfo"].get("city", ""),
                        grant["granteeInfo"].get("state", ""),
                        grant["granteeInfo"].get("zip", ""),
                        grant["granteeInfo"].get("country", ""),
                    ],
                )
            ),
            pi_name=pi_string,
            named_participants=named_participants,  # Now passing list of AwardParticipant objects
            grant_year=datetime.fromtimestamp(grant["dateAwarded"] / 1000).year,
            grant_start_date=datetime.fromtimestamp(grant["startDate"] / 1000).date(),
            grant_end_date=datetime.fromtimestamp(grant["endDate"] / 1000).date(),
            grant_duration=f"{(datetime.fromtimestamp(grant['endDate']/1000).date() - datetime.fromtimestamp(grant['startDate']/1000).date()).days} days",
            award_amount=float(grant["amountAwarded"]),
            award_currency="USD",
            award_amount_usd=float(grant["amountAwarded"]),
            grant_title=grant["title"],
            grant_description=grant["description"],
            program_of_funder=" | ".join(grant["programs"]),
            _crawled_at=crawl_ts,
            raw_source_data=str(grant),
            _award_schema_version="0.1.1",
        )
        awards.append(asdict(award))
    except Exception as e:
        print(f"Error processing grant {grant.get('grantNumber', 'unknown')}: {str(e)}")

In [None]:
pl.DataFrame(awards).write_ndjson(OUTPUT_LOCATION)