# National Endowment for the Humanities

Downloads data from NEH's open data portal and outputs into our format

Base URL (from: https://apps.neh.gov/open/data/)

In [2]:
import requests
import io
import pandas as pd

# from oic_scrape.items import GrantItem
from collections import OrderedDict
import datetime

In [3]:
# Notebook Parameters
DECADES = "2000, 2010, 2020"
OUTPUT_LOCATION = "data/neh.gov_grants.jsonl"
OUTPUT_FORMAT = "jsonl"

In [4]:
def validate_decades(decades):
    """
    Validates a list of decades for NEH data file downloads.

    Args:
        decades (str): A string representing the decades to validate.
                       It can be either a comma-separated list of decades or the string "all".

    Returns:
        list: A list of valid decades.

    Raises:
        ValueError: If the input decades are not valid.

    Example:
        >>> validate_decades("1960, 1970, 1980")
        ['1960', '1970', '1980']
    """
    current_year = datetime.datetime.now().year
    current_decade = current_year - (current_year % 10)
    valid_decades = [str(year) for year in range(1960, current_decade + 1, 10)]

    if decades.lower() == "all":
        return valid_decades

    decades_list = decades.split(",")
    sanitized_decades = []

    for decade in decades_list:
        decade = decade.strip()
        if decade not in valid_decades:
            raise ValueError(
                f"Decade should be between 1960 and {current_decade} (the start of the decade for {current_year})."
            )
        sanitized_decades.append(decade)

    return sanitized_decades


def validate_output_format(format):
    """
    Validates the output file format.

    Args:
        format (str): The output format to be validated.

    Returns:
        bool: True if the format is valid (json, jsonl, or jsonlines), False otherwise.
    """
    if (
        format.lower() == "json"
        or format.lower() == "jsonl"
        or format.lower() == "jsonlines"
    ):
        return True
    else:
        return False


if validate_output_format(OUTPUT_FORMAT):
    if OUTPUT_FORMAT.lower() == "jsonl" or OUTPUT_FORMAT.lower() == "jsonlines":
        output_format_lines = True
    else:
        output_format_lines = False
else:
    raise ValueError("Output format should be either 'json' or 'jsonl'/'jsonlines'.")

In [5]:
FUNDER_NAME = "National Endowment for the Humanities"
FUNDER_ROR_ID = "https://ror.org/02vdm1p28"

In [6]:
baseurl = "https://securegrants.neh.gov/open/data/NEH_Grants{}s.csv"

In [7]:
dfs = []
for decade in validate_decades(DECADES):
    url = baseurl.format(decade)
    r = requests.get(url, verify=False)
    df = pd.read_csv(io.StringIO(r.text))
    timestamp_str = datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
    df["_crawled_at"] = timestamp_str
    dfs.append(df)
all_grants = pd.concat(dfs)



In [None]:
# Format times
all_grants["BeginGrant"] = pd.to_datetime(
    all_grants["BeginGrant"], format="%m/%d/%Y %I:%M:%S %p"
)
all_grants["EndGrant"] = pd.to_datetime(
    all_grants["EndGrant"], format="%m/%d/%Y %I:%M:%S %p"
)

  all_grants['BeginGrant'] = pd.to_datetime(all_grants['BeginGrant'])
  all_grants['EndGrant'] = pd.to_datetime(all_grants['EndGrant'])


In [None]:
# Format times
all_grants["BeginGrant"] = pd.to_datetime(
    all_grants["BeginGrant"], format="%m/%d/%Y %I:%M:%S %p"
)
all_grants["EndGrant"] = pd.to_datetime(
    all_grants["EndGrant"], format="%m/%d/%Y %I:%M:%S %p"
)

In [None]:
def month_diff(end_date: pd.Timestamp, start_date: pd.Timestamp) -> int:
    """
    Calculate the difference in months between two Pandas timestamps.

    Parameters:
    end_date (pandas.Timestamp): The first timestamp.
    start_date (pandas.Timestamp): The second timestamp.

    Returns:
    int: The difference in months between the two timestamps.
    """
    if (
        start_date.day == 1
        and end_date.day == pd.Timestamp(end_date.year, end_date.month, 1).days_in_month
    ):
        return (
            12 * (end_date.year - start_date.year)
            + (end_date.month - start_date.month)
            + 1
        )
    else:
        return 12 * (end_date.year - start_date.year) + (
            end_date.month - start_date.month
        )

In [None]:
ioi_grants = []
for ix, grant in all_grants.iterrows():
    g = OrderedDict()

    g["grant_id"] = f"neh::{grant['AppNumber']}"
    g["funder_name"] = FUNDER_NAME
    g["funder_ror_id"] = FUNDER_ROR_ID
    g["recipient_org_name"] = grant["Institution"]
    location = ""
    if grant["InstCity"]:
        location += grant["InstCity"]
    if grant["InstState"]:
        if location:
            location += ", "
        location += grant["InstState"]
    if grant["InstPostalCode"]:
        if location:
            location += ", "
        location += str(grant["InstPostalCode"])  # Convert float to string
    if grant["InstCountry"]:
        if location:
            location += ", "
        location += grant["InstCountry"]
    g["recipient_location"] = location
    g["pi_name"] = grant["Participants"]

    # date fields
    grant_start_date = grant["BeginGrant"]
    grant_end_date = grant["EndGrant"]
    grant_duration = month_diff(grant_end_date, grant_start_date)
    grant_year = grant_start_date.year

    g["grant_year"] = grant_year
    g["grant_duration"] = grant_duration
    g["grant_start_date"] = grant_start_date.strftime("%Y-%m-%d")
    g["grant_end_date"] = grant_end_date.strftime("%Y-%m-%d")

    g["award_amount"] = grant["ApprovedOutright"]
    g["award_currency"] = "USD"
    g["award_amount_usd"] = grant["ApprovedOutright"]
    g["source"] = "https://apps.neh.gov/open/data/"
    g["grant_description"] = f"{grant['ProjectTitle']} > {grant['ProjectDesc']}"
    g["program_of_funder"] = f"{grant['Program']} > {grant['Division']}"
    g["_crawled_at"] = grant["_crawled_at"]
    del grant["_crawled_at"]

    g["raw_source_data"] = grant
    ioi_grants.append(g)

In [None]:
grants_df = pd.DataFrame(ioi_grants)

In [None]:
grants_df.to_json(OUTPUT_LOCATION, orient="records", lines=output_format_lines)