In [None]:
import logging
from db import DB, reset_schema
from utils.os import load_csv
from utils.pytorch import seed_everything

seed_everything()

db = DB()

In [None]:
confirm = input(
    "⚠️ This will DROP and RECREATE the schema 'us_gaap'. Type 'yes' to proceed: "
)

if confirm.strip().lower() == "yes":
    reset_schema(db, "us_gaap")

    # Recreate the database connection
    db = DB()
else:
    print("❌ Canceled.")

In [None]:
# TODO: Refactor accordingly

# import csv
from db import DB
from tqdm import tqdm

# ============================================================================
# ALLOWED_CONCEPT_TYPES
# ----------------------------------------------------------------------------
# This list defines the subset of XBRL and US GAAP data types permitted for
# alignment to OFSS financial statement categories. Only concepts with one of
# these types will be considered during ingestion and mapping.
#
# These types were selected based on:
# - Relevance to numerical financial reporting (e.g., dollar values, shares)
# - Suitability for autoencoding, vector embedding, or semantic comparison
# - Exclusion of non-numeric or non-quantitative data (e.g., booleans, enums)
#
# The list includes:
# - Percentages, per-share values, monetary units, share counts
# - Volumetric and per-unit measures (srt/dtr volume)
# - Integer and decimal types where the semantics are clearly quantitative
# - Interest rates and cash/operational flow concepts over time
#
# NOTE:
# - Types like stringItemType, pureItemType, booleanItemType, etc., are excluded
#   unless explicitly whitelisted elsewhere.
# - For debugging:
#       1. Locate US GAAP concepts which contain these concept types: SELECT * FROM us_gaap.us_gaap_concept INNER JOIN us_gaap_concept_type ON us_gaap_concept_type.id = concept_type_id WHERE us_gaap_concept_type.concept_type LIKE "%integerItemType%"
#       2. Locate symbols that contain these concepts (in Rust): cargo run --example locate_us_gaap_values 'OilAndGasDeliveryCommitmentsAndContractsDailyProduction'
ALLOWED_CONCEPT_TYPES = [
    "dtr-types:percentItemType",
    "dtr-types:perShareItemType",
    "xbrli:monetaryItemType",
    "xbrli:sharesItemType",
    # "dtr-types:volumeItemType", # TODO: Include? Note: _Mostly_ Oil and Gas
    "srt-types:perUnitItemType",
    # "xbrli:decimalItemType",  # TODO: Include?
    # "xbrli:integerItemType",  # TODO: Include?
    # "dtr-types:flowItemType"  # TODO: Include?
]


# Note: The decision was made to use this CSV instead of the raw XBRL as it is easier to parse and to obtain
# the label and description.
#
# https://www.fasb.org/page/detail?pageId=/projects/FASB-Taxonomies/2025-gaap-financial-reporting-taxonomy.html
def upsert_us_gaap_concepts(db: DB, csv_data: list[dict]) -> None:
    """
    Upserts base-level US GAAP concepts into the database, including their
    concept type, balance type, period type, label, and documentation fields.

    Only records with a 'us-gaap' prefix and a valid concept type are imported.

    Args:
        db (DB): Database connection instance.
        csv_data (list[dict]): Parsed CSV records with fields like:
            - 'name': US GAAP tag
            - 'type': XBRL or DTR/SRT item type
            - 'balance': 'debit' or 'credit' (optional)
            - 'periodType': 'instant' or 'duration' (optional)
            - 'label': Human-readable label (optional)
            - 'documentation': Description (optional)
    """

    discarded_us_gaap_concept_types = set()

    try:
        for row in tqdm(csv_data, desc="Importing US GAAP Concepts"):
            if row["prefix"] != "us-gaap" or (row["type"] not in ALLOWED_CONCEPT_TYPES):
                if row["prefix"] == "us-gaap":
                    discarded_us_gaap_concept_types.add(row["type"])
                continue

            name = row["name"]
            concept_type = row["type"]
            balance_type = (
                row["balance"] if row["balance"] else None
            )  # Note: Upstream `balance_type` is listed as `balance`
            period_type = row["periodType"] if row["periodType"] else None
            label = row["label"] if row["label"] else None
            documentation = row["documentation"] if row["documentation"] else None

            concept_type_id = db.upsert_entity(
                "us_gaap_concept_type", {"concept_type": concept_type}, ["concept_type"]
            )

            # Upsert balance type if provided
            if balance_type is not None:
                balance_type_id = db.upsert_entity(
                    "us_gaap_balance_type",
                    {"balance_type": balance_type},
                    ["balance_type"],
                )
            else:
                balance_type_id = None

            # Upsert period type if provided
            if period_type is not None:
                period_type_id = db.upsert_entity(
                    "us_gaap_period_type", {"period_type": period_type}, ["period_type"]
                )
            else:
                period_type_id = None

            # Upsert the concept itself in the `us_gaap_concept` table
            concept_data = {
                "name": name,
                "concept_type_id": concept_type_id,
                "balance_type_id": balance_type_id,
                "period_type_id": period_type_id,
                "label": label,
                "documentation": documentation,
            }

            _concept_id = db.upsert_entity("us_gaap_concept", concept_data, ["name"])

            # logging.debug(f"Upserted data for concept ({concept_id}): {name}")

        for discarded_type in discarded_us_gaap_concept_types:
            logging.warning(f"Discarded US GAAP concept type: {discarded_type}")
        logging.warning(
            f"Total discarded US GAAP {len(discarded_us_gaap_concept_types)} concept types."
        )

        logging.info("US GAAP concept data has been successfully upserted.")
    except Exception as e:
        logging.error(f"Error upserting GAAP concept data: {e}")
        raise

In [None]:
logging.info("Ingesting US GAAP concepts")

csv_file = "data/2025_GAAP_Concepts.csv"

csv_data = load_csv(csv_file)
upsert_us_gaap_concepts(db, csv_data)