In [None]:
import os
import logging
import xml.etree.ElementTree as ET
from db import DB
from tqdm import tqdm
from collections import defaultdict

TAXONOMY_DIR = "data/us-gaap-2025"
ELTS_XSD = os.path.join(TAXONOMY_DIR, "elts", "us-gaap-2025.xsd")

XBRLI_NS = "http://www.xbrl.org/2003/instance"
BALANCE_KEY = f"{{{XBRLI_NS}}}balance"
PERIOD_TYPE_KEY = f"{{{XBRLI_NS}}}periodType"

db = DB()


def build_type_resolution_map(schema_paths: list) -> dict:
    """
    Builds a mapping from custom GAAP types to base XBRL types
    (handles both simpleType and complexType).
    """
    type_map = {}

    for path in schema_paths:
        if not path.endswith(".xsd"):
            continue

        try:
            tree = ET.parse(path)
            root = tree.getroot()

            # Handle <xs:simpleType>
            for simple_type in root.findall(".//{http://www.w3.org/2001/XMLSchema}simpleType"):
                name = simple_type.attrib.get("name")
                restriction = simple_type.find("{http://www.w3.org/2001/XMLSchema}restriction")
                if name and restriction is not None:
                    base = restriction.attrib.get("base")
                    if base:
                        type_map[name] = base.split(":")[-1]

            # Handle <xs:complexType> with <extension>
            for complex_type in root.findall(".//{http://www.w3.org/2001/XMLSchema}complexType"):
                name = complex_type.attrib.get("name")
                ext = complex_type.find(".//{http://www.w3.org/2001/XMLSchema}extension")
                if name and ext is not None:
                    base = ext.attrib.get("base")
                    if base:
                        type_map[name] = base.split(":")[-1]

        except ET.ParseError:
            continue

    return type_map


def import_gaap_concepts():
    # Gather all .xsd files from the taxonomy directory
    schema_files = []
    for root_dir, _, files in os.walk(TAXONOMY_DIR):
        for f in files:
            if f.endswith(".xsd"):
                schema_files.append(os.path.join(root_dir, f))

    # type_map = build_type_resolution_map(schema_files)

    ALLOWED_NON_XBRLI_CONCEPT_TYPES = [
        "dtr-types:perShareItemType",
        "dtr-types:percentItemType",
        "dtr-types:volumeItemType",
        "srt-types:perUnitItemType"
    ]

    tree = ET.parse(ELTS_XSD)
    root = tree.getroot()
    elements = root.findall(".//{http://www.w3.org/2001/XMLSchema}element")

    skipped_types = set()

    for el in tqdm(elements, desc="Importing GAAP concepts"):
        name = el.attrib.get("name")
        if not name or el.attrib.get("abstract") == "true":
            continue

        concept_type = el.attrib.get("type", "")
   
        if not concept_type.startswith("xbrli:") and not concept_type in ALLOWED_NON_XBRLI_CONCEPT_TYPES:
            if concept_type not in skipped_types:
                logging.warning(f"Skipping concept type: {concept_type}")
                skipped_types.add(concept_type)
            continue

        balance = el.attrib.get(BALANCE_KEY)
        period_type = el.attrib.get(PERIOD_TYPE_KEY)
        label = el.attrib.get("label")
        documentation = el.attrib.get("documentation")

        try:
            concept_type_id = db.upsert_entity(
                "us_gaap_concept_type", {"concept_type": concept_type},
                unique_fields=["concept_type"]
            )
        except Exception as e:
            logging.warning("Skipping upsert concept type %s: %s", concept_type, e)

            # Example: "MalpracticeInsurance-OccurrenceOrClaims-madeItemType"
            continue

        balance_type_id = (
            db.upsert_entity("us_gaap_balance_type", {"balance": balance}, ["balance"])
            if balance else None
        )

        period_type_id = (
            db.upsert_entity("us_gaap_period_type", {"period_type": period_type}, ["period_type"])
            if period_type else None
        )

        concept_data = {
            "name": name,
            "concept_type_id": concept_type_id,
            "balance_type_id": balance_type_id,
            "period_type_id": period_type_id,
            "label": label,
            "documentation": documentation
        }

        db.upsert_entity("us_gaap_concept", concept_data, unique_fields=["name"])
        # logging.debug("Upserted concept: %s", name)

    logging.info("GAAP concepts successfully imported from schema.")


if __name__ == "__main__":
    import_gaap_concepts()
