In [None]:


import os
import re
import csv
import xml.etree.ElementTree as ET
from collections import defaultdict

# Base paths (update if needed)
# https://www.fasb.org/page/detail?pageId=/projects/FASB-Taxonomies/2025-gaap-financial-reporting-taxonomy.html
# https://xbrl.fasb.org/us-gaap/2025/us-gaap-2025.zip


# Configuration
TAXONOMY_DIR = "data/us-gaap-2025/"
ELTS_XSD = os.path.join(TAXONOMY_DIR, "elts", "us-gaap-2025.xsd")
STM_DIR = os.path.join(TAXONOMY_DIR, "stm")

FILENAME_STATEMENT_MAP = {
    "scf": "Cash Flow Statement",
    "soi": "Income Statement",
    "sfp": "Balance Sheet",
    "sheci": "Equity Statement",
    "soc": "Comprehensive Income"
}

XBRLI_NS = "http://www.xbrl.org/2003/instance"
BALANCE_KEY = f"{{{XBRLI_NS}}}balance"
PERIOD_TYPE_KEY = f"{{{XBRLI_NS}}}periodType"

def generate_description(tag_name):
    return re.sub(
        r'(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])',
        ' ',
        tag_name
    ).lower()

# Step 1: Parse tag metadata from ELTS
tag_metadata = {}
tree = ET.parse(ELTS_XSD)
root = tree.getroot()
for el in root.findall(".//{http://www.w3.org/2001/XMLSchema}element"):
    name = el.attrib.get("name")
    if not name or el.attrib.get("abstract") == "true":
        continue
    tag_metadata[name] = {
        "balance": el.attrib.get(BALANCE_KEY, ""),
        "period_type": el.attrib.get(PERIOD_TYPE_KEY, ""),
        "statements": set()
    }

# Step 2: Assign tags to statement types from STM files
for file in os.listdir(STM_DIR):
    if not file.endswith(".xml") or "-pre-" not in file:
        continue
    parts = file.split("-")
    if len(parts) < 4:
        continue
    stmt_key = parts[3]
    inferred_statement = FILENAME_STATEMENT_MAP.get(stmt_key)
    if not inferred_statement:
        continue

    tree = ET.parse(os.path.join(STM_DIR, file))
    root = tree.getroot()
    for loc in root.findall(".//{http://www.xbrl.org/2003/linkbase}loc"):
        href = loc.attrib.get("{http://www.w3.org/1999/xlink}href", "")
        tag = href.split("#")[-1]
        if tag.startswith("us-gaap_"):
            tag = tag.replace("us-gaap_", "")
        if tag in tag_metadata:
            tag_metadata[tag]["statements"].add(inferred_statement)

# Step 3: Add descriptions and prep final dataset
final_output = []
for tag, meta in tag_metadata.items():
    for stmt in meta["statements"]:
        final_output.append({
            "tag": tag,
            "statement_type": stmt,
            "balance": meta["balance"],
            "period_type": meta["period_type"],
            "description": generate_description(tag)
        })

# Step 4: Subcategory path from presentation tree hierarchy
def walk_presentation_tree_with_fallback(root, label_map, current_id, current_path, arcs_by_from, loc_map, depth=0):
    if depth > 4:
        return
    tag = loc_map.get(current_id)
    if tag and tag.startswith("us-gaap_"):
        clean_tag = tag.replace("us-gaap_", "")
        tag_to_path[clean_tag] = " > ".join(current_path[:4])
    for child_id in arcs_by_from.get(current_id, []):
        label = label_map.get(child_id)
        if not label:
            fallback_tag = loc_map.get(child_id, "")
            if fallback_tag.startswith("us-gaap_"):
                label = generate_description(fallback_tag.replace("us-gaap_", ""))
        if label:
            walk_presentation_tree_with_fallback(
                root, label_map, child_id,
                current_path + [label],
                arcs_by_from, loc_map,
                depth + 1
            )

tag_to_path = {}

for file in os.listdir(STM_DIR):
    if not file.endswith(".xml") or "-pre-" not in file:
        continue

    tree = ET.parse(os.path.join(STM_DIR, file))
    root = tree.getroot()

    loc_map = {}
    label_map = {}
    arcs_by_from = defaultdict(list)
    all_to_ids = set()

    for loc in root.findall(".//{http://www.xbrl.org/2003/linkbase}loc"):
        loc_id = loc.attrib.get("{http://www.w3.org/1999/xlink}label")
        href = loc.attrib.get("{http://www.w3.org/1999/xlink}href", "")
        loc_map[loc_id] = href.split("#")[-1]

    for label in root.findall(".//{http://www.xbrl.org/2003/linkbase}label"):
        label_id = label.attrib.get("{http://www.w3.org/1999/xlink}label")
        label_text = label.text
        if label_id and label_text:
            label_map[label_id] = label_text.strip()

    for arc in root.findall(".//{http://www.xbrl.org/2003/linkbase}presentationArc"):
        from_id = arc.attrib.get("{http://www.w3.org/1999/xlink}from")
        to_id = arc.attrib.get("{http://www.w3.org/1999/xlink}to")
        arcs_by_from[from_id].append(to_id)
        all_to_ids.add(to_id)

    root_ids = set(arcs_by_from.keys()) - all_to_ids

    for root_id in root_ids:
        label = label_map.get(root_id)
        if not label:
            fallback_tag = loc_map.get(root_id, "")
            if fallback_tag.startswith("us-gaap_"):
                label = generate_description(fallback_tag.replace("us-gaap_", ""))
        if label:
            walk_presentation_tree_with_fallback(
                root, label_map, root_id, [label], arcs_by_from, loc_map
            )

# Step 5: Attach subcategory paths
for row in final_output:
    row["subcategory_path"] = tag_to_path.get(row["tag"], "")

# Step 6: Write final output
with open("data/us_gaap_2025_verified_subcategory_path.csv", "w", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=[
        "tag", "statement_type", "balance", "period_type",
        "description", "subcategory_path"
    ])
    writer.writeheader()
    writer.writerows(final_output)
