In [5]:
import csv
import json

# Define mappings with fallback to "unknown" (0)
statement_map = {
    "Balance Sheet": 0,
    "Income Statement": 1,
    "Cash Flow Statement": 2,
    "Equity Statement": 0,  # mapped to Balance Sheet
    "Comprehensive Income": 1,             # mapped to Income Statement
}

balance_map = {
    "debit": 0,
    "credit": 1,
    "unknown": 2
}

period_map = {
    "duration": 0,
    "instant": 1,
    "unknown": 2
}

input_csv = "data/us_gaap_2025_verified_subcategory_path.csv"
output_json = "data/us_gaap_multilabel_training_data.json"

dataset = []

with open(input_csv, "r") as f:
    reader = csv.DictReader(f)
    for i, row in enumerate(reader):
        desc = row.get("description", "").strip()
        if not desc:
            print(f"[Row {i}] Skipped: Missing description")
            continue

        statement_type = row.get("statement_type", "").strip()
        balance = row.get("balance", "").strip().lower()
        period = row.get("period_type", "").strip().lower()

        statement_label = statement_map.get(statement_type, -1)
        if statement_label == -1:
            print(f"[Row {i}] Skipped: Unmapped statement_type: '{statement_type}'")
            continue  # skip anything unmapped

        balance_label = balance_map.get(balance, balance_map["unknown"])
        period_label = period_map.get(period, period_map["unknown"])

        dataset.append({
            "text": desc,
            "labels": {
                "statement_type": statement_label,
                "balance": balance_label,
                "period_type": period_label
            }
        })


with open(output_json, "w") as f:
    json.dump(dataset, f, indent=2)

import os
os.path.abspath(output_json)


'/Volumes/2TB Storage Vault/rust-sec-fetcher/python/data/us_gaap_multilabel_training_data.json'