In [None]:
import os
import pandas as pd
import numpy as np
from collections import defaultdict
from tqdm import tqdm
from db import DB

data_dir = "../data/us-gaap"
db = DB()

# Step 1: Get all US GAAP concept names from the DB
concept_df = db.get("SELECT name FROM us_gaap_concept", ["name"])
valid_concepts = set(concept_df["name"].values)

# Step 2: Prepare structures to collect values per unit and concept
unit_values = defaultdict(list)
unit_concepts = defaultdict(set)
non_numeric_units = set()

# Step 3: Traverse CSV files
csv_files = []
for root, _, files in os.walk(data_dir):
    for file in files:
        if file.endswith(".csv"):
            csv_files.append(os.path.join(root, file))

for path in tqdm(csv_files, desc="Scanning CSVs"):
    try:
        df = pd.read_csv(path, low_memory=False)

        # Filter to only valid GAAP concept columns
        tag_columns = [col for col in df.columns if col in valid_concepts]
        if not tag_columns:
            continue

        for col in tag_columns:
            for val in df[col].dropna().astype(str):
                if "::" not in val:
                    continue
                val_part, unit_part = val.split("::", 1)

                # Obtain unit part and normalize to uppercase
                unit_part = unit_part.strip().upper()

                try:
                    num_val = float(val_part.strip())
                    unit_values[unit_part].append(num_val)
                    unit_concepts[unit_part].add(col)
                except ValueError:
                    non_numeric_units.add(unit_part)
    except Exception as e:
        print(f"⚠️ Skipped {path} due to error: {e}")

# Step 4: Report
print(f"\n✅ Scanned {len(csv_files)} files.")
print(f"📦 Found {len(unit_values)} numeric units and {len(non_numeric_units)} non-numeric units.")

# Step 5: Show stats per numeric unit
for unit, values in sorted(unit_values.items()):
    arr = np.array(values)
    print(f"🔹 {unit}")
    print(f"   Count: {len(arr)}")
    print(f"   Min:   {arr.min():,.4f}")
    print(f"   Max:   {arr.max():,.4f}")
    print(f"   Mean:  {arr.mean():,.4f}")
    print(f"   Std:   {arr.std():,.4f}")
    print(f"   Concepts: {', '.join(sorted(unit_concepts[unit]))}")

# Step 6: Optionally show non-numeric units
if non_numeric_units:
    print("\n⚠️ Non-numeric units encountered:")
    for unit in sorted(non_numeric_units):
        print(f"  - {unit}")
