In [None]:
import os
from pathlib import Path

# This snippet ensures consistent import paths across environments.
# When running notebooks via JupyterLab's web UI, the current working
# directory is often different (e.g., /notebooks) compared to VS Code,
# which typically starts at the project root. This handles that by 
# retrying the import after changing to the parent directory.
# 
# Include this at the top of every notebook to standardize imports
# across development environments.

try:
    from utils.os import chdir_to_git_root
except ModuleNotFoundError:
    os.chdir(Path.cwd().parent)
    print(f"Retrying import from: {os.getcwd()}")
    from utils.os import chdir_to_git_root

chdir_to_git_root("python")

print(os.getcwd())

In [None]:
import logging
# from models.pytorch.narrative_stack.stage1.preprocessing import generate_concept_unit_embeddings, generate_concepts_report
from db import DbUsGaap

db_us_gaap = DbUsGaap()
data_dir = "../data/june-us-gaap" # Where CSV data is read from (once CSV file per symbol)


In [None]:
# NOTE: For debugging / monitoring purposes only
# Determine "category stack" depths

from collections import defaultdict
import numpy as np
from utils.csv import walk_us_gaap_csvs, get_filtered_us_gaap_form_rows_for_symbol

class RunningStats:
    def __init__(self):
        self.count = 0
        self.total = 0
        self.max_val = 0
        self.values = []

    def update(self, val: int):
        self.count += 1
        self.total += val
        self.max_val = max(self.max_val, val)
        self.values.append(val)  # Optional: remove this if median not needed

    def finalize(self):
        result = {
            "avg": self.total / self.count if self.count else 0,
            "max": self.max_val,
        }
        if self.values:
            result["median"] = float(np.median(self.values))
        return result

# Initialize running stats per key
stats = defaultdict(RunningStats)

gen = walk_us_gaap_csvs(data_dir, db_us_gaap, "row")

try:
    while True:
        row = next(gen)
        counter = defaultdict(int)
        for entry in row.entries:
            key = (entry.balance_type or "none", entry.period_type or "none")
            counter[key] += 1
        for key, val in counter.items():
            stats[key].update(val)
except StopIteration:
    pass

# Final summary
summary = {key: stat.finalize() for key, stat in stats.items()}

from pprint import pprint
pprint(summary)


In [None]:
# NOTE: For debugging / monitoring purposes only

import numpy as np
from utils.csv import walk_us_gaap_csvs, UsGaapRowRecord
from collections import defaultdict


def generate_concepts_report_from_walker(
    data_dir: Path,
    db_us_gaap: DbUsGaap,
    filtered_symbols: set[str] | None = None,
):
    gen = walk_us_gaap_csvs(
        data_dir=data_dir,
        db_us_gaap=db_us_gaap,
        walk_type="row",
        filtered_symbols=filtered_symbols,
    )

    unit_stats = defaultdict(list)
    concept_by_unit = defaultdict(set)

    try:
        while True:
            row = next(gen)
            if isinstance(row, UsGaapRowRecord):
                for entry in row.entries:
                    unit_stats[entry.uom].append(entry.value)
                    concept_by_unit[entry.uom].add(entry.concept)
    except StopIteration as stop:
        summary = stop.value

    print(f"\n✅ Scanned {len(summary.csv_files)} files.")
    print(
        f"📦 Found {len(unit_stats)} numeric units and "
        f"{len(summary.non_numeric_units)} non-numeric units."
    )

    for unit, values in sorted(unit_stats.items()):
        arr = np.array(values)
        print(f"🔹 {unit}")
        print(f"   Count: {len(arr)}")
        print(f"   Min:   {arr.min():,.4f}")
        print(f"   Max:   {arr.max():,.4f}")
        print(f"   Mean:  {arr.mean():,.4f}")
        print(f"   Std:   {arr.std():,.4f}")
        print(f"   Concepts: {', '.join(sorted(concept_by_unit[unit]))}")

    if summary.non_numeric_units:
        print("\n⚠️ Non-numeric units encountered:")
        for unit in sorted(summary.non_numeric_units):
            print(f"  - {unit}")

    total_values = sum(len(v) for v in unit_stats.values())
    print(f"\n🧮 Total values extracted: {total_values:,}")


In [None]:
generate_concepts_report_from_walker(data_dir, db_us_gaap, None)

In [None]:
logging.info("Collecting valid concepts...")

from utils.csv import walk_us_gaap_csvs, get_filtered_us_gaap_form_rows_for_symbol


gen = walk_us_gaap_csvs(data_dir, db_us_gaap, "row", {"AAPL"})

try:
    while True:
        data = next(gen)
        display(data)
except StopIteration as stop:
    summary = stop.value
    display(summary)

# for data in get_filtered_us_gaap_form_rows_for_symbol(data_dir, db_us_gaap, "NVDA", {"10-K", "10-Q"}):
#     display(data)
    


# from simd_r_drive import DataStore

# data_store = DataStore("proto.bin")

# logging.info("Extracting concept unit value tuples...")
# extracted_concept_unit_value_data = extract_concept_unit_value_tuples(data_dir, valid_concepts, data_store)

In [None]:
# # For deterministic hashing (TODO: Move to tests)

# import hashlib
# import pickle

# # def hash_extracted_data(data: ExtractedConceptUnitValueData) -> str:
# def hash_extracted_data(data) -> str:
#     """
#     Computes a SHA-256 hash of the full extracted concept/unit/value data structure.
#     This includes tuples, unit stats, and file list — all serialized deterministically.
#     """
#     # Serialize using protocol=5 (highest and deterministic in modern Python)
#     serialized = pickle.dumps(data.dict(), protocol=5)
#     return hashlib.sha256(serialized).hexdigest()

# hash_extracted_data(extracted_concept_unit_value_data)

In [None]:
# print("Fetching...")
# extracted_concept_unit_value_data_2 = extract_concept_unit_value_tuples(data_dir, valid_concepts)

# print("Hashing...")
# hash_extracted_data(extracted_concept_unit_value_data_2)

In [None]:
# View concepts report (not needed for preprocessing but contains useful information)

generate_concepts_report(extracted_concept_unit_value_data)

In [None]:
from utils.pytorch import get_device, seed_everything

device = get_device()

logging.info("Collecting concept unit pairs...")
concept_unit_pairs = collect_concept_unit_pairs(extracted_concept_unit_value_data)

logging.info(f"Total concept unit pairs: {len(concept_unit_pairs)}")

logging.info("Generating concept unit embeddings...")
concept_unit_embeddings = generate_concept_unit_embeddings(concept_unit_pairs, device=device)
# concept_unit_embeddings_2 = generate_concept_unit_embeddings(concept_unit_pairs, device=device)



In [None]:
# import numpy as np
# import torch
# from tqdm import tqdm

# # Normalize for cosine similarity
# A = concept_unit_embeddings_1
# B = concept_unit_embeddings_2

# A_norm = A / np.linalg.norm(A, axis=1, keepdims=True)
# B_norm = B / np.linalg.norm(B, axis=1, keepdims=True)

# # Cosine similarity per row
# cos_sim = np.sum(A_norm * B_norm, axis=1)

# # Report
# print(f"Cosine similarity:")
# print(f"  Mean: {cos_sim.mean():.8f}")
# print(f"  Min:  {cos_sim.min():.8f}")
# print(f"  Std:  {cos_sim.std():.8f}")

# # Optional: show rows below threshold
# threshold = 0.999
# bad_indices = np.where(cos_sim < threshold)[0]
# print(f"\n🔻 Below {threshold}: {len(bad_indices)} / {len(cos_sim)} rows")
# if len(bad_indices):
#     for idx in bad_indices[:10]:
#         print(f"  Row {idx}: cosine = {cos_sim[idx]:.8f}")


In [None]:
# concept_unit_embeddings

In [None]:
from models.pytorch.narrative_stack.stage1.preprocessing.plots import plot_pca_explanation

dim = plot_pca_explanation(concept_unit_embeddings, variance_threshold=0.95)

display(dim)

In [None]:
from models.pytorch.narrative_stack.stage1.preprocessing import pca_compress_concept_unit_embeddings


pca_compressed_concept_unit_embeddings, pca = pca_compress_concept_unit_embeddings(concept_unit_embeddings, n_components=243)


In [None]:
# # TODO: Prototype this

# import joblib
# import io

# # `pca` is the fitted PCA object
# buffer = io.BytesIO()
# joblib.dump(pca, buffer)
# pca_bytes = buffer.getvalue()

In [None]:
from models.pytorch.narrative_stack.stage1.preprocessing.plots import plot_semantic_embeddings

plot_semantic_embeddings(pca_compressed_concept_unit_embeddings, title="PCA Semantic Embedding Scatterplot")
plot_semantic_embeddings(concept_unit_embeddings, title="Raw Semantic Embedding Scatterplot")

In [None]:
pca_compressed_concept_unit_embeddings

In [None]:
import logging
import numpy as np

# TODO: Add types
def save_concept_unit_value_tuples(pca_compressed_concept_unit_embeddings, concept_unit_pairs, concept_unit_value_tuples, file_path):
    assert len(pca_compressed_concept_unit_embeddings) == len(concept_unit_pairs), \
        f"Mismatch: {len(pca_compressed_concept_unit_embeddings)} embeddings vs {len(concept_unit_pairs)} keys"

    # Save both embeddings and tuples
    np.savez_compressed(
        file_path,
        keys=np.array([f"{c}::{u}" for c, u in concept_unit_pairs]),
        embeddings=pca_compressed_concept_unit_embeddings,
        concept_unit_value_tuples=np.array(concept_unit_value_tuples, dtype=object)
    )

    logging.info(f"Saved {len(concept_unit_value_tuples):,} tuples and {len(pca_compressed_concept_unit_embeddings):,} embeddings to '{file_path}'")


save_concept_unit_value_tuples(
    pca_compressed_concept_unit_embeddings,
    concept_unit_pairs,
    extracted_concept_unit_value_data.concept_unit_value_tuples,
    "data/stage1_latents.npz" # TODO: Rename! These are not latent vectors!
)

# save_concept_unit_value_tuples(
#     pca_compressed_concept_unit_embeddings_2,
#     concept_unit_pairs,
#     extracted_concept_unit_value_data.concept_unit_value_tuples,
#     "data/stage1_latents_new_2.npz"
# )

In [None]:
# TODO: Validate subsequent

# import numpy as np

# # Load from disk
# new_data = np.load("data/stage1_latents_new.npz", allow_pickle=True)
# new_concept_unit_value_tuples = new_data["concept_unit_value_tuples"].tolist()
# new_embeddings = new_data["embeddings"]

# # Check shape match
# assert len(pca_compressed_concept_unit_embeddings) == len(new_embeddings), \
#     "Mismatch in embedding row counts"

# # Cosine similarity check
# def cosine_similarity(a, b):
#     a = a / np.linalg.norm(a)
#     b = b / np.linalg.norm(b)
#     return np.dot(a, b)

# cos_sims = []
# for a_vec, b_vec in zip(pca_compressed_concept_unit_embeddings, new_embeddings):
#     sim = cosine_similarity(a_vec.astype(np.float64), b_vec.astype(np.float64))
#     cos_sims.append(sim)

# # Report
# cos_sims = np.array(cos_sims)
# print(f"✅ Compared {len(cos_sims)} rows")
# print(f"🔹 Mean cosine similarity: {cos_sims.mean():.8f}")
# print(f"🔹 Min cosine similarity:  {cos_sims.min():.8f}")
# print(f"🔹 Std dev:                {cos_sims.std():.8f}")


In [None]:
# import numpy as np

# # Load saved latent data
# old_data = np.load("data/stage1_latents.npz", allow_pickle=True)

# # Build embedding map
# embedding_map = {
#     tuple(key.split("::", 1)): vec
#     for key, vec in zip(old_data["keys"], old_data["embeddings"])
# }

# # Load concept-unit-value tuples
# old_concept_unit_value_tuples = old_data["concept_unit_value_tuples"].tolist()

# # Load saved latent data
# new_data = np.load("data/stage1_latents_new.npz", allow_pickle=True)

# # Build embedding map
# embedding_map = {
#     tuple(key.split("::", 1)): vec
#     for key, vec in zip(new_data["keys"], new_data["embeddings"])
# }

# # Load concept-unit-value tuples
# new_concept_unit_value_tuples = new_data["concept_unit_value_tuples"].tolist()

In [None]:
# import numpy as np
# from hashlib import sha256

# # a = np.load("data/stage1_latents.npz", allow_pickle=True)
# a = np.load("data/stage1_latents_new_1.npz", allow_pickle=True)
# b = np.load("data/stage1_latents_new_2.npz", allow_pickle=True)

# # with open("data/stage1_latents_new_1.pkl", "rb") as f:
# #     a = pickle.load(f)

# # with open("data/stage1_latents_new_2.pkl", "rb") as f:
# #     b = pickle.load(f)

# def hash_array(arr):
#     return sha256(np.ascontiguousarray(arr)).hexdigest()

# print("Hash a[keys]:", hash_array(a["keys"]))
# print("Hash b[keys]:", hash_array(b["keys"]))

# for k in a.files:
# # for k in a.keys():
#     print(f"Checking: {k}")
#     if k == "embeddings":
#         A = a[k].astype(np.float32)
#         B = b[k].astype(np.float32)
#         assert A.shape == B.shape, "Shape mismatch in embeddings"

#         # Normalize to unit vectors
#         A_norm = A / np.linalg.norm(A, axis=1, keepdims=True)
#         B_norm = B / np.linalg.norm(B, axis=1, keepdims=True)

#         # Cosine similarity
#         cos_sim = np.sum(A_norm * B_norm, axis=1)
#         mean_sim = np.mean(cos_sim)
#         min_sim = np.min(cos_sim)

#         print(f"Mean cosine similarity: {mean_sim:.8f}")
#         print(f"Min cosine similarity:  {min_sim:.8f}")
#         print(f"Std cosine similarity:  {cos_sim.std():.8f}")
#         assert min_sim > 0.999, f"Cosine similarity too low in embeddings: {min_sim}"
#     else:
#         assert np.array_equal(a[k], b[k]), f"Mismatch in {k}"
