In [None]:
import os
from pathlib import Path

# This snippet ensures consistent import paths across environments.
# When running notebooks via JupyterLab's web UI, the current working
# directory is often different (e.g., /notebooks) compared to VS Code,
# which typically starts at the project root. This handles that by 
# retrying the import after changing to the parent directory.
# 
# Include this at the top of every notebook to standardize imports
# across development environments.

try:
    from utils.os import chdir_to_git_root
except ModuleNotFoundError:
    os.chdir(Path.cwd().parent)
    print(f"Retrying import from: {os.getcwd()}")
    from utils.os import chdir_to_git_root

chdir_to_git_root("python")

print(os.getcwd())

In [None]:
import os
import pandas as pd
import numpy as np
from collections import defaultdict
from tqdm import tqdm
from db import DB

data_dir = "../data/us-gaap"
db = DB()

# Step 1: Get all US GAAP concept names from the DB
concept_df = db.get("SELECT name FROM us_gaap_concept", ["name"])
valid_concepts = set(concept_df["name"].values)

# Step 2: Prepare structures to collect values per unit and concept
unit_values = defaultdict(list)
unit_concepts = defaultdict(set)
non_numeric_units = set()

# Step 3: Traverse CSV files
csv_files = []
for root, _, files in os.walk(data_dir):
    for file in files:
        if file.endswith(".csv"):
            csv_files.append(os.path.join(root, file))

for path in tqdm(csv_files, desc="Scanning CSVs"):
    try:
        df = pd.read_csv(path, low_memory=False)

        # Filter to only valid GAAP concept columns
        tag_columns = [col for col in df.columns if col in valid_concepts]
        if not tag_columns:
            continue

        for col in tag_columns:
            for val in df[col].dropna().astype(str):
                if "::" not in val:
                    continue
                val_part, unit_part = val.split("::", 1)

                # Obtain unit part and normalize to uppercase
                unit_part = unit_part.strip().upper()

                try:
                    num_val = float(val_part.strip())
                    unit_values[unit_part].append(num_val)
                    unit_concepts[unit_part].add(col)
                except ValueError:
                    non_numeric_units.add(unit_part)
    except Exception as e:
        print(f"⚠️ Skipped {path} due to error: {e}")

# Step 4: Report
print(f"\n✅ Scanned {len(csv_files)} files.")
print(f"📦 Found {len(unit_values)} numeric units and {len(non_numeric_units)} non-numeric units.")

# Step 5: Show stats per numeric unit
for unit, values in sorted(unit_values.items()):
    arr = np.array(values)
    print(f"🔹 {unit}")
    print(f"   Count: {len(arr)}")
    print(f"   Min:   {arr.min():,.4f}")
    print(f"   Max:   {arr.max():,.4f}")
    print(f"   Mean:  {arr.mean():,.4f}")
    print(f"   Std:   {arr.std():,.4f}")
    print(f"   Concepts: {', '.join(sorted(unit_concepts[unit]))}")

# Step 6: Optionally show non-numeric units
if non_numeric_units:
    print("\n⚠️ Non-numeric units encountered:")
    for unit in sorted(non_numeric_units):
        print(f"  - {unit}")


In [None]:
# Step 7: Build concept/unit dataset
concept_unit_pairs = []
for unit, concepts in unit_concepts.items():
    for concept in concepts:
        concept_unit_pairs.append((concept, unit))

# Convert to DataFrame
# concept_unit_df = pd.DataFrame(concept_unit_pairs,
#                                columns=["concept", "unit"])
# concept_unit_df.to_csv("data/concept_unit_pairs.csv", index=False)
# print("✅ data/concept_unit_pairs.csv saved.")


In [None]:
import torch
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from utils import generate_us_gaap_description

input_texts = [f"{generate_us_gaap_description(concept)} measured in {unit}" for concept, unit in concept_unit_pairs]

device = "mps" if torch.backends.mps.is_available() else "cpu"
model = SentenceTransformer("BAAI/bge-large-en-v1.5")
model.to(device)

def encode_on_device(texts, model, batch_size=64):
    all_embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Encoding"):
        batch = texts[i:i+batch_size]
        tokens = model.tokenize(batch)
        tokens = {k: v.to(device) for k, v in tokens.items()}
        with torch.no_grad():
            output = model.forward(tokens)
            embeddings = output["sentence_embedding"]
        all_embeddings.append(embeddings.cpu())
    return torch.cat(all_embeddings).numpy()

embeddings = encode_on_device(input_texts, model)


In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np

# embeddings: np.ndarray of shape (N, 1024)
pca = PCA()
pca.fit(embeddings)

explained = np.cumsum(pca.explained_variance_ratio_)

plt.plot(np.arange(1, len(explained)+1), explained)
plt.xlabel("Number of PCA components")
plt.ylabel("Cumulative explained variance")
plt.grid(True)
plt.axhline(0.95, color='red', linestyle='--')  # e.g. 95% threshold
plt.title("Explained Variance vs PCA Components")
plt.show()


In [None]:
import numpy as np
from sklearn.decomposition import PCA
import joblib

# Assuming `embeddings` is your (N, 1024) array
n_components = 150  # or 128 if you're more memory-conscious
pca = PCA(n_components=n_components)

# Fit PCA and transform the embeddings
compressed = pca.fit_transform(embeddings)

# # Save PCA model and compressed embeddings
# joblib.dump(pca, "pca_model.joblib")
# np.save("concept_uom_embeddings_pca.npy", compressed)


In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D  # noqa: F401 unused import
import numpy as np

def plot_embeddings(embeddings, labels=None, title="Embedding Scatterplot"):
    """
    Display a 2D or 3D scatterplot of the compressed embeddings.

    Parameters:
        embeddings (np.ndarray): Array of shape (N, 2) or (N, 3)
        labels (List[str], optional): Labels to annotate points (optional)
        title (str): Plot title
    """
    dim = embeddings.shape[1]
    assert dim in (2, 3), "Embeddings must be 2D or 3D for scatterplot"

    fig = plt.figure(figsize=(10, 8))

    if dim == 3:
        ax = fig.add_subplot(111, projection='3d')
        ax.scatter(*embeddings.T, s=10, alpha=0.7)
    else:
        ax = fig.add_subplot(111)
        ax.scatter(embeddings[:, 0], embeddings[:, 1], s=10, alpha=0.7)

    if labels is not None:
        for i, label in enumerate(labels):
            if dim == 3:
                ax.text(*embeddings[i], label, fontsize=6)
            else:
                ax.text(embeddings[i, 0], embeddings[i, 1], label, fontsize=6)

    ax.set_title(title)
    plt.tight_layout()
    plt.show()

plot_embeddings(compressed[:, :2])

In [None]:
import hdbscan
import umap
import matplotlib.pyplot as plt
from collections import defaultdict

# Cluster
clusterer = hdbscan.HDBSCAN(min_cluster_size=5, min_samples=5, cluster_selection_method="leaf")
labels = clusterer.fit_predict(compressed)  # PCA-reduced embeddings

# Group input_texts by cluster
clusters = defaultdict(list)
for idx, label in enumerate(labels):
    clusters[label].append(input_texts[idx])

# Print samples from each cluster
for cluster_id, examples in clusters.items():
    if cluster_id == -1:
        continue  # Skip noise
    print(f"\n📦 Cluster {cluster_id} ({len(examples)} samples):")
    for e in examples[:10]:
        print(f"  - {e}")

# UMAP visualization
umap_model = umap.UMAP(n_neighbors=15, min_dist=0.1, metric="cosine")
umap_2d = umap_model.fit_transform(compressed)

plt.figure(figsize=(10, 6))
plt.scatter(umap_2d[:, 0], umap_2d[:, 1], c=labels, cmap="tab10", s=5)
plt.title("Concept/UOM Embeddings Clustered")
plt.show()


In [None]:
df = pd.DataFrame({
    "concept": [c for c, _ in concept_unit_pairs],
    "unit": [u for _, u in concept_unit_pairs],
    "cluster": labels
})
grouped = df.groupby("cluster")

for cluster_id, group in grouped:
    print(f"\nCluster {cluster_id} ({len(group)} items):")
    print(group.head(10).to_string(index=False))

noise = df[df["cluster"] == -1]

print(f"Noise points: {len(noise)}")


In [None]:
noise_points = df[df["cluster"] == -1][["concept", "unit"]].reset_index(drop=True)

noise_points.to_csv("noise_points.csv")

In [None]:
# pca = joblib.load("pca_model.joblib")
# compressed = np.load("concept_uom_embeddings_pca.npy")

In [None]:
import numpy as np

# Assuming these are already defined in your notebook
# - `concept_unit_pairs` is a list of (concept, unit) tuples
# - `compressed` is the corresponding array of embeddings

# Convert keys to a structured array of strings
keys = np.array([f"{c}::{u}" for c, u in concept_unit_pairs])

# Save compressed embeddings
np.savez_compressed("data/stage1_latents.npz", keys=keys, embeddings=compressed)
print(f"✅ Saved {len(keys)} embeddings to 'stage1_latents.npz'")


In [None]:
import numpy as np

# keys: ["Revenue::USD", "Assets::EUR", ...]
# embeddings: numpy array of shape (N, 128)
data = np.load("data/stage1_latents.npz")
embedding_map = {
    tuple(key.split("::", 1)): vec
    for key, vec in zip(data["keys"], data["embeddings"])
}


In [None]:
import os
import pandas as pd
from db import DB
from tqdm import tqdm

db = DB()

def extract_concept_unit_value_tuples(data_dir, valid_concepts):
    rows = []
    all_files = []
    for root, _, files in os.walk(data_dir):
        for file in files:
            if file.endswith(".csv"):
                all_files.append(os.path.join(root, file))

    for path in tqdm(all_files, desc="Scanning CSV files"):
        try:
            df = pd.read_csv(path, low_memory=False)
            tag_columns = [col for col in df.columns if col in valid_concepts]
            for col in tag_columns:
                for val in df[col].dropna().astype(str):
                    if "::" not in val:
                        continue
                    val_part, unit_part = val.split("::", 1)
                    unit_part = unit_part.strip().upper()
                    try:
                        num_val = float(val_part.strip())
                        rows.append((col, unit_part, num_val))
                    except ValueError:
                        continue
        except Exception as e:
            print(f"⚠️ Skipped {path}: {e}")
    return rows


concept_df = db.get("SELECT name FROM us_gaap_concept", ["name"])
valid_concepts = set(concept_df["name"].values)

concept_unit_value_tuples = extract_concept_unit_value_tuples("../data/us-gaap", valid_concepts)



In [None]:
# concept_unit_value_tuples

In [None]:
from collections import defaultdict
from sklearn.preprocessing import StandardScaler

# Step 1: Group values per (concept, unit)
grouped = defaultdict(list)
for concept, unit, value in concept_unit_value_tuples:
    grouped[(concept, unit)].append(value)

# Step 2: Fit individual scalers and transform
scalers = {}
scaled_tuples = []

for key, vals in tqdm(grouped.items(), desc="Scaling per concept/unit"):
    vals_np = np.array(vals).reshape(-1, 1)
    scaler = StandardScaler()
    scaled_vals = scaler.fit_transform(vals_np).flatten()
    scalers[key] = scaler

    # Rebuild tuples
    scaled_tuples.extend((key[0], key[1], v) for v in scaled_vals)


In [None]:
scalers

In [None]:
import torch
from torch import nn
import pytorch_lightning as pl
from torch.utils.data import Dataset, DataLoader
import numpy as np

# Stage 1 dataset: concept+uom embedding + value
class ConceptValueDataset(Dataset):
    def __init__(self, concept_unit_value_tuples, embedding_lookup):
        """
        Args:
            concept_unit_value_tuples: List of (concept, unit, value)
            embedding_lookup: Dict[(concept, unit)] -> np.ndarray
        """
        self.rows = concept_unit_value_tuples
        self.lookup = embedding_lookup

    def __len__(self):
        return len(self.rows)

    def __getitem__(self, idx):
        concept, unit, value = self.rows[idx]

        try:
            embedding = self.lookup[(concept, unit)]
        except KeyError:
            raise ValueError(f"Missing embedding for ({concept}, {unit})")
        x = torch.tensor(np.concatenate([embedding, [value]]), dtype=torch.float32)
        y = torch.tensor(np.concatenate([embedding, [value]]), dtype=torch.float32)
        return x, y

# LightningModule
class Stage1Autoencoder(pl.LightningModule):
    def __init__(self, input_dim=129, latent_dim=64, lr=1e-3):
        super().__init__()
        self.save_hyperparameters()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, latent_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 256),
            nn.ReLU(),
            nn.Linear(256, input_dim)
        )
        self.loss_fn = nn.MSELoss()

    def forward(self, x):
        z = self.encoder(x)
        return self.decoder(z)

    def training_step(self, batch, batch_idx):
        x, target = batch
        recon = self(x)

        target_embedding = target[:, :-1]
        target_value = target[:, -1].unsqueeze(1)
        recon_embedding = recon[:, :-1]
        recon_value = recon[:, -1].unsqueeze(1)

        embedding_loss = self.loss_fn(recon_embedding, target_embedding)
        value_loss = self.loss_fn(recon_value, target_value)
        loss = 0.2 * embedding_loss + 1.0 * value_loss

        self.log("train_loss", loss, prog_bar=True)
        self.log("train_embedding_loss", embedding_loss, prog_bar=False)
        self.log("train_value_loss", value_loss, prog_bar=False)
        return loss

    def validation_step(self, batch, batch_idx):
        x, target = batch
        recon = self(x)

        target_embedding = target[:, :-1]
        target_value = target[:, -1].unsqueeze(1)
        recon_embedding = recon[:, :-1]
        recon_value = recon[:, -1].unsqueeze(1)

        embedding_loss = self.loss_fn(recon_embedding, target_embedding)
        value_loss = self.loss_fn(recon_value, target_value)
        loss = 0.2 * embedding_loss + 1.0 * value_loss

        self.log("val_loss", loss, prog_bar=True)
        self.log("val_embedding_loss", embedding_loss, prog_bar=False)
        self.log("val_value_loss", value_loss, prog_bar=False)
        return loss


    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams.lr)

# Example usage:
dataset = ConceptValueDataset(concept_unit_value_tuples, embedding_map)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

# Determine true input dimension
sample_x, _ = dataset[0]
model = Stage1Autoencoder(input_dim=sample_x.shape[0], latent_dim=64)

trainer = pl.Trainer(max_epochs=20, accelerator="mps" if torch.backends.mps.is_available() else "cpu")
trainer.fit(model, dataloader)
