In [None]:
import os
from pathlib import Path

# This snippet ensures consistent import paths across environments.
# When running notebooks via JupyterLab's web UI, the current working
# directory is often different (e.g., /notebooks) compared to VS Code,
# which typically starts at the project root. This handles that by 
# retrying the import after changing to the parent directory.
# 
# Include this at the top of every notebook to standardize imports
# across development environments.

try:
    from utils.os import chdir_to_git_root
except ModuleNotFoundError:
    os.chdir(Path.cwd().parent)
    print(f"Retrying import from: {os.getcwd()}")
    from utils.os import chdir_to_git_root

chdir_to_git_root("python")

print(os.getcwd())

In [None]:
import os
import pandas as pd
import numpy as np
from collections import defaultdict
from tqdm import tqdm
from db import DB

# === Extraction Logic ===
def extract_concept_unit_value_tuples(data_dir, valid_concepts):
    rows = []
    unit_values = defaultdict(list)
    unit_concepts = defaultdict(set)
    non_numeric_units = set()

    csv_files = []
    for root, _, files in os.walk(data_dir):
        for file in files:
            if file.endswith(".csv"):
                csv_files.append(os.path.join(root, file))

    for path in tqdm(csv_files, desc="Scanning CSV files"):
        try:
            df = pd.read_csv(path, low_memory=False)
            tag_columns = [col for col in df.columns if col in valid_concepts]
            if not tag_columns:
                continue

            for col in tag_columns:
                for val in df[col].dropna().astype(str):
                    if "::" not in val:
                        continue
                    val_part, unit_part = val.split("::", 1)
                    unit_part = unit_part.strip().upper()
                    try:
                        num_val = float(val_part.strip())
                        rows.append((col, unit_part, num_val))
                        unit_values[unit_part].append(num_val)
                        unit_concepts[unit_part].add(col)
                    except ValueError:
                        non_numeric_units.add(unit_part)
        except Exception as e:
            print(f"⚠️ Skipped {path}: {e}")

    return rows, unit_values, unit_concepts, non_numeric_units, csv_files

# === Main Execution ===
data_dir = "../data/us-gaap"
db = DB()
concept_df = db.get("SELECT name FROM us_gaap_concept", ["name"])
valid_concepts = set(concept_df["name"].values)

concept_unit_value_tuples, unit_values, unit_concepts, non_numeric_units, csv_files = extract_concept_unit_value_tuples(data_dir, valid_concepts)

print(f"\n✅ Scanned {len(csv_files)} files.")
print(f"📦 Found {len(unit_values)} numeric units and {len(non_numeric_units)} non-numeric units.")

for unit, values in sorted(unit_values.items()):
    arr = np.array(values)
    print(f"🔹 {unit}")
    print(f"   Count: {len(arr)}")
    print(f"   Min:   {arr.min():,.4f}")
    print(f"   Max:   {arr.max():,.4f}")
    print(f"   Mean:  {arr.mean():,.4f}")
    print(f"   Std:   {arr.std():,.4f}")
    print(f"   Concepts: {', '.join(sorted(unit_concepts[unit]))}")

if non_numeric_units:
    print("\n⚠️ Non-numeric units encountered:")
    for unit in sorted(non_numeric_units):
        print(f"  - {unit}")


In [None]:
print(f"\n🧮 Total values extracted: {len(concept_unit_value_tuples):,}")


In [None]:
# Step 7: Build concept/unit dataset
concept_unit_pairs = []
for unit, concepts in unit_concepts.items():
    for concept in concepts:
        concept_unit_pairs.append((concept, unit))

# Convert to DataFrame
# concept_unit_df = pd.DataFrame(concept_unit_pairs,
#                                columns=["concept", "unit"])
# concept_unit_df.to_csv("data/concept_unit_pairs.csv", index=False)
# print("✅ data/concept_unit_pairs.csv saved.")


In [None]:
import torch
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from utils import generate_us_gaap_description

input_texts = [f"{generate_us_gaap_description(concept)} measured in {unit}" for concept, unit in concept_unit_pairs]

device = "mps" if torch.backends.mps.is_available() else "cpu"
model = SentenceTransformer("BAAI/bge-large-en-v1.5")
model.to(device)

def encode_on_device(texts, model, batch_size=64):
    all_embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Encoding"):
        batch = texts[i:i+batch_size]
        tokens = model.tokenize(batch)
        tokens = {k: v.to(device) for k, v in tokens.items()}
        with torch.no_grad():
            output = model.forward(tokens)
            embeddings = output["sentence_embedding"]
        all_embeddings.append(embeddings.cpu())
    return torch.cat(all_embeddings).numpy()

embeddings = encode_on_device(input_texts, model)


In [None]:
# from sklearn.decomposition import PCA
# import matplotlib.pyplot as plt
# import numpy as np

# # embeddings: np.ndarray of shape (N, 1024)
# pca = PCA()
# pca.fit(embeddings)

# explained = np.cumsum(pca.explained_variance_ratio_)

# plt.plot(np.arange(1, len(explained)+1), explained)
# plt.xlabel("Number of PCA components")
# plt.ylabel("Cumulative explained variance")
# plt.grid(True)
# plt.axhline(0.95, color='red', linestyle='--')  # e.g. 95% threshold
# plt.title("Explained Variance vs PCA Components")
# plt.show()


In [None]:
import numpy as np
from sklearn.decomposition import PCA
import joblib

# Assuming `embeddings` is your (N, 1024) array
n_components = 200  # or 128 if you're more memory-conscious
pca = PCA(n_components=n_components)

# Fit PCA and transform the embeddings
compressed = pca.fit_transform(embeddings)

# # Save PCA model and compressed embeddings
# joblib.dump(pca, "pca_model.joblib")
# np.save("concept_uom_embeddings_pca.npy", compressed)


In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D  # noqa: F401 unused import
import numpy as np

def plot_embeddings(embeddings, labels=None, title="Embedding Scatterplot"):
    """
    Display a 2D or 3D scatterplot of the compressed embeddings.

    Parameters:
        embeddings (np.ndarray): Array of shape (N, 2) or (N, 3)
        labels (List[str], optional): Labels to annotate points (optional)
        title (str): Plot title
    """
    dim = embeddings.shape[1]
    assert dim in (2, 3), "Embeddings must be 2D or 3D for scatterplot"

    fig = plt.figure(figsize=(10, 8))

    if dim == 3:
        ax = fig.add_subplot(111, projection='3d')
        ax.scatter(*embeddings.T, s=10, alpha=0.7)
    else:
        ax = fig.add_subplot(111)
        ax.scatter(embeddings[:, 0], embeddings[:, 1], s=10, alpha=0.7)

    if labels is not None:
        for i, label in enumerate(labels):
            if dim == 3:
                ax.text(*embeddings[i], label, fontsize=6)
            else:
                ax.text(embeddings[i, 0], embeddings[i, 1], label, fontsize=6)

    ax.set_title(title)
    plt.tight_layout()
    plt.show()

plot_embeddings(compressed[:, :2])

In [None]:
# import hdbscan
# import umap
# import matplotlib.pyplot as plt
# from collections import defaultdict

# # Cluster
# # clusterer = hdbscan.HDBSCAN(min_cluster_size=5, min_samples=5, cluster_selection_method="leaf")
# clusterer = hdbscan.HDBSCAN(min_cluster_size=20, min_samples=10)
# labels = clusterer.fit_predict(compressed)  # PCA-reduced embeddings

# # Group input_texts by cluster
# clusters = defaultdict(list)
# for idx, label in enumerate(labels):
#     clusters[label].append(input_texts[idx])

# # Print samples from each cluster
# for cluster_id, examples in clusters.items():
#     if cluster_id == -1:
#         continue  # Skip noise
#     print(f"\n📦 Cluster {cluster_id} ({len(examples)} samples):")
#     for e in examples[:10]:
#         print(f"  - {e}")




In [None]:
# # UMAP visualization
# umap_model = umap.UMAP(n_neighbors=15, min_dist=0.1, metric="cosine")
# umap_2d = umap_model.fit_transform(compressed)

# plt.figure(figsize=(10, 6))
# plt.scatter(umap_2d[:, 0], umap_2d[:, 1], c=labels, cmap="tab10", s=5)
# plt.title("Concept/UOM Embeddings Clustered")
# plt.show()

In [None]:
# df = pd.DataFrame({
#     "concept": [c for c, _ in concept_unit_pairs],
#     "unit": [u for _, u in concept_unit_pairs],
#     "cluster": labels
# })
# grouped = df.groupby("cluster")

# for cluster_id, group in grouped:
#     print(f"\nCluster {cluster_id} ({len(group)} items):")
#     print(group.head(10).to_string(index=False))

# noise = df[df["cluster"] == -1]

# print(f"Noise points: {len(noise)}")


In [None]:
# noise_points = df[df["cluster"] == -1][["concept", "unit"]].reset_index(drop=True)

# noise_points.to_csv("noise_points.csv")

In [None]:
# pca = joblib.load("pca_model.joblib")
# compressed = np.load("concept_uom_embeddings_pca.npy")

In [None]:
import numpy as np

# Save both embeddings and tuples
np.savez_compressed(
    "data/stage1_latents.npz",
    keys=np.array([f"{c}::{u}" for c, u in concept_unit_pairs]),
    embeddings=compressed,
    concept_unit_value_tuples=np.array(concept_unit_value_tuples, dtype=object)
)

print(f"✅ Saved {len(concept_unit_value_tuples):,} tuples and {len(compressed):,} embeddings to 'stage1_latents.npz'")


In [None]:
import numpy as np

# Load saved latent data
data = np.load("data/stage1_latents.npz", allow_pickle=True)

# Build embedding map
embedding_map = {
    tuple(key.split("::", 1)): vec
    for key, vec in zip(data["keys"], data["embeddings"])
}

# Load concept-unit-value tuples
concept_unit_value_tuples = data["concept_unit_value_tuples"].tolist()


In [None]:
# embedding_map

In [None]:
from tqdm import tqdm
from collections import defaultdict
from sklearn.preprocessing import StandardScaler

# TODO: Document why this happens before data splitting in this particular dataset

# Step 1: Group values per (concept, unit)
grouped = defaultdict(list)
for concept, unit, value in concept_unit_value_tuples:
    grouped[(concept, unit)].append(value)

# Step 2: Fit individual scalers and transform
scalers = {}
scaled_tuples = []

for key, vals in tqdm(grouped.items(), desc="Scaling per concept/unit"):
    vals_np = np.array(vals).reshape(-1, 1)
    scaler = StandardScaler()
    scaled_vals = scaler.fit_transform(vals_np).flatten()
    scalers[key] = scaler

    # Rebuild tuples
    scaled_tuples.extend((key[0], key[1], v) for v in scaled_vals)


In [None]:
scalers

In [None]:
import torch
from torch import nn
import pytorch_lightning as pl
from torch.utils.data import Dataset, DataLoader
from utils.pytorch import seed_everything
import numpy as np
from torch.nn.functional import cosine_similarity, l1_loss

# Stage 1 dataset: concept+uom embedding + value
class ConceptValueDataset(Dataset):
    def __init__(self, scaled_tuples, embedding_lookup, device: torch.tensor, value_noise_std=0.0, train=False):
        self.rows = scaled_tuples
        self.lookup = embedding_lookup
        self.value_noise_std = value_noise_std
        self.train = train
        self.device = device

    def __len__(self):
        return len(self.rows)

    def __getitem__(self, idx):
        concept, unit, value = self.rows[idx]

        try:
            embedding = self.lookup[(concept, unit)]
        except KeyError:
            raise ValueError(f"Missing embedding for ({concept}, {unit})")

        if self.train and self.value_noise_std > 0:
            value += np.random.normal(0, self.value_noise_std)

        x = torch.tensor(np.concatenate([embedding, [value]]), dtype=torch.float32, device=self.device)
        y = torch.tensor(np.concatenate([embedding, [value]]), dtype=torch.float32, device=self.device)
        return x, y

# LightningModule
class Stage1Autoencoder(pl.LightningModule):
    def __init__(
            self,
            input_dim=201,
            latent_dim=64,
            dropout_rate=0.1,
            lr=1e-3,
            batch_size=24,
            gradient_clip=1.0,
            alpha_embed=0.5,
            alpha_value=1.0,
            weight_decay=0.0
    ):
        super().__init__()
        
        self.save_hyperparameters()

        self.value_proj = nn.Sequential(
            nn.Linear(1, 32),
            nn.GELU(),
            nn.Linear(32, self.hparams.latent_dim),
            nn.LayerNorm(self.hparams.latent_dim)
        )

        self.encoder = nn.Sequential(
            nn.Linear(input_dim - 1 + self.hparams.latent_dim, 256),
            nn.GELU(),
            nn.Dropout(p=dropout_rate),
            nn.Linear(256, latent_dim)
        )
        
        self.embedding_decoder = nn.Sequential(
            nn.Linear(latent_dim, 256),
            nn.GELU(),
            nn.Dropout(p=dropout_rate),
            nn.Linear(256, input_dim - 1)
        )

        self.value_decoder = nn.Sequential(
            nn.Linear(latent_dim, 128),
            nn.GELU(),
            nn.Dropout(p=dropout_rate),
            nn.Linear(128, 1)
        )

        self.loss_fn = nn.MSELoss()

    def forward(self, x):
        # x shape: [batch_size, input_dim]
        x_emb = x[:, :-1]
        x_val = x[:, -1].unsqueeze(1)
    
        # project the value separately and then fuse
        val_proj = self.value_proj(x_val)
        fused_input = torch.cat([x_emb, val_proj], dim=1)
    
        z = self.encoder(fused_input)
        
        recon_emb = self.embedding_decoder(z)
        recon_val = self.value_decoder(z)
        
        return recon_emb, recon_val
    
    def compute_losses(self, x, target):
        recon_emb, recon_val = self(x)

        target_emb = target[:, :-1]
        target_val = target[:, -1].unsqueeze(1)

        embedding_loss = self.loss_fn(recon_emb, target_emb)
        value_loss = self.loss_fn(recon_val, target_val)
        loss = self.hparams.alpha_embed * embedding_loss + self.hparams.alpha_value * value_loss

        cos_sim = cosine_similarity(recon_emb, target_emb, dim=1).mean()
        mae_value = l1_loss(recon_val, target_val)
        euclidean_dist = torch.norm(recon_emb - target_emb, dim=1).mean()

        ss_res = ((recon_val - target_val) ** 2).sum()
        ss_tot = ((target_val - target_val.mean()) ** 2).sum()
        r2_raw = 1 - ss_res / ss_tot
        r2_value = torch.clamp(r2_raw, min=-10.0, max=1.0)

        return loss, embedding_loss, value_loss, cos_sim, mae_value, euclidean_dist, r2_value

    def training_step(self, batch, batch_idx):
        x, target = batch
        loss, embedding_loss, value_loss, cos_sim, mae_value, euclidean_dist, r2_value = self.compute_losses(x, target)

        self.log("train_loss", loss, prog_bar=True)
        self.log("train_embedding_loss", embedding_loss)
        self.log("train_value_loss", value_loss)
        self.log("train_embedding_cos_sim", cos_sim)
        self.log("train_value_mae", mae_value)
        self.log("train_embedding_euclidean", euclidean_dist)
        self.log("train_value_r2", r2_value)
        return loss

    def validation_step(self, batch, batch_idx):
        x, target = batch
        loss, embedding_loss, value_loss, cos_sim, mae_value, euclidean_dist, r2_value = self.compute_losses(x, target)

        self.log("val_loss", loss, prog_bar=True)
        self.log("val_embedding_loss", embedding_loss)
        self.log("val_value_loss", value_loss)
        self.log("val_embedding_cos_sim", cos_sim)
        self.log("val_value_mae", mae_value)
        self.log("val_embedding_euclidean", euclidean_dist)
        self.log("val_value_r2", r2_value)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams.lr, weight_decay=self.hparams.weight_decay)


In [None]:
import os
import optuna
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from torch.utils.data import DataLoader
from utils.pytorch import get_device

device = get_device()

# === CONFIG ===
OUTPUT_PATH = "data/stage1"
os.makedirs(OUTPUT_PATH, exist_ok=True)
OPTUNA_DB_PATH = os.path.join(OUTPUT_PATH, "optuna_study.db")
EPOCHS = 3
PATIENCE = 5
VAL_SPLIT = 0.2

def objective(trial):
    batch_size = trial.suggest_int("batch_size", 8, 64, step=8)
    lr = trial.suggest_float("lr", 1e-5, 1e-3, log=True)
    latent_dim = trial.suggest_int("latent_dim", 32, 128, step=32)
    # dropout_rate = trial.suggest_float("dropout_rate", 0.0, 0.2, step=0.1)
    weight_decay = trial.suggest_float("weight_decay", 1e-8, 1e-4, log=True)
    gradient_clip = trial.suggest_float("gradient_clip", 0.0, 1.0, step=0.1)

    # === Sample Subset for Faster Debugging ===
    SAMPLE_SIZE = 500_000
    subset = scaled_tuples[:SAMPLE_SIZE]
    
    # 80/20 Train/Val Split
    split = int(len(subset) * (1 - VAL_SPLIT))
    train_data = scaled_tuples[:split]
    val_data = scaled_tuples[split:]

    train_loader = DataLoader(
        ConceptValueDataset(train_data, embedding_map, device=device, value_noise_std=0.005, train=True),
        batch_size=batch_size,
        shuffle=True
    )
    
    val_loader = DataLoader(
        ConceptValueDataset(val_data, embedding_map, device=device, value_noise_std=0.00, train=False),
        batch_size=batch_size,
        shuffle=False
    )

    input_dim = len(next(iter(embedding_map.values()))) + 1

    model = Stage1Autoencoder(
        input_dim=input_dim,
        latent_dim=latent_dim,
        # dropout_rate=dropout_rate,
        lr=lr,
        batch_size=batch_size,
        weight_decay=weight_decay,
        gradient_clip=gradient_clip
    )

    early_stop_callback = EarlyStopping(monitor="val_loss", patience=PATIENCE, verbose=True, mode="min")

    model_checkpoint = ModelCheckpoint(
        dirpath=OUTPUT_PATH,
        filename="best_model_trial_{trial.number}",
        monitor="val_loss",
        mode="min",
        save_top_k=1,
        verbose=True
    )

    trainer = pl.Trainer(
        max_epochs=EPOCHS,
        logger=TensorBoardLogger(OUTPUT_PATH, name="stage1_autoencoder"),
        callbacks=[early_stop_callback, model_checkpoint],
        accelerator="auto",
        devices=1,
        gradient_clip_val=gradient_clip
    )

    trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=val_loader)
    return trainer.callback_metrics["val_loss"].item()

# === Optuna Study ===
study = optuna.create_study(direction="minimize",
                            storage=f"sqlite:///{OPTUNA_DB_PATH}",
                            load_if_exists=True)
study.optimize(objective, n_trials=20)

print("Best params:", study.best_params)
print("Best trial value:", study.best_trial.value)


In [None]:
# from torch.utils.data import DataLoader

# # Instantiate dataset
# dataset = ConceptValueDataset(scaled_tuples, embedding_map)

# # Sample inspection
# sample_x, sample_y = dataset[0]
# print("Sample input:", sample_x)
# print("Min:", sample_x.min().item(), "Max:", sample_x.max().item())
# print("Mean:", sample_x.mean().item(), "Std:", sample_x.std().item())
# print("Input dim:", sample_x.shape[0], "Target dim:", sample_y.shape[0])

# # Optional: test batch loading
# loader = DataLoader(dataset, batch_size=4)
# for xb, yb in loader:
#     print("Batch shape:", xb.shape)
#     break
