In [None]:
import os
from pathlib import Path

# This snippet ensures consistent import paths across environments.
# When running notebooks via JupyterLab's web UI, the current working
# directory is often different (e.g., /notebooks) compared to VS Code,
# which typically starts at the project root. This handles that by 
# retrying the import after changing to the parent directory.
# 
# Include this at the top of every notebook to standardize imports
# across development environments.

try:
    from utils.os import chdir_to_git_root
except ModuleNotFoundError:
    os.chdir(Path.cwd().parent)
    print(f"Retrying import from: {os.getcwd()}")
    from utils.os import chdir_to_git_root

chdir_to_git_root("python")

print(os.getcwd())

In [None]:
from models.pytorch.us_gaap_alignment import build_us_gaap_alignment_dataset
from utils.pytorch import seed_everything, get_device

device = get_device()

output_file = "data/us_gaap_concepts_with_variations_and_embeddings.jsonl"

build_us_gaap_alignment_dataset(output_file, device)

# Baseline Analysis (no alignment model)

Result: ~65% accuracy

In [None]:
import pandas as pd
from db import DB
from utils import generate_us_gaap_description
from sklearn.metrics.pairwise import cosine_similarity
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
from utils.pytorch import seed_everything, get_device

# Setup for BGE model
MODEL_NAME = "BAAI/bge-large-en-v1.5"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
encoder = AutoModel.from_pretrained(MODEL_NAME)

device = get_device()
encoder = encoder.to(device)

# Database setup
db = DB()

queries = {
    "concept_variations": """
        SELECT
            v.text AS input_text,
            t.name AS us_gaap_description
        FROM us_gaap_concept_description_variation v
        JOIN us_gaap_concept t ON t.id = v.us_gaap_concept_id
    """
}

# Cache embeddings for each text individually
embedding_cache = {}

# Generate embeddings for text using the BGE model
def generate_embeddings(texts):
    """
    This function generates embeddings for a given list of text descriptions.
    It uses the BGE model to generate the embeddings, specifically the [CLS] token representation.
    """
    embeddings = []
    for text in texts:
        if text in embedding_cache:  # Check if the embedding is cached
            embeddings.append(embedding_cache[text])  # Use the cached embedding
        else:
            # If not cached, generate and cache the embedding
            if isinstance(text, str):
                texts = [text]  # Convert single string to a list of strings
            elif not isinstance(texts, list):
                raise ValueError("Input must be a string or a list of strings.")

            texts = [str(text) if not isinstance(text, str) else text for text in texts]
            inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
            
            with torch.no_grad():
                outputs = encoder(**inputs)
            
            text_embedding = outputs.last_hidden_state[:, 0, :]  # Use [CLS] token embedding
            embedding_cache[text] = text_embedding  # Cache the generated embedding
            embeddings.append(text_embedding)
    return torch.stack(embeddings)

# Function to find the most similar us_gaap_description to a given variation (input text)
def find_closest_description(variation, descriptions):
    variation_embedding = generate_embeddings([variation]).squeeze(0)  # Remove extra dimensions
    
    # Generate embeddings for all us_gaap_descriptions
    description_embeddings = generate_embeddings(descriptions).squeeze(1)  # Remove extra dimensions

    # Compute cosine similarity
    similarity = cosine_similarity(variation_embedding.cpu().numpy(), description_embeddings.cpu().numpy())
    
    # Find the index of the description with the highest cosine similarity
    closest_idx = similarity.argmax()
    
    return descriptions[closest_idx], similarity[0][closest_idx]

def build_dataset(query: str):
    df = db.get(query, ["input_text", "us_gaap_description"])

    # Apply the generate_us_gaap_description function to the us_gaap_description
    df["us_gaap_description"] = df["us_gaap_description"].apply(generate_us_gaap_description)

    # For each variation in the dataset, find the closest match to the us_gaap_description
    closest_matches = []
    is_correct_matches = []
    similarities = []
    
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing data"):
        # Find the closest us_gaap_description for the current variation
        closest_match, similarity = find_closest_description(row["input_text"], df["us_gaap_description"].tolist())
        closest_matches.append(closest_match)
        similarities.append(similarity)
        
        # Check if the closest match is the same as the original us_gaap_description
        is_correct = closest_match == row["us_gaap_description"]
        is_correct_matches.append(is_correct)

    # Add the closest match and correctness flag to the dataframe
    df["closest_match"] = closest_matches
    df["similarity"] = similarities
    df["is_correct_match"] = is_correct_matches

    # Calculate error rate (percentage of incorrect matches)
    total_entries = len(df)
    correct_matches = sum(is_correct_matches)
    accuracy = correct_matches / total_entries
    error_rate = 1 - accuracy
    
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print(f"Error Rate: {error_rate * 100:.2f}%")
    
    # Save as JSONL
    df.to_json(f"data/matched_tags.jsonl", orient="records", lines=True)
    print(f"Matched data saved to 'matched_tags.jsonl' with {len(df)} rows.")


if __name__ == "__main__":
    build_dataset(queries["concept_variations"])
