In [None]:
# For training

import pandas as pd
import numpy as np
from db import DB
from utils import generate_us_gaap_description
from utils.pytorch import get_device, seed_everything
from tqdm import tqdm  # For progress bar
from transformers import AutoTokenizer, AutoModel
import torch
import os

# === Setup for BGE model ===
MODEL_NAME = "BAAI/bge-large-en-v1.5"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
encoder = AutoModel.from_pretrained(MODEL_NAME)

device = get_device()
print(f"Using device: {device}")

# Move the model to the selected device
encoder = encoder.to(device)
encoder.eval()  # Ensure model is in evaluation mode (no gradients needed)

# Database setup
db = DB()

queries = {
    "concept_variations": """
        SELECT
            t.id AS us_gaap_concept_id,
            t.name AS us_gaap_concept_name,
            v.text AS variation_text,
            GROUP_CONCAT(DISTINCT m.ofss_category_id ORDER BY m.ofss_category_id) AS ofss_category_ids
        FROM us_gaap_concept t
        JOIN us_gaap_concept_description_variation v ON v.us_gaap_concept_id = t.id
        LEFT JOIN us_gaap_concept_ofss_category m ON m.us_gaap_concept_id = t.id
        WHERE m.ofss_category_id IS NOT NULL -- Ensure we only select rows that have an associated ofss_category_id
        GROUP BY t.id, v.text
    """
}

def generate_embeddings(texts, batch_size=16):
    """
    Generate embeddings for texts using the transformer model on the MPS device.
    """
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Generating Embeddings"):
        batch_texts = texts[i:i+batch_size]

        # Tokenize the batch of texts
        inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)

        # No gradients required for inference
        with torch.no_grad():
            outputs = encoder(**inputs)

        # Extract embeddings (use [CLS] token, first token in the sequence)
        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.extend(batch_embeddings)

    return np.array(embeddings)

def build_concept_dataset(query: str):
    # Fetch data from the database
    df = db.get(query, ["us_gaap_concept_id", "us_gaap_concept_name", "variation_text", "ofss_category_ids"])

    # Apply generate_us_gaap_description to the concept names
    df["us_gaap_concept_description"] = df["us_gaap_concept_name"].apply(generate_us_gaap_description)

    # Generate embeddings for variation text and description
    print("Generating embeddings for variation text...")
    variation_embeddings = generate_embeddings(df["variation_text"].tolist())
    
    print("Generating embeddings for concept descriptions...")
    description_embeddings = generate_embeddings(df["us_gaap_concept_description"].tolist())

    # Add embeddings as columns directly to the DataFrame
    df["variation_embedding"] = list(variation_embeddings)
    df["description_embedding"] = list(description_embeddings)

    # Optionally process the `ofss_category_ids` if needed
    df["ofss_category_ids"] = df["ofss_category_ids"].apply(lambda s: [int(x) for x in s.split(",")] if s else [])

    # Limit to a maximum of 2 labels per row
    df["ofss_category_ids"] = df["ofss_category_ids"].apply(lambda x: x[:2])

    # Save the dataset as JSONL (with embeddings and category IDs included)
    output_file = "data/us_gaap_concepts_with_variations_and_embeddings.jsonl"
    df.to_json(output_file, orient="records", lines=True)

    print(f"Dataset saved to {output_file} with {len(df)} rows, including embeddings and categories.")

# Run the function to build the dataset
if __name__ == "__main__":
    build_concept_dataset(queries["concept_variations"])


In [None]:
# # For inference

# import pandas as pd
# import numpy as np
# from db import DB
# from utils import generate_us_gaap_description
# from utils.pytorch import get_device
# from tqdm import tqdm
# from transformers import AutoTokenizer, AutoModel
# import torch
# import os

# # === Setup for BGE model ===
# MODEL_NAME = "BAAI/bge-large-en-v1.5"
# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# encoder = AutoModel.from_pretrained(MODEL_NAME)

# device = get_device()
# print(f"Using device: {device}")
# encoder = encoder.to(device)
# encoder.eval()

# # Database setup
# db = DB()

# queries = {
#     "reference_concepts": """
#         SELECT
#             t.id AS us_gaap_concept_id,
#             t.name AS us_gaap_concept_name
#         FROM us_gaap_concept t
#     """
# }

# def generate_embeddings(texts, batch_size=16):
#     """
#     Generate embeddings for texts using the transformer model.
#     """
#     embeddings = []
#     for i in tqdm(range(0, len(texts), batch_size), desc="Generating Embeddings"):
#         batch_texts = texts[i:i+batch_size]

#         inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)
#         with torch.no_grad():
#             outputs = encoder(**inputs)
#         batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
#         embeddings.extend(batch_embeddings)

#     return np.array(embeddings)

# def build_reference_embedding_dataset(query: str):
#     df = db.get(query, ["us_gaap_concept_id", "us_gaap_concept_name"])

#     # Apply the description generator
#     df["us_gaap_concept_description"] = df["us_gaap_concept_name"].apply(generate_us_gaap_description)

#     # Generate embeddings only for concept descriptions
#     print("Generating embeddings for reference concept descriptions...")
#     description_embeddings = generate_embeddings(df["us_gaap_concept_description"].tolist())
#     df["description_embedding"] = list(description_embeddings)

#     # Keep only the necessary columns
#     ref_df = df[["us_gaap_concept_id", "us_gaap_concept_name", "description_embedding"]]

#     output_file = "data/us_gaap_concept_reference_embeddings.jsonl"
#     ref_df.to_json(output_file, orient="records", lines=True)

#     print(f"Reference dataset saved to {output_file} with {len(ref_df)} rows.")

# # Run the function to build the reference dataset
# if __name__ == "__main__":
#     build_reference_embedding_dataset(queries["reference_concepts"])

In [None]:
# import pandas as pd
# from db import DB
# from utils import generate_us_gaap_description
# from tqdm import tqdm

# # Database setup
# db = DB()

# queries = {
#     "gaap_concepts": """
#         SELECT DISTINCT
#             t.id AS us_gaap_concept_id,
#             t.name AS us_gaap_concept_name
#         FROM us_gaap_concept t
#         JOIN us_gaap_concept_description_variation v ON v.us_gaap_concept_id = t.id
#     """
# }

# # Function to load the data from the database using the above query
# def build_gaap_concept_description_dataset(query: str):
#     # Fetch concept names and ids from the database where concepts have variations
#     df = db.get(query, ["us_gaap_concept_id", "us_gaap_concept_name"])
    
#     # Apply generate_us_gaap_description to generate descriptions for the concept names
#     df["us_gaap_concept_description"] = df["us_gaap_concept_name"].apply(generate_us_gaap_description)

#     # Save the dataset as JSONL
#     df.to_json("data/us_gaap_concepts_with_descriptions.jsonl", orient="records", lines=True)
#     print(f"Dataset saved with {len(df)} rows.")

# # Run the function to build the dataset
# if __name__ == "__main__":
#     build_gaap_concept_description_dataset(queries["gaap_concepts"])


In [None]:
# import pandas as pd
# from db import DB
# from utils import generate_us_gaap_description 

# db = DB()

# queries = {
#     "train": """
#         SELECT
#             v.text AS input_text,
#             COALESCE(t.balance_type_id, 0) AS balance_type_id,
#             COALESCE(t.period_type_id, 0) AS period_type_id,
#             GROUP_CONCAT(DISTINCT m.ofss_category_id ORDER BY m.ofss_category_id) AS labels
#         FROM us_gaap_concept_description_variation v
#         JOIN us_gaap_concept t ON t.id = v.us_gaap_concept_id
#         JOIN us_gaap_concept_ofss_category m ON m.us_gaap_concept_id = t.id
#         GROUP BY v.text, t.balance_type_id, t.period_type_id
#     """,
#     "val": """
#         SELECT
#             t.name AS input_text,
#             COALESCE(t.balance_type_id, 0) AS balance_type_id,
#             COALESCE(t.period_type_id, 0) AS period_type_id,
#             GROUP_CONCAT(DISTINCT m.ofss_category_id ORDER BY m.ofss_category_id) AS labels
#         FROM us_gaap_concept t
#         JOIN us_gaap_concept_ofss_category m ON m.us_gaap_concept_id = t.id
#         GROUP BY t.name, t.balance_type_id, t.period_type_id
#     """
# }

# def build_dataset(name: str, query: str):
#     df = db.get(query, ["input_text", "balance_type_id", "period_type_id", "labels"])
    
#     if name == "val":
#         # Apply generate_us_gaap_description to input_text for validation dataset
#         df["input_text"] = df["input_text"].apply(generate_us_gaap_description)

#     # Process the labels
#     df["labels"] = df["labels"].apply(lambda s: [int(x) for x in s.split(",")] if s else [])

#     # Limit to a maximum of 2 labels per row
#     df["labels"] = df["labels"].apply(lambda x: x[:2])
    
#     # Save as JSONL
#     df.to_json(f"data/{name}.jsonl", orient="records", lines=True)
#     print(f"{name}.jsonl saved with {len(df)} rows.")

# if __name__ == "__main__":
#     for split, sql in queries.items():
#         build_dataset(split, sql)


# Baseline Analysis

In [None]:
import pandas as pd
from db import DB
from utils import generate_us_gaap_description  # Importing the necessary function
from sklearn.metrics.pairwise import cosine_similarity
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm  # For progress bar

# Setup for BGE model
MODEL_NAME = "BAAI/bge-large-en-v1.5"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
encoder = AutoModel.from_pretrained(MODEL_NAME)

device = "mps" if torch.backends.mps.is_available() else "cpu"
encoder = encoder.to(device)

# Database setup
db = DB()

queries = {
    "concept_variations": """
        SELECT
            v.text AS input_text,
            t.name AS us_gaap_description
        FROM us_gaap_concept_description_variation v
        JOIN us_gaap_concept t ON t.id = v.us_gaap_concept_id
    """
}

# Cache embeddings for each text individually
embedding_cache = {}

# Generate embeddings for text using the BGE model
def generate_embeddings(texts):
    """
    This function generates embeddings for a given list of text descriptions.
    It uses the BGE model to generate the embeddings, specifically the [CLS] token representation.
    """
    embeddings = []
    for text in texts:
        if text in embedding_cache:  # Check if the embedding is cached
            embeddings.append(embedding_cache[text])  # Use the cached embedding
        else:
            # If not cached, generate and cache the embedding
            if isinstance(text, str):
                texts = [text]  # Convert single string to a list of strings
            elif not isinstance(texts, list):
                raise ValueError("Input must be a string or a list of strings.")

            texts = [str(text) if not isinstance(text, str) else text for text in texts]
            inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
            
            with torch.no_grad():
                outputs = encoder(**inputs)
            
            text_embedding = outputs.last_hidden_state[:, 0, :]  # Use [CLS] token embedding
            embedding_cache[text] = text_embedding  # Cache the generated embedding
            embeddings.append(text_embedding)
    return torch.stack(embeddings)

# Function to find the most similar us_gaap_description to a given variation (input text)
def find_closest_description(variation, descriptions):
    variation_embedding = generate_embeddings([variation]).squeeze(0)  # Remove extra dimensions
    
    # Generate embeddings for all us_gaap_descriptions
    description_embeddings = generate_embeddings(descriptions).squeeze(1)  # Remove extra dimensions

    # Compute cosine similarity
    similarity = cosine_similarity(variation_embedding.cpu().numpy(), description_embeddings.cpu().numpy())
    
    # Find the index of the description with the highest cosine similarity
    closest_idx = similarity.argmax()
    
    return descriptions[closest_idx], similarity[0][closest_idx]

def build_dataset(query: str):
    df = db.get(query, ["input_text", "us_gaap_description"])

    # Apply the generate_us_gaap_description function to the us_gaap_description
    df["us_gaap_description"] = df["us_gaap_description"].apply(generate_us_gaap_description)

    # For each variation in the dataset, find the closest match to the us_gaap_description
    closest_matches = []
    is_correct_matches = []
    similarities = []
    
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing data"):
        # Find the closest us_gaap_description for the current variation
        closest_match, similarity = find_closest_description(row["input_text"], df["us_gaap_description"].tolist())
        closest_matches.append(closest_match)
        similarities.append(similarity)
        
        # Check if the closest match is the same as the original us_gaap_description
        is_correct = closest_match == row["us_gaap_description"]
        is_correct_matches.append(is_correct)

    # Add the closest match and correctness flag to the dataframe
    df["closest_match"] = closest_matches
    df["similarity"] = similarities
    df["is_correct_match"] = is_correct_matches

    # Calculate error rate (percentage of incorrect matches)
    total_entries = len(df)
    correct_matches = sum(is_correct_matches)
    accuracy = correct_matches / total_entries
    error_rate = 1 - accuracy
    
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print(f"Error Rate: {error_rate * 100:.2f}%")
    
    # Save as JSONL
    df.to_json(f"data/matched_tags.jsonl", orient="records", lines=True)
    print(f"Matched data saved to 'matched_tags.jsonl' with {len(df)} rows.")


if __name__ == "__main__":
    build_dataset(queries["concept_variations"])
