In [None]:
import os

print(os.getcwd())

In [None]:
import os

# Define the starting directory
start_dir = "../data/fund-holdings"

# Count CSV files
csv_count = sum(
    len(files) for root, _, files in os.walk(start_dir) if any(f.endswith(".csv") for f in files)
)

print(f"Total CSV files found: {csv_count}")


# Determine Unmapped Fund CIK Entries

In [None]:
import os
import pandas as pd

# Define the starting directory
start_dir = "../data/fund-holdings"

# List to store unique entries without mapped company CIK number
unique_entries = []

# Initialize counter for iteration
file_count = 0

# Iterate through CSV files
for root, _, files in os.walk(start_dir):
    for file in files:
        if file.endswith(".csv"):
            file_path = os.path.join(root, file)
            df = pd.read_csv(file_path, dtype=str)  # Read CSV as string to avoid conversion issues
            
            # Filter rows where mapped_company_cik_number is NaN or empty
            filtered_df = df[df["mapped_company_cik_number"].fillna("").str.strip() == ""]

            # Append unique rows to the list
            unique_entries.extend(filtered_df.drop_duplicates().to_dict(orient="records"))
            
            # Increment file counter and print progress
            file_count += 1
            print(f"Processed {file_count} files...")

# Convert to DataFrame
unique_df = pd.DataFrame(unique_entries)

In [None]:
unique_df.to_csv("unmapped.csv")

# Determine Most Common (and currently maintained) US-GAAP Columns

In [None]:
import os
import pandas as pd
from collections import defaultdict

# Define the directory to search for CSV files
start_dir = "../data/orig.us-gaap"

# Dictionary to store column reporting frequency per form type
column_distribution = defaultdict(lambda: {"10-K": 0, "10-Q": 0, "latest_filed": 0})

# Track the number of processed files
file_count = 0

# Define the minimum year threshold
current_year = pd.Timestamp.now().year
min_year = current_year - 4  # Consider only filings within the last 4 years

# Iterate through all CSV files in the directory
for root, _, files in os.walk(start_dir):
    for file in files:
        if file.endswith(".csv"):
            file_path = os.path.join(root, file)

            try:
                # Read CSV file with dtype=str to prevent automatic type conversion
                df = pd.read_csv(file_path, dtype=str)

                # Ensure required columns exist
                if "form" not in df.columns or "filed" not in df.columns:
                    continue

                # Convert "filed" column to numeric (year only)
                df["filed"] = pd.to_datetime(df["filed"], errors="coerce").dt.year

                # Filter out entries older than min_year
                df = df[df["filed"] >= min_year]

                if df.empty:
                    continue

                # Track the most recent year for each column
                latest_filing_year = df["filed"].max()

                # Count occurrences of each column by form type
                for column in df.columns:
                    form_counts = df["form"].value_counts()
                    for form_type in ["10-K", "10-Q"]:
                        if form_type in form_counts:
                            column_distribution[column][form_type] += form_counts[form_type]

                    # Update latest filing year
                    if latest_filing_year:
                        column_distribution[column]["latest_filed"] = max(
                            column_distribution[column]["latest_filed"], latest_filing_year
                        )

                # Increment processed file counter
                file_count += 1
                if file_count % 10 == 0:
                    print(f"Processed {file_count} files...")

                # TODO: Remove
                # if file_count > 500:
                #     break

            except Exception as e:
                print(f"Error processing {file_path}: {e}")

# Convert results to a DataFrame
df_distribution = pd.DataFrame.from_dict(column_distribution, orient="index")

# Filter columns that haven't been reported in the last 4 years
df_distribution = df_distribution[df_distribution["latest_filed"] >= min_year]

# Sort by most frequently reported in "10-K" and "10-Q"
df_distribution.sort_values(by=["10-K", "10-Q"], ascending=False, inplace=True)

####

# Compute total number of unique 10-K and 10-Q documents (not their sum)
total_10k_docs = df_distribution["10-K"].count()
total_10q_docs = df_distribution["10-Q"].count()

# Compute percentage for each row based on unique document count
df_distribution["10-K %"] = df_distribution["10-K"] / total_10k_docs
df_distribution["10-Q %"] = df_distribution["10-Q"] / total_10q_docs

# Normalize percentages so that the highest value is 100%
df_distribution["10-K %"] = (df_distribution["10-K %"] / df_distribution["10-K %"].max()) * 100
df_distribution["10-Q %"] = (df_distribution["10-Q %"] / df_distribution["10-Q %"].max()) * 100

def classify_statement_type(column_name):
    """Assigns a financial statement type to a given column name using hierarchical classification."""
    best_match = ("Unclassified", 0, None)  # (Statement, Length, Category)

    STATEMENT_TYPES = {
        "Balance Sheet": {
            "Current Assets": [
                "CashAndCashEquivalents", "MarketableSecurities", "AccountsReceivable",
                "Inventory", "PrepaidExpenses", "LoansReceivable", "FairValueMeasurement",
                "CustomerLoyaltyProgram", "FrequentFlierLiability"
            ],
            "Non-Current Assets": [
                "PropertyPlantEquipment", "Goodwill", "IntangibleAssets",
                "DeferredTaxAssets", "LongTermInvestments", "LiabilityForAsbestos",
                "DeferredTaxLiabilities", "AmortizationOfIntangibleAssets"
            ],
            "Liabilities & Equity": [
                "AccountsPayable", "AccruedLiabilities", "DeferredRevenue",
                "ShortTermDebt", "LongTermDebt", "Equity", "RetainedEarnings",
                "OperatingLeaseLiability", "PreferredStock", "UnrecognizedTaxBenefits",
                "AdditionalPaidInCapital"
            ]
        },
        "Income Statement": {
            "Revenue": [
                "Revenue", "Sales", "ServiceRevenue", "InterestAndFeeIncome",
                "SecScheduleSupplementalInformation"
            ],
            "Expenses": [
                "CostOfRevenue", "OperatingExpenses", "InterestExpense", "Depreciation",
                "Amortization", "SellingGeneralAdministrative", "LossOnDisposal",
                "ForeignCurrencyExchangeLoss", "StockBasedCompensationExpense",
                "AdjustmentsForIncomeTaxExpense", "AdjustmentsForAmortizationExpense",
                "OtherFinanceIncomeCost", "OperatingLeaseCost"
            ],
            "Profit & Loss": [
                "GrossProfit", "NetIncome", "Loss", "ComprehensiveIncome",
                "ComprehensiveIncomeNetOfTax", "ProfitLoss"
            ],
            "Cash-Related Adjustments": [
                "InterestIncome", "InterestExpense", "Depreciation",
                "Amortization", "StockBasedCompensation"
            ]
        },
        "Cash Flow Statement": {
            "Operating Activities": [
                "CashFlowsFromOperatingActivities", "IncreaseInInventory", "Depreciation",
                "StockBasedCompensation", "AccountsReceivableChanges", "OperatingActivities",
                "NoncashAcquisition", "ReorganizationValueCash", "OperatingLeasePayments"
            ],
            "Investing Activities": [
                "CapitalExpenditures", "Investments", "PurchaseOfProperty", "SaleOfInvestments",
                "ProceedsFromIssuanceOfSecurities", "PaymentsForInvestment"
            ],
            "Financing Activities": [
                "DebtIssuance", "StockRepurchase", "DividendsPaid", "ProceedsFromStockOptions",
                "RepaymentOfLongTermDebt", "LiabilitiesArisingFromFinancingActivities",
                "PaymentsForRepurchaseOfStock"
            ],
            "Other Cash Flow Items": [
                "ForeignCurrencyExchangeEffects", "InterestPaid", "IncomeTaxesPaid",
                "NonCashItems", "ChangesInWorkingCapital", "PaymentsForRestrictedCash"
            ]
        }
    }

    for statement, categories in STATEMENT_TYPES.items():
        for category, keywords in categories.items():
            for keyword in sorted(keywords, key=len, reverse=True):  # Prioritize longest match
                if keyword in column_name and len(keyword) > best_match[1]:
                    best_match = (statement, len(keyword), category)

    return best_match[0], best_match[2]  # Returns (Statement Type, Category)


# Apply classification to each column
df_distribution["Statement Type"] = df_distribution.index.map(classify_statement_type)

In [None]:
df_distribution.to_csv("column_distribution.csv")

In [None]:
import re
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
import torch.nn.functional as F

# Load better embedding model optimized for semantic similarity
model_name = "BAAI/bge-large-en-v1.5"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Category structure (maintained as provided)
STATEMENT_TYPES = {
    "Balance Sheet": {
        "Assets": {
            "Cash and Short Term Assets": [
                "Cash", "Cash & Equivalents", "Short Term Investments"
            ],
            "Accounts Receivable - Trade, Net": ["Accounts Receivable - Trade, Gross"],
            "Total Receivables, Net": ["Receivables, Other"],
            "Total Inventory": ["Inventories - Finished Goods", "Inventories - Work in Progress"],
            "Prepaid Expenses": [],
            "Other Currrent Assets, Total": ["Restricted Cash - Current", "Other Current Assets"],
            ########################################
            "Total Current Assets": [],
            ########################################
            "Property/PlantEquipment, Total - Gross": [
                "Buildings - Gross",
                "Land/Improvements - Gross",
                "Machinery/Equipment - Gross",
                "Other Property/Plant/Equipment - Gross"
            ],
            "Property/Plant/Equipment, Total - Net": ["Accumulated Depreciation, Total"],
            "Goodwill, Net": [],
            "Intangibles, Net": [],
            "Long Term Investments": ["LT Investments - Other"],
            "Note Receivable - Long Term": [],
            "Other Long Term Assets, Total": [
                "Restricted Cash - Long Term",
                "Other Long Term Assets"
            ],
            ########################################
            "Total Assets": []
            ########################################
        },
        "Liabilities": {
            "Accounts Payable": [],
            "Payable/Accrued": [],
            "Accrued Expenses": [],
            "Notes Payable/Short Term Debt": [],
            "Current Port. of LT Debt/Capital Leases": [],
            "Other Current Liabilities, Total": ["Customer Advances", "Other Current Liabilities"],
            ########################################
            "Total Current Liabilities": [],
            ########################################
            "Total Long Term Debt": ["Long Term Debt", "Capital Lease Obligations"],
            "Total Debt": [],
            "Deferred Income Tax": [],
            "Minority Interest": [],
            "Other Liabilities, Total": ["Other Long Term Liabilities"],
            ########################################
            "Total Liabilities": []
            ########################################
        },
        "Shareholders' Equity": {
            "Redeemable Preferred Stock, Total": [],
            "Preferred Stock - Non Redeemable, Net": [],
            "Common Stock, Total": ["Common Stock"],
            "Additional Paid-In Capital": [],
            "Retained Earnings (Accumulated Deficit)": [],
            "Treasury Stock - Common": [],
            "ESOP Debt Guarantee": [],
            "Unrealized Gain (Loss)": [],
            "Other Equity, Total": ["Translation Adjustment", "Other Comprehensive Income"],
            ########################################
            "Total Equity": [],
        },
        "Total Liabilities & Shareholders' Equity": []
        
    },
    "Income Statement": {
        "Revenue": {
            "Net Sales": []
        },
        "Other Revenue, Total": [],
        "Cost of Revenue, Total": ["Cost of Revenue"],
        ########################################
        "Gross Profit": [],
        ########################################
        "Selling/General/Admin. Expenses, Total": ["Selling/General/Administrative Expenses"],
        "Research & Development": [],
        "Depreciation/Amortization": [],
        "Interest Expense, Net - Operating": [],
        "Interest/Investment Income - Operating": [],
        "Interest Expense(Income) - Net Operating": [],
        "Interest Exp.(Inc.),Net-Operating, Total": [],
        "Unusual Expense (Income)": [],
        "Other Operating Expenses, Total": [],
        "Total Operating Expense": [],
        ########################################
        "Operating Income": [],
        ########################################
        "Interest Expense, Net Non-Operating": ["Interest Expense - Non-Operating"],
        "Interest/Invest Income - Non-Operating": ["Interest Income - Non-Operating"],
        "Interest Income(Exp), Net Non-Operating": [],
        "Interest Inc.(Exp.),Net-Non-Op., Total": [],
        "Gain (Loss) on Sale of Assets": [],
        "Other, Net": ["Other Non-Operating Income (Expense)"],
        "Net Income Before Taxes": [],
        "Provision for Income Taxes": [],
        "Net Income After Taxes": [],
        "Minority Interest": [],
        "Equity In Affiliates": [],
        "U.S. GAAP Adjustment": [],
        ########################################
        "Net Income Before Extra. Items": [],
        ########################################
        "Accounting Change": [],
        "Discontinued Operations": [],
        "Extraordinary Item": [],
        "Tax on Extraordinary Items": [],
        "Total Extraordinary Items": [],
        ########################################
        "Net Income": [],
        ########################################
        "Preferred Dividends": [],
        "General Partners' Distributions": [],
        "Miscellaneous Earnings Adjustment": [],
        "Pro Forma Adjustment": [],
        "Interest Adjustment - Primary Earnings Per Share": [],
        "Total Adjustments to Net Income": [],
        "Income Available to Com Excl ExtraOrd": [],
        "Income Available to Com Incl ExtraOrd": [],
        "Basic Weighted Average Shares": [],
        "Basic Earnings Per Share Excluding Extraordinary Items": [],
        "Basic Earnings Per Share Including Extraordinary Items": [],
        "Dilution Adjustment": [],
        "Diluted Net Income": [],
        "Diluted Weighted Average Shares": [],
        "Diluted Earnings Per Share Excluding ExtraOrd Items": [],
        "Diluted Earnings Per Share Including ExtraOrd Items": []
    },
    "Cash Flow": {
        "Operating Activities": {
            "Net Income/Starting Line": [],
            "Depreciation/Depletion": ["Depreciation"],
            "Amortization": [],
            "Deferred Taxes": [],
            "Non-Cash Items": ["Other Non-Cash Items"],
            "Changes in Working Capital": [
                "Accounts Receivable",
                "Inventories",
                "Other Assets",
                "Accounts Payable",
                "Other Liabilities"
            ],
            ########################################
            "Cash from Operating Activities": []
            ########################################
        },
            
    },

}

# Function to generate embeddings with pooling
def get_embedding(text):
    """Generates embeddings using mean pooling from a BGE model."""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state
    attention_mask = inputs['attention_mask'].unsqueeze(-1)
    masked_embeddings = embeddings * attention_mask
    summed = masked_embeddings.sum(1)
    counted = attention_mask.sum(1)
    return summed / counted

# Prepare category embeddings
def prepare_category_embeddings():
    match_terms = []
    for statement, categories in STATEMENT_TYPES.items():
        for category, subcategories in categories.items():
            if isinstance(subcategories, dict):
                for subcategory, terms in subcategories.items():
                    combined_terms = f"{statement} {category} {subcategory} " + " ".join(terms).lower()
                    match_terms.append((combined_terms, statement, category, subcategory))
            else:
                combined_terms = f"{statement} {category} " + " ".join(subcategories).lower()
                match_terms.append((combined_terms, statement, category, ""))

    match_df = pd.DataFrame(match_terms, columns=["Processed Terms", "Statement", "Main Category", "Subcategory"])
    match_df["Embedding"] = match_df["Processed Terms"].apply(get_embedding)
    return match_df

# Function to classify US-GAAP field names using embeddings
def classify_us_gaap_field(field_name, match_df, top_n=3):
    # Also handles acronymns
    processed_field = re.sub(
        r'(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])',
        ' ',
        field_name
    ).lower()

    field_embedding = get_embedding(processed_field)

    print(f"Processed Field: {processed_field}")

    match_df["Similarity"] = match_df["Embedding"].apply(
        lambda emb: F.cosine_similarity(field_embedding, emb).item()
    )
    return match_df.sort_values(by="Similarity", ascending=False).head(top_n)

# Example usage
match_df = prepare_category_embeddings()
us_gaap_field = "DeferredTaxAssetsLiabilitiesNetNoncurrent"
top_matches = classify_us_gaap_field(us_gaap_field, match_df)

# Display the results
# top_matches[["Statement", "Main Category", "Subcategory", "Similarity"]]
top_matches

Processed Field: deferred tax assets liabilities net noncurrent


Unnamed: 0,Processed Terms,Statement,Main Category,Subcategory,Embedding,Similarity
24,Balance Sheet Liabilities Deferred Income Tax,Balance Sheet,Liabilities,Deferred Income Tax,"[[tensor(-0.5943), tensor(-0.0223), tensor(-0....",0.839012
6,Balance Sheet Assets Total Current Assets,Balance Sheet,Assets,Total Current Assets,"[[tensor(-0.5913), tensor(0.3480), tensor(-0.1...",0.755094
10,"Balance Sheet Assets Intangibles, Net",Balance Sheet,Assets,"Intangibles, Net","[[tensor(-0.8014), tensor(0.0492), tensor(-0.4...",0.75097


# US-GAAP 2025 Hierarchy

https://www.sec.gov/data-research/standard-taxonomies/operating-companies