In [None]:
import os

print(os.getcwd())

In [None]:
import os

# Define the starting directory
start_dir = "../data/fund-holdings"

# Count CSV files
csv_count = sum(
    len(files) for root, _, files in os.walk(start_dir) if any(f.endswith(".csv") for f in files)
)

print(f"Total CSV files found: {csv_count}")


# Determine Unmapped Fund CIK Entries

In [None]:
import os
import pandas as pd

# Define the starting directory
start_dir = "../data/fund-holdings"

# List to store unique entries without mapped company CIK number
unique_entries = []

# Initialize counter for iteration
file_count = 0

# Iterate through CSV files
for root, _, files in os.walk(start_dir):
    for file in files:
        if file.endswith(".csv"):
            file_path = os.path.join(root, file)
            df = pd.read_csv(file_path, dtype=str)  # Read CSV as string to avoid conversion issues
            
            # Filter rows where mapped_company_cik_number is NaN or empty
            filtered_df = df[df["mapped_company_cik_number"].fillna("").str.strip() == ""]

            # Append unique rows to the list
            unique_entries.extend(filtered_df.drop_duplicates().to_dict(orient="records"))
            
            # Increment file counter and print progress
            file_count += 1
            print(f"Processed {file_count} files...")

# Convert to DataFrame
unique_df = pd.DataFrame(unique_entries)

In [None]:
unique_df.to_csv("unmapped.csv")

# Determine Most Common (and currently maintained) US-GAAP Columns

In [None]:
import os
import pandas as pd
from collections import defaultdict

# Define the directory to search for CSV files
start_dir = "../data/orig.us-gaap"

# Dictionary to store column reporting frequency per form type
column_distribution = defaultdict(lambda: {"10-K": 0, "10-Q": 0, "latest_filed": 0})

# Track the number of processed files
file_count = 0

# Define the minimum year threshold
current_year = pd.Timestamp.now().year
min_year = current_year - 4  # Consider only filings within the last 4 years

# Iterate through all CSV files in the directory
for root, _, files in os.walk(start_dir):
    for file in files:
        if file.endswith(".csv"):
            file_path = os.path.join(root, file)

            try:
                # Read CSV file with dtype=str to prevent automatic type conversion
                df = pd.read_csv(file_path, dtype=str)

                # Ensure required columns exist
                if "form" not in df.columns or "filed" not in df.columns:
                    continue

                # Convert "filed" column to numeric (year only)
                df["filed"] = pd.to_datetime(df["filed"], errors="coerce").dt.year

                # Filter out entries older than min_year
                df = df[df["filed"] >= min_year]

                if df.empty:
                    continue

                # Track the most recent year for each column
                latest_filing_year = df["filed"].max()

                # Count occurrences of each column by form type
                for column in df.columns:
                    form_counts = df["form"].value_counts()
                    for form_type in ["10-K", "10-Q"]:
                        if form_type in form_counts:
                            column_distribution[column][form_type] += form_counts[form_type]

                    # Update latest filing year
                    if latest_filing_year:
                        column_distribution[column]["latest_filed"] = max(
                            column_distribution[column]["latest_filed"], latest_filing_year
                        )

                # Increment processed file counter
                file_count += 1
                if file_count % 10 == 0:
                    print(f"Processed {file_count} files...")

                # TODO: Remove
                # if file_count > 500:
                #     break

            except Exception as e:
                print(f"Error processing {file_path}: {e}")

# Convert results to a DataFrame
df_distribution = pd.DataFrame.from_dict(column_distribution, orient="index")

# Filter columns that haven't been reported in the last 4 years
df_distribution = df_distribution[df_distribution["latest_filed"] >= min_year]

# Sort by most frequently reported in "10-K" and "10-Q"
df_distribution.sort_values(by=["10-K", "10-Q"], ascending=False, inplace=True)

####

# Compute total number of unique 10-K and 10-Q documents (not their sum)
total_10k_docs = df_distribution["10-K"].count()
total_10q_docs = df_distribution["10-Q"].count()

# Compute percentage for each row based on unique document count
df_distribution["10-K %"] = df_distribution["10-K"] / total_10k_docs
df_distribution["10-Q %"] = df_distribution["10-Q"] / total_10q_docs

# Normalize percentages so that the highest value is 100%
df_distribution["10-K %"] = (df_distribution["10-K %"] / df_distribution["10-K %"].max()) * 100
df_distribution["10-Q %"] = (df_distribution["10-Q %"] / df_distribution["10-Q %"].max()) * 100



In [None]:
df_distribution.to_csv("column_distribution.csv")