In [2]:
# TODO: might setup faster parsers or use multiprocessing to speed up the process
import datamule as dm
import json
import os
import csv
from time import time
from tqdm import tqdm

def process_company_concept_file(file_path, output_dir):
    try:
        with open(file_path, 'r') as f:
            company_concepts = json.load(f)

        parsed_data = dm.parse_company_concepts(company_concepts)

        cik = os.path.basename(file_path).replace('CIK', '').replace('.json', '').lstrip('0')
        cik_dir = os.path.join(output_dir, cik)
        os.makedirs(cik_dir, exist_ok=True)

        # Write metadata.csv
        with open(os.path.join(cik_dir, 'metadata.csv'), 'w', newline='') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=['cik', 'category', 'fact', 'label', 'description', 'unit'])
            writer.writeheader()
            writer.writerows({k: v for k, v in item.items() if k != 'table'} for item in parsed_data)

        # Write table CSVs and create locations.csv
        locations = []
        for index, item in enumerate(parsed_data, start=1):
            if item['table']:
                table_file = f"{index:04d}.csv"
                with open(os.path.join(cik_dir, table_file), 'w', newline='') as csvfile:
                    if item['table']:
                        # Assume all dictionaries in the list have the same keys
                        fieldnames = item['table'][0].keys()
                        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

                        writer.writeheader()  # Write the header row
                        writer.writerows(item['table'])  # Write the data rows

                locations.append({'file': table_file, 'label': item['fact']})

        # Write crosswalk.csv
        with open(os.path.join(cik_dir, 'crosswalk.csv'), 'w', newline='') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=['file', 'label'])
            writer.writeheader()
            writer.writerows(locations)
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

def main():
    start_time = time()

    company_concepts_dir = 'company_concepts'
    output_dir = 'company_concept_csv'
    os.makedirs(output_dir, exist_ok=True)

    json_files = [f for f in os.listdir(company_concepts_dir) if f.endswith('.json')]
    print(json_files[803])

    for filename in tqdm(json_files, desc="Processing files"):
        file_path = os.path.join(company_concepts_dir, filename)
        process_company_concept_file(file_path, output_dir)

    print(f"Processing completed in {time() - start_time:.2f} seconds")

main()

CIK0000759828.json


Processing files:   0%|          | 11/7975 [00:02<33:35,  3.95it/s]


KeyboardInterrupt: 