# Collect data

In [None]:
import pandas as pd
import glob
import os
import json  # Add this import

# Define the directory path and file pattern
directory = "./files/*/"
file_pattern = "*_cleaned.json"

# Get all matching files in the directory
file_paths = glob.glob(os.path.join(directory, file_pattern))

# Check if any files were found
if not file_paths:
    print("No matching files found!")
else:
    print(f"Found {len(file_paths)} files to process")
    
    # Create a list to hold individual DataFrames
    dfs = []
    
    # Read each JSON file into a DataFrame and add to list
    for file_path in file_paths:
        try:
            df = pd.read_json(file_path)
            dfs.append(df)
            print(f"Processed: {os.path.basename(file_path)}")
        except Exception as e:
            print(f"Error processing {file_path}: {str(e)}")
    
    combined_df = pd.concat(dfs, ignore_index=True)
    
    output_file = "dataset.json"
    data = combined_df.to_dict(orient='records')
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False) 
      
    print(f"\nCombined data saved to {output_file}")

Found 21 files to process
Processed: output_2005_cleaned.json
Processed: output_2006_cleaned.json
Processed: output_2007_cleaned.json
Processed: output_2008_cleaned.json
Processed: output_2009_cleaned.json
Processed: output_2010_cleaned.json
Processed: output_2011_cleaned.json
Processed: output_2012_cleaned.json
Processed: output_2013_cleaned.json
Processed: output_2014_cleaned.json
Processed: output_2015_cleaned.json
Processed: output_2016_cleaned.json
Processed: output_2017_cleaned.json
Processed: output_2018_cleaned.json
Processed: output_2019_cleaned.json
Processed: output_2020_cleaned.json
Processed: output_2021_cleaned.json
Processed: output_2022_cleaned.json
Processed: output_2023_cleaned.json
Processed: output_2024_cleaned.json
Processed: output_2025_cleaned.json

Combined data saved to dataset.json


In [2]:
metadata_count = combined_df['metadata'].notna().sum()
metadata_count

1603