In [None]:
import json

# List of JSON files to merge
files_to_merge = [
    "clusters_biology.json",
    "clusters_computer_science.json",
    "clusters_finances.json",
    "clusters_others.json",
    "clusters_physics.json",
    "clusters_math.json"
]

# Output file for the merged data
output_file = "merged_clusters.json"

# Categories with chunked data
chunked_categories = {"computer_science", "math", "physics"}

# Dictionary to store merged results
merged_clusters = {}

# Merge the files
for filename in files_to_merge:
    try:
        with open(filename, "r") as f:
            data = json.load(f)
            for category, content in data.items():
                # Initialize category if not present
                if category not in merged_clusters:
                    merged_clusters[category] = {"clusters": [], "noise": []}

                if category in chunked_categories:
                    # Merge chunked data
                    for chunk_key, chunk_data in content.items():
                        merged_clusters[category]["clusters"].extend(chunk_data.get("clusters", []))
                        merged_clusters[category]["noise"].extend(chunk_data.get("noise", []))
                else:
                    # Non-chunked data
                    merged_clusters[category]["clusters"].extend(content.get("clusters", []))
                    merged_clusters[category]["noise"].extend(content.get("noise", []))
    except Exception as e:
        print(f"Error reading {filename}: {e}")

# Save the merged clusters to a single JSON file
try:
    with open(output_file, "w") as f:
        json.dump(merged_clusters, f, indent=4)
    print(f"Merged clusters saved to {output_file}")
except Exception as e:
    print(f"Error writing merged file: {e}")

Merged clusters saved to merged_clusters.json


In [9]:
import json

with open("merged_clusters.json", "r") as f:
    data = json.load(f)

cluster_sizes = {}
for category, results in data.items():
    clusters = results["clusters"]
    noise_count = len(results["noise"])

    cluster_sizes[category] = {
        "num_clusters": len(clusters),
        "total_docs": sum(cluster["num_docs"] for cluster in clusters),
        "noise_docs": noise_count
    }

for category, stats in cluster_sizes.items():
    print(f"  Category: {category}")
    print(f"  Number of Clusters: {stats['num_clusters']}")
    print(f"  Documents in Clusters: {stats['total_docs']}")
    print(f"  Noise Documents: {stats['noise_docs']}")

  Category: biology
  Number of Clusters: 72
  Documents in Clusters: 21750
  Noise Documents: 7807
  Category: computer_science_chunk_1
  Number of Clusters: 373
  Documents in Clusters: 78031
  Noise Documents: 63119
  Category: computer_science_chunk_2
  Number of Clusters: 382
  Documents in Clusters: 82053
  Noise Documents: 59097
  Category: computer_science_chunk_3
  Number of Clusters: 359
  Documents in Clusters: 84443
  Noise Documents: 56707
  Category: computer_science_chunk_4
  Number of Clusters: 347
  Documents in Clusters: 77960
  Noise Documents: 63190
  Category: finances
  Number of Clusters: 120
  Documents in Clusters: 11414
  Noise Documents: 8234
  Category: others
  Number of Clusters: 3
  Documents in Clusters: 8182
  Noise Documents: 21
  Category: physics_chunk_1
  Number of Clusters: 4
  Documents in Clusters: 342317
  Noise Documents: 686
  Category: physics_chunk_2
  Number of Clusters: 2
  Documents in Clusters: 339466
  Noise Documents: 3537
  Category: 