In [15]:
import os
import zipfile
import json

# Paths
input_dir = "server/ncbi_reports"  # Input directory
output_file_path = "server/utils/combined_taxonomy_reports.json"  # Output file

output_data = []
unique_tax_ids = set()  # Track unique taxonomy codes

def load_existing_data():
    """Load existing data from the JSON file if it exists."""
    if os.path.exists(output_file_path):
        try:
            with open(output_file_path, "r") as f:
                data = json.load(f)
                # Extract and store unique tax_ids
                for entry in data:
                    tax_id = entry.get("tax_id")  # Adjust key name if different
                    if tax_id is not None:
                        unique_tax_ids.add(tax_id)
                return data
        except json.JSONDecodeError:
            print("Warning: Failed to decode existing JSON. Starting fresh.")
            return []
    return []

def process_zip_file(zip_path):
    """Process a ZIP file and extract new taxonomy data."""
    try:
        with zipfile.ZipFile(zip_path, "r") as zip_ref:
            zip_entries = zip_ref.namelist()

            # Find the taxonomy_report.jsonl inside the zip file
            jsonl_file = next((e for e in zip_entries if e.startswith("ncbi_dataset/data/taxonomy_report.jsonl")), None)

            if jsonl_file:
                print(f"Processing {jsonl_file} from {zip_path}")

                with zip_ref.open(jsonl_file) as f:
                    for line in f:
                        try:
                            json_data = json.loads(line.decode("utf-8"))
                            tax_id = json_data.get("tax_id")  # Adjust key name if different

                            # Ensure tax_id exists and is unique
                            if tax_id is not None and tax_id not in unique_tax_ids:
                                unique_tax_ids.add(tax_id)
                                output_data.append(json_data)
                        except json.JSONDecodeError as e:
                            print(f"Error decoding JSON line in {jsonl_file}: {e}")
            else:
                print(f"No taxonomy_report.jsonl found in {zip_path}")
    except zipfile.BadZipFile as e:
        print(f"Error processing file {zip_path}: Bad ZIP file {e}")
    except Exception as e:
        print(f"Error processing file {zip_path}: {e}")

def main():
    try:
        # Load existing data
        existing_data = load_existing_data()
        prev_count = len(existing_data)  # Count before processing

        # Process new ZIP files
        for file_name in os.listdir(input_dir):
            if file_name.endswith(".zip"):
                zip_path = os.path.join(input_dir, file_name)
                print(f"Processing file: {zip_path}")
                process_zip_file(zip_path)

        # Append new unique data and save
        combined_data = existing_data + output_data
        new_count = len(combined_data)  # Count after processing

        with open(output_file_path, "w") as output_file:
            json.dump(combined_data, output_file, indent=2)

        print(f"Updated taxonomy reports saved to: {output_file_path}")
        print(f"Previously, there were {prev_count} unique entries. Now, there are {new_count} unique entries.")
        print(f"Added {new_count - prev_count} new unique entries.")

    except Exception as e:
        print(f"Error during processing: {e}")

if __name__ == "__main__":
    main()

Processing file: server/ncbi_reports/ncbi_dataset_452646.zip
Processing ncbi_dataset/data/taxonomy_report.jsonl from server/ncbi_reports/ncbi_dataset_452646.zip
Processing file: server/ncbi_reports/ncbi_dataset_9915.zip
Processing ncbi_dataset/data/taxonomy_report.jsonl from server/ncbi_reports/ncbi_dataset_9915.zip
Processing file: server/ncbi_reports/ncbi_dataset_1206715.zip
Processing ncbi_dataset/data/taxonomy_report.jsonl from server/ncbi_reports/ncbi_dataset_1206715.zip
Processing file: server/ncbi_reports/ncbi_dataset_230844.zip
Processing ncbi_dataset/data/taxonomy_report.jsonl from server/ncbi_reports/ncbi_dataset_230844.zip
Processing file: server/ncbi_reports/ncbi_dataset_416366.zip
Processing ncbi_dataset/data/taxonomy_report.jsonl from server/ncbi_reports/ncbi_dataset_416366.zip
Processing file: server/ncbi_reports/ncbi_dataset_9691.zip
Processing ncbi_dataset/data/taxonomy_report.jsonl from server/ncbi_reports/ncbi_dataset_9691.zip
Processing file: server/ncbi_reports/ncb

In [13]:
# read combined_taxonomy_reports.json
import json

with open('/Users/vishwaas.singh/Documents/Personal/Projects/Histology_DB/server/utils/combined_taxonomy_reports.json') as f:
    data = json.load(f)

In [14]:
len(data)

560