In [2]:
import os
import zipfile
import json

# Path to the input directory and output file
input_dir = "server/ncbi_reports"  # Change this to your input directory
output_file_path = "combined_taxonomy_reports.json"  # Output file for combined data

output_data = []

def process_zip_file(zip_path):
    try:
        # Open the ZIP file
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            # List all files in the ZIP
            zip_entries = zip_ref.namelist()

            # Find the taxonomy_report.jsonl inside the zip file
            jsonl_file = None
            for entry in zip_entries:
                if entry.startswith("ncbi_dataset/data/taxonomy_report.jsonl"):
                    jsonl_file = entry
                    break

            if jsonl_file:
                print(f"Processing {jsonl_file} from {zip_path}")

                # Extract the JSONL file content
                with zip_ref.open(jsonl_file) as f:
                    # Read the content of the JSONL file
                    for line in f:
                        try:
                            # Decode the line and parse it as JSON
                            json_data = json.loads(line.decode("utf-8"))
                            output_data.append(json_data)
                        except json.JSONDecodeError as e:
                            print(f"Error decoding JSON line in {jsonl_file}: {e}")
            else:
                print(f"No taxonomy_report.jsonl found in {zip_path}")
    except zipfile.BadZipFile as e:
        print(f"Error processing file {zip_path}: Bad ZIP file {e}")
    except Exception as e:
        print(f"Error processing file {zip_path}: {e}")

def main():
    try:
        # List all files in the input directory
        for file_name in os.listdir(input_dir):
            if file_name.endswith(".zip"):
                zip_path = os.path.join(input_dir, file_name)
                print(f"Processing file: {zip_path}")
                process_zip_file(zip_path)

        # After processing all ZIP files, save the combined data into a JSON file
        with open(output_file_path, 'w') as output_file:
            json.dump(output_data, output_file, indent=2)

        print(f"Combined taxonomy reports saved to: {output_file_path}")
    except Exception as e:
        print(f"Error during processing: {e}")

if __name__ == "__main__":
    main()


Processing file: server/ncbi_reports/ncbi_dataset_27658.zip
Processing ncbi_dataset/data/taxonomy_report.jsonl from server/ncbi_reports/ncbi_dataset_27658.zip
Processing file: server/ncbi_reports/ncbi_dataset_9530.zip
Processing ncbi_dataset/data/taxonomy_report.jsonl from server/ncbi_reports/ncbi_dataset_9530.zip
Processing file: server/ncbi_reports/ncbi_dataset_249015.zip
Processing ncbi_dataset/data/taxonomy_report.jsonl from server/ncbi_reports/ncbi_dataset_249015.zip
Processing file: server/ncbi_reports/ncbi_dataset_58073.zip
Processing ncbi_dataset/data/taxonomy_report.jsonl from server/ncbi_reports/ncbi_dataset_58073.zip
Processing file: server/ncbi_reports/ncbi_dataset_36225.zip
Processing ncbi_dataset/data/taxonomy_report.jsonl from server/ncbi_reports/ncbi_dataset_36225.zip
Processing file: server/ncbi_reports/ncbi_dataset_30640.zip
Processing ncbi_dataset/data/taxonomy_report.jsonl from server/ncbi_reports/ncbi_dataset_30640.zip
Processing file: server/ncbi_reports/ncbi_data