In [3]:
import os
import json

In [4]:
mibig_json_dir = "../mibig_json_4.0"

In [5]:
bgc_data = []
for file in os.listdir(mibig_json_dir):
    if file.endswith(".json"):
        file_path = os.path.join(mibig_json_dir, file)
        with open(file_path, "r") as f:
            data = json.load(f)
            bgc_data.append(data)

In [6]:
print(f"{len(bgc_data)} BGCs")

3013 BGCs


In [7]:
print(bgc_data[-1].keys())

dict_keys(['accession', 'version', 'changelog', 'quality', 'status', 'completeness', 'loci', 'biosynthesis', 'compounds', 'taxonomy', 'legacy_references'])


In [8]:
import json

def count_attribute(bgc_data, attribute_path, key=None):
    """
    Count how many BGCs have a specific attribute.

    :param bgc_data: List of BGC dictionaries.
    :param attribute_path: List of keys representing the path to the attribute
    :param key: Key to check within the attribute.
    :return: Count of BGCs with the specified attribute.
    """
    count = 0
    for bgc in bgc_data:
        current = bgc
        for key_part in attribute_path:
            if key_part in current:
                current = current[key_part]
            else:
                current = None
                break
        
        if current:
            if key:
                # Check specific key in the attribute
                if isinstance(current, list):
                    if any(key in item for item in current if isinstance(item, dict)):
                        count += 1
                elif isinstance(current, dict) and key in current:
                    count += 1
            else:
                # General check for non-empty attribute
                count += 1

    return count



def count_cyclic_compounds(bgc_data):
    """
    Count how many BGCs have at least one cyclic compound.
    """
    return count_attribute(bgc_data, ["compounds"], "cyclic")



Count the total number of BGCs:

In [15]:
total_bgcs = len(bgc_data)
print(f"Total Number of BGCs: {total_bgcs}")

Total Number of BGCs: 3013


Count the number of BGCs with Genes:

In [18]:
bgc_with_genes = count_attribute(bgc_data, ["genes"])
print(f"BGCs with genes: {bgc_with_genes}")

BGCs with genes: 674


Count the number of BGCs with annotations:

In [17]:
bgcs_with_annotations = count_attribute(bgc_data, ["genes", "annotations"])
print(f"BGCs with gene annotations: {bgcs_with_annotations}")

BGCs with gene annotations: 637


Count the number of BGCs with at least one cyclic compound:

In [21]:
bgcs_with_cyclic_compounds = count_attribute(bgc_data, ["compounds"], "cyclic")
print(f"BGCs with at least one cyclic compound: {bgcs_with_cyclic_compounds}")

BGCs with at least one cyclic compound: 260


Count the number of BGC with biosynthesis classes:

In [23]:
num_biosynthesis_class = count_attribute(bgc_data, ["biosynthesis", "classes"])
print(f"Number of BGCs with biosynthesis classes: {num_biosynthesis_class}")

Number of BGCs with biosynthesis classes: 3013


Count the number of BGCs with active modules:

In [20]:
count = 0
for bgc in bgc_data:
    modules = bgc.get("biosynthesis", {}).get("modules", [])
    if any(module.get("active", False) for module in modules):
        count += 1
print(f"Number of BGCs with active modules: {count}")

Number of BGCs with active modules: 270
