# Data overview

## p279

In [12]:
import os
import glob
import random
from collections import defaultdict, deque
import matplotlib.pyplot as plt
import networkx as nx
from tqdm.auto import tqdm


def load_triples(p279_dir):
    """
    Read all TSV files in the directory and build two mappings:
    - child_to_parents: child -> list of parent classes
    - parent_to_children: parent -> list of subclasses
    """
    child_to_parents = defaultdict(list)
    parent_to_children = defaultdict(list)
    tsv_files = glob.glob(os.path.join(p279_dir, "*.tsv"))
    print(f"Found {len(tsv_files)} TSV files in '{p279_dir}'.")

    for filename in tsv_files:
        with open(filename, "r", encoding="utf-8") as f:
            header = f.readline()  # skip header line
            for line in f:
                parts = line.strip().split("\t")
                if len(parts) != 3:
                    continue
                child, prop, parent = parts
                child_to_parents[child].append(parent)
                parent_to_children[parent].append(child)
    return child_to_parents, parent_to_children


child_to_parents, parent_to_children = load_triples("P279")
print(
    f"Loaded {len(child_to_parents)} child entities and {len(parent_to_children)} parent entities.\n"
)

# 1. Count nodes with multiple parents
multiple_parents = {
    child: parents for child, parents in tqdm(child_to_parents.items()) if len(parents) > 1
}
print(
    f"Nodes with multiple parents: {len(multiple_parents)} out of {len(child_to_parents)} children."
)



def find_cycles_in_subsample(child_to_parents, sample_size):
    # Take a random subsample of the keys in child_to_parents
    sample_nodes = random.sample(list(child_to_parents.keys()), min(sample_size, len(child_to_parents)))
    print(f"sample size is {len(sample_nodes)}")

    cycles = []
    visited = set()

    def dfs(node, path, local_visited):
        if node in path:
            cycles.append(path[path.index(node):] + [node])
            return
        if node in local_visited:
            # Already processed this branch from another starting point
            return
        local_visited.add(node)
        for parent in child_to_parents.get(node, []):
            dfs(parent, path + [node], local_visited)

    for node in sample_nodes:
        if node not in visited:
            local_visited = set()
            dfs(node, [], local_visited)
            visited.update(local_visited)
    return cycles

# Example usage:
subsample_cycles = find_cycles_in_subsample(child_to_parents, sample_size=1000000000)
print(f"Found {len(subsample_cycles)} cycles in the subsample.")

Found 2275 TSV files in 'P279'.
Loaded 4201747 child entities and 282692 parent entities.



100%|██████████| 4201747/4201747 [00:00<00:00, 6318185.85it/s]


Nodes with multiple parents: 666843 out of 4201747 children.
sample size is 4201747
Found 4834625 cycles in the subsample.
