In [None]:
import sys
import json
import polars as pl
import matplotlib.pyplot as plt
import blitzbeaver as bb

sys.path.append("..")
pl.Config.set_tbl_rows(100)

In [None]:
from genealogy.processing import (
    load_dataframes,
    serialize_tracking_chains,
    serialize_dataframes,
)
from genealogy.tree import GenealogyTree
from genealogy.models import GenealogyNode, TrackingChain
from genealogy.chain import chain_summary

In [None]:
START_YEAR = 1835
dataframes = load_dataframes(
    folder_path="../data/normalized",
    start_year=START_YEAR,
    end_year=1892,
)

In [None]:
record_schema = bb.RecordSchema(
    [
        bb.FieldSchema("nom_rue", bb.ElementType.String),
        bb.FieldSchema("chef_prenom", bb.ElementType.String),
        bb.FieldSchema("chef_nom", bb.ElementType.String),
        bb.FieldSchema("chef_origine", bb.ElementType.String),
        bb.FieldSchema("epouse_nom", bb.ElementType.String),
        bb.FieldSchema("chef_vocation", bb.ElementType.String),
        bb.FieldSchema("enfants_chez_parents_prenom", bb.ElementType.MultiStrings),
    ]
)

In [None]:
path_graph = "../graph.beaver"

graph = bb.read_beaver(path_graph)

In [None]:
distance_metric_config = bb.DistanceMetricConfig(
    metric="lv_substring",
    caching_threshold=4,
    use_sigmoid=False,
    lv_substring_weight=0.6,
)
genealogy_config = bb.GenealogyConfig(
    husband_name_idx=1,
    wife_name_idx=4,
    last_name_idx=2,
    origin_idx=3,
    children_idx=6,
    min_tracking_chain_length=8,
    min_child_count=3,
    search_last_frame_child=False,
    search_wife=False,
    search_year_range=2,
    matching_threshold=0.95,
)

In [None]:
genealogy_trees = json.loads(bb.execute_genealogy_process(
    genealogy_config,
    distance_metric_config=distance_metric_config,
    record_schema=record_schema,
    tracking_graph=graph._raw,
    dataframes=dataframes,
))

In [None]:
def summary_trees(genealogy_trees: list[GenealogyNode]):
    depths = [0 for i in range(10)]
    total_nodes = 0
    total_children = 0
    for tree in genealogy_trees:
        total_nodes += GenealogyTree.size(tree, include_leaf_children=False)
        total_children += GenealogyTree.size(tree, include_leaf_children=True)
        depth = GenealogyTree.depth(tree)
        if depth < 10:
            depths[depth] += 1
        else:
            print(f"Warning: got tree depth {depth} > 10")
    
    print(f"Number of trees: {len(genealogy_trees)}")
    print(f"Total nodes: {total_nodes}")
    print(f"Total children: {total_children}")
    print(f"Total leaf children: {total_children - total_nodes}")

    plt.bar(range(1, len(depths)), depths[1:])
    plt.title("Histogram of trees depth")
    plt.show()

summary_trees(genealogy_trees)

In [None]:
def find_tree_with_depth(
    genealogy_trees: list[GenealogyNode], depth: int
) -> GenealogyNode:
    for tree in genealogy_trees:
        if GenealogyTree.depth(tree) >= depth:
            return tree
    raise ValueError(f"Tree with depth {depth} not found")


def _display_tree_summary(tree: GenealogyNode) -> list[str]:
    tracking_chain = graph.materialize_tracking_chain(
        tree["id"], dataframes, record_schema
    )
    summary_parent = chain_summary(tracking_chain, genealogy_config, START_YEAR)
    summary_children = ["  " + name for name in tree["leaf_children"]]
    for child in tree["children"]:
        summary_children += ["  " + line for line in _display_tree_summary(child)]
    return [summary_parent] + summary_children

def display_tree_summary(tree: GenealogyNode) -> None:
    return "\n".join(_display_tree_summary(tree))

In [None]:
tree = find_tree_with_depth(genealogy_trees, 3)

In [None]:
print(display_tree_summary(tree))

In [None]:
chain1 = graph.materialize_tracking_chain(
    tree["id"], dataframes, record_schema
)
chain2 = graph.materialize_tracking_chain(
    tree["children"][0]["id"], dataframes, record_schema
)
chain3 = graph.materialize_tracking_chain(
    tree["children"][0]["children"][0]["id"], dataframes, record_schema
)

In [None]:
chain1.as_dataframe()

In [None]:
chain2.as_dataframe()

In [None]:
chain3.as_dataframe()

In [None]:
tree

In [None]:
public_path = "../data/public"

# serialize trees
with open(f"{public_path}/trees.json", "w") as f:
    json.dump(genealogy_trees, f)


In [None]:
# serialize tracking chains
chains = serialize_tracking_chains(graph, dataframes, record_schema)
with open(f"{public_path}/tracking_chains.json", "w") as f:
    json.dump(chains, f)

In [None]:
# serialize dataframes
serialized_dataframes= serialize_dataframes(dataframes)
with open(f"{public_path}/dataframes.json", "w") as f:
    json.dump(serialized_dataframes, f)