In [1]:
import sys
import json
import polars as pl
import matplotlib.pyplot as plt
import blitzbeaver as bb

sys.path.append("..")
pl.Config.set_tbl_rows(100)

polars.config.Config

In [2]:
from genealogy.processing import (
    load_dataframes,
)
from genealogy.models import GenealogyNode, Element

In [3]:
START_YEAR = 1835
dataframes = load_dataframes(
    folder_path="../data/normalized",
    start_year=START_YEAR,
    end_year=1898,
)

In [4]:
record_schema = bb.RecordSchema(
    [
        bb.FieldSchema("nom_rue", bb.ElementType.String),
        bb.FieldSchema("chef_prenom", bb.ElementType.String),
        bb.FieldSchema("chef_nom", bb.ElementType.String),
        bb.FieldSchema("chef_origine", bb.ElementType.String),
        bb.FieldSchema("epouse_nom", bb.ElementType.String),
        bb.FieldSchema("chef_vocation", bb.ElementType.String),
        bb.FieldSchema("enfants_chez_parents_prenom", bb.ElementType.MultiStrings),
    ]
)

In [5]:
path_graph = "../graph_35_98.beaver"

graph = bb.read_beaver(path_graph)

In [7]:
public_path = "../data/public"

with open(f"{public_path}/trees.json", "r") as f:
    genealogy_trees = json.load(f)

In [19]:
import Levenshtein

Metric = tuple[str, str, float]

def get_chain(id: bb.ID) -> bb.MaterializedTrackingChain:
    return graph.materialize_tracking_chain(id, dataframes, record_schema)

def get_most_frequent_value(df: pl.DataFrame, col: str) -> Element:
    return df.select([
        pl.col(col).value_counts(sort=True)
        .head(1)
    ]).get_column(col)[0][col]

def lv(v1: str, v2: str) -> float:
    return 1.0 - Levenshtein.distance(v1, v2) / max(len(v1), len(v2))

def compute_tree_metrics(node: GenealogyNode, col: str) -> list[Metric]:
    metrics = []
    chain = get_chain(node["id"])
    v = get_most_frequent_value(chain.as_dataframe(), col)
    for child in node["children"]:
        child_chain = get_chain(child["id"])
        child_v = get_most_frequent_value(child_chain.as_dataframe(), col)
        if not None in (v, child_v):
            dist = lv(v, child_v)
            metrics.append((v, child_v, dist))
        metrics += compute_tree_metrics(child, col)
    return metrics

In [110]:
col = "chef_vocation"
dataframes[34].select([
        pl.col(col).value_counts(sort=True)
        .head(20)
    ])

chef_vocation
struct[2]
"{""rentiere"",372}"
"{null,338}"
"{""rentier"",210}"
"{""journalier"",190}"
"{""agriculteur"",175}"
"{""cordonnier"",134}"
"{""gociant"",134}"
"{""menuisier"",126}"
"{""proprietaire"",125}"
"{""charpentier"",110}"


In [105]:
metrics = []
for tree in genealogy_trees:
    metrics += compute_tree_metrics(tree, "chef_vocation")

In [103]:
def compute_match_ratio(metrics: list[Metric], threshold: float, parent_value: str | None=None) -> tuple[int, int]:
    count = 0
    num_match = 0
    for metric in metrics:
        if parent_value is not None and metric[0] != parent_value:
            continue
        count += 1
        if metric[2] > threshold:
            num_match += 1
    return (num_match, count)

In [112]:
match, count = compute_match_ratio(metrics, 0.9, parent_value="gociant")
print(f"{match/count:.2f}% ({match}/{count})")

0.62% (16/26)


In [6]:


def lv(v1: str, v2: str) -> float:
    return 1.0 - Levenshtein.distance(v1, v2) / max(len(v1), len(v2))

In [8]:
distance("fgss", "foss")

0.75