In [1]:
import numpy as np
import pandas as pd

In [2]:
import os
os.chdir("..")

In [3]:
from analytics import BasicTracker, MinChangesTracker, TrackingConfig, TrackingHistoryConfig

In [83]:
config = TrackingConfig(
    threshold_mismatch=0.7,
    history=TrackingHistoryConfig(
        max_num_value_to_score=5,
    ),
)

In [103]:
columns = [
    "chef_prenom_norm",
    "chef_nom_norm",
    # "nom_rue_norm",
    # "chef_vocation_norm",
    # "chef_origine_norm",
    "epouse_nom_norm",
]

years = [
    "1836",
    "1837",
    "1838",
    "1839",
    "1840",
    "1841",
    "1842",
    "1843",
    "1844",
    "1845",
    "1846",
    "1847",
    "1848",
    "1849",
    "1850",
]

num_persons_tracked = 10

In [104]:
baseline_df = pd.read_csv("data/csv/1835.csv")

In [86]:
baselines = baseline_df.sample(n=num_persons_tracked)

In [87]:
baselines[columns]

Unnamed: 0,chef_prenom_norm,chef_nom_norm,nom_rue_norm,chef_vocation_norm,epouse_nom_norm
2365,jean christian,paget,malle,·,destraz
2150,veuve d henri,vettil,chaueran,parpentiere,corbaz
167,nicolas,siegfried,martherey,marches,mauch
3542,jeanne francoise,emery|emery,derrue de faus,proprietaire,·
3430,jean,votz,rosemont,proprietaire,reber
1956,jean georges,obreith,montee de st lade,henriette,vaucher
2406,marie,curchod,rudelermann,·,separie
3262,veuve,suat,barre,proprietaire,duvoisin
3671,gabriel,regamey,vuachere,taniel,raimondin
377,henry,wuistaz,f pierre,voiturier,stabersol


In [88]:
def track_person(args) -> list:
    tracker, years = args
    records = []
    for year in years:
        df = pd.read_csv(f"data/csv/{year}.csv")
        scores = tracker.compute_record_scores(df)
        record = tracker.select_record(df, scores)
        records.append(record)

    return records

In [105]:
import multiprocessing

inputs = []
for i in range(num_persons_tracked):
    baseline = baselines.iloc[i][columns].values.tolist()
    
    tracker = MinChangesTracker(config, columns, baseline)

    inputs.append((tracker, years))

with multiprocessing.Pool(processes=num_persons_tracked) as pool:
    results = pool.map(track_person, inputs)




In [106]:
for v in results:
    print("".join("Y" if r is not None else "." for r in v))

YYYYYYYYYYY....
YYYY..Y..Y.....
YYYYYYYYY.YY...
YYYY.YYY....YYY
Y.Y............
YYYYYYYYYYY..Y.
YYYYYY..Y......
..YYYYY.YYYYYYY
YYYYYYYYY.YYYYY
YY.....Y.......


In [109]:
baseline = baselines.iloc[0][columns].values.tolist()
baseline

['jean christian', 'paget', 'destraz']

In [108]:
for record in results[0]:
    if record is not None:
        print(record[columns].tolist())

['jean christian', 'paget', 'destraz']
['jean christian', 'paget', 'destraz']
['jean christian', 'paget', 'detrak']
['jean christian', 'paget', 'destraz']
['jean christian', 'fayet', 'jry detraz']
['jean christiane', 'paget', 'destraz']
['jean christian', 'paget', 'destraz']
['jean christi', 'jayet', 'destraz']
['jean christian', 'jayet', 'destraz']
['jean christian', 'paget', 'destraz']
['jean christian', 'paget', 'destraz']


In [96]:
results1 = results