# Data Extraction

In [1]:
import re

import pandas as pd
import numpy as np
from gedcom5.parser import GEDCOM5Parser

In [2]:
parser = GEDCOM5Parser()

with open("data/royal92.ged") as f:
    gedcom = parser.parse_string(f.read())

## Individuals

In [3]:
def iterator():
    for indi in gedcom.indi:
        if indi.name is None:
            yield indi.xref_id, np.nan, np.nan
            continue

        if not (match := re.search("([^/]*)/([^/]*)/", indi.name[0].value)):
            yield indi.xref_id, np.nan, np.nan
            continue

        givn = match.group(1).strip() if match.group(1).strip() else np.nan
        surn = match.group(2).strip() if match.group(2).strip() else np.nan
        yield indi.xref_id, givn, surn


df_individual = pd.DataFrame(
    data=iterator(),
    columns=["indi", "givn", "surn"],
)
df_individual

Unnamed: 0,indi,givn,surn
0,@I1@,Victoria,Hanover
1,@I2@,Albert Augustus Charles,
2,@I3@,Victoria Adelaide Mary,
3,@I4@,Edward_VII,Wettin
4,@I5@,Alice Maud Mary,
...,...,...,...
3005,@I3006@,John Sanford,Scobell
3006,@I3007@,James,Cartland
3007,@I3008@,Flora,
3008,@I3009@,,Cartland


## Births

In [4]:
def iterator():
    for indi in gedcom.indi:
        for birt in indi.birt:
            date = birt.date.value if birt.date is not None else np.nan
            plac = birt.plac.value if birt.plac is not None else np.nan
            yield indi.xref_id, date, plac


df_birth = pd.DataFrame(
    data=iterator(),
    columns=["indi", "date", "plac"],
)
df_birth

Unnamed: 0,indi,date,plac
0,@I1@,24 MAY 1819,"Kensington,Palace,London,England"
1,@I2@,26 AUG 1819,"Schloss Rosenau,Near Coburg,Germany"
2,@I3@,21 NOV 1840,"Buckingham,Palace,London,England"
3,@I4@,9 NOV 1841,"Buckingham,Palace,London,England"
4,@I5@,25 APR 1843,"Buckingham,Palace,London,England"
...,...,...,...
1734,@I2997@,3 JAN 1907,
1735,@I2998@,4 JAN 1912,
1736,@I3004@,,"Florence,Italy"
1737,@I3006@,1879,


## Deaths

In [5]:
def iterator():
    for indi in gedcom.indi:
        for deat in indi.deat:
            date = deat.date.value if deat.date is not None else np.nan
            plac = deat.plac.value if deat.plac is not None else np.nan
            yield indi.xref_id, date, plac


df_death = pd.DataFrame(
    data=iterator(),
    columns=["indi", "date", "plac"],
)
df_death

Unnamed: 0,indi,date,plac
0,@I1@,22 JAN 1901,"Osborne House,Isle of Wight,England"
1,@I2@,14 DEC 1861,"Windsor Castle,Berkshire,England"
2,@I3@,5 AUG 1901,"Friedrichshof,Near,Kronberg,Taunus"
3,@I4@,6 MAY 1910,"Buckingham,Palace,London,England"
4,@I5@,14 DEC 1878,"Darmstadt,,,Germany"
...,...,...,...
1687,@I2995@,27 MAY 1917,
1688,@I2997@,30 MAY 1940,"Nr Cassel,France"
1689,@I2998@,29 MAY 1940,
1690,@I3004@,BEF 1877,


## Marriages

In [6]:
def iterator():
    for fam in gedcom.fam:
        husb = fam.husb.value if fam.husb is not None else np.nan
        wife = fam.wife.value if fam.wife is not None else np.nan

        for marr in fam.marr:
            date = marr.date.value if marr.date is not None else np.nan
            plac = marr.plac.value if marr.plac is not None else np.nan
            yield husb, wife, date, plac


df_marriage = pd.DataFrame(
    data=iterator(),
    columns=["husb", "wife", "date", "plac"],
)
df_marriage

Unnamed: 0,husb,wife,date,plac
0,@I2@,@I1@,10 FEB 1840,"Chapel Royal,St. James Palace,England"
1,@I4@,@I12@,10 MAR 1863,"St. George Chap.,Windsor,,England"
2,@I20@,@I3@,25 JAN 1858,"London,England"
3,@I37@,@I39@,26 NOV 1894,"Winter Palace,,St. Petersburg,Russia"
4,@I10@,@I23@,27 APR 1882,"St. George Chap.,Windsor,,England"
...,...,...,...,...
551,@I54@,@I2977@,17 DEC 1978,
552,@I109@,@I2979@,AFT 1989,
553,@I2985@,@I243@,ABT 1947,
554,@I2986@,@I806@,28 DEC 1936,"Guildhall,London,England"


## Families

In [7]:
def iterator():
    for indi in gedcom.indi:
        for fam in indi.famc:
            yield indi.xref_id, fam.value

        for fam in indi.fams:
            yield indi.xref_id, fam.value


df_family = pd.DataFrame(
    data=iterator(),
    columns=["indi", "fam"],
)
df_family

Unnamed: 0,indi,fam
0,@I1@,@F42@
1,@I1@,@F1@
2,@I2@,@F43@
3,@I2@,@F1@
4,@I3@,@F1@
...,...,...
4573,@I3006@,@F1419@
4574,@I3007@,@F1422@
4575,@I3008@,@F1422@
4576,@I3009@,@F1418@
