In [1]:
import re
import os
import json

In [2]:
def readFile(filename):

    with open(filename) as file:
        lines = file.readlines()

    return lines

parsed_data = readFile('processos.txt')

In [3]:
def correctData(lines: list[str]) -> list[str]:

    new_lines: list[str] = []
    expressionFlag = re.compile(r'[\s\.]\s\s')

    for line in lines:
        match = re.search(expressionFlag, line)
        if match:
            start_index = match.start()
            end_index = match.end()
            id = line[:3]
            date = line[5:14]
            previous_line = line[:start_index].rstrip()
            new_line = id + "::" + date + "::" + \
                line[end_index:].rstrip() + "::"
            new_line = re.sub(r'\s\s\s', os.linesep, new_line)
            new_lines.append(previous_line)
            new_lines.append(new_line)
        else:
            new_lines.append(line.rstrip())

    return new_lines

parsed_data = correctData(parsed_data)

In [4]:
def parseInfo(lines: list[str]) -> list[dict]:

    data = []

    pattern = re.compile(
        r'(?P<folder>\d*)::(?P<date>\d{4}-\d{2}-\d{2})::(?P<name>.+?)::(?P<father>.*?)::(?P<mother>.*?)::(?P<observations>[^,\n]*(?:,(?P<family>[^,.\n]*)\.)?.*)::')

    familiy_restriction = re.compile(r'\w+ ?\w* ?\w*')
    family_names_restriction = ['Sao', 'Santa', 'Santiago', 'Frei']

    for line in lines:
        match = re.search(pattern, line)
        if match:
            info = match.groupdict()
            if info['family'] is None and info['observations'] is not None:
                observations = info['observations'].split(',')
                valid_content = observations[-1].split('.')[0]
                if valid_content:
                    info['family'] = valid_content
            if info['family'] is not None:
                if re.match(familiy_restriction, info['family']) and not any(info['family'].startswith(name) for name in family_names_restriction):
                    data.append(info)
            else:
                data.append(info)

    return data

parsed_data = parseInfo(parsed_data)

In [5]:
def recordsPerYear(data: list[dict]):

    print("Loading results...")

    years_dict = dict()
    counter = 1
    current_year: str

    for i, element in enumerate(data):
        current_year = (element['date'])[:4]

        for next_indexes in range(i, len(data)):
            if current_year == ((data[next_indexes])['date'])[:4]:
                counter += 1

        if years_dict.get(current_year) is None or counter > years_dict[current_year]:
            years_dict[current_year] = counter

        counter = 1

    return years_dict

years_dict = recordsPerYear(parsed_data)
print(years_dict)

Loading results...
{'1894': 72, '1909': 39, '1896': 76, '1904': 52, '1901': 58, '1883': 34, '1900': 50, '1902': 80, '1880': 54, '1889': 67, '1908': 52, '1906': 62, '1856': 62, '1892': 56, '1733': 957, '1778': 765, '1899': 78, '1869': 22, '1898': 88, '1877': 36, '1910': 28, '1907': 48, '1884': 39, '1879': 51, '1897': 69, '1730': 974, '1707': 98, '1689': 552, '1713': 226, '1824': 188, '1691': 810, '1720': 148, '1890': 42, '1732': 1802, '1863': 19, '1895': 76, '1729': 32, '1694': 38, '1765': 5, '1754': 250, '1755': 263, '1823': 159, '1708': 130, '1759': 73, '1683': 126, '1712': 56, '1687': 81, '1704': 263, '1888': 67, '1786': 269, '1798': 48, '1773': 368, '1821': 227, '1822': 232, '1809': 217, '1734': 761, '1722': 409, '1680': 148, '1738': 141, '1728': 396, '1716': 207, '1849': 97, '1777': 766, '1703': 123, '1851': 52, '1717': 232, '1785': 639, '1857': 63, '1686': 155, '1881': 62, '1784': 281, '1727': 111, '1719': 331, '1799': 83, '1829': 125, '1847': 100, '1787': 575, '1805': 90, '1819':

In [6]:
def recordsStatsFirstNamePerCentury(data: list[dict]):

    print("Loading results...")

    century_dict = {century: set() for century in range(15, 20)}
    counter = 1
    current_year: str
    current_name: tuple(str, int)
    visited_names = []

    for i, element in enumerate(data):

        current_year = (element['date'])[:4]
        century = (int(current_year[:2]) - 1)

        current_name = (element['name']).split()[0]
        verification_tuple = (current_name, century)
        if verification_tuple not in visited_names:
            visited_names.append((current_name, century))
            for next_indexes in range(i, len(data)):
                if current_name == ((data[next_indexes])['name']).split()[0]:
                    counter += 1

        if (century_dict.get(century) is None or
                counter > century_dict.get((current_name, century), (0, 0))[1] or current_name in [name for name in century_dict.get(century, set())]):
            current_name = (current_name, counter)
            century_dict[century].add(current_name)

        counter = 1

    for element in century_dict:
        century_dict[element] = sorted(
            century_dict[element], key=lambda x: x[1], reverse=True)
        century_dict[element] = (century_dict[element])[:5]

    return century_dict

first_name_century_dict = recordsStatsFirstNamePerCentury(parsed_data)
print(first_name_century_dict)


Loading results...
{15: [('Manuel', 5060), ('Antonio', 4169), ('Joao', 3961), ('Jose', 3810), ('Francisco', 2627)], 16: [('Manuel', 5066), ('Antonio', 4168), ('Joao', 3964), ('Jose', 3812), ('Francisco', 2626)], 17: [('Manuel', 5054), ('Antonio', 4167), ('Joao', 3965), ('Jose', 3811), ('Francisco', 2622)], 18: [('Manuel', 5012), ('Antonio', 4162), ('Joao', 3944), ('Jose', 3569), ('Francisco', 2477)], 19: []}


In [7]:
def recordsStatsLastNamePerCentury(data: list[dict]):
    print("Loading results...")

    century_dict = {century: set() for century in range(15, 20)}
    counter = 1
    current_year: str
    current_name: tuple(str, int)
    visited_names = []

    for i, element in enumerate(data):

        current_year = (element['date'])[:4]
        century = (int(current_year[:2]) - 1)

        current_name = (element['name']).split()[-1]
        verification_tuple = (current_name, century)
        if verification_tuple not in visited_names:
            visited_names.append((current_name, century))
            for next_indexes in range(i, len(data)):
                if current_name == ((data[next_indexes])['name']).split()[0]:
                    counter += 1

        if (century_dict.get(century) is None or
                counter > century_dict.get((current_name, century), (0, 0))[1] or current_name in [name for name in century_dict.get(century, set())]):
            current_name = (current_name, counter)
            century_dict[century].add(current_name)

        counter = 1

    for element in century_dict:
        century_dict[element] = sorted(
            century_dict[element], key=lambda x: x[1], reverse=True)
        century_dict[element] = (century_dict[element])[:5]

    return century_dict

last_name_century_dict = recordsStatsLastNamePerCentury(parsed_data)
print(last_name_century_dict)

Loading results...
{15: [('Manuel', 5066), ('Joao', 3965), ('Francisco', 2627), ('Luis', 761), ('Gaspar', 163)], 16: [('Manuel', 5066), ('Joao', 3965), ('Jose', 3812), ('Francisco', 2627), ('Antonio', 1459)], 17: [('Manuel', 5066), ('Joao', 3965), ('Jose', 3812), ('Francisco', 2627), ('Luis', 761)], 18: [('Matias', 99), ('Santos', 24), ('Roque', 21), ('Lino', 14), ('Gomes', 2)], 19: []}


In [12]:
def recordsFamilyStats(data: list[dict]):

    print("Loading results...")

    family_dict = dict()
    counter = 1
    family_degree: str
    restriction = re.compile(r'\w+ ?\w* ?\w*')
    names_restriction = ['Em', 'Ver', 'Doc', 'Embargo', 'Para', 'Fundador' 'Foi']

    for i, element in enumerate(data):
        family_degree = element['family']

        if family_degree is not None and re.match(restriction, family_degree) and not any(family_degree.startswith(name) for name in names_restriction):
            for next_indexes in range(i, len(data)):
                if family_degree == (data[next_indexes])['family']:
                    counter += 1

            if family_dict.get(family_degree) is None or counter > family_dict[family_degree]:
                family_dict[family_degree] = counter

    return family_dict

family_dict = recordsFamilyStats(parsed_data)
print(family_dict)

Loading results...
{'Irmao': 14321908, 'Sobrinho Materno': 14321910, 'Tio Materno': 14321909, 'Sobrinho Paterno': 14320759, 'Filho': 14321709, 'Sobrinhos Paternos': 14321872, 'Tio Paterno': 14319025, 'Sobrinho Neto Paterno': 14306318, 'Irmaos': 14320807, 'Primo': 14320899, 'Sobrinho Bisneto Paterno': 14082590, 'Irmao Materno': 14317844, 'Exposto': 14252844, 'Pai': 14321708, 'Arvore Genealogica': 14288303, 'Sobrinho Neto Materno': 14250213, 'Irmao Paterno': 14321570, 'Foi Arcebispo Primaz de Braga': 1067299, 'Fundo Jose Gomes': 10476793, 'Hoje BANHO E CARVALHOSA': 14300513, 'Antes do Crisma era SILVESTRE': 2063295, 'Autos Carta Rogatoria vinda do Bispado do Porto': 2507220, 'Neto Paterno': 14198904, 'Primo Materno': 14255475, 'Primo Paterno': 14315606, 'O Pai tambem aparece como Domingos Lourenco Eiras': 2935055, 'Primos': 13122194, 'Filhos': 14321543, 'Tem Impedimento': 3477393, 'Tios Paternos': 11589853, 'Irmao da Congregacao do Oratorio da cidade de Viseu': 3920548, 'Assistente no Po

In [13]:
def recordsToJSON(data: list[dict]):

    try:
        output = data[:20]
        with open('processos.json', 'w') as file:
            json.dump(output, file, indent=4)
        print("JSON file created successfully!")
    except Exception as e:
        print(e)

recordsToJSON(parsed_data)

JSON file created successfully!
