In [1]:
import re
import os
import json

In [2]:
def readFile(filename):

    with open(filename) as file:
        lines = file.readlines()

    return lines

parsed_data = readFile('processos.txt')

In [3]:
def correctData(lines: list[str]) -> list[str]:

    new_lines: list[str] = []
    expressionFlag = re.compile(r'[\s\.]\s\s')

    for line in lines:
        match = re.search(expressionFlag, line)
        if match:
            start_index = match.start()
            end_index = match.end()
            id = line[:3]
            date = line[5:14]
            previous_line = line[:start_index].rstrip()
            new_line = id + "::" + date + "::" + \
                line[end_index:].rstrip() + "::"
            new_line = re.sub(r'\s\s\s', os.linesep, new_line)
            new_lines.append(previous_line)
            new_lines.append(new_line)
        else:
            new_lines.append(line.rstrip())

    return new_lines

parsed_data = correctData(parsed_data)

In [4]:
def parseInfo(lines: list[str]) -> list[dict]:

    data = []

    pattern = re.compile(
        r'(?P<folder>\d*)::(?P<date>\d{4}-\d{2}-\d{2})::(?P<name>.+?)::(?P<father>.*?)::(?P<mother>.*?)::(?P<observations>[^,\n]*(?:,(?P<family>[^,.\n]*)\.)?.*)::')

    familiy_restriction = re.compile(r'\w+ ?\w* ?\w*')
    family_names_restriction = ['Sao', 'Santa', 'Santiago', 'Frei']

    for line in lines:
        match = re.search(pattern, line)
        if match:
            info = match.groupdict()
            if info['family'] is None and info['observations'] is not None:
                observations = info['observations'].split(',')
                valid_content = observations[-1].split('.')[0]
                if valid_content:
                    info['family'] = valid_content
            if info['family'] is not None:
                if re.match(familiy_restriction, info['family']) and not any(info['family'].startswith(name) for name in family_names_restriction):
                    data.append(info)
            else:
                data.append(info)

    return data

parsed_data = parseInfo(parsed_data)

In [None]:
def recordsPerYear(data: list[dict]):

    print("Loading results...")

    years_dict = dict()
    counter = 1
    current_year: str

    for i, element in enumerate(data):
        current_year = (element['date'])[:4]

        for next_indexes in range(i, len(data)):
            if current_year == ((data[next_indexes])['date'])[:4]:
                counter += 1

        if years_dict.get(current_year) is None or counter > years_dict[current_year]:
            years_dict[current_year] = counter

        counter = 1

    return years_dict

years_dict = recordsPerYear(parsed_data)
print(years_dict)

In [None]:
def recordsStatsFirstNamePerCentury(data: list[dict]):

    print("Loading results...")

    century_dict = {century: set() for century in range(15, 20)}
    counter = 1
    current_year: str
    current_name: tuple(str, int)
    visited_names = []

    for i, element in enumerate(data):

        current_year = (element['date'])[:4]
        century = (int(current_year[:2]) - 1)

        current_name = (element['name']).split()[0]
        verification_tuple = (current_name, century)
        if verification_tuple not in visited_names:
            visited_names.append((current_name, century))
            for next_indexes in range(i, len(data)):
                if current_name == ((data[next_indexes])['name']).split()[0]:
                    counter += 1

        if (century_dict.get(century) is None or
                counter > century_dict.get((current_name, century), (0, 0))[1] or current_name in [name for name in century_dict.get(century, set())]):
            current_name = (current_name, counter)
            century_dict[century].add(current_name)

        counter = 1

    for element in century_dict:
        century_dict[element] = sorted(
            century_dict[element], key=lambda x: x[1], reverse=True)
        century_dict[element] = (century_dict[element])[:5]

    return century_dict

first_name_century_dict = recordsStatsFirstNamePerCentury(parsed_data)
print(first_name_century_dict)


In [None]:
def recordsStatsLastNamePerCentury(data: list[dict]):
    print("Loading results...")

    century_dict = {century: set() for century in range(15, 20)}
    counter = 1
    current_year: str
    current_name: tuple(str, int)
    visited_names = []

    for i, element in enumerate(data):

        current_year = (element['date'])[:4]
        century = (int(current_year[:2]) - 1)

        current_name = (element['name']).split()[-1]
        verification_tuple = (current_name, century)
        if verification_tuple not in visited_names:
            visited_names.append((current_name, century))
            for next_indexes in range(i, len(data)):
                if current_name == ((data[next_indexes])['name']).split()[0]:
                    counter += 1

        if (century_dict.get(century) is None or
                counter > century_dict.get((current_name, century), (0, 0))[1] or current_name in [name for name in century_dict.get(century, set())]):
            current_name = (current_name, counter)
            century_dict[century].add(current_name)

        counter = 1

    for element in century_dict:
        century_dict[element] = sorted(
            century_dict[element], key=lambda x: x[1], reverse=True)
        century_dict[element] = (century_dict[element])[:5]

    return century_dict

last_name_century_dict = recordsStatsLastNamePerCentury(parsed_data)
print(last_name_century_dict)

In [None]:
def recordsFamilyStats(data: list[dict]):

    print("Loading results...")

    family_dict = dict()
    counter = 1
    family_degree: str
    restriction = re.compile(r'\w+ ?\w* ?\w*')
    names_restriction = ['Em', 'Ver', 'Doc', 'Embargo', 'Para', 'Fundador' 'Foi']

    for i, element in enumerate(data):
        family_degree = element['family']

        if family_degree is not None and re.match(restriction, family_degree) and not any(family_degree.startswith(name) for name in names_restriction):
            for next_indexes in range(i, len(data)):
                if family_degree == (data[next_indexes])['family']:
                    counter += 1

            if family_dict.get(family_degree) is None or counter > family_dict[family_degree]:
                family_dict[family_degree] = counter

    return family_dict

family_dict = recordsFamilyStats(parsed_data)
print(family_dict)

In [None]:
def recordsToJSON(data: list[dict]):

    try:
        output = data[:20]
        with open('processos.json', 'w') as file:
            json.dump(output, file, indent=4)
        print("JSON file created successfully!")
    except Exception as e:
        print(e)

recordsToJSON(parsed_data)