In [2]:
!pip install --upgrade pip
!pip install openpyxl
!pip install xlsxwriter

import pandas as pd
import openpyxl
import xlsxwriter

print("Installation done.")

Collecting pip
  Downloading pip-24.0-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-24.0-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.3.1
    Uninstalling pip-23.3.1:
      Successfully uninstalled pip-23.3.1
Successfully installed pip-24.0
Installation done.


In [3]:
# OPTION 1: get file from Github

import requests

def read_raw_text_from_github(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        print("Failed to fetch data from GitHub.")
        return None

# Define URL
github_raw_url = "https://github.com/ieg-dhr/DigiKAR/raw/main/Sample%20Data/students_corr18.txt"
file_content = read_raw_text_from_github(github_raw_url)
if file_content is not None:
    print(file_content[:550])

Failed to fetch data from GitHub.


In [18]:
# OPTION 1:

# Restructure input file to improve event separation
# IDEALLY, EACH LINE IN THE TXT FILE HAS TO CAPTURE ONE EVENT

import re
from string import punctuation

print(punctuation)

# define character sets

brackets = (("(", "(", ")", "[", "]")) # brackets to strip
non_letters = ((",", ".", "!", "?", ";", ":", "(", "(", ")", "[", "]"))
digits = (("1", "2", "3", "4", "5", "6", "7", "8", "9", "0"))

# create function for line processing

def process_line(lines):
    
    # define global variable and empty list for line processing
    global processed_lines 
    processed_lines = []
    
    # iterate through lines to strip leading and trailing whitespace
    stripped_lines = [l.strip() for l in lines]
    
    # define the characters at line ending to make sure that all lines end in comma or semicolon
    # semicolon, denoting the end of one event, is treated as the default mode
    lines_punctuation = []
    end_characters = {',', ';'}

    # Iterate through each line and add a semicolon if necessary
    for x, s in enumerate(stripped_lines):
        if not s.endswith(tuple(end_characters)):
            s += ';'
            lines_punctuation.append(s)
        else:
            lines_punctuation.append(s)
    
    for i, line in enumerate(lines_punctuation):
        prev_line = lines[i - 1] if i > 0 else ''
        next_line = lines[i + 1] if i < len(lines) - 1 else ''
        next_next_line = lines[i + 2] if i < len(lines) - 2 else ''

        if line.endswith(non_letters): 
            #print("Punctuation found:", line)
            #print("Next line:", next_line)  # Add this line for debugging
            if next_line.startswith(("V:", "M:", " V:", " M:")):
                #print("Related person found.")
                processed_line = line.rstrip(punctuation) + ","
                #print("New line:", processed_line)
            elif line.startswith("[Source]"):  # separate several sources with comma, not semicolon
                #print("Source found.")
                if next_line.startswith("#PERSON"):
                    #print("Source ends.")
                    processed_line = line.rstrip(punctuation) + ";"
                elif next_line.startswith("#IDENTITAET"):
                    #print("Source ends.")
                    processed_line = line.rstrip(punctuation) + ";"
                else:
                    #print("Source continues.")
                    processed_line = line.rstrip(punctuation) + ','
                    #print("Line if source continues", processed_line)
            elif line.startswith("#PERSON"):  # add semicolon after person name in first line
                #print("Person found.")
                processed_line = re.sub(r'(\D)(\d)', r'\1;\n\2', line) # INSERT A SEMICOLON BEFORE DIGITS IF NO star is there!
            else:
                processed_line = line # cases where line is already one event and ends in semicolon

        processed_lines.append(processed_line)
    return processed_lines

def split_line(processed_lines):
    global split_lines
    split_lines = []
    for line in processed_lines:
        # check if the line contains a semicolon
        if ';' in line and line.index(';') != len(line) - 1:
            # split the line at each semicolon
            split_segments = line.split(';')
            # iterate over each segment and add it as a separate line
            for i, segment in enumerate(split_segments):
                # if it's not the last segment, add the semicolon back to the segment
                if i < len(split_segments) - 1:
                    segment += ';'
                # add the segment to the list of split lines
                if segment.strip():  # Check if segment is not empty
                    split_lines.append(segment)
        else:
            # if the line does not contain a semicolon, simply add it to the list
            split_lines.append(line)
    return split_lines

def merge_line(split_lines):
    global merged_lines
    merged_lines = []
    i = 0
    while i < len(split_lines):
        line = split_lines[i]
        if line.endswith(','):
            next_line = split_lines[(i + 1) % len(split_lines)]
            merged_lines.append(line + ' ' + next_line)
            i += 1  # Increment i to skip the next line since it's already merged
        else:
            merged_lines.append(line)
        i += 1
    return merged_lines

def replace_dates(merged_lines):
    def format_date(year, month='', day=''):
        if month and day:
            return f"{year}-{month:02d}-{day:02d}"  # Format: YYYY-MM-DD
        elif month:
            return f"{year}-{month:02d}"             # Format: YYYY-MM
        else:
            return str(year)                        # Format: YYYY

    replaced_lines = []

    for line in merged_lines:
        if line.startswith("[Source]"):
            replaced_lines.append(line)
        else:
            # Define regular expressions for different date formats
            date_patterns = [
                r'\b(\d{4})\b',                        # YYYY
                r'\b(\d{1,2})\.(\d{4})\b',             # MM.YYYY
                r'\b(\d{1,2})\.(\d{4})\b',             # M.YYYY
                r'\b(\d{1,2})\.(\d{1,2})\.(\d{4})\b',  # DD.MM.YYYY
                r'\b(\d{1,2})\.(\d{4})\b',             # D.MM.YYYY
                r'\b(\d{1,2})\.(\d{1,2})\.(\d{4})\b',  # D.M.YYYY
                r'\b(\d{1,2})\.(\d{1,2})\.(\d{4})\b',  # DD.M.YYYY
                r'\b(\d{1,2})\.(\d{1,2})\.\b',         # DD.MM. (no year)
                r'\b(\d{1,2})\.(\d{1,2})\.\b',         # D.M. (no year)
            ]

            # Iterate through the date patterns and find matches
            for pattern in date_patterns:
                matches = re.finditer(pattern, line)
                for match in matches:
                    groups = match.groups()
                    if len(groups) == 1:  # Only year is present
                        year = int(groups[0])
                        replaced_date = format_date(year)
                    elif len(groups) == 2:  # Month and year are present
                        month, year = map(int, groups)
                        replaced_date = format_date(year, month)
                    elif len(groups) == 3:  # Day, month, and year are present
                        day, month, year = map(int, groups)
                        replaced_date = format_date(year, month, day)

                    # Replace the original date with the formatted one
                    line = line.replace(match.group(0), replaced_date, 1)
            replaced_lines.append(line)

    return replaced_lines

def add_missing_years(replaced_lines):
    updated_lines = []
    year_pattern = r'\b\d{4}\b'
    for i, line in enumerate(replaced_lines):
        prev_line = replaced_lines[i - 1] if i > 1 else ''
        # Check if the line contains "0000-MM-DD" date
        if "0000-" in line:
            # Regular expression pattern to find YYYY in the previous line
            try:
                years = re.findall(pattern, line)
                # Extract the last year from previous line
                previous_year = years[-1]
                # Replace "0000" with previous year
                line = line.replace("0000", previous_year)
            except:
                line = line
        else:
            line = line

        updated_lines.append(line)
    return updated_lines

def main(input_file, output_file):
    with open(input_file, 'r') as f_in, open(output_file, 'w') as f_out:
        # read original lines
        lines = f_in.readlines()
        # process lines to unify punctuation etc.
        processed_lines = process_line(lines)
        # split lines if more than one event per line
        split_lines = split_line(processed_lines)
        # merge lines after commas
        merged_lines = merge_line(split_lines)
        # replace dates that are not in machine-readable format
        replaced_lines = replace_dates(merged_lines)
        for r in replaced_lines[:15]:
            print("REPLACED LINE:", r)
        updated_lines = add_missing_years(replaced_lines)
        for line in updated_lines[:15]:
            print("UPDATED LINE:", line)
            
    with open(output_file, 'w') as f:
        f.write('\n'.join(updated_lines))
        
if __name__ == "__main__":
    input_file = "students_corr18.txt"
    output_file = "students_updated_lines.txt"
    main(input_file, output_file)


!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
REPLACED LINE: #PERSON [pers_name] ABEL, Joannes Caspar;
REPLACED LINE: Francofurtensis [event: Geburt];
REPLACED LINE: 1765-05-09 bacc. [title: bacc.] [event: Graduation] [pers_func: Absolvent] phil. [institution: Universität Mainz, $ Fakultät];
REPLACED LINE: [Source] RPh 267r, Jung, Thesenblatt 1765);
REPLACED LINE: #PERSON [pers_name] ABLEY (Abeley), Andreas;
REPLACED LINE: Moguntinus [event: Geburt] * E 1719-02-12, V: Joannes Michael;
REPLACED LINE: A. 1736-04-12 poeta [title: poeta] [event: Ernennung] [pers_func: Träger Ehrentitel];
REPLACED LINE: paup. [event: Stipendium] [pers_func: Stipendiat];
REPLACED LINE: [Source] RR I 30r;
REPLACED LINE: #PERSON [pers_name] ABT, Joannes Adamus;
REPLACED LINE: Ursellanus [event: Geburt], * 1698-08-09, V: Friderich A. (1701 Kirchenrechner);
REPLACED LINE: 1717-06-07 bacc. [title: bacc.] [event: Graduation] [pers_func: Absolvent];
REPLACED LINE: 1718 mag. [title: mag.] [event: Graduation] [pers_func: Absolven

In [19]:
# get list of places from places list
### adjust the file path according to your own file structure

filename = "Ortsontologie_Geocoded_geprüft.xlsx"

### read excel file to dataframe

places_df = pd.read_excel(filename, dtype=str) # axis=1, sort=False
places_df = places_df.fillna("n/a") # replace empty fields for string

places_to_check = places_df["place_name"].drop_duplicates().to_list()
print("All places loaded.")

All places loaded.


In [24]:
## SPLIT AND WRITE TO TABLE FORMAT

import pandas as pd
import re

# Read the file and split by #PERSON
with open('students_updated_lines.txt', 'r') as file:
    data = file.read()

person_data = re.split(r'#PERSON', data)[1:]  # [1:] to remove empty first entry

# Define regex patterns
event_pattern = r'\[event: (.*?)\]'
full_date_pattern = r'(\d{4}-\d{2}-\d{2})'
year_pattern = r'(\d{4})'
birth_date_pattern = r'\*([A-Z])?\s*(\d{4}-\d{2}-\d{2})'
pers_func_pattern = r'\[pers_func: (.*?)\]'
institution_pattern = r'\[institution: (.*?)\]'
place_pattern = r'(?:nach|in)\s+([A-ZÄÖÜ][a-zäöüß]+(?:\s+[A-ZÄÖÜ][a-zäöüß]+)*)'
rel_pers_pattern = r'(?:V|M): ([A-Z][a-z]+(?: [A-Z][a-z]+)*)'
birth_place_pattern = r'^[^*\d]+(?=\*|\[|\d)'  # Capture all characters until "*" or first digit

# Initialize lists to store data
rows = []

# Loop through each person's data
for person in person_data:
    lines = person.strip().split('\n')
    events_dict = {}
    dates_dict = {}
    pers_func_dict = {}
    title_dict = {}
    rel_pers_dict = {}
    institution_dict = {}
    place_dict = {}
    source_quotation_dict = {}
    count = 0

    # Extract pers_name
    pers_name_match = re.search(r'\[pers_name\] (.*?);', lines[0])
    pers_name = pers_name_match.group(1) if pers_name_match else 'n/a'

    # Extract source and source_quotation
    source_match = re.search(r'\[Source\] (.*)', person)
    if source_match:
        source = source_match.group(0).strip() # Remove source from person record
    else:
        source = 'n/a'

    for line in lines[1:-1]: # exclude first line (name) and last ine (source)
        line = str(line)
        count += 1

        # Extract event
        if count == 1:
            event_match = re.search(event_pattern, line)
            if event_match:
                event = event_match.group(1)
            else:
                event = "Geburt"
        elif count > 1:
            event_match = re.search(event_pattern, line)
            if event_match:
                event = event_match.group(1)
            else:
                event = "Funktionsausübung"

        # Extract date
        if event == 'Geburt':
            try:
                date_match = re.search(birth_date_pattern, line)
                if date_match:
                    date = date_match.group(1)
                else:
                    date = "n/a"   
            except AttributeError as e:
                print("Birthday pattern not found:", e)
        elif event != "Geburt":
            full_date_match = re.search(full_date_pattern, line)
            year_match = re.search(year_pattern, line)
            if full_date_match:
                date = full_date_match.group(0)
            elif year_match:
                date = year_match.group(0)
            else:
                date = "n/a"
        else:
            date = "n/a"

        # Extract pers_func
        pers_func_match = re.search(pers_func_pattern, line)
        pers_func = pers_func_match.group(1) if pers_func_match else "n/a"

        # Extract title
        title_match = re.search(r'\[title: (.*?)\]', line)
        title = title_match.group(1) if title_match else "n/a"

        # Extract institution
        if event == 'Studium' or "Graduation" or "Promotion" or "Prüfung":
            try:
                institution_match = re.search(institution_pattern, line)
                institution = institution_match.group(1) if institution_match else "n/a"
            except:
                institution = "Universität #"
        elif event == "Geburt":
            institution = "Geburtshaus #"
        elif event == "Tod":
            try:
                institution_match = re.search(institution_pattern, line)
                institution = institution_match.group(1) if institution_match else "n/a"
            except:
                institution = "Sterbehaus #"
        else:
            try:
                institution_match = re.search(institution_pattern, line)
                institution = institution_match.group(1) if institution_match else "n/a"
            except:
                institution = "n/a"

        # Extract place
        if event == 'Geburt':
            birth_place_match = re.search(birth_place_pattern, line)
            if birth_place_match:
                place = birth_place_match.group(0).strip() 
            elif "[" in line:
                place = re.sub(r'\[.*?\]', '', line)
            else:
                place = "#Geburtsort"
        else:
            # Iterate through list of places to check
            place_match = re.search(place_pattern, line)
            if place_match:
                place = place_match.group(1) 
            else:
                place = next((p for p in places_to_check if p in line), "[Mainz]")

        # Extract rel_pers
        rel_pers_match = re.search(rel_pers_pattern, line)
        rel_pers = rel_pers_match.group(1) if rel_pers_match else "n/a"
        
        ## STILL ADD HANDLING FOR #IDENTITAET AS ALTERNATIVE NAMES

        # Append data to dictionaries
        events_dict[event] = event
        dates_dict[event] = date
        pers_func_dict[event] = pers_func
        title_dict[event] = title
        institution_dict[event] = institution
        place_dict[event] = place
        rel_pers_dict[event] = rel_pers
        source_quotation_dict[event] = re.sub(r'\[.*?\]', '', line)
        
    # Append data to the list of rows
    for event, event_date in dates_dict.items():
        rows.append({
            'pers_name': pers_name,
            'event': event,
            'date': event_date,
            'pers_func': pers_func_dict[event],
            'title': title_dict[event],
            'institution': institution_dict[event],
            'place': place_dict[event],
            'rel_pers': rel_pers_dict[event],
            'source': source,
            'source_quotation': source_quotation_dict[event]
        })

# Create DataFrame from the list of rows
df = pd.DataFrame(rows)

# Fill empty cells with 'n/a'
df = df.fillna('n/a')

# Print DataFrame
display(df)

Unnamed: 0,pers_name,event,date,pers_func,title,institution,place,rel_pers,source,source_quotation
0,"ABEL, Joannes Caspar",Geburt,,,,,Francofurtensis,,"[Source] RPh 267r, Jung, Thesenblatt 1765);",Francofurtensis ;
1,"ABEL, Joannes Caspar",Graduation,1765-05-09,Absolvent,bacc.,"Universität Mainz, $ Fakultät",Mainz,,"[Source] RPh 267r, Jung, Thesenblatt 1765);",1765-05-09 bacc. phil. ;
2,"ABLEY (Abeley), Andreas",Geburt,,,,,Moguntinus [event: Geburt],Joannes Michael,[Source] RR I 30r;,"Moguntinus * E 1719-02-12, V: Joannes Michael;"
3,"ABLEY (Abeley), Andreas",Ernennung,1736-04-12,Träger Ehrentitel,poeta,,[Mainz],,[Source] RR I 30r;,A. 1736-04-12 poeta ;
4,"ABLEY (Abeley), Andreas",Stipendium,,Stipendiat,,,[Mainz],,[Source] RR I 30r;,paup. ;
...,...,...,...,...,...,...,...,...,...,...
35844,"WOLFF, Johannes Jacobus",Geburt,,,,,Bommersheimensis,,,Bommersheimensis ;
35845,"WOLFF, Johannes Jacobus",Funktionsausübung,1715-10-09,,,,[Mainz],Johann Adam Wolff,,"get. 1715-10-09, V: Johann Adam Wolff, M: Dor..."
35846,"WOLFF, Johannes Paulus",Geburt,,,,,Oberurseilanus,,"[Source] unbekannte Quelle,",Oberurseilanus ;
35847,"WOLFF, Johannes Paulus",Funktionsausübung,1758,,Lehrer,,Reifenberg,,"[Source] unbekannte Quelle,",1758-1811 Lehrer in Reifenberg;


In [25]:
!pip install xlsxwriter
import xlsxwriter

# save preliminary version

workbook='df_students_exported_v4.xlsx'
print(workbook)
writer = pd.ExcelWriter(workbook, engine='xlsxwriter') # create a Pandas Excel writer using XlsxWriter as the engine.
df.to_excel(writer, sheet_name='StudExp1') # Convert the dataframe to an XlsxWriter Excel object.
writer.save() # Close the Pandas Excel writer and output the Excel file.
print("Done.")

df_students_exported_v4.xlsx


  writer.save() # Close the Pandas Excel writer and output the Excel file.


Done.


In [None]:
# geocode dataframe

from factoids2_add_geodata import add_geodata
df_students_geocoded = add_geodata(df)

display(df_students_geocoded)

In [None]:
# save enriched df to DRIVE

workbook='df_students_geocoded.xlsx'
print(workbook)
writer = pd.ExcelWriter(workbook, engine='xlsxwriter') # create a Pandas Excel writer using XlsxWriter as the engine.
df_students_geocoded.to_excel(writer, sheet_name='FactCons1') # Convert the dataframe to an XlsxWriter Excel object.
writer.save() # Close the Pandas Excel writer and output the Excel file.
print("Done.")

In [None]:
## EXAMPLE OF PROBLEMS IN THE INPUT DATA
 
"""
#PERSON [pers_name] ACKERMANN, Franciscus; 
 Moguntinus [event: Geburt] 
 #IDENTITAET ein Franz Peter Aloys Judas Thaddäus * Q 8.12.1771,  V: Faktor Friedrich Edmund A.;
 jur. cand. 
 [Source] Prot. Univ. Justizsenat vom 27.11.1787, jur. ex acad. Mog. [Mainz];
 22.10.1789 in Göttingen imMatr. [Immatrikulation];
 # IDENTITAET e. Franz Josef A.;
 Kanzleiakzessist der Hofkammer 1794 und 1797;
 [Source] StAD Hdschr. 282/5a S.120, Matr. [Immatrikulation] Gött. Nr. 15 408, Hofkal. 1794 S.146, 1797 S.109;
"""