# Einleitung
Dieses Jupyter Notebook demonstriert, wie aufbereitete Daten des Historischen Grundbuchs der Stadt Basel durchsucht werden können.

Bemerkung: Je nach Suche, Suchbegriff und Suchzeitraum kann die Suche einige Zeit dauern.

# Importiere Packages
Installiere für dieses Notebook notwendige Packages.

In [None]:
import subprocess

FILENAME_REQUIREMENTS = 'requirements.txt'

try:
    from google.colab import drive
    in_colab = True
except ImportError:
    in_colab = False

if in_colab:

    # Install the package 'thefuzz'.
    subprocess.run(['pip', 'install', 'thefuzz'], check=True)

else:
    
    # Install all packages listed in local available requirements.txt.
    subprocess.run(['pip', 'install', '-r', FILENAME_REQUIREMENTS], check=True)

Importiere für dieses Skript notwendige Funktionen.

In [None]:
import zipfile
import os
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
from IPython.display import display, HTML
import matplotlib.pyplot as plt
from pyproj import Transformer
from ipyleaflet import Map, WMSLayer, Circle, Popup
import ipywidgets as widgets
from thefuzz import fuzz, process

# Definiere Arbeitsverzeichnis für Google Colab
Definiere Arbeitsverzeichnis in Google Colab zur Speicherung Daten und Ergebnisse.

In [None]:
if in_colab:
  # Define the working directory in google colab.
  WORKING_DIRECTORY = '/content/drive/My Drive/Colab Notebooks/'

  # Mount personal google drive.
  drive.mount('/content/drive')

# Bereite Datengrundlage auf
Lade und entpacke für die Suche verwendete Datengrundlage.

In [None]:
FILENAME_DATA = 'hgb_corpus_24_07_26_inline_full_updated.zip'
URL_DATA = 'https://github.com/history-unibas/FS2024-Research-Seminar-Demo-Notebook/raw/main/hgb_corpus_24_07_26_inline_full_updated.zip'

if in_colab:
    
    if not os.path.exists(WORKING_DIRECTORY + FILENAME_DATA):
      
      # Download the data in working directory.
      subprocess.run(['wget', URL_DATA, '-O', WORKING_DIRECTORY + FILENAME_DATA], check=True)

      # Unzip the file.
      with zipfile.ZipFile(WORKING_DIRECTORY + FILENAME_DATA, 'r') as myzip:
          myzip.extractall(WORKING_DIRECTORY)
      
    # Update the filepath.
    FILENAME_DATA = WORKING_DIRECTORY + os.path.splitext(FILENAME_DATA)[0] + '.xml'

else:

    # Unzip the file.
    with zipfile.ZipFile(FILENAME_DATA, 'r') as myzip:
        myzip.extractall('.')

    # Update the filepath.
    FILENAME_DATA = os.path.splitext(FILENAME_DATA)[0] + '.xml'

# Suchen
In diesem Jupyter Notebook kann auf folgende drei Arten gesucht werden:
- Freitextsuche: Suche nach einem beliebigem Begriff in Paragraphen von Dokumenten.
- Entitätssuche: Suche nach annotierten Entitäten in Paragraphen von Dokumenten.
- Eventsuche: Suche nach annotierten Events in Paragraphen von Dokumenten.

Bei jeder Suche wird einen Suchzeitraum definiert.

## Freitextsuche
Beispiel: Suche nach dem Begriff 'wysung' in Paragraphen von Dokumenten im Zeitraum 1550 – 1600.

### Definiere Suchparameter
Festlegen der gewünschten Parameter für die Suche.

Die 'Fuzzy'-Suche ist eine fehlertolerante Suche.

Mit dem 'Verhältnis-Schwellenwert' wird definiert, wie ähnlich Suchresultate zum Suchbegriff sein sollen. Je kleiner dieser Wert, je mehr Treffer werden generiert.

In [None]:
# Set search word.
SEARCH_KEYWORD = 'wysung'

# Define search period.
YEAR_MIN = 1550
YEAR_MAX = 1600

# Define whether to search for the exact search term.
DO_EXACT_SEARCH = False

# Ratio threshold between 0 and 100 for inexact search.
RATIO_THRESHOLD = 80

### Suche ausführen

In [None]:
# Load xml file.
context = ET.iterparse(FILENAME_DATA, events=('start', 'end'))

# Create an empty dataframe to store the search results.
results = pd.DataFrame(
    columns=['id', 'house', 'coord_x', 'coord_y',
             'pages', 'year', 'text', 'imagelinks',
             'keyword', 'ratio'
             ]
    )

# Iterate over each element.
collection_element = None
for event, elem in context:

    # The start event is triggered when the parser encounters the opening tag of an element.
    if event == 'start' and elem.tag == 'Collection':
            
        # Store current collection element.
        collection_element = elem
    
    # The end event is triggered when the parser encounters the closing tag of an element.
    elif event == 'end' and elem.tag == 'Document':
            
        # Get the header element.
        header = elem.find('Header')
        
        # Determine the year of the document.
        year = int(header.get('year'))

        # Skip the document if not in desired search period.
        if year < YEAR_MIN or year > YEAR_MAX:
            continue

        # Get the text attribute.
        text = header.get('text')

        if DO_EXACT_SEARCH:
            
            # Continue if keyword not in text.
            if SEARCH_KEYWORD not in text:
                continue
            
            else:
                keyword = SEARCH_KEYWORD
                ratio = None
        
        else:
            # Perform a fuzzy search.
            keyword_ratio = process.extractOne(
                SEARCH_KEYWORD, text.split(),
                score_cutoff=RATIO_THRESHOLD,
                scorer=fuzz.ratio
                )

            # Continue if ratio is smaller than threshold.
            if keyword_ratio is None:
                continue

            else:
                keyword, ratio = keyword_ratio
        
        # Store search result in dataframe.
        results.loc[len(results)] = [
            collection_element.attrib['id'],
            collection_element.attrib['house'],
            collection_element.attrib['coord_x'],
            collection_element.attrib['coord_y'],
            header.get('pages'),
            year,
            text,
            header.get('imagelinks'),
            keyword,
            ratio
            ]

# Print the number of search results.
print(f'The keyword "{SEARCH_KEYWORD}" was found in {results.shape[0]} documents '
      f'in the period between {YEAR_MIN} and {YEAR_MAX}.'
      )

## Entitätssuche
Beispiel: Suche nach dem Begriff "Fischer" in Paragraphen von Dokumenten im Zeitraum 1550 – 1600.

### Suchparameter

In [None]:
# Set search word.
SEARCH_KEYWORD = 'Fischer'

# Define the mention subtype.
MENTION_SUBTYPE = 'occ'

# Define search period.
YEAR_MIN = 1550
YEAR_MAX = 1600

# Define whether to search for the exact search term.
DO_EXACT_SEARCH = False

# Ratio threshold between 0 and 100 for inexact search.
RATIO_THRESHOLD = 80

### Suche ausführen

In [None]:
# Load xml file.
context = ET.iterparse(FILENAME_DATA, events=('start', 'end'))

# Create an empty dataframe to store the search results.
results = pd.DataFrame(
    columns=['id', 'house', 'coord_x', 'coord_y',
             'pages', 'year', 'text', 'imagelinks',
             'keyword', 'ratio',
             'head', 'confidence'
             ]
    )

# Iterate over each element.
collection_element = None
for event, elem in context:
    if event == 'start' and elem.tag == 'Collection':
            
        # Store current collection element.
        collection_element = elem
    
    elif event == 'end' and elem.tag == 'Document':
            
        # Get the header element.
        header = elem.find('Header')
        
        # Determine the year of the document.
        year = int(header.get('year'))

        # Skip the document if not in desired search period.
        if year < YEAR_MIN or year > YEAR_MAX:
            continue
        
        # Iterate over all attributes of the document.
        for attribute in elem.findall(f'./Body//Attribute[@mention_subtype="{MENTION_SUBTYPE}"]'):

            # Get the header text.
            head = attribute.find('.//Head')
            head_text = head.text

            # Skip attribute with no header text.
            if not head_text:
                continue
            
            if DO_EXACT_SEARCH:
                
                # Continue if keyword not in text.
                if SEARCH_KEYWORD not in head_text:
                    continue
                
                else:
                    keyword = SEARCH_KEYWORD
                    ratio = None
            
            else:
                # Perform a fuzzy search.
                keyword_ratio = process.extractOne(
                    SEARCH_KEYWORD, head_text.split(),
                    score_cutoff=RATIO_THRESHOLD,
                    scorer=fuzz.ratio
                    )

                # Continue if ratio is smaller than threshold.
                if keyword_ratio is None:
                    continue

                else:
                    keyword, ratio = keyword_ratio

            # Store search result in dataframe.
            results.loc[len(results)] = [
                collection_element.attrib['id'],
                collection_element.attrib['house'],
                collection_element.attrib['coord_x'],
                collection_element.attrib['coord_y'],
                header.get('pages'),
                year,
                header.get('text'),
                header.get('imagelinks'),
                keyword,
                ratio,
                head_text,
                attribute.attrib.get('confidence')
                ]

# Print the number of search results.
print(f'The keyword "{SEARCH_KEYWORD}" was found in {results.shape[0]} entities of '
      f'entity mention subtype "{MENTION_SUBTYPE}" in documents from the period {YEAR_MIN} to {YEAR_MAX}.'
      )

## Eventsuche
Beispiel: Suche nach Fröhnungsdokumenten in Dokumenten im Zeitraum 1550 – 1600.

### Suchparameter

In [None]:
# Define the event type.
EVENT_TYPE = 'seizure'

# Define search period.
YEAR_MIN = 1550
YEAR_MAX = 1600

### Suche ausführen

In [None]:
# Load xml file.
context = ET.iterparse(FILENAME_DATA, events=('start', 'end'))

# Create an empty dataframe to store the search results.
results = pd.DataFrame(
    columns=['id', 'house', 'coord_x', 'coord_y',
             'pages', 'year', 'text', 'imagelinks',
             'keyword',
             'trigger_text', 'confidence'
             ]
    )

# Iterate over each element.
collection_element = None
for event, elem in context:
    if event == 'start' and elem.tag == 'Collection':
            
        # Store current collection element.
        collection_element = elem
    
    elif event == 'end' and elem.tag == 'Document':
            
        # Get the header element.
        header = elem.find('Header')
        
        # Determine the year of the document.
        year = int(header.get('year'))

        # Skip the document if not in desired search period.
        if year < YEAR_MIN or year > YEAR_MAX:
            continue
        
        # Iterate over all events of the document.
        for doc_event in elem.findall(f'./Standoff/Events//Event[@type="{EVENT_TYPE}"]'):

            # Get the trigger text attribute.
            trigger = doc_event.find('.//Trigger')
            trigger_text = trigger.attrib.get('text')
            
            # Store selected elements in dataframe.
            results.loc[len(results)] = [
                collection_element.attrib['id'],
                collection_element.attrib['house'],
                collection_element.attrib['coord_x'],
                collection_element.attrib['coord_y'],
                header.get('pages'),
                year,
                header.get('text'),
                header.get('imagelinks'),
                trigger_text,
                trigger_text,
                trigger.attrib.get('confidence')
                ]

# Print the number of search results.
print(f'The event type "{EVENT_TYPE}" was found in {results.shape[0]} events '
      f'in documents from the period {YEAR_MIN} to {YEAR_MAX}.'
      )

# Ergebnisse in Tabelle darstellen
Die Suchergebnisse werden in einer Tabelle dargestellt. Der Suchbegriff wird im Text vorgehoben und das zugehörige Digitalisat zur Verfügung gestellt.

In diesem Notebook werden maximal vier Resultate dargestellt. Alle Suchresultate werden in einer HTML-Datei im Arbeitsverzeichnis gespeichert.

In [None]:
# Define the filename for exported html table.
FILENAME_HTML = 'search_results.html'

# Reduce the table of results for the display. 
filtered_results = results[['id', 'house', 'pages', 'year', 'text', 'imagelinks', 'keyword']].copy()

# Define the integration of the images in the table.
def make_clickable_images(val):
    urls = val.split(" | ")
    img_tags = [f'<a href="{url}" target="_blank"><img src="{url}" width="100" /></a>' for url in urls]
    return " ".join(img_tags)

filtered_results['imagelinks'] = filtered_results['imagelinks'].apply(make_clickable_images)

# Highlight the search word.
def highlight_keyword(keyword, val):
    if keyword in val:
        return val.replace(f'{keyword}', f'<span style="color: red;">{keyword}</span>')
    return val

filtered_results['text'] = filtered_results.apply(
    lambda row: highlight_keyword(row['keyword'], row['text']), axis=1
)
filtered_results.drop(columns=['keyword'], inplace=True)

# Clean up page breaks.
def clean_pagebreaks(val):
    return val.replace('\n', '<br>')

filtered_results['text'] = filtered_results['text'].apply(
    lambda x: clean_pagebreaks(x)
    )

# Render the dataframe as HTML.
html = filtered_results.to_html(escape=False)

# Export all search results as html file.
if in_colab:
    with open(WORKING_DIRECTORY + FILENAME_HTML, 'w', encoding='utf-8') as file:
        file.write(html)
else:
    with open(FILENAME_HTML, 'w', encoding='utf-8') as file:
        file.write(html)

# Display a maximum of four results in a table.
display(HTML(filtered_results.to_html(escape=False, max_rows=4)))

# Histogramm der Resultate über die Zeit
Die Verteilung der Suchresultate über den Suchzeitraum wird in einem Histogramm visualisiert.

In [None]:
# Define the appearance of the histogram.
plt.figure(figsize=(10, 6))
n, bins, patches = plt.hist(
    results['year'],
    bins=range(min(results['year']), max(results['year']), 1),
    edgecolor='black'
    )
plt.title('Histogram')
plt.xlabel('Year')
plt.ylabel('Frequency')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.gca().set_axisbelow(True)
plt.yticks(np.arange(0, np.max(n) + 1, 1))

# Display the plot.
plt.show()

# Visualisiere Ergebnisse im Raum
Die Suchresultate werden als Punkte auf dem "Loeffel"-Plan dargestellt.

In [None]:
# Transform the coordinates of the search results.
def transform_coords(x, y):
    transformer = Transformer.from_crs(2056, 4326)
    return transformer.transform(x, y)
results['lat'], results['lon'] = zip(*results.apply(
    lambda row: transform_coords(row['coord_x'], row['coord_y']), axis=1
    ))

# Define the basemap.
wms = WMSLayer(
    url='https://wms.geo.bs.ch/',
    layers='HP_Situationsplan_Basel_1862',
    attribution='Geodaten Kanton Basel-Stadt'
)
m = Map(basemap=wms, center=(47.557, 7.595), zoom=14)

# Add the search results to the map.
for index, row in results.iterrows():
    
    # Create Circle.
    circle = Circle(location=(row['lat'], row['lon']),
                    radius=3,
                    color='blue',
                    fill_color='blue')
    
    # Add pop-up to display attributes of the circle.
    popup_content = f"""ID: {row['id']}<br>
    House: {row['house']}<br>
    Pages: {row['pages']}<br>
    Year: {row['year']}"""
    popup = Popup(location=(row['lat'], row['lon']), child=widgets.HTML(popup_content), close_button=True)
    circle.popup = popup

    # Add the circle to the map.
    m.add_layer(circle)

#  Display the map.
m

# Suchresultate exportieren
Exportiere Suchergebnisse als Exceltabelle in Arbeitsverzeichnis.

In [None]:
# Define the filename for exported excel table.
FILENAME_EXCEL = 'search_results.xlsx'

# Export all search results as excel file.
if in_colab:
    results.to_excel(WORKING_DIRECTORY + FILENAME_EXCEL, index=False)
else:
    results.to_excel(FILENAME_EXCEL, index=False)

print('Search results were exported.')