<a href="https://colab.research.google.com/github/hsandaver/hsandaver/blob/main/Manifest_Enricher.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ============================== #
#      Enhanced Python Script    #
# ============================== #

# ==============================
# 1. Install Required Libraries
# ==============================

# Uncomment the following lines to install required libraries if not already installed.
# You can run this cell separately to install the dependencies.

# !pip install requests
# !pip install networkx
# !pip install plotly
# !pip install ipywidgets
# !pip install fuzzywuzzy
# !pip install python-Levenshtein  # Optional for improved fuzzy matching performance
# !pip install unidecode
# !pip install cachetools
# !pip install jsonschema

# ==============================
# 2. Import Necessary Libraries
# ==============================

from google.colab import output
output.enable_custom_widget_manager()

import json
import requests
import networkx as nx
import plotly.graph_objs as go
from ipywidgets import (
    widgets, VBox, HBox, Layout, Button, Textarea, IntSlider, Output, HTML
)
from IPython.display import display, clear_output
import plotly.io as pio
import logging
from fuzzywuzzy import fuzz
from urllib.parse import urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed
from unidecode import unidecode
from cachetools import TTLCache, cached
from jsonschema import validate, ValidationError

# ==============================
# 3. Configure Logging
# ==============================

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s: %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)

# ==============================
# 4. Set Plotly Renderer
# ==============================

pio.renderers.default = 'colab'

# ==============================
# 5. Setup Caching
# ==============================

# Cache fetched data to avoid redundant network requests
# Cache size: 100 items, TTL: 1 hour
cache = TTLCache(maxsize=100, ttl=3600)

# ==============================
# 6. Define JSON Schemas
# ==============================

# Minimal JSON Schemas for validation
IIIF_MANIFEST_SCHEMA = {
    "type": "object",
    "properties": {
        "@id": {"type": "string"},
        "label": {"type": ["string", "object"]},
        "metadata": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "label": {"type": "string"},
                    "value": {"type": "string"}
                },
                "required": ["label", "value"]
            }
        }
    },
    "required": ["@id", "label", "metadata"]
}

LINKED_DATA_SCHEMA = {
    "type": "object",
    "properties": {
        "@context": {"type": "string"},
        "id": {"type": "string"},
        "prefLabel": {"type": "object"},
        "altLabel": {"type": "object"},
        "dateOfBirth": {"type": "array"},
        "dateOfDeath": {"type": "array"},
        "description": {"type": "object"}
    },
    "required": ["@context", "id", "prefLabel", "altLabel", "dateOfBirth", "dateOfDeath", "description"]
}

# ==============================
# 7. Helper Functions
# ==============================

def validate_uri(uri):
    """ Validate URI format before making network requests. """
    try:
        result = urlparse(uri)
        return all([result.scheme, result.netloc])
    except Exception:
        return False

def fetch_with_retries(session, uri, headers=None, max_retries=3, backoff_factor=0.3):
    """
    Fetch data from a URI with retry mechanism.

    Args:
        session (requests.Session): Session object for connection pooling.
        uri (str): The URI to fetch data from.
        headers (dict): HTTP headers to include in the request.
        max_retries (int): Maximum number of retries.
        backoff_factor (float): Backoff factor for retries.

    Returns:
        dict or None: JSON response if successful, else None.
    """
    from requests.adapters import HTTPAdapter
    from urllib3.util.retry import Retry

    retry_strategy = Retry(
        total=max_retries,
        backoff_factor=backoff_factor,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET"]  # Changed from 'method_whitelist' to 'allowed_methods'
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("https://", adapter)
    session.mount("http://", adapter)

    try:
        response = session.get(uri, headers=headers, timeout=10)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        logging.error(f"Request failed for {uri}: {e}")
        return None
    except json.JSONDecodeError:
        logging.error(f"Failed to decode JSON from {uri}")
        return None

@cached(cache)
def fetch_linked_data(uri):
    """Fetch linked data entity from a given URI with caching and validation."""
    if not validate_uri(uri):
        logging.error(f"Invalid URI: {uri}")
        return None

    with requests.Session() as session:
        data = fetch_with_retries(session, uri, headers={'Accept': 'application/ld+json'})
        if data:
            try:
                validate(instance=data, schema=LINKED_DATA_SCHEMA)
                logging.info(f"Successfully fetched and validated linked data from {uri}")
                return data
            except ValidationError as ve:
                logging.error(f"Linked data from {uri} failed validation: {ve.message}")
        return None

@cached(cache)
def fetch_iiif_manifest(uri):
    """Fetch IIIF manifest from a given URI with caching and validation."""
    if not validate_uri(uri):
        logging.error(f"Invalid URI: {uri}")
        return None

    with requests.Session() as session:
        data = fetch_with_retries(session, uri)
        if data:
            try:
                validate(instance=data, schema=IIIF_MANIFEST_SCHEMA)
                logging.info(f"Successfully fetched and validated IIIF manifest from {uri}")
                return data
            except ValidationError as ve:
                logging.error(f"IIIF manifest from {uri} failed validation: {ve.message}")
        return None

def normalize_name(name):
    """
    Normalize the name by:
    - Transliteration (removing accents)
    - Converting 'Last, First' to 'First Last'
    - Lowercasing
    - Stripping whitespace
    """
    name = unidecode(name)
    if ',' in name:
        parts = name.split(',')
        if len(parts) == 2:
            first = parts[1].strip()
            last = parts[0].strip()
            name = f"{first} {last}"
    return name.strip().lower()

def extract_artist_info(linked_data_entity):
    """
    Extract artist information from a linked data entity.

    Args:
        linked_data_entity (dict): The linked data JSON object.

    Returns:
        dict: Extracted artist information.
    """
    artist_info = {}

    # Extract preferred names from all English-related keys
    pref_label = linked_data_entity.get('prefLabel', {})
    preferred_names = []
    for key in pref_label:
        if key.startswith('en'):
            preferred_names.append(pref_label[key])
    if not preferred_names:
        # Fallback to any available language
        preferred_names.append(next(iter(pref_label.values()), 'Unknown'))
    # Assuming the first preferred name is the primary one
    artist_info['preferred_name'] = normalize_name(preferred_names[0])

    # Extract alternative names from all English-related keys
    alt_labels = linked_data_entity.get('altLabel', {})
    alternative_names = []
    for key, labels in alt_labels.items():
        if key.startswith('en'):
            if isinstance(labels, list):
                alternative_names.extend([normalize_name(label) for label in labels])
            else:
                alternative_names.append(normalize_name(labels))
    artist_info['alternative_names'] = alternative_names

    # Extract date of birth
    dob_entry = linked_data_entity.get('dateOfBirth', [{}])[0]
    dob = dob_entry.get('time:inXSDDateTimeStamp', {}).get('@value', 'Unknown')
    artist_info['date_of_birth'] = dob[:10] if dob != 'Unknown' else dob

    # Extract date of death
    dod_entry = linked_data_entity.get('dateOfDeath', [{}])[0]
    dod = dod_entry.get('time:inXSDDateTimeStamp', {}).get('@value', 'Unknown')
    artist_info['date_of_death'] = dod[:10] if dod != 'Unknown' else dod

    # Extract descriptions from all English-related keys
    descriptions = linked_data_entity.get('description', {})
    description_texts = []
    for key in descriptions:
        if key.startswith('en'):
            description_texts.append(descriptions[key])
    if not description_texts:
        # Fallback to any available language
        description_texts.append(next(iter(descriptions.values()), 'No description available.'))
    artist_info['description'] = description_texts[0]

    artist_info['id'] = linked_data_entity.get('id', 'Unknown')

    logging.debug(f"Extracted Artist Info: {artist_info}")
    return artist_info

def remove_duplicate_metadata(metadata_list):
    """
    Remove duplicate metadata entries based on 'label' and 'value'.

    Args:
        metadata_list (list): List of metadata dictionaries.

    Returns:
        list: List with duplicates removed.
    """
    seen = set()
    unique_metadata = []
    for item in metadata_list:
        key = (item['label'].lower(), item['value'].lower())
        if key not in seen:
            seen.add(key)
            unique_metadata.append(item)
    return unique_metadata

def enrich_iiif_manifests(iiif_manifests, artists_info, fuzzy_threshold=85):
    """
    Enrich IIIF manifests with artist information.

    Args:
        iiif_manifests (list): List of IIIF manifest dictionaries.
        artists_info (list): List of artist information dictionaries.
        fuzzy_threshold (int): Threshold for fuzzy matching.

    Returns:
        list: Enriched IIIF manifests.
    """
    enriched_manifests = []
    for manifest in iiif_manifests:
        manifest_metadata = manifest.get('metadata', [])
        manifest_id = manifest.get('@id', 'Unknown ID')
        manifest_label = manifest.get('label', 'Unknown Label')
        logging.info(f"Processing Manifest: {manifest_label} ({manifest_id})")

        # Track if the manifest has been enriched to avoid multiple enrichments
        enriched = False

        # Iterate over each artist to check for matches
        for artist_info in artists_info:
            artist_names = [artist_info['preferred_name']] + artist_info['alternative_names']
            normalized_artist_names = [normalize_name(name) for name in artist_names]
            logging.debug(f"Checking Artist: {artist_info['preferred_name']} with names {artist_names}")

            # Iterate over manifest metadata to find matching creator/artist/author
            for item in manifest_metadata:
                label = item.get('label', '').strip().lower()
                value = item.get('value', '')
                if label in ['creator', 'artist', 'author']:
                    normalized_value = normalize_name(value)
                    # Check for exact match
                    if any(name == normalized_value for name in normalized_artist_names):
                        logging.info(f"Exact match found: '{value}' matches artist '{artist_info['preferred_name']}'")
                        # Enrich the manifest
                        new_metadata_entries = [
                            {'label': 'Enriched: Artist Preferred Name', 'value': artist_info['preferred_name']},
                            {'label': 'Enriched: Artist Alternative Names', 'value': ', '.join(artist_info['alternative_names']) if artist_info['alternative_names'] else 'N/A'},
                            {'label': 'Enriched: Artist Date of Birth', 'value': artist_info['date_of_birth']},
                            {'label': 'Enriched: Artist Date of Death', 'value': artist_info['date_of_death']},
                            {'label': 'Enriched: Artist Description', 'value': artist_info['description']},
                            {'label': 'Enriched: Artist Linked Data ID', 'value': artist_info['id']}
                        ]
                        manifest['metadata'].extend(new_metadata_entries)
                        # Remove duplicates
                        manifest['metadata'] = remove_duplicate_metadata(manifest['metadata'])
                        logging.info(f"Enriched manifest '{manifest_label}' with artist info '{artist_info['preferred_name']}'")
                        enriched = True
                        break  # Stop checking after enriching with one artist
                    else:
                        # If no exact match, perform fuzzy matching
                        for name in normalized_artist_names:
                            similarity = fuzz.partial_ratio(name, normalized_value)
                            if similarity >= fuzzy_threshold:
                                logging.info(f"Fuzzy match found: '{value}' matches artist '{artist_info['preferred_name']}' with similarity {similarity}")
                                # Enrich the manifest
                                new_metadata_entries = [
                                    {'label': 'Enriched: Artist Preferred Name', 'value': artist_info['preferred_name']},
                                    {'label': 'Enriched: Artist Alternative Names', 'value': ', '.join(artist_info['alternative_names']) if artist_info['alternative_names'] else 'N/A'},
                                    {'label': 'Enriched: Artist Date of Birth', 'value': artist_info['date_of_birth']},
                                    {'label': 'Enriched: Artist Date of Death', 'value': artist_info['date_of_death']},
                                    {'label': 'Enriched: Artist Description', 'value': artist_info['description']},
                                    {'label': 'Enriched: Artist Linked Data ID', 'value': artist_info['id']}
                                ]
                                manifest['metadata'].extend(new_metadata_entries)
                                # Remove duplicates
                                manifest['metadata'] = remove_duplicate_metadata(manifest['metadata'])
                                logging.info(f"Enriched manifest '{manifest_label}' with artist info '{artist_info['preferred_name']}' via fuzzy matching")
                                enriched = True
                                break  # Stop checking after enriching with one artist
                if enriched:
                    break  # Move to next manifest after enrichment
        enriched_manifests.append(manifest)
    return enriched_manifests

def create_relationship_graph(iiif_manifests, artists_info):
    """
    Create and display a relationship graph between artists and artworks.

    Args:
        iiif_manifests (list): List of enriched IIIF manifest dictionaries.
        artists_info (list): List of artist information dictionaries.
    """
    G = nx.Graph()

    # Add artist nodes
    for artist_info in artists_info:
        G.add_node(artist_info['preferred_name'], type='artist', description=artist_info['description'])
        logging.debug(f"Added artist node: {artist_info['preferred_name']}")

    # Add artwork nodes and edges
    for manifest in iiif_manifests:
        manifest_label = manifest.get('label', 'Unknown')
        G.add_node(manifest_label, type='artwork')
        logging.debug(f"Added artwork node: {manifest_label}")

        # Collect all creators/artists from metadata
        creators = []
        for item in manifest.get('metadata', []):
            label = item.get('label', '').strip().lower()
            value = item.get('value', '')
            if label in ['creator', 'artist', 'author']:
                creators.append(normalize_name(value))

        # Create edges between each creator and the artwork
        for creator in creators:
            for artist_info in artists_info:
                artist_names = [artist_info['preferred_name']] + artist_info['alternative_names']
                normalized_artist_names = [name.lower() for name in artist_names]
                if creator in normalized_artist_names:
                    G.add_edge(artist_info['preferred_name'], manifest_label)
                    logging.debug(f"Created edge between '{artist_info['preferred_name']}' and '{manifest_label}' (Exact Match)")
                else:
                    # Fuzzy matching
                    for name in normalized_artist_names:
                        similarity = fuzz.partial_ratio(name, creator)
                        if similarity >= 85:
                            G.add_edge(artist_info['preferred_name'], manifest_label)
                            logging.debug(f"Created edge between '{artist_info['preferred_name']}' and '{manifest_label}' (Fuzzy Match: {similarity})")
                            break  # Stop after first sufficient match

    # Remove edges between artists if any (shouldn't happen)
    for edge in list(G.edges()):
        node1, node2 = edge
        if G.nodes[node1].get('type') == 'artist' and G.nodes[node2].get('type') == 'artist':
            G.remove_edge(node1, node2)
            logging.debug(f"Removed unintended edge between artists: {node1} - {node2}")

    # Layout
    pos = nx.kamada_kawai_layout(G)

    # Create edge traces
    edge_x = []
    edge_y = []
    for edge in G.edges():
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_x.extend([x0, x1, None])
        edge_y.extend([y0, y1, None])
    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(width=2, color='#888'),
        hoverinfo='none',
        mode='lines'
    )

    # Create node traces
    node_x = []
    node_y = []
    node_text = []
    node_color = []
    for node in G.nodes():
        x, y = pos[node]
        node_x.append(x)
        node_y.append(y)
        node_type = G.nodes[node].get('type', '')
        if node_type == 'artist':
            node_color.append('blue')
            hover_text = f"{node}<br>{G.nodes[node].get('description', '')}"
        else:
            node_color.append('orange')
            hover_text = node
        node_text.append(hover_text)
    node_trace = go.Scatter(
        x=node_x, y=node_y,
        mode='markers+text',
        text=[node for node in G.nodes()],
        textposition='top center',
        hoverinfo='text',
        hovertext=node_text,
        marker=dict(
            color=node_color,
            size=20,
            line_width=2
        )
    )

    # Create the figure
    fig = go.Figure(
        data=[edge_trace, node_trace],
        layout=go.Layout(
            title='Artist and Artwork Relationships',
            showlegend=False,
            hovermode='closest',
            margin=dict(b=20, l=5, r=5, t=40),
            xaxis=dict(
                showgrid=False,
                showticklabels=False,
                zeroline=False
            ),
            yaxis=dict(
                showgrid=False,
                showticklabels=False,
                zeroline=False
            )
        )
    )

    # Display the figure explicitly with the 'colab' renderer
    fig.show(renderer='colab')

def validate_json(data, schema):
    """
    Validate JSON data against a schema.

    Args:
        data (dict): JSON data to validate.
        schema (dict): JSON schema.

    Returns:
        bool: True if valid, False otherwise.
    """
    try:
        validate(instance=data, schema=schema)
        return True
    except ValidationError as ve:
        logging.error(f"JSON validation error: {ve.message}")
        return False

# ==============================
# 8. User Interface Components
# ==============================

# Textarea for IIIF Manifest URIs
iiif_manifest_uris_input = Textarea(
    value='',
    placeholder='Enter IIIF Manifest URIs (one per line)',
    description='IIIF Manifests:',
    layout=Layout(width='48%', height='200px')
)

# Textarea for Linked Data Entity URIs
linked_data_uris_input = Textarea(
    value='',
    placeholder='Enter Linked Data Entity URIs (one per line)',
    description='Linked Data URIs:',
    layout=Layout(width='48%', height='200px')
)

# Slider for Fuzzy Matching Threshold
fuzzy_threshold_slider = IntSlider(
    value=85,
    min=50,
    max=100,
    step=5,
    description='Fuzzy Threshold:',
    continuous_update=False,
    readout=True,
    readout_format='d',
    layout=Layout(width='50%')
)

# Button to Start Processing
process_button = Button(
    description='Process',
    button_style='success',
    layout=Layout(width='200px')
)

# Output Area for Logs and Messages
output_area = Output()

# Progress Bar
progress_bar = widgets.FloatProgress(
    value=0.0,
    min=0.0,
    max=1.0,
    step=0.01,
    description='Progress:',
    bar_style='info',
    layout=Layout(width='100%')
)

# Summary HTML
summary_html = HTML(
    value="",
    placeholder='',
    description='',
    layout=Layout(width='100%')
)

# Display the UI
display(
    VBox([
        HBox([iiif_manifest_uris_input, linked_data_uris_input]),
        HBox([fuzzy_threshold_slider, process_button]),
        progress_bar,
        summary_html,
        output_area
    ])
)

# ==============================
# 9. Main Processing Function
# ==============================

def process_uris(iiif_manifest_uris, linked_data_uris, fuzzy_threshold):
    """
    Main function to process IIIF manifests and Linked Data URIs.

    Args:
        iiif_manifest_uris (list): List of IIIF Manifest URIs.
        linked_data_uris (list): List of Linked Data Entity URIs.
        fuzzy_threshold (int): Threshold for fuzzy matching.
    """
    if not iiif_manifest_uris or not linked_data_uris:
        logging.error("Please enter at least one IIIF manifest URI and one linked data entity URI.")
        return

    total_steps = 4  # Fetch Linked Data, Fetch IIIF Manifests, Enrich Manifests, Create Visualization
    current_step = 0
    progress_bar.value = 0.0
    summary_html.value = ""

    # Update progress
    def update_progress(step, message):
        nonlocal current_step
        current_step += 1
        progress_bar.value = current_step / total_steps
        summary_html.value += f"<p><strong>{step}:</strong> {message}</p>"

    with output_area:
        clear_output()
        try:
            # Step 1: Fetch Linked Data Entities
            update_progress(
                "Fetching Linked Data Entities",
                f"Starting to fetch {len(linked_data_uris)} linked data entities..."
            )
            linked_data_entities = fetch_data_concurrently(linked_data_uris, fetch_linked_data)
            logging.info(f"Fetched {len(linked_data_entities)} linked data entities.")
            update_progress(
                "Fetching Linked Data Entities",
                f"Successfully fetched {len(linked_data_entities)} linked data entities."
            )

            # Step 2: Fetch IIIF Manifests
            update_progress(
                "Fetching IIIF Manifests",
                f"Starting to fetch {len(iiif_manifest_uris)} IIIF manifests..."
            )
            iiif_manifests = fetch_data_concurrently(iiif_manifest_uris, fetch_iiif_manifest)
            logging.info(f"Fetched {len(iiif_manifests)} IIIF manifests.")
            update_progress(
                "Fetching IIIF Manifests",
                f"Successfully fetched {len(iiif_manifests)} IIIF manifests."
            )

            if not linked_data_entities:
                logging.error("No linked data entities fetched successfully. Aborting process.")
                update_progress("Error", "No linked data entities fetched successfully. Aborting process.")
                return
            if not iiif_manifests:
                logging.error("No IIIF manifests fetched successfully. Aborting process.")
                update_progress("Error", "No IIIF manifests fetched successfully. Aborting process.")
                return

            # Step 3: Extract Artist Information
            update_progress(
                "Extracting Artist Information",
                "Extracting artist information from linked data entities..."
            )
            artists_info = [extract_artist_info(entity) for entity in linked_data_entities]
            logging.info(f"Extracted information for {len(artists_info)} artists.")
            update_progress(
                "Extracting Artist Information",
                f"Extracted information for {len(artists_info)} artists."
            )

            # Step 4: Enrich IIIF Manifests
            update_progress(
                "Enriching IIIF Manifests",
                "Enriching IIIF manifests with artist information..."
            )
            enriched_manifests = enrich_iiif_manifests(iiif_manifests, artists_info, fuzzy_threshold)
            logging.info(f"Enriched {len(enriched_manifests)} IIIF manifests.")
            update_progress(
                "Enriching IIIF Manifests",
                f"Enriched {len(enriched_manifests)} IIIF manifests."
            )

            # Step 5: Save Enriched Manifests
            update_progress(
                "Saving Enriched Manifests",
                "Saving enriched IIIF manifests to JSON files..."
            )
            for idx, manifest in enumerate(enriched_manifests):
                filename = f'enriched_manifest_{idx+1}.json'
                try:
                    with open(filename, 'w', encoding='utf-8') as file:
                        json.dump(manifest, file, ensure_ascii=False, indent=2)
                    logging.info(f"Enriched manifest saved to {filename}")
                except Exception as e:
                    logging.error(f"Failed to save enriched manifest to {filename}: {e}")
            update_progress(
                "Saving Enriched Manifests",
                f"Saved {len(enriched_manifests)} enriched IIIF manifests."
            )

            # Step 6: Create Visualization
            update_progress(
                "Creating Visualization",
                "Generating relationship graph..."
            )
            create_relationship_graph(enriched_manifests, artists_info)
            update_progress(
                "Creating Visualization",
                "Relationship graph generated successfully."
            )

            # Completion Message
            summary_html.value += "<p><strong>Process Completed Successfully!</strong></p>"

        except Exception as e:
            logging.error(f"An unexpected error occurred: {e}")
            summary_html.value += f"<p style='color:red;'><strong>Error:</strong> {e}</p>"

# ==============================
# 10. Link the Button to the Callback Function
# ==============================

def on_process_button_clicked(b):
    """Callback function triggered when the process button is clicked."""
    with output_area:
        clear_output()
        # Extract URIs from input fields
        iiif_manifest_uris = [
            uri.strip() for uri in iiif_manifest_uris_input.value.strip().split('\n')
            if uri.strip()
        ]
        linked_data_uris = [
            uri.strip() for uri in linked_data_uris_input.value.strip().split('\n')
            if uri.strip()
        ]
        fuzzy_threshold = fuzzy_threshold_slider.value
        logging.info("Starting the processing of URIs...")
        process_uris(iiif_manifest_uris, linked_data_uris, fuzzy_threshold)

process_button.on_click(on_process_button_clicked)
