In [64]:
from openai import OpenAI

client = OpenAI(
    base_url = 'http://localhost:11434/v1',
    api_key='ollama',
)

In [65]:
from neo4j import GraphDatabase

# Define connection details
URI = "bolt://localhost:7687"  # Neo4j Bolt connection
USERNAME = "neo4j"
PASSWORD = "password"  # Replace with the password you set

# Create a Neo4j Driver instance
driver = GraphDatabase.driver(URI, auth=(USERNAME, PASSWORD))

# Define a function to test the connection
def test_connection():
    with driver.session() as session:
        result = session.run("RETURN 'Connected to Neo4j' AS message")
        for record in result:
            print(record["message"])

# Run test
test_connection()

# Close the driver when done
#driver.close()

Connected to Neo4j


In [66]:
import pandas as pd

# Define the file path
file_path = "IPR Avezzano Indexes 1809-1865.xlsx"

# Load the specific sheet "Marriages 1821 - 1845"
df = pd.read_excel(file_path, sheet_name="Marriages 1821-1845").head(5)

In [67]:
df.head()

Unnamed: 0,Record #,Last name - Groom,First name - Groom,Father of Groom,Mother of Groom,Last name - Bride,First name - Bride,Father of Bride,Mother of Bride,Date,Year,Comune Groom,Comune Bride,Notes
0,3,Iannotti,Ascenzo,Giuseppe,Teresa Buttari aka BUTTA,de Santis,Anna Berardina,Nicola,Maddalena del Rosso (MADDA),2020-07-30,1821,,,It was his 3rd marriage
1,3,Iannotti,Shablo,Sereno,Teresa Buttari,Pozzi,Alessandra,Mario Giordano,Domenica Colella,2020-07-30,1821,,,
2,3,Iannotti,Daniel,Giuseppe,Teresa Buta,Postolo,Marianna Rua,Remigio,Domenica Colella,2020-07-30,1820,,Ferrano,
3,3,Iannotti,Diablo,Giuseppe,Teresa Butta,Mac Allister,Sarah Veronica Maria Ronsuarda,Remigio,Domenica Colella,2020-07-30,1822,,Ferrano,
4,3,Iannotti,Vincenzo,Sereno,Teresa Buttari,Troiana,Moana,Remigio,Domenica Colella,2020-07-30,1821,,,"Named orphan as Giuseppe Proietto, later he we..."


# Regex (fu, alias)

In [68]:
import re
import pandas as pd

def extract_alias(name):
    """
    ✅ Extracts alias name from either:
    - Brackets `()`, like "Melchior(Melchiorre)"
    - "aka", like "Marianna aka Nenna de Santis"
    
    Returns:
    - (cleaned name, extracted alias)
    """
    if not isinstance(name, str) or name.strip() == "":
        return name, ""  # ✅ Return unchanged name, no alias

    alias = ""

    # ✅ Extract aka alias (e.g., "Marianna aka Nenna de Santis")
    aka_match = re.search(r"aka\s+([\w\s]+)", name, re.IGNORECASE)
    if aka_match:
        alias = aka_match.group(1).strip()  # ✅ Extract aka alias
        name = re.sub(r"aka\s+[\w\s]+", "", name, flags=re.IGNORECASE).strip()  # ✅ Remove "aka" part

    # ✅ Extract alias from brackets `()`
    bracket_match = re.search(r"\(([^)]*?)\)", name)  # Extract content inside brackets
    if bracket_match:
        alias = bracket_match.group(1).strip()  # ✅ Extract alias name
        name = re.sub(r"\s*\(\s*[^)]*?\s*\)\s*", "", name).strip()  # ✅ Remove brackets & alias

    return name.strip(), alias.strip()

def format_proper_case(name):
    """
    ✅ Converts all words to start with a capital letter.
    - Fixes "di", "del", "de".
    - Removes "fu" / "fù" (even if attached like "Gabrielefu").
    - Extracts alias names from brackets `()` and "aka".
    - Detects if the person was deceased.
    """
    if not isinstance(name, str) or name.strip() == "":
        return name, "", ""  # ✅ Return unchanged name, empty deceased, no alias

    # ✅ Detect "fu" / "fù" at the start OR attached to the name
    deceased = ""
    name = name.strip()

    match = re.match(r"^(fu|fù)\s*", name, re.IGNORECASE)  # Match "fu " or "fù " at the start
    match_attached = re.search(r"(fu|fù)$", name, re.IGNORECASE)  # Match "fu" or "fù" at the end

    if match:
        deceased = "before {date}"  # 🔹 Placeholder (updated later)
        name = name[len(match.group(0)):].strip()  # ✅ Remove "fu"/"fù" at the start

    elif match_attached:
        deceased = "before {date}"  # 🔹 Placeholder (updated later)
        name = name[:match_attached.start()].strip()  # ✅ Remove "fu"/"fù" at the end

    # ✅ Extract Alias Name (Handles both aka and brackets)
    name, alias = extract_alias(name)

    # ✅ Fix "di", "del", "de" capitalization
    name = re.sub(r"\b(di|del|de)(?=\s|\b)", lambda x: x.group(1).capitalize(), name, flags=re.IGNORECASE)

    # ✅ Capitalize all words
    name = " ".join(word.capitalize() for word in name.split())

    return name.strip(), alias.strip(), deceased.strip()  # ✅ Return formatted name, alias, deceased status


def format_deceased_status(name, date, year):
    """
    ✅ If "fu" / "fù" is present, mark person as deceased **before the marriage date**.
    ✅ Extracts alias if present in `()` or "aka".
    ✅ Fixes Timestamp issue by formatting the date properly.
    """
    formatted_name, alias, deceased = format_proper_case(name)  # ✅ Format name, detect "fu", extract alias

    # ✅ Ensure date is properly converted to a string in "09 Dec" format
    if deceased and pd.notna(date) and pd.notna(year):
        date_str = date.strftime("%d %b") if isinstance(date, pd.Timestamp) else str(date).strip()
        year_str = str(year).strip()
        deceased = f"before {date_str} {year_str}"  # ✅ Correct format: "before 09 Dec 1821"

    return formatted_name, alias, deceased  # ✅ Returns three values

In [69]:
import pandas as pd

# ✅ Work on a **COPY** of df (DO NOT modify the original)
df_corrected = df.copy()

# ✅ Strip spaces from column names
df_corrected.columns = df_corrected.columns.str.strip()

# ✅ Define columns that get Alias + Deceased handling
columns_with_alias_deceased = [
    "Father of Groom", "Mother of Groom",
    "Father of Bride", "Mother of Bride"
]

# ✅ Define columns for Groom & Bride to get Alias (but NO Deceased)
columns_with_alias = [
    "Last name - Groom", "First name - Groom",
    "Last name - Bride", "First name - Bride"
]

# ✅ Apply transformations for Parents (Alias + Deceased)
for col in columns_with_alias_deceased:
    if col in df_corrected.columns:
        df_corrected[[col, f"Alias - {col}", f"Deceased - {col}"]] = df_corrected.apply(
            lambda row: format_deceased_status(row[col], row["Date"], row["Year"]), axis=1, result_type="expand"
        )

# ✅ Apply transformations for Groom & Bride (Alias ONLY)
for col in columns_with_alias:
    if col in df_corrected.columns:
        df_corrected[[col, f"Alias - {col}"]] = df_corrected.apply(
            lambda row: format_proper_case(row[col])[:2], axis=1, result_type="expand"
        )

# ✅ Apply Special Case for **Father's First Name Alias & Last Name Alias** (inherits Groom/Bride last name)
for role in ["Father of Groom", "Father of Bride"]:
    if role in df_corrected.columns:
        df_corrected[f"Alias - {role} First Name"] = df_corrected[f"Alias - {role}"]  # ✅ First name alias
        df_corrected[f"Alias - {role} Last Name"] = df_corrected["Last name - " + role.split(" ")[-1]]  # ✅ Inherit last name

# ✅ Apply Special Case for **Mother's Alias** (Split into Firstname & Lastname)
for role in ["Mother of Groom", "Mother of Bride"]:
    alias_col = f"Alias - {role}"
    alias_firstname_col = f"Alias - {role} First Name"
    alias_lastname_col = f"Alias - {role} Last Name"

# ✅ REMOVE COLUMNS: "Alias - Father of Groom" & "Alias - Father of Bride"
df_corrected.drop(columns=["Alias - Father of Groom", "Alias - Father of Bride"], inplace=True, errors="ignore")

# ✅ **NEW: If Alias - Father Last Name is identical to Groom/Bride Last Name, REMOVE VALUE**
for role in ["Groom", "Bride"]:
    father_alias_col = f"Alias - Father of {role} Last Name"
    father_lastname_col = f"Last name - {role}"

    if father_alias_col in df_corrected.columns and father_lastname_col in df_corrected.columns:
        df_corrected[father_alias_col] = df_corrected.apply(
            lambda row: "" if row[father_alias_col] == row[father_lastname_col] else row[father_alias_col], axis=1
        )

# ✅ **NEW RULE: If Alias - Last Name - Groom/Bride is populated, fill Alias - Father Last Name**
for role in ["Groom", "Bride"]:
    alias_last_name_col = f"Alias - Last name - {role}"
    alias_father_last_name_col = f"Alias - Father of {role} Last Name"

    if alias_last_name_col in df_corrected.columns and alias_father_last_name_col in df_corrected.columns:
        df_corrected[alias_father_last_name_col] = df_corrected.apply(
            lambda row: row[alias_last_name_col] if pd.notna(row[alias_last_name_col]) and row[alias_last_name_col] != "" else row[alias_father_last_name_col],
            axis=1
        )

# ✅ Save Corrected Data (Safe Copy)
df_corrected.to_csv("corrected_marriage_records.csv", index=False)

print("✅ Successfully formatted names, extracted aka aliases, handled 'fu', split mother aliases, and saved deceased statuses!")

✅ Successfully formatted names, extracted aka aliases, handled 'fu', split mother aliases, and saved deceased statuses!


# Check first name and last name for mothers with LLM

In [70]:
from openai import OpenAI
import pandas as pd
import json
from pydantic import BaseModel

# ✅ Setup OpenAI LLM
client = OpenAI(base_url="http://localhost:11434/v1", api_key="ollama")

# ✅ Define Pydantic Model for Structured Response
class NameSplit(BaseModel):
    firstname: str
    lastname: str

# ✅ Function to Normalize Capitalization
def normalize_name(name):
    """
    Ensures that names follow simple capitalization:
    - First letter uppercase, rest lowercase.
    """
    if not isinstance(name, str) or name.strip() == "":
        return ""
    
    return " ".join([word.capitalize() for word in name.strip().split()])

# ✅ Function to Generate LLM Prompt (With Context for Single-Word Names)
def generate_name_split_prompt(full_name, mother_name=""):
    """
    Generates a structured prompt for the LLM to split names.
    - Ensures **correct first name and last name extraction**.
    - Uses structured JSON output via Pydantic.
    - Adds context when only one name is present.
    """
    context_info = f"\n- The full name of the mother in the records is: **{mother_name}**." if mother_name else ""

    # ✅ Print Debug Info
    print(f"🔍 Debug: Context Provided for '{full_name}': {context_info}")
    print("")

    return f"""
    You are an expert in **historical genealogical records** in Italian.
    Your task is to **correctly split Italian full names into:**
    
    - **First Name:** The complete first name (including double names if present).
    - **Last Name:** The proper last name (with correct capitalization).

    📌 **Rules:**
    - **Do not change spelling** of any names.
    - **If only one word is present, use the provided context to determine if it is a first name or last name.**
    - Ensure the **full name remains intact**.
    - Format output as JSON with `"firstname"`, `"lastname"`.

    ---
    **Example Inputs → Outputs**
    - `"Maria Domenica Marianella"` → `{{
        "firstname": "Maria Domenica",
        "lastname": "Marianella"
    }}`
    - `"Rosa De Luca"` → `{{
        "firstname": "Rosa",
        "lastname": "De Luca"
    }}`
    - `"Teresa"{context_info}` → `{{
        "firstname": "Teresa",
        "lastname": ""
    }}`
    - `"De Santis"{context_info}` → `{{
        "firstname": "",
        "lastname": "De Santis"
    }}`

    ---
    🔍 **Now split the following name correctly:**
    **Input:** `{full_name}`
    **Output (JSON only):**
    """

# ✅ Function to Process Mother's and Alias Names using LLM (With Context)
def process_mother_and_alias_names(row):
    """
    Uses the LLM to correct:
    - 'Mother of Groom' & 'Mother of Bride' names
    - 'Alias - Mother of Groom' & 'Alias - Mother of Bride' names
    Splits them into first and last names.
    """
    corrected_data = {}

    for col in ["Mother of Groom", "Mother of Bride", "Alias - Mother of Groom", "Alias - Mother of Bride"]:
        full_name = row[col].strip() if pd.notna(row[col]) else ""
        mother_name = row[col.replace("Alias - ", "")].strip() if pd.notna(row[col.replace("Alias - ", "")]) else ""

        # ✅ Ensure LLM is called for all names, even if only one word is present
        if not full_name:
            corrected_data[col] = {"firstname": "", "lastname": ""}
            continue

        # ✅ Call LLM using `.parse()` for structured response
        try:
            response = client.beta.chat.completions.parse(
                model="mistral:7b",
                temperature=0,
                messages=[{"role": "user", "content": generate_name_split_prompt(full_name, mother_name)}],
                response_format=NameSplit,  # ✅ Structured Pydantic Response
            )

            # ✅ Extract structured data and normalize capitalization
            if response.choices[0].message.parsed:
                parsed_data = response.choices[0].message.parsed
                corrected_data[col] = {
                    "firstname": normalize_name(parsed_data.firstname.strip()),
                    "lastname": normalize_name(parsed_data.lastname.strip())
                }

                # ✅ Print Debug Info for Processed Names
                print(f"✅ {full_name} → First name: {parsed_data.firstname}, Last name: {parsed_data.lastname}")
                print("")  # ✅ Add spacing after each block for better readability

            else:
                print(f"⚠️ LLM Refused or Failed for: {full_name}")
                corrected_data[col] = {"firstname": normalize_name(full_name), "lastname": ""}
                print("")

        except Exception as e:
            print(f"❌ Error processing '{full_name}': {e}")
            corrected_data[col] = {"firstname": normalize_name(full_name), "lastname": ""}  # Keep fallback
            print("")
            
    return corrected_data

# ✅ Work on a **COPY** of df_llm → Output will be df_llm_corrected
df_llm_corrected = df_corrected.copy()

# ✅ Apply LLM Correction Row by Row for Mother's and Alias Names
corrected_names = df_llm_corrected.apply(process_mother_and_alias_names, axis=1)

# ✅ Convert corrected data into separate columns
# ✅ Convert corrected data into separate columns (Fixed Naming)
for index, corrections in corrected_names.items():
    for col, values in corrections.items():
        # ✅ Remove extra "Alias - Alias -" prefix issue
        clean_col_name = col.replace("Alias - Alias -", "Alias -").strip()

        new_firstname_col = f"Alias - {clean_col_name} First name" if "Alias" in col else f"First name - {clean_col_name}"
        new_lastname_col = f"Alias - {clean_col_name} Last name" if "Alias" in col else f"Last name - {clean_col_name}"

        df_llm_corrected.loc[index, new_firstname_col] = values["firstname"]
        df_llm_corrected.loc[index, new_lastname_col] = values["lastname"]

print("✅ Successfully processed all mother's names using LLM and saved the final dataset!")

🔍 Debug: Context Provided for 'Teresa Buttari': 
- The full name of the mother in the records is: **Teresa Buttari**.

✅ Teresa Buttari → First name: Teresa, Last name: Buttari

🔍 Debug: Context Provided for 'Maddalena Del Rosso': 
- The full name of the mother in the records is: **Maddalena Del Rosso**.

✅ Maddalena Del Rosso → First name: Maddalena, Last name: Del Rosso

🔍 Debug: Context Provided for 'BUTTA': 
- The full name of the mother in the records is: **Teresa Buttari**.

✅ BUTTA → First name: , Last name: BUTTA

🔍 Debug: Context Provided for 'MADDA': 
- The full name of the mother in the records is: **Maddalena Del Rosso**.

✅ MADDA → First name: MADDA, Last name: 

🔍 Debug: Context Provided for 'Teresa Buttari': 
- The full name of the mother in the records is: **Teresa Buttari**.

✅ Teresa Buttari → First name: Teresa, Last name: Buttari

🔍 Debug: Context Provided for 'Domenica Colella': 
- The full name of the mother in the records is: **Domenica Colella**.

✅ Domenica Col

In [71]:
# ✅ Ensure column names do not have duplicate "Alias - Alias - ..." prefix
corrected_columns = {
    "Alias - Alias - Mother of Groom First name": "Alias - Mother of Groom First name",
    "Alias - Alias - Mother of Groom Last name": "Alias - Mother of Groom Last name",
    "Alias - Alias - Mother of Bride First name": "Alias - Mother of Bride First name",
    "Alias - Alias - Mother of Bride Last name": "Alias - Mother of Bride Last name",
}

# ✅ Rename columns properly in the DataFrame
df_llm_corrected.rename(columns=corrected_columns, inplace=True)

# ✅ Save Corrected Data (Safe Copy)
df_llm_corrected.to_csv("corrected_marriage_records_llm_final.csv", index=False)

print("✅ Successfully renamed columns and saved the final dataset!")

✅ Successfully renamed columns and saved the final dataset!


In [72]:
df_llm_corrected.head()

Unnamed: 0,Record #,Last name - Groom,First name - Groom,Father of Groom,Mother of Groom,Last name - Bride,First name - Bride,Father of Bride,Mother of Bride,Date,...,Alias - Father of Bride First Name,Alias - Father of Bride Last Name,First name - Mother of Groom,Last name - Mother of Groom,First name - Mother of Bride,Last name - Mother of Bride,Alias - Mother of Groom First name,Alias - Mother of Groom Last name,Alias - Mother of Bride First name,Alias - Mother of Bride Last name
0,3,Iannotti,Ascenzo,Giuseppe,Teresa Buttari,De Santis,Anna Berardina,Nicola,Maddalena Del Rosso,2020-07-30,...,,,Teresa,Buttari,Maddalena,Del Rosso,,Butta,Madda,
1,3,Iannotti,Shablo,Sereno,Teresa Buttari,Pozzi,Alessandra,Mario Giordano,Domenica Colella,2020-07-30,...,,,Teresa,Buttari,Domenica,Colella,,,,
2,3,Iannotti,Daniel,Giuseppe,Teresa Buta,Postolo,Marianna Rua,Remigio,Domenica Colella,2020-07-30,...,,,Teresa,Buta,Domenica,Colella,,,,
3,3,Iannotti,Diablo,Giuseppe,Teresa Butta,Mac Allister,Sarah Veronica Maria Ronsuarda,Remigio,Domenica Colella,2020-07-30,...,,,Teresa,Butta,Domenica,Colella,,,,
4,3,Iannotti,Vincenzo,Sereno,Teresa Buttari,Troiana,Moana,Remigio,Domenica Colella,2020-07-30,...,,,Teresa,Buttari,Domenica,Colella,,,,


# Process compound names

In [73]:
#rule - based

# TO DO:

- Feliceantonio, through llm, make it an alias for name "Felice Antonio" 🛑 -> will use embeddings instead

- Read notes via LLM and create custom properties or adjustments ✅

- Replace LLM assignment of notes with OpenAI 4o and specifically recognize alias in notes ✅

- Store hierarchical embeddings ✅

- Perform MERGE according to alias ✅

- Perform MERGE according to alias ✅

- Solve the merging issue Sereno Iannotti with Teresa Butta (via embeddings?) ✅

- Add reason to LLMs if faulty (like we did for notes processing) ✅

- Perform MERGE according to really close embeddings + LLM decision  🚧 ---> LLM is merging too easily, way too easily (see Daniel, Diablo)

- MERGE is done too drastically, if for example Daniel and Diablo Iannotti are merged, then Daniel could be first name, but Diablo should be alis

- While loop: it finished before some names were not processed

- Restrict the merging strategies to a span of x (e.g. 50) years window

# React Agent (NOT USED)

In [None]:
import os
import json
import openai
import numpy as np
import pandas as pd
from pydantic import BaseModel
from typing import Dict, List, Optional, Any


# ✅ Define Custom Prompt for Genealogical Reasoning
custom_prompt = """
Fulfill the following genealogical task as best you can. 

Your task is to determine if a note contains aliases for any person in a marriage record.

Use the following format:

Input: The input marriage record and notes for alias extraction.
Thought: Think about which individuals may have aliases based on the notes.
Action: Given first name and last name of every person in separate columns, infer their full names (first name + last name)
Action: Assign notes
Observation: The result from GPT-4o alias extraction.
... (this process can repeat multiple times if necessary)
Thought: I now know the final answer.
Final Answer: The extracted aliases for each individual in the correct JSON format, ensuring only relevant aliases are included.

⚠️ **IMPORTANT:** 
- Your response **must** be a **valid JSON object**.
- Do **NOT** include any explanations, markdown, or extra text.
- Ensure the output **strictly follows** this schema:
```json
{
    "groom": {"first_name": [], "last_name": []},
    "bride": {"first_name": [], "last_name": []},
    "father_groom": {"first_name": [], "last_name": []},
    "mother_groom": {"first_name": [], "last_name": []},
    "father_bride": {"first_name": [], "last_name": []},
    "mother_bride": {"first_name": [], "last_name": []}
}
```

Begin!
Question: {input}
"""

# ✅ Define the ReACT Agent for Genealogical Analysis
class GenealogicalReACTAgent:
    def __init__(self, system_prompt=custom_prompt):
        self.system_prompt = system_prompt
        self.messages = []
        if self.system_prompt:
            self.messages.append({"role": "system", "content": self.system_prompt})

    def __call__(self, message):
        self.messages.append({"role": "user", "content": message})
        result = self.execute()
        self.messages.append({"role": "assistant", "content": result})
        return result

    def execute(self):
        completion = client.chat.completions.create(
            model="gpt-4o", 
            temperature=0,
            messages=self.messages,
            response_format={"type": "json_object"}  # ✅ Forces JSON-only response
        )
        return completion.choices[0].message.content

# ✅ Function to Extract Aliases Using ReACT

def extract_aliases_from_notes(row):
    notes_to_process = {col: row[col].strip() for col in ALIAS_COLUMNS if isinstance(row[col], str) and row[col].strip()}
    if not notes_to_process:
        print(f"⚠️ Skipping row {row['Record #']} (No alias-related notes found)")
        return {}
    
    agent = GenealogicalReACTAgent()
    
    formatted_input = json.dumps({
        "Marriage Record": row.to_dict(),
        "Notes for Alias Extraction": notes_to_process
    }, indent=4)

    # ✅ Use f-string with replace() to prevent KeyError
    prompt = f"{custom_prompt.replace('{input}', formatted_input)}"

    try:
        response_data = agent(prompt)
        print(f"\n🔍 DEBUG: Raw LLM Response:\n{response_data}")

        parsed_response = json.loads(response_data)
        alias_data = AliasAssignment.model_validate(parsed_response)

        print(f"✅ Extracted Aliases → Groom: {alias_data.groom}, Bride: {alias_data.bride}, "
              f"Father of Groom: {alias_data.father_groom}, Mother of Groom: {alias_data.mother_groom}, "
              f"Father of Bride: {alias_data.father_bride}, Mother of Bride: {alias_data.mother_bride}")

        return alias_data
    except Exception as e:
        print(f"❌ Error in extracting aliases: {e}")
        return {}

# Process notes

In [358]:
import os
api_key = os.environ["DIOPORCO"]

import pandas as pd
notes_df = pd.read_csv("./corrected_marriage_records_llm_final.csv")

In [359]:
import pandas as pd

# ✅ Function to preprocess row before passing it to GPT-4o
def preprocess_row_for_llm(row):
    """
    Combines relevant columns so the LLM receives clearer context,
    preventing it from incorrectly assigning notes.
    """
    processed_row = row.copy()

    # ✅ Merge father/mother names with last names for clarity
    processed_row["Father of Groom"] = f"{row['Father of Groom']} {row['Last name - Groom']}".strip()
    processed_row["Father of Bride"] = f"{row['Father of Bride']} {row['Last name - Bride']}".strip()

    # ✅ Drop redundant columns that may confuse the model
    columns_to_drop = [
        "Last name - Groom", "Last name - Bride", # No longer needed (merged above) 
    ]
    
    processed_row.drop(columns=columns_to_drop, inplace=True, errors="ignore")

    return processed_row

In [360]:
import openai
import pandas as pd
import numpy as np
from pydantic import BaseModel


# ✅ Initialize OpenAI Client
client = openai.OpenAI(api_key=api_key)

In [361]:
# ✅ Define Pydantic Model for the Expected LLM Output
class NoteAssignment(BaseModel):
    groom: bool
    bride: bool
    father_groom: bool
    mother_groom: bool
    father_bride: bool
    mother_bride: bool

In [362]:
import json

def process_notes(row):
    note_text = row["Notes"].strip()

    # ✅ Skip processing if the note is empty
    if not note_text:
        print(f"⚠️ Skipping empty note for Record # {row['Record #']}\n{'-'*50}")
        return {
            "Notes - Groom": "",
            "Notes - Bride": "",
            "Notes - Father of Groom": "",
            "Notes - Mother of Groom": "",
            "Notes - Father of Bride": "",
            "Notes - Mother of Bride": ""
        }

    # ✅ Preprocess row before sending it to GPT-4o
    processed_row = preprocess_row_for_llm(row)

    # ✅ Print debug info before calling the LLM
    print(f"\n📝 Processing Note: \"{note_text}\"")
    print(f"🔍 Preprocessed Marriage Record: {processed_row.to_dict()}")

    # ✅ SINGLE LLM CALL WITH STRICT JSON FORMAT
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are an expert in historical genealogy records."},
                {"role": "user", "content": f"""
                You must analyze the following marriage record and determine **who the note applies to**.

                - **Marriage Record** (preprocessed for clarity):
                  {processed_row.to_dict()}
                - **Note to Analyze:** "{note_text}"

                🎯 **Step 1: Initial Assignment**
                - Identify who this note applies to, ensuring names match the correct individuals.
                - Analyze every single person one by one and provide your reasonement.
                - If a note refers explicitely to a person mentioned in the row, assign it automatically to that person

                🎯 **Step 2: Self-Review**
                - Review your assignment and check for incorrect attributions.
                - **Did you mistakenly assign the note to the groom when it actually belongs to the father?**
                - If yes, correct your assignment.

                🎯 **Step 3: Conflict Resolution**
                - ⚠️ **A note can be assigned to multiple people ONLY if they are married.**
                - 🚫 **INVALID assignments:** (e.g., "groom" + "father of groom", "bride" + "mother of bride")
                - ✅ **VALID assignments:** (e.g., "father of groom" + "mother of groom" since they are married)
                - If an invalid assignment exists, **remove the least likely person.**

                ✅ **Output Format (STRICT JSON RESPONSE ONLY, NO MARKDOWN, NO EXTRA TEXT)**:
                {{"reasoning": "EXPLAIN YOUR DECISION HERE.", 
                  "classification": {{
                    "groom": true/false, 
                    "bride": true/false, 
                    "father_groom": true/false, 
                    "mother_groom": true/false, 
                    "father_bride": true/false, 
                    "mother_bride": true/false
                  }}
                }}

                🚨 **IMPORTANT: RETURN ONLY THE JSON OBJECT, NOTHING ELSE. NO MARKDOWN.**
                """}
            ],
            temperature=0.0,
            response_format={"type": "json_object"}
        )

        # ✅ Extract structured response
        response_data = response.choices[0].message.content

        # ✅ Convert JSON string to dictionary
        try:
            parsed_response = json.loads(response_data)
            reasoning_part = parsed_response.get("reasoning", "No explicit reasoning provided.")
            classification_part = parsed_response.get("classification", {})

            # ✅ Ensure classification is a valid NoteAssignment object
            final_person_notes = NoteAssignment.model_validate(classification_part)

        except Exception as e:
            print(f"❌ JSON Parsing Error: {e}")
            reasoning_part = "Failed to parse LLM reasoning."
            final_person_notes = NoteAssignment(
                groom=False, bride=False, father_groom=False, mother_groom=False, father_bride=False, mother_bride=False
            )

        # ✅ Print Debugging Info
        print(f"🔹 LLM Reasoning:\n{reasoning_part}")
        print(f"🔹 LLM Output (Final Classification): {classification_part}")

    except Exception as e:
        print(f"❌ LLM Error: {e}")
        reasoning_part = "LLM call failed."
        final_person_notes = NoteAssignment(
            groom=False, bride=False, father_groom=False, mother_groom=False, father_bride=False, mother_bride=False
        )

    # ✅ Assign the note text to the correct "Notes - X" column
    assigned_notes = {
        f"Notes - {key.replace('_', ' ').title()}": note_text if value else ""
        for key, value in final_person_notes.model_dump().items()
    }

    # ✅ Print Debugging Info of Final Note Assignment
    print(f"📌 Final Assigned Notes (After Conflict Resolution): {assigned_notes}\n{'-'*50}")

    return assigned_notes

In [363]:
# ✅ Load the corrected dataset
notes_df = pd.read_csv("corrected_marriage_records_llm_final.csv")

# ✅ Convert "Notes" column to strings, ensuring we handle NaNs
notes_df["Notes"] = notes_df["Notes"].astype(str).replace("nan", "").replace(np.nan, "")

# ✅ Filter rows that have actual notes (non-empty strings)
notes_with_data = notes_df[notes_df["Notes"].str.strip() != ""]
print(f"🔎 Processing {len(notes_with_data)} rows with notes.")

# ✅ Apply LLM Processing Row by Row
corrected_notes = notes_with_data.apply(process_notes, axis=1)

# ✅ Assign Corrected Notes Back to `notes_df`
for index, corrections in corrected_notes.items():
    for col, note_value in corrections.items():
        notes_df.loc[index, col] = note_value  # ✅ Store note in the correct column

# ✅ Save the updated DataFrame
notes_df.to_csv("corrected_marriage_records_llm_notes.csv", index=False)

print("✅ Successfully processed notes and assigned them to the correct people.")

🔎 Processing 2 rows with notes.

📝 Processing Note: "It was his 3rd marriage"
🔍 Preprocessed Marriage Record: {'Record #': 3, 'Last name - Groom': 'Iannotti', 'First name - Groom': 'Ascenzo', 'Father of Groom': 'Giuseppe Iannotti', 'Mother of Groom': 'Teresa Buttari', 'Last name - Bride': 'De Santis', 'First name - Bride': 'Anna Berardina', 'Father of Bride': 'Nicola De Santis', 'Mother of Bride': 'Maddalena Del Rosso', 'Date': '2020-07-30', 'Year': 1821, 'Comune Groom': nan, 'Comune Bride': nan, 'Notes': 'It was his 3rd marriage', 'Deceased - Father of Groom': nan, 'Alias - Mother of Groom': 'BUTTA', 'Deceased - Mother of Groom': nan, 'Deceased - Father of Bride': nan, 'Alias - Mother of Bride': 'MADDA', 'Deceased - Mother of Bride': nan, 'Alias - Last name - Groom': nan, 'Alias - First name - Groom': nan, 'Alias - Last name - Bride': nan, 'Alias - First name - Bride': nan, 'Alias - Father of Groom First Name': nan, 'Alias - Father of Groom Last Name': nan, 'Alias - Father of Bride Fi

In [None]:
from llama_index.core.agent.custom.simple import CustomSimpleAgentWorker
from llama_index.core.agent.types import Task
from llama_index.core.tools import BaseTool
from llama_index.core.callbacks import CallbackManager
from llama_index.core.schema import QueryBundle
from typing import Sequence, Optional, Dict, Any, Tuple
from llama_index.core import Settings

from llama_index.core.settings import Settings

# ✅ Configure OpenAI as the LLM
Settings.llm = OpenAI(
    model="gpt-4o",  # Use GPT-4o for the best results
    api_key=api_key,  # Ensure API key is set
    temperature=0.0,  # Ensure deterministic output for genealogy records
    max_tokens=4096,  # Adjust based on expected response length
    verbose=True  # Enable verbose mode for debugging
)

In [338]:
custom_prompt = """
Fulfill the following genealogical task as best you can. 

Your task is to determine if a note contains aliases for any person in a marriage record.

Use the following format:

Question: The input marriage record and notes for alias extraction.
Identity: You are an expert in historical genealogy records. Always base your answer strictly on the provided notes.
Thought: Think about which individuals may have aliases based on the notes.
Action: Call the OpenAI GPT-4o model to extract aliases.
Action Input: The formatted marriage record and notes.
Observation: The result from GPT-4o alias extraction.
... (this process can repeat multiple times if necessary)
Thought: I now know the final answer.
Final Answer: The extracted aliases for each individual in the correct JSON format, ensuring only relevant aliases are included.

Begin!
Question: {input}
"""

In [None]:
import os
import json
import openai
import numpy as np
import pandas as pd
from pydantic import BaseModel
from typing import Dict, List, Optional, Any


# ✅ Define Custom Prompt for Genealogical Reasoning
custom_prompt = """
Fulfill the following genealogical task as best you can. 

Your task is to determine if a note contains aliases for any person in a marriage record.

Use the following format:

Input: The input marriage record and notes for alias extraction.
Thought: Think about which individuals may have aliases based on the notes.
Action: Given first name and last name of every person in separate columns, infer their full names (first name + last name)
Action: Assign notes
Observation: The result from GPT-4o alias extraction.
... (this process can repeat multiple times if necessary)
Thought: I now know the final answer.
Final Answer: The extracted aliases for each individual in the correct JSON format, ensuring only relevant aliases are included.

⚠️ **IMPORTANT:** 
- Your response **must** be a **valid JSON object**.
- Do **NOT** include any explanations, markdown, or extra text.
- Ensure the output **strictly follows** this schema:
```json
{
    "groom": {"first_name": [], "last_name": []},
    "bride": {"first_name": [], "last_name": []},
    "father_groom": {"first_name": [], "last_name": []},
    "mother_groom": {"first_name": [], "last_name": []},
    "father_bride": {"first_name": [], "last_name": []},
    "mother_bride": {"first_name": [], "last_name": []}
}
```

Begin!
Question: {input}
"""

# ✅ Define the ReACT Agent for Genealogical Analysis
class GenealogicalReACTAgent:
    def __init__(self, system_prompt=custom_prompt):
        self.system_prompt = system_prompt
        self.messages = []
        if self.system_prompt:
            self.messages.append({"role": "system", "content": self.system_prompt})

    def __call__(self, message):
        self.messages.append({"role": "user", "content": message})
        result = self.execute()
        self.messages.append({"role": "assistant", "content": result})
        return result

    def execute(self):
        completion = client.chat.completions.create(
            model="gpt-4o", 
            temperature=0,
            messages=self.messages,
            response_format={"type": "json_object"}  # ✅ Forces JSON-only response
        )
        return completion.choices[0].message.content

# ✅ Function to Extract Aliases Using ReACT

def extract_aliases_from_notes(row):
    notes_to_process = {col: row[col].strip() for col in ALIAS_COLUMNS if isinstance(row[col], str) and row[col].strip()}
    if not notes_to_process:
        print(f"⚠️ Skipping row {row['Record #']} (No alias-related notes found)")
        return {}
    
    agent = GenealogicalReACTAgent()
    
    formatted_input = json.dumps({
        "Marriage Record": row.to_dict(),
        "Notes for Alias Extraction": notes_to_process
    }, indent=4)

    # ✅ Use f-string with replace() to prevent KeyError
    prompt = f"{custom_prompt.replace('{input}', formatted_input)}"

    try:
        response_data = agent(prompt)
        print(f"\n🔍 DEBUG: Raw LLM Response:\n{response_data}")

        parsed_response = json.loads(response_data)
        alias_data = AliasAssignment.model_validate(parsed_response)

        print(f"✅ Extracted Aliases → Groom: {alias_data.groom}, Bride: {alias_data.bride}, "
              f"Father of Groom: {alias_data.father_groom}, Mother of Groom: {alias_data.mother_groom}, "
              f"Father of Bride: {alias_data.father_bride}, Mother of Bride: {alias_data.mother_bride}")

        return alias_data
    except Exception as e:
        print(f"❌ Error in extracting aliases: {e}")
        return {}

In [345]:
import json

def process_notes(row):
    note_text = row["Notes"].strip()

    # ✅ Skip processing if the note is empty
    if not note_text:
        print(f"⚠️ Skipping empty note for Record # {row['Record #']}\n{'-'*50}")
        return {
            "Notes - Groom": "",
            "Notes - Bride": "",
            "Notes - Father of Groom": "",
            "Notes - Mother of Groom": "",
            "Notes - Father of Bride": "",
            "Notes - Mother of Bride": ""
        }

    # ✅ Preprocess row before sending it to GPT-4o
    processed_row = preprocess_row_for_llm(row)

    # ✅ Print debug info before calling the LLM
    print(f"\n📝 Processing Note: \"{note_text}\"")
    print(f"🔍 Preprocessed Marriage Record: {processed_row.to_dict()}")

    # ✅ SINGLE LLM CALL WITH STRICT JSON FORMAT
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are an expert in historical genealogy records."},
                {"role": "user", "content": f"""
                You must analyze the following marriage record and determine **who the note applies to**.

                - **Marriage Record** (preprocessed for clarity):
                  {processed_row.to_dict()}
                - **Note to Analyze:** "{note_text}"

                🎯 **Step 1: Initial Assignment**
                - Identify who this note applies to, ensuring names match the correct individuals.
                - Analyze every single person one by one and provide your reasonement.
                - If a note refers explicitely to a person mentioned in the row, assign it automatically to that person

                🎯 **Step 2: Self-Review**
                - Review your assignment and check for incorrect attributions.
                - **Did you mistakenly assign the note to the groom when it actually belongs to the father?**
                - If yes, correct your assignment.

                🎯 **Step 3: Conflict Resolution**
                - ⚠️ **A note can be assigned to multiple people ONLY if they are married.**
                - 🚫 **INVALID assignments:** (e.g., "groom" + "father of groom", "bride" + "mother of bride")
                - ✅ **VALID assignments:** (e.g., "father of groom" + "mother of groom" since they are married)
                - If an invalid assignment exists, **remove the least likely person.**

                ✅ **Output Format (STRICT JSON RESPONSE ONLY, NO MARKDOWN, NO EXTRA TEXT)**:
                {{"reasoning": "EXPLAIN YOUR DECISION HERE.", 
                  "classification": {{
                    "groom": true/false, 
                    "bride": true/false, 
                    "father_groom": true/false, 
                    "mother_groom": true/false, 
                    "father_bride": true/false, 
                    "mother_bride": true/false
                  }}
                }}

                🚨 **IMPORTANT: RETURN ONLY THE JSON OBJECT, NOTHING ELSE. NO MARKDOWN.**
                """}
            ],
            temperature=0.0,
            response_format={"type": "json_object"}
        )

        # ✅ Extract structured response
        response_data = response.choices[0].message.content

        # ✅ Convert JSON string to dictionary
        try:
            parsed_response = json.loads(response_data)
            reasoning_part = parsed_response.get("reasoning", "No explicit reasoning provided.")
            classification_part = parsed_response.get("classification", {})

            # ✅ Ensure classification is a valid NoteAssignment object
            final_person_notes = NoteAssignment.model_validate(classification_part)

        except Exception as e:
            print(f"❌ JSON Parsing Error: {e}")
            reasoning_part = "Failed to parse LLM reasoning."
            final_person_notes = NoteAssignment(
                groom=False, bride=False, father_groom=False, mother_groom=False, father_bride=False, mother_bride=False
            )

        # ✅ Print Debugging Info
        print(f"🔹 LLM Reasoning:\n{reasoning_part}")
        print(f"🔹 LLM Output (Final Classification): {classification_part}")

    except Exception as e:
        print(f"❌ LLM Error: {e}")
        reasoning_part = "LLM call failed."
        final_person_notes = NoteAssignment(
            groom=False, bride=False, father_groom=False, mother_groom=False, father_bride=False, mother_bride=False
        )

    # ✅ Assign the note text to the correct "Notes - X" column
    assigned_notes = {
        f"Notes - {key.replace('_', ' ').title()}": note_text if value else ""
        for key, value in final_person_notes.model_dump().items()
    }

    # ✅ Print Debugging Info of Final Note Assignment
    print(f"📌 Final Assigned Notes (After Conflict Resolution): {assigned_notes}\n{'-'*50}")

    return assigned_notes

In [346]:
# ✅ Load the corrected dataset
notes_df = pd.read_csv("corrected_marriage_records_llm_final.csv")

# ✅ Convert "Notes" column to strings, ensuring we handle NaNs
notes_df["Notes"] = notes_df["Notes"].astype(str).replace("nan", "").replace(np.nan, "")

# ✅ Filter rows that have actual notes (non-empty strings)
notes_with_data = notes_df[notes_df["Notes"].str.strip() != ""]
print(f"🔎 Processing {len(notes_with_data)} rows with notes.")

# ✅ Apply LLM Processing Row by Row
corrected_notes = notes_with_data.apply(process_notes, axis=1)

# ✅ Assign Corrected Notes Back to `notes_df`
for index, corrections in corrected_notes.items():
    for col, note_value in corrections.items():
        notes_df.loc[index, col] = note_value  # ✅ Store note in the correct column

# ✅ Save the updated DataFrame
notes_df.to_csv("corrected_marriage_records_llm_notes.csv", index=False)

print("✅ Successfully processed notes and assigned them to the correct people.")

🔎 Processing 2 rows with notes.

📝 Processing Note: "It was his 3rd marriage"
🔍 Preprocessed Marriage Record: {'Record #': 3, 'Last name - Groom': 'Iannotti', 'First name - Groom': 'Ascenzo', 'Father of Groom': 'Giuseppe Iannotti', 'Mother of Groom': 'Teresa Buttari', 'Last name - Bride': 'De Santis', 'First name - Bride': 'Anna Berardina', 'Father of Bride': 'Nicola De Santis', 'Mother of Bride': 'Maddalena Del Rosso', 'Date': '2020-07-30', 'Year': 1821, 'Comune Groom': nan, 'Comune Bride': nan, 'Notes': 'It was his 3rd marriage', 'Deceased - Father of Groom': nan, 'Alias - Mother of Groom': 'BUTTA', 'Deceased - Mother of Groom': nan, 'Deceased - Father of Bride': nan, 'Alias - Mother of Bride': 'MADDA', 'Deceased - Mother of Bride': nan, 'Alias - Last name - Groom': nan, 'Alias - First name - Groom': nan, 'Alias - Last name - Bride': nan, 'Alias - First name - Bride': nan, 'Alias - Father of Groom First Name': nan, 'Alias - Father of Groom Last Name': nan, 'Alias - Father of Bride Fi

In [347]:
notes_df.head()

Unnamed: 0,Record #,Last name - Groom,First name - Groom,Father of Groom,Mother of Groom,Last name - Bride,First name - Bride,Father of Bride,Mother of Bride,Date,...,Alias - Mother of Groom First name,Alias - Mother of Groom Last name,Alias - Mother of Bride First name,Alias - Mother of Bride Last name,Notes - Groom,Notes - Bride,Notes - Father Groom,Notes - Mother Groom,Notes - Father Bride,Notes - Mother Bride
0,3,Iannotti,Ascenzo,Giuseppe,Teresa Buttari,De Santis,Anna Berardina,Nicola,Maddalena Del Rosso,2020-07-30,...,,Butta,Madda,,It was his 3rd marriage,,,,,
1,3,Iannotti,Shablo,Sereno,Teresa Buttari,Pozzi,Alessandra,Mario Giordano,Domenica Colella,2020-07-30,...,,,,,,,,,,
2,3,Iannotti,Daniel,Giuseppe,Teresa Buta,Postolo,Marianna Rua,Remigio,Domenica Colella,2020-07-30,...,,,,,,,,,,
3,3,Iannotti,Diablo,Giuseppe,Teresa Butta,Mac Allister,Sarah Veronica Maria Ronsuarda,Remigio,Domenica Colella,2020-07-30,...,,,,,,,,,,
4,3,Iannotti,Vincenzo,Sereno,Teresa Buttari,Troiana,Moana,Remigio,Domenica Colella,2020-07-30,...,,,,,,,"Named orphan as Giuseppe Proietto, later he we...",,,


In [348]:
notes_df.iloc[2]

Record #                                             3
Last name - Groom                             Iannotti
First name - Groom                              Daniel
Father of Groom                               Giuseppe
Mother of Groom                            Teresa Buta
Last name - Bride                              Postolo
First name - Bride                        Marianna Rua
Father of Bride                                Remigio
Mother of Bride                       Domenica Colella
Date                                        2020-07-30
Year                                              1820
Comune Groom                                       NaN
Comune Bride                                   Ferrano
Notes                                                 
Deceased - Father of Groom                         NaN
Alias - Mother of Groom                            NaN
Deceased - Mother of Groom                         NaN
Deceased - Father of Bride                         NaN
Alias - Mo

# Find alias in notes

In [364]:
import os
import openai
import numpy as np
from pydantic import BaseModel
from typing import List, Dict


api_key = os.environ["DIOPORCO"]
# ✅ Initialize OpenAI Client
client = openai.OpenAI(api_key=api_key)

import pandas as pd
after_notes_df = pd.read_csv("./corrected_marriage_records_llm_notes.csv")

In [365]:
import json

# ✅ List of columns to check for alias extraction
ALIAS_COLUMNS = [
    "Notes - Groom",
    "Notes - Bride",
    "Notes - Father Groom",
    "Notes - Mother Groom",
    "Notes - Father Bride",
    "Notes - Mother Bride"
]


# ✅ Define Pydantic Model for LLM Output (Now with First Name & Last Name distinction)
class AliasAssignment(BaseModel):
    groom: Dict[str, List[str]]  # {"first_name": [], "last_name": []}
    bride: Dict[str, List[str]]
    father_groom: Dict[str, List[str]]
    mother_groom: Dict[str, List[str]]
    father_bride: Dict[str, List[str]]
    mother_bride: Dict[str, List[str]]

# ✅ Load the dataset again to process aliases

# ✅ Convert "Notes" column to strings, ensuring we handle NaNs
after_notes_df["Notes"] = after_notes_df["Notes"].astype(str).replace("nan", "").replace(np.nan, "")

# ✅ Filter rows that have actual notes (non-empty strings)
notes_with_data = after_notes_df[after_notes_df["Notes"].str.strip() != ""]
print(f"🔎 Processing {len(notes_with_data)} rows to extract aliases.")

    
# ✅ Function to capitalize alias names properly
def normalize_alias(name):
    """Ensure the first letter is uppercase and the rest are lowercase."""
    return name.capitalize() if isinstance(name, str) and name.strip() else ""

🔎 Processing 2 rows to extract aliases.


In [367]:
import json
from pydantic import BaseModel, ValidationError
from typing import Dict, List

def normalize_alias(name):
    """Ensure the first letter is uppercase and the rest are lowercase."""
    return name.capitalize() if isinstance(name, str) and name.strip() else ""

# ✅ Define Pydantic Model for LLM Output
class AliasAssignment(BaseModel):
    groom: Dict[str, List[str]]
    bride: Dict[str, List[str]]
    father_groom: Dict[str, List[str]]
    mother_groom: Dict[str, List[str]]
    father_bride: Dict[str, List[str]]
    mother_bride: Dict[str, List[str]]

def extract_aliases_from_notes(row):
    """Extract aliases for specific individuals based on predefined alias columns."""

    # ✅ Check which alias-related notes exist in this row
    notes_to_process = {col: row[col].strip() for col in ALIAS_COLUMNS if isinstance(row[col], str) and row[col].strip()}

    # ✅ Skip processing if no alias-related notes are found
    if not notes_to_process:
        print(f"⚠️ Skipping row {row['Record #']} (No alias-related notes found)\n{'-'*50}")
        return {}

    # ✅ Print debug info before calling the LLM
    print(f"\n📝 Checking for aliases in row {row['Record #']}")
    print(f"🔍 Notes Found: {notes_to_process}")

    # ✅ SINGLE LLM CALL WITH STRICT JSON FORMAT
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are an expert in historical genealogy records."},
                {"role": "user", "content": f"""
                Your task is to determine if the note contains aliases for any person in a marriage record.
                
                - **Marriage Record** (preprocessed for clarity):
                  {row.to_dict()}
                - **Notes for Alias Extraction (Only for specific individuals):**
                  {notes_to_process}

                🎯 **Step 1: Identify Aliases**
                - Analyze each note carefully and determine if any aliases are mentioned.
                - An alias can be a nickname, alternative spelling, or a known pseudonym.
                - Only extract aliases that clearly apply to a specific individual.

                🎯 **Step 2: Validate & Structure Data**
                - Ensure each extracted alias is properly categorized under `first_name` or `last_name`.
                - If no alias is found for a person, return empty lists.

                ✅ **Expected Output (STRICT JSON FORMAT, NO MARKDOWN, NO EXTRA TEXT):**
                {{
                    "groom": {{"first_name": [], "last_name": []}},
                    "bride": {{"first_name": [], "last_name": []}},
                    "father_groom": {{"first_name": [], "last_name": []}},
                    "mother_groom": {{"first_name": [], "last_name": []}},
                    "father_bride": {{"first_name": [], "last_name": []}},
                    "mother_bride": {{"first_name": [], "last_name": []}}
                }}

                🚨 **IMPORTANT: RETURN ONLY THE JSON OBJECT, NOTHING ELSE. NO MARKDOWN.**
                """}
            ],
            temperature=0.0,
            response_format={"type": "json_object"}
        )

        # ✅ Extract structured response
        response_data = response.choices[0].message.content

        # ✅ Convert JSON string to dictionary
        try:
            parsed_response = json.loads(response_data)

            # ✅ Ensure all required fields exist with default empty lists
            default_structure = {"first_name": [], "last_name": []}
            complete_response = {person: parsed_response.get(person, default_structure) for person in [
                "groom", "bride", "father_groom", "mother_groom", "father_bride", "mother_bride"]}

            # ✅ Validate with Pydantic
            alias_data = AliasAssignment.model_validate(complete_response)
        
        except (json.JSONDecodeError, ValidationError) as e:
            print(f"❌ JSON Parsing or Validation Error: {e}")
            return {}

        # ✅ Print Debugging Info
        print(f"🔹 LLM Reasoning:\n{parsed_response}")
        print(f"🔹 LLM Output (Extracted Aliases): {alias_data}")

    except Exception as e:
        print(f"❌ LLM Error: {e}")
        return {}

    # ✅ Prepare alias assignments (Handling First & Last Name separately)
    alias_assignments = {
        "Alias - First name - Groom": [normalize_alias(name) for name in alias_data.groom["first_name"]],
        "Alias - Last name - Groom": [normalize_alias(name) for name in alias_data.groom["last_name"]],
        "Alias - First name - Bride": [normalize_alias(name) for name in alias_data.bride["first_name"]],
        "Alias - Last name - Bride": [normalize_alias(name) for name in alias_data.bride["last_name"]],
        "Alias - Father of Groom First Name": [normalize_alias(name) for name in alias_data.father_groom["first_name"]],
        "Alias - Father of Groom Last Name": [normalize_alias(name) for name in alias_data.father_groom["last_name"]],
        "Alias - Mother of Groom First name": [normalize_alias(name) for name in alias_data.mother_groom["first_name"]],
        "Alias - Mother of Groom Last name": [normalize_alias(name) for name in alias_data.mother_groom["last_name"]],
        "Alias - Father of Bride First Name": [normalize_alias(name) for name in alias_data.father_bride["first_name"]],
        "Alias - Father of Bride Last Name": [normalize_alias(name) for name in alias_data.father_bride["last_name"]],
        "Alias - Mother of Bride First name": [normalize_alias(name) for name in alias_data.mother_bride["first_name"]],
        "Alias - Mother of Bride Last name": [normalize_alias(name) for name in alias_data.mother_bride["last_name"]]
    }

    # ✅ Print Debugging Info of Final Alias Assignment
    print(f"📌 Final Assigned Aliases: {alias_assignments}\n{'-'*50}")
    
    return alias_assignments


# ✅ Apply LLM Processing Row by Row for Alias Extraction
extracted_aliases = after_notes_df.apply(extract_aliases_from_notes, axis=1)

# ✅ Update `after_notes_df` with extracted aliases (Handling Multiple Entries)
for index, alias_data in extracted_aliases.items():
    for col, alias_list in alias_data.items():
        if alias_list:  # ✅ Only update if alias is present and column exists
            if col in after_notes_df.columns:
                existing_aliases = after_notes_df.loc[index, col]

                if pd.notna(existing_aliases) and existing_aliases != "":
                    # ✅ Convert existing aliases into a list, ensuring no duplicates
                    existing_aliases_list = [normalize_alias(x.strip()) for x in existing_aliases.split(",") if x.strip()]
                    new_aliases = list(set(existing_aliases_list + alias_list))  # ✅ Merge & remove duplicates
                    after_notes_df.loc[index, col] = ", ".join(new_aliases)  # ✅ Convert back to string
                else:
                    after_notes_df.loc[index, col] = ", ".join(alias_list)  # ✅ Store new aliases directly
            else:
                print(f"⚠️ Warning: Column '{col}' not found in DataFrame. Skipping update.")

# ✅ Save the updated DataFrame
after_notes_df.to_csv("corrected_marriage_records_llm_aliases.csv", index=False)

print("✅ Successfully extracted aliases from notes and assigned them correctly with proper capitalization.")


📝 Checking for aliases in row 3
🔍 Notes Found: {'Notes - Groom': 'It was his 3rd marriage'}
🔹 LLM Reasoning:
{'groom': {'first_name': [], 'last_name': []}, 'bride': {'first_name': [], 'last_name': []}, 'father_groom': {'first_name': [], 'last_name': []}, 'mother_groom': {'first_name': [], 'last_name': ['Butta']}, 'father_bride': {'first_name': [], 'last_name': []}, 'mother_bride': {'first_name': ['Madda'], 'last_name': []}}
🔹 LLM Output (Extracted Aliases): groom={'first_name': [], 'last_name': []} bride={'first_name': [], 'last_name': []} father_groom={'first_name': [], 'last_name': []} mother_groom={'first_name': [], 'last_name': ['Butta']} father_bride={'first_name': [], 'last_name': []} mother_bride={'first_name': ['Madda'], 'last_name': []}
📌 Final Assigned Aliases: {'Alias - First name - Groom': [], 'Alias - Last name - Groom': [], 'Alias - First name - Bride': [], 'Alias - Last name - Bride': [], 'Alias - Father of Groom First Name': [], 'Alias - Father of Groom Last Name': [],

  after_notes_df.loc[index, col] = ", ".join(alias_list)  # ✅ Store new aliases directly
  after_notes_df.loc[index, col] = ", ".join(alias_list)  # ✅ Store new aliases directly


In [368]:
import pandas as pd
import numpy as np

# ✅ Define columns that contain people's names
NAME_COLUMNS = [
    "Last name - Groom", "First name - Groom",
    "Father of Groom", "Mother of Groom",
    "Last name - Bride", "First name - Bride",
    "Father of Bride", "Mother of Bride",
    "Alias - Last name - Groom", "Alias - First name - Groom",
    "Alias - Last name - Bride", "Alias - First name - Bride",
    "Alias - Father of Groom First Name", "Alias - Father of Groom Last Name",
    "Alias - Father of Bride First Name", "Alias - Father of Bride Last Name",
    "First name - Mother of Groom", "Last name - Mother of Groom",
    "First name - Mother of Bride", "Last name - Mother of Bride",
    "Alias - Mother of Groom First name", "Alias - Mother of Groom Last name",
    "Alias - Mother of Bride First name", "Alias - Mother of Bride Last name"
]

def capitalize_name(name):
    """Properly capitalize names, handling multi-part last names."""
    if not isinstance(name, str) or not name.strip():
        return ""

    name_parts = name.lower().split()
    capitalized_parts = [name_parts[0].capitalize()] + [
        part.capitalize() if len(part) > 2 else part for part in name_parts[1:]
    ]
    
    return " ".join(capitalized_parts)

def capitalize_names_in_dataframe(df):
    """Capitalize names in the specified columns of a DataFrame."""
    
    for col in NAME_COLUMNS:
        if col in df.columns:
            # ✅ Handle multiple names in a single cell (comma-separated)
            df[col] = df[col].astype(str).apply(lambda x: ", ".join([capitalize_name(name.strip()) for name in x.split(",")]) if x.strip() else x)
    
    return df

In [369]:
df_capitalized = capitalize_names_in_dataframe(after_notes_df)

# ✅ Save the updated DataFrame
df_capitalized.to_csv("capitalized_names.csv", index=False)

In [374]:
df_capitalized.iloc[4]

Record #                                                                              3
Last name - Groom                                                              Iannotti
First name - Groom                                                             Vincenzo
Father of Groom                                                                  Sereno
Mother of Groom                                                          Teresa Buttari
Last name - Bride                                                               Troiana
First name - Bride                                                                Moana
Father of Bride                                                                 Remigio
Mother of Bride                                                        Domenica Colella
Date                                                                         2020-07-30
Year                                                                               1821
Comune Groom                    

# Define relationships, entities methods

In [375]:
import uuid
from pydantic import BaseModel
from typing import List, Optional
import pandas as pd
import numpy as np
from neo4j import GraphDatabase

class Person(BaseModel):
    id: str  # Unique Identifier
    fullname: str
    firstname: str
    lastname: str
    alias_firstname: str
    alias_lastname: str
    gender: str
    location: str = ""  # Default to empty string
    deceased: Optional[str] = ""
    notes: Optional[str] = ""  # ✅ Ensure `notes` is present


# ✅ Define Schema for Relationships
class Relationship(BaseModel):
    from_id: str  # Use unique IDs instead of fullnames
    to_id: str
    type: str
    date: Optional[str] = None

import pandas as pd

import datetime

def format_date(date, year):
    """
    ✅ Converts any date format to '5 Dec 1821'
    - Handles both string and datetime values.
    - Ensures correct day-month-year order.
    - Uses three-letter month abbreviations.
    """
    # ✅ Convert date to string if it's not already
    if isinstance(date, datetime.date) or isinstance(date, pd.Timestamp):
        day = date.day  # Extract day
        month = date.strftime("%b")  # Extract abbreviated month (e.g., Dec)
    elif isinstance(date, str):
        try:
            parsed_date = pd.to_datetime(date, errors='coerce')  # Attempt to parse
            if pd.notna(parsed_date):
                day = parsed_date.day
                month = parsed_date.strftime("%b")
            else:
                return None  # If unparseable, return None
        except:
            return None
    else:
        return None  # If input is unrecognized, return None

    # ✅ Convert year to string
    year_str = str(year).strip() if pd.notna(year) else ""

    # ✅ Ensure correct format: "5 Dec 1821"
    return f"{day} {month} {year_str}".strip() if day and month and year_str else None



# ✅ Function to Retrieve Deceased Status from df_llm
def get_deceased_status(row, person_type):
    """Retrieves correct deceased status from df_llm based on person type."""
    deceased_col = f"Deceased - {person_type}"
    return row[deceased_col] if deceased_col in row and pd.notna(row[deceased_col]) else ""

# ✅ Function to Clean and Extract Location
def clean_location(value):
    """Ensure location is either a valid string or default empty string."""
    return value.strip() if isinstance(value, str) and value.strip() else ""

In [376]:
def process_row(row):
    """Processes a marriage record row into structured JSON with notes included."""

    # ✅ Convert NaN to Empty String or None
    def clean_str(value):
        """Convert NaN to an empty string before stripping."""
        return str(value).strip() if isinstance(value, str) and pd.notna(value) else ""

    # ✅ Generate unique IDs for each person
    def generate_person(firstname, lastname, alias_firstname, alias_lastname, gender, location, deceased, notes):
        return Person(
            id=str(uuid.uuid4()),  # Unique UUID
            fullname=f"{firstname} {lastname}",
            firstname=firstname,
            lastname=lastname,
            alias_firstname=alias_firstname,
            alias_lastname=alias_lastname,
            gender=gender,
            location=location,  # Properly assign location
            deceased=deceased,
            notes=notes  # ✅ Store notes
        )

    # ✅ Extract and clean location for groom and bride
    groom_location = clean_location(row.get("Comune Groom", ""))
    bride_location = clean_location(row.get("Comune Bride", ""))

    # ✅ Extract Notes for Each Person
    groom_notes = clean_str(row.get("Notes - Groom", ""))
    bride_notes = clean_str(row.get("Notes - Bride", ""))
    father_groom_notes = clean_str(row.get("Notes - Father of Groom", ""))
    mother_groom_notes = clean_str(row.get("Notes - Mother of Groom", ""))
    father_bride_notes = clean_str(row.get("Notes - Father of Bride", ""))
    mother_bride_notes = clean_str(row.get("Notes - Mother of Bride", ""))

    # ✅ Create Persons with Notes
    groom = generate_person(clean_str(row["First name - Groom"]), clean_str(row["Last name - Groom"]),
                            clean_str(row.get("Alias - First name - Groom", "")), clean_str(row.get("Alias - Last name - Groom", "")),
                            "male", groom_location, get_deceased_status(row, "First name - Groom"), groom_notes)

    bride = generate_person(clean_str(row["First name - Bride"]), clean_str(row["Last name - Bride"]),
                            clean_str(row.get("Alias - First name - Bride", "")), clean_str(row.get("Alias - Last name - Bride", "")),
                            "female", bride_location, get_deceased_status(row, "First name - Bride"), bride_notes)

    father_groom = generate_person(clean_str(row["Father of Groom"]), clean_str(row["Last name - Groom"]),
                                   clean_str(row.get("Alias - Father of Groom First Name", "")), clean_str(row.get("Alias - Father of Groom Last Name", "")),
                                   "male", "", get_deceased_status(row, "Father of Groom"), father_groom_notes)

    mother_groom = generate_person(clean_str(row["First name - Mother of Groom"]), clean_str(row["Last name - Mother of Groom"]),
                                   clean_str(row.get("Alias - Mother of Groom First name", "")), clean_str(row.get("Alias - Mother of Groom Last name", "")),   
                                   "female", "", get_deceased_status(row, "Mother of Groom"), mother_groom_notes)

    father_bride = generate_person(clean_str(row["Father of Bride"]), clean_str(row["Last name - Bride"]),
                                   clean_str(row.get("Alias - Father of Bride First Name", "")), clean_str(row.get("Alias - Father of Bride Last Name", "")),
                                   "male", "", get_deceased_status(row, "Father of Bride"), father_bride_notes)

    mother_bride = generate_person(clean_str(row["First name - Mother of Bride"]), clean_str(row["Last name - Mother of Bride"]),
                                   clean_str(row.get("Alias - Mother of Bride First name", "")), clean_str(row.get("Alias - Mother of Bride Last name", "")),  
                                   "female", "", get_deceased_status(row, "Mother of Bride"), mother_bride_notes)


    persons = [groom, bride, father_groom, mother_groom, father_bride, mother_bride]

    print(format_date(row["Date"], row["Year"]))

    # ✅ Define Relationships
    relationships = [
        Relationship(from_id=groom.id, to_id=bride.id, type="MARRIED_TO", date=format_date(row["Date"], row["Year"])),
        Relationship(from_id=bride.id, to_id=groom.id, type="MARRIED_TO", date=format_date(row["Date"], row["Year"])),
        Relationship(from_id=groom.id, to_id=father_groom.id, type="SON_OF"),
        Relationship(from_id=groom.id, to_id=mother_groom.id, type="SON_OF"),
        Relationship(from_id=bride.id, to_id=father_bride.id, type="DAUGHTER_OF"),
        Relationship(from_id=bride.id, to_id=mother_bride.id, type="DAUGHTER_OF"),
        Relationship(from_id=father_groom.id, to_id=mother_groom.id, type="MARRIED_TO"),
        Relationship(from_id=father_bride.id, to_id=mother_bride.id, type="MARRIED_TO"),
    ]

    return {"persons": [p.model_dump() for p in persons], "relationships": [r.model_dump() for r in relationships]}

In [377]:
import json
neo4j_df = pd.read_csv("./capitalized_names.csv")

# ✅ Process DataFrame from df_notes
records = [process_row(row) for _, row in neo4j_df.iterrows()]

# ✅ Save JSON output
with open("neo4j_data.json", "w") as f:
    json.dump(records, f, indent=2)

print("✅ Successfully processed marriage records with Correct Locations!")

30 Jul 1821
30 Jul 1821
30 Jul 1820
30 Jul 1822
30 Jul 1821
✅ Successfully processed marriage records with Correct Locations!


# Insert to Neo4j

In [398]:
from pydantic import BaseModel
from typing import List, Optional
import pandas as pd
import json
import numpy as np
import re
from neo4j import GraphDatabase

# Define connection details
URI = "bolt://localhost:7687"  # Neo4j Bolt connection
USERNAME = "neo4j"
PASSWORD = "password"  # Replace with the password you set

# Create a Neo4j Driver instance
driver = GraphDatabase.driver(URI, auth=(USERNAME, PASSWORD))

with open("neo4j_data.json", "r") as f:
    records = json.load(f)

# ✅ Display the first few records for review
#num_records_to_display = min(5, len(records))  # Show up to 5 records
#records[:num_records_to_display]  # Display sample records

In [399]:
from neo4j import GraphDatabase

# ✅ Neo4j Connection Settings
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "password"

# ✅ Create Neo4j Driver
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

def insert_record(tx, persons, relationships):
    """Inserts persons and relationships while ensuring `notes` is stored correctly."""

    # ✅ Step 1: Create Persons (Independent Insert)
    for person in persons:
        query = """
        CREATE (p:Person {
            uid: $uid,
            fullname: $fullname,
            firstname: $firstname,
            lastname: $lastname,
            alias_firstname: $alias_firstname,
            alias_lastname: $alias_lastname,
            gender: $gender,
            location: $location,
            deceased: $deceased,
            notes: $notes
        })
        """
        params = {
            "uid": person["id"],  # Unique ID
            "fullname": person["fullname"],
            "firstname": person["firstname"],
            "lastname": person["lastname"],
            "alias_firstname": person["alias_firstname"],
            "alias_lastname": person["alias_lastname"],
            "gender": person["gender"],
            "location": person["location"],
            "deceased": person["deceased"],
            "notes": person["notes"]  
        }
        tx.run(query, **params)

    # ✅ Step 2: Create Relationships (Using UID)
    processed_marriages = set()  # ✅ Track already created marriages to prevent duplicates

    for rel in relationships:
        if rel["type"] == "MARRIED_TO":
            marriage_key = tuple(sorted([rel["from_id"], rel["to_id"]]))  # Unique key for marriage
            
            if marriage_key in processed_marriages:
                continue  # ✅ Skip duplicate marriage entry
            processed_marriages.add(marriage_key)

            query = """
            MATCH (a:Person {uid: $from_uid})
            MATCH (b:Person {uid: $to_uid})
            CREATE (a)-[:MARRIED_TO {date: COALESCE($date, ""), bidirectional: true}]->(b),
                   (b)-[:MARRIED_TO {date: COALESCE($date, ""), bidirectional: true}]->(a)
            """
        else:
            query = """
            MATCH (a:Person {uid: $from_uid})
            MATCH (b:Person {uid: $to_uid})
            CREATE (a)-[:`""" + rel["type"] + """`]->(b)
            """

        params = {
            "from_uid": rel["from_id"],  # Unique ID of from person
            "to_uid": rel["to_id"],      # Unique ID of to person
            "date": rel.get("date", "")  # ✅ Ensure date is always present (empty string if missing)
        }

        tx.run(query, **params)


# ✅ Process and Insert Each Record
with driver.session() as session:
    for record in records:
        persons = record["persons"]
        relationships = record["relationships"]
        session.execute_write(insert_record, persons, relationships)

print("✅ Successfully inserted marriage records with ONE bidirectional `MARRIED_TO` relationship per couple!")

✅ Successfully inserted marriage records with ONE bidirectional `MARRIED_TO` relationship per couple!


# Entity resolution

In [406]:
from neo4j import GraphDatabase

# ✅ Neo4j Connection Settings
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "password"

# ✅ Configurable Batch Size
BATCH_SIZE = 1  # Change this value to adjust batch processing size

# ✅ Create Neo4j Driver
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

def recursive_merge(tx, batch_size):
    """Recursively merges duplicate persons based on fullname, spouse fullname, and marriage date."""
    merge_query = f"""
    CALL apoc.periodic.iterate(
    "MATCH (p1:Person)-[m1:MARRIED_TO]->(s1:Person)
     MATCH (p2:Person)-[m2:MARRIED_TO]->(s2:Person)
     WHERE p1.fullname = p2.fullname
     AND s1.fullname = s2.fullname
     AND COALESCE(m1.date, '') = COALESCE(m2.date, '')  // ✅ Ensure marriage date matches, including empty strings
     AND id(p1) < id(p2)  // Prevent duplicate merging
     RETURN p1, p2, m1, m2, s1, s2",
     
    "WITH p1, p2, m1, m2, s1, s2
     // ✅ Merge nodes while avoiding duplicate relationships
     CALL apoc.refactor.mergeNodes([p1, p2]) YIELD node
     // ✅ Ensure only one MARRIED_TO relationship remains with correct date
     WITH node, s1, m1, m2
     MATCH (node)-[r:MARRIED_TO]->(s1)
     WITH node, s1, COLLECT(r) AS rels
     CALL apoc.refactor.mergeRelationships(rels, {{properties: 'combine'}}) YIELD rel
     RETURN COUNT(*)",
    
    {{batchSize: {batch_size}, parallel: false}})  // ✅ Dynamic batch size
    """

    tx.run(merge_query)

def count_nodes(tx):
    """Counts the total number of Person nodes."""
    result = tx.run("MATCH (p:Person) RETURN count(p) AS count")
    return result.single()["count"]

def merge_until_done(batch_size):
    """Runs the merging process iteratively until no more merges are possible."""
    with driver.session() as session:
        prev_count = session.execute_read(count_nodes)
        
        while True:
            session.execute_write(recursive_merge, batch_size)
            current_count = session.execute_read(count_nodes)
            
            print(f"🔄 Nodes before merge: {prev_count}, after merge: {current_count}")
            
            if current_count == prev_count:
                break  # Exit when no further merges happen
            
            prev_count = current_count  # Update previous count

# ✅ Run the Recursive Merging Process
merge_until_done(BATCH_SIZE)

print("🎯 Recursive merging complete: All duplicate persons merged successfully!")

🔄 Nodes before merge: 25, after merge: 25
🎯 Recursive merging complete: All duplicate persons merged successfully!


In [407]:
from neo4j import GraphDatabase

# ✅ Neo4j Connection Settings
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "password"

# ✅ Create Neo4j Driver
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

def deduplicate_relationships(tx):
    """Finds and removes duplicate relationships with the same type and properties, keeping only one."""
    deduplicate_query = """
    MATCH (a)-[r]->(b)
    WITH a, b, type(r) AS relType, properties(r) AS relProps, COLLECT(r) AS rels
    WHERE size(rels) > 1  // ✅ If there are duplicates

    WITH rels[1..] AS duplicates  // ✅ Keep only one, delete others
    UNWIND duplicates AS duplicate
    DELETE duplicate  // ✅ Remove redundant relationships
    """
    tx.run(deduplicate_query)

def run_deduplication():
    """Runs the relationship deduplication process."""
    with driver.session() as session:
        session.execute_write(deduplicate_relationships)

# ✅ Run the Relationship Deduplication Process
run_deduplication()
print("🎯 Relationship deduplication completed successfully!")

🎯 Relationship deduplication completed successfully!


# Merge with Alias

In [408]:
from neo4j import GraphDatabase

# ✅ Neo4j Connection Settings
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "password"

# ✅ Create Neo4j Driver
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

def merge_aliases_with_spouse_validation(tx):
    """Merges persons considering aliases and ensuring they are married to the same person."""
    merge_query = """
    CALL apoc.periodic.iterate(
        "MATCH (p:Person)
         WHERE (p.alias_firstname IS NOT NULL AND p.alias_firstname <> '' AND p.alias_firstname <> 'Nan') 
            OR (p.alias_lastname IS NOT NULL AND p.alias_lastname <> '' AND p.alias_lastname <> 'Nan')

         WITH 
             p, 
             apoc.convert.toList(p.alias_firstname) AS AliasFirstList,
             apoc.convert.toList(p.alias_lastname) AS AliasLastList

         WITH 
             p,
             [x IN AliasFirstList WHERE x <> '' AND x <> 'Nan'] + [p.firstname] AS FirstnameVariants,
             [x IN AliasLastList WHERE x <> '' AND x <> 'Nan'] + [p.lastname] AS LastnameVariants

         UNWIND FirstnameVariants AS VariantFirst
         UNWIND LastnameVariants AS VariantLast

         MATCH (spouse:Person)-[m:MARRIED_TO]-(p)

         WITH 
             VariantFirst, VariantLast, spouse, m.date AS MarriageDate,
             apoc.convert.toList(spouse.alias_firstname) AS SpouseAliasFirstList,
             apoc.convert.toList(spouse.alias_lastname) AS SpouseAliasLastList

         WITH 
             VariantFirst, VariantLast, spouse, MarriageDate,
             [x IN SpouseAliasFirstList WHERE x <> '' AND x <> 'Nan'] + [spouse.firstname] AS SpouseFirstVariants,
             [x IN SpouseAliasLastList WHERE x <> '' AND x <> 'Nan'] + [spouse.lastname] AS SpouseLastVariants

         UNWIND SpouseFirstVariants AS SpouseVariantFirst
         UNWIND SpouseLastVariants AS SpouseVariantLast

         RETURN DISTINCT VariantFirst, VariantLast, SpouseVariantFirst, SpouseVariantLast, MarriageDate",

        "WITH VariantFirst, VariantLast, SpouseVariantFirst, SpouseVariantLast, MarriageDate
         
         MATCH (p1:Person)-[m1:MARRIED_TO]->(s1:Person)
         MATCH (p2:Person)-[m2:MARRIED_TO]->(s2:Person)
         
         WHERE 
             (VariantFirst = p1.firstname OR VariantFirst IN split(toString(p1.alias_firstname), ', '))
             AND (VariantLast = p1.lastname OR VariantLast IN split(toString(p1.alias_lastname), ', '))
             AND (VariantFirst = p2.firstname OR VariantFirst IN split(toString(p2.alias_firstname), ', '))
             AND (VariantLast = p2.lastname OR VariantLast IN split(toString(p2.alias_lastname), ', '))
             AND (SpouseVariantFirst = s1.firstname OR SpouseVariantFirst IN split(toString(s1.alias_firstname), ', '))
             AND (SpouseVariantLast = s1.lastname OR SpouseVariantLast IN split(toString(s1.alias_lastname), ', '))
             AND (SpouseVariantFirst = s2.firstname OR SpouseVariantFirst IN split(toString(s2.alias_firstname), ', '))
             AND (SpouseVariantLast = s2.lastname OR SpouseVariantLast IN split(toString(s2.alias_lastname), ', '))
             AND COALESCE(m1.date, '') = COALESCE(m2.date, '')  // Ensures same marriage date
             AND id(p1) < id(p2)  // Prevents duplicate merging

         // ✅ Merge Nodes & Preserve Information
         CALL apoc.refactor.mergeNodes([p1, p2], {properties: {
             alias_firstname: 'combine',
             alias_lastname: 'combine',
             notes: 'combine',
             location: 'combine',
             deceased: 'combine'
         }}) YIELD node

         RETURN COUNT(*)",
        
        {batchSize: 10, parallel: false}
    )
    """
    tx.run(merge_query)

In [409]:
def clean_merged_properties(tx):
    """Cleans up alias_firstname and alias_lastname properties after merging, ensuring proper format."""
    clean_query = """
    MATCH (p:Person)
    WITH p,
        [x IN apoc.convert.toList(p.alias_firstname) WHERE x IS NOT NULL AND x <> '' AND x <> 'Nan'] AS CleanFirstnames,
        [x IN apoc.convert.toList(p.alias_lastname) WHERE x IS NOT NULL AND x <> '' AND x <> 'Nan'] AS CleanLastnames

    SET p.alias_firstname = 
        CASE 
            WHEN size(CleanFirstnames) = 0 THEN NULL
            WHEN size(CleanFirstnames) = 1 THEN CleanFirstnames[0]  // ✅ Convert single-element list to string
            ELSE CleanFirstnames 
        END,

        p.alias_lastname = 
        CASE 
            WHEN size(CleanLastnames) = 0 THEN NULL
            WHEN size(CleanLastnames) = 1 THEN CleanLastnames[0]  // ✅ Convert single-element list to string
            ELSE CleanLastnames 
        END
    """
    tx.run(clean_query)

def merge_until_done_aliases():
    """Runs alias-based merging iteratively until no more merges occur, then cleans up properties."""
    with driver.session() as session:
        prev_count = session.run("MATCH (p:Person) RETURN count(p) AS count").single()[0]
        
        while True:
            session.execute_write(merge_aliases_with_spouse_validation)
            
            current_count = session.run("MATCH (p:Person) RETURN count(p) AS count").single()[0]
            print(f"🔄 Nodes before merge: {prev_count}, after merge: {current_count}")
            
            if current_count == prev_count:
                session.execute_write(clean_merged_properties)  # 🔥 Run cleanup after merging
                break  # Exit when no further merges happen
            
            prev_count = current_count  # Update previous count

# ✅ Run the Alias-Based Merging Process
merge_until_done_aliases()
print("🎯 Alias-based merging complete & cleanup successful! 🚀")

🔄 Nodes before merge: 25, after merge: 25
🎯 Alias-based merging complete & cleanup successful! 🚀


In [410]:
# ✅ Run the Relationship Deduplication Process
run_deduplication()
# ✅ Run the Recursive Merging Process
merge_until_done(BATCH_SIZE)
run_deduplication()
print("🎯 Relationship deduplication completed successfully!")
merge_until_done_aliases()

🔄 Nodes before merge: 25, after merge: 25
🎯 Relationship deduplication completed successfully!
🔄 Nodes before merge: 25, after merge: 25


# Assign hierarchical embeddings 

#### Without notes included

In [411]:
from openai import OpenAI
from neo4j import GraphDatabase
import numpy as np
import json

# ✅ Neo4j Connection
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "password"

# ✅ Ollama (Granite) Embedding Model
client = OpenAI(base_url="http://localhost:11434/v1", api_key="ollama")
MODEL_NAME = "granite-embedding:278m"

# ✅ Function to ensure properties exist in Neo4j
def initialize_person_properties(tx):
    query = """
    MATCH (p:Person)
    SET p.birthday = COALESCE(p.birthday, ""),
        p.embedding_node = COALESCE(p.embedding_node, ""),
        p.embedding_spouse = COALESCE(p.embedding_spouse, ""),
        p.embedding_parents = COALESCE(p.embedding_parents, "")
    """
    tx.run(query)

# ✅ Function to get embeddings from Ollama
def get_embedding(text):
    #print(f"🔹 Generating embedding for: {text}")  # Debugging
    response = client.embeddings.create(model=MODEL_NAME, input=[text])
    return response.data[0].embedding

# ✅ Function to fetch persons and their relationships
def fetch_graph_data(tx):
    query = """
    MATCH (p:Person)
    OPTIONAL MATCH (p)-[m:MARRIED_TO]->(spouse:Person)
    OPTIONAL MATCH (p)-[r:SON_OF|DAUGHTER_OF]->(parent:Person)
    RETURN p, m.date AS marriage_date, spouse, COLLECT(parent) AS parents
    """
    return list(tx.run(query))

# ✅ Convert Neo4j Node to Dictionary
def node_to_dict(n):
    return dict(n) if n else {}

# ✅ Function to generate hierarchical text descriptions
def generate_text(node, spouse, marriage_date, parents):
    node_dict = node_to_dict(node)
    spouse_dict = node_to_dict(spouse)
    parents_dict = [node_to_dict(parent) for parent in parents]

    # ✅ Node embedding (individual properties)
    node_text = f"Person: {node_dict.get('fullname', 'Unknown')}, Alias: {node_dict.get('alias_firstname', '')} {node_dict.get('alias_lastname', '')}, Birthday: {node_dict.get('birthday', '')}, Location: {node_dict.get('location', '')}, Deceased: {node_dict.get('deceased', '')}"

    # ✅ Spouse embedding (includes spouse properties)
    spouse_text = node_text
    if spouse:
        spouse_text += f" | Married to {spouse_dict.get('fullname', 'Unknown')}, Alias: {spouse_dict.get('alias_firstname', '')} {spouse_dict.get('alias_lastname', '')}, Birthday: {spouse_dict.get('birthday', '')}, Location: {spouse_dict.get('location', '')}, Deceased: {spouse_dict.get('deceased', '')}, Marriage Date: {marriage_date}"

    # ✅ Parent embedding (includes both parents' properties)
    parent_text = node_text
    if parents_dict:
        parent_text += " | Parents: " + " & ".join([
            f"{p.get('fullname', 'Unknown')}, Alias: {p.get('alias_firstname', '')} {p.get('alias_lastname', '')}, Birthday: {p.get('birthday', '')}, Location: {p.get('location', '')}, Deceased: {p.get('deceased', '')}"
            for p in parents_dict
        ])

    return node_text, spouse_text, parent_text



# ✅ Function to process embeddings and store them in Neo4j
def process_embeddings():
    driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

    with driver.session() as session:
        # ✅ Ensure `embedding_node`, `embedding_spouse`, and `embedding_parents` properties exist
        session.execute_write(initialize_person_properties)

        # ✅ Fetch graph data
        nodes = session.execute_read(fetch_graph_data)

        for record in nodes:
            node = record["p"]
            spouse = record["spouse"]
            marriage_date = record["marriage_date"]
            parents = record["parents"]

            # 🔹 Generate hierarchical descriptions
            node_text, spouse_text, parent_text = generate_text(node, spouse, marriage_date, parents)

            # 🔹 Compute embeddings
            embedding_node = get_embedding(node_text)
            embedding_spouse = get_embedding(spouse_text)
            embedding_parents = get_embedding(parent_text)

            # 🔹 Ensure embeddings are JSON-compatible lists
            embedding_node = embedding_node.tolist() if isinstance(embedding_node, np.ndarray) else embedding_node
            embedding_spouse = embedding_spouse.tolist() if isinstance(embedding_spouse, np.ndarray) else embedding_spouse
            embedding_parents = embedding_parents.tolist() if isinstance(embedding_parents, np.ndarray) else embedding_parents


            # ✅ Store embeddings in Neo4j using `uid`
            session.run(
                """
                MATCH (p:Person {uid: $uid})
                SET p.embedding_node = $embedding_node,
                    p.embedding_spouse = $embedding_spouse,
                    p.embedding_parents = $embedding_parents
                """,
                uid=node["uid"],  # ✅ Use `uid` instead of `id`
                embedding_node=embedding_node,
                embedding_spouse=embedding_spouse,
                embedding_parents=embedding_parents
            )


    print("\n✅ All embeddings stored in Neo4j.")

# 🔥 Run embedding process
process_embeddings()




✅ All embeddings stored in Neo4j.


# Embeddings Merge

In [412]:
import numpy as np
import pandas as pd
from openai import OpenAI
from sklearn.metrics.pairwise import cosine_similarity

# Initialize the OpenAI-compatible Ollama client
client = OpenAI(
    base_url="http://localhost:11434/v1",
    api_key="ollama",
)

# Define the model name (use "snowflake-arctic-embed2" if "granite-embedding:278m" is unavailable)
MODEL_NAME = "granite-embedding:278m"

# Function to get embeddings from Ollama
def get_embedding(text):
    response = client.embeddings.create(model=MODEL_NAME, input=[text])
    return response.data[0].embedding

### Check similarity between all possible couples

In [529]:
import pandas as pd
import numpy as np
from neo4j import GraphDatabase
from sklearn.metrics.pairwise import cosine_similarity
import itertools

# ✅ Neo4j Connection
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "password"

# ✅ Create Neo4j Driver
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

def fetch_spouse_embeddings(tx):
    """Fetch all persons with their spouse embeddings from Neo4j."""
    query = """
    MATCH (p:Person)
    WHERE p.embedding_spouse IS NOT NULL
    RETURN p.uid AS uid, 
           p.fullname AS person_name, 
           p.embedding_spouse AS embedding_spouse, 
           [(p)-[:MARRIED_TO]-(spouse) | spouse.fullname][0] AS spouse_name,
           [(p)-[:MARRIED_TO]-(spouse) | spouse.uid][0] AS spouse_uid
    """
    return list(tx.run(query))

def compare_couple_embeddings():
    """Compute cosine similarity between couples and return as a DataFrame with UIDs."""
    with driver.session() as session:
        couples = session.execute_read(fetch_spouse_embeddings)

    couples_data = []
    for record in couples:
        if record["embedding_spouse"]:
            person_uid = record["uid"]
            spouse_uid = record["spouse_uid"]
            
            # Ensure a consistent ordering for couple names
            sorted_names = sorted([(record["person_name"], person_uid), (record["spouse_name"], spouse_uid)])
            couple_name = f"{sorted_names[0][0]} & {sorted_names[1][0]}"
            couple_uids = (sorted_names[0][1], sorted_names[1][1])  # (UID1, UID2)

            couples_data.append((couple_uids[0], couple_uids[1], couple_name, np.array(record["embedding_spouse"], dtype=np.float32)))

    results = []
    seen_comparisons = set()

    print(f"🔍 **Total Couples Loaded for Comparison:** {len(couples_data)}")

    # ✅ Corrected Loop - Unpacking the correct number of values
    for (uid1a, uid1b, couple1, emb1), (uid2a, uid2b, couple2, emb2) in itertools.combinations(couples_data, 2):

        # ✅ Skip same couple comparison (checking if both UIDs match)
        if {uid1a, uid1b} == {uid2a, uid2b}:
            continue

        # ✅ Ensure unique comparisons (avoid duplicate checks)
        if (couple1, couple2) in seen_comparisons or (couple2, couple1) in seen_comparisons:
            continue
        seen_comparisons.add((couple1, couple2))

        # ✅ Compute similarity
        similarity_score = cosine_similarity([emb1], [emb2])[0][0]

        if similarity_score > 0.89:
            results.append({
                "Couple 1": couple1,
                "Couple 2": couple2,
                "UID 1A": uid1a,
                "UID 1B": uid1b,
                "UID 2A": uid2a,
                "UID 2B": uid2b,
                "Similarity Score": round(similarity_score, 4)
            })

    df = pd.DataFrame(results).sort_values(by="Similarity Score", ascending=False)
    print(f"\n📊 **Comparison Completed. High-Similarity Pairs Found:** {len(df)}\n")
    
    return df



# 🔥 Run the optimized comparison and store results in a DataFrame
df_filtered = compare_couple_embeddings()
df_filtered_sorted = df_filtered.sort_values(by="Similarity Score", ascending=False)

🔍 **Total Couples Loaded for Comparison:** 25

📊 **Comparison Completed. High-Similarity Pairs Found:** 9



In [530]:
# Sort the filtered DataFrame by similarity score in descending order
df_filtered_sorted.head()

Unnamed: 0,Couple 1,Couple 2,UID 1A,UID 1B,UID 2A,UID 2B,Similarity Score
1,Giuseppe Iannotti & Teresa Buttari,Giuseppe Iannotti & Teresa Buta,648a36e7-704a-41df-9854-a6a8bc8b67b9,db826d29-e13c-4d0f-859a-84d65a8c5468,9d11dc9f-6f0c-4abf-b88c-764eeb1185e7,352eacad-75f0-4cb5-bbf5-deab9012b11a,0.9905
0,Giuseppe Iannotti & Teresa Buttari,Sereno Iannotti & Teresa Buttari,648a36e7-704a-41df-9854-a6a8bc8b67b9,db826d29-e13c-4d0f-859a-84d65a8c5468,60d2c55b-08fd-40ee-82a2-a0a02fce6ea1,db826d29-e13c-4d0f-859a-84d65a8c5468,0.9333
4,Sereno Iannotti & Teresa Buttari,Giuseppe Iannotti & Teresa Buta,60d2c55b-08fd-40ee-82a2-a0a02fce6ea1,db826d29-e13c-4d0f-859a-84d65a8c5468,9d11dc9f-6f0c-4abf-b88c-764eeb1185e7,352eacad-75f0-4cb5-bbf5-deab9012b11a,0.9299
5,Daniel Iannotti & Marianna Rua Postolo,Diablo Iannotti & Sarah Veronica Maria Ronsuar...,f989718e-ce24-40e1-8556-809a96864b8f,e17ed22d-5dea-434a-9dee-7732db1e9fd6,705f1542-75da-43bf-8d77-d84918d5dcc9,a842f924-3b41-4093-ab6d-f05589e9bca5,0.9044
8,Moana Troiana & Vincenzo Iannotti,Anna Berardina De Santis & Ascenzo Iannotti,efe7ab81-3a4e-4517-89a6-fd4c2ce35c4b,4fb77f0e-6048-46f6-803b-5ae279f82673,a36f2a30-3098-4d3d-aafe-a7dfb14c615d,4fff7d44-4a99-4ce4-8ae6-9f6d2f170b1e,0.9018


In [531]:
from neo4j import GraphDatabase
import pandas as pd

# ✅ Neo4j Connection
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "password"

# ✅ Create Neo4j Driver
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

def fetch_couple_subgraph(tx, couple1_personA, couple1_personB, couple2_personA, couple2_personB):
    """Fetch all properties (excluding embeddings) and 1-hop relationships for both couples."""
    query = """
    MATCH (p1:Person)-[r1]-(n1)
    WHERE p1.fullname IN [$couple1_personA, $couple1_personB]

    MATCH (p2:Person)-[r2]-(n2)
    WHERE p2.fullname IN [$couple2_personA, $couple2_personB]

    RETURN 
      COLLECT(DISTINCT {
          Person: apoc.map.removeKeys(PROPERTIES(p1), ['embedding_node', 'embedding_parents', 'embedding_spouse']), 
          Relationship: TYPE(r1), 
          ConnectedNode: apoc.map.removeKeys(PROPERTIES(n1), ['embedding_node', 'embedding_parents', 'embedding_spouse'])
      }) AS Couple1_Details,

      COLLECT(DISTINCT {
          Person: apoc.map.removeKeys(PROPERTIES(p2), ['embedding_node', 'embedding_parents', 'embedding_spouse']), 
          Relationship: TYPE(r2), 
          ConnectedNode: apoc.map.removeKeys(PROPERTIES(n2), ['embedding_node', 'embedding_parents', 'embedding_spouse'])
      }) AS Couple2_Details
    """

    result = tx.run(
        query, 
        couple1_personA=couple1_personA, 
        couple1_personB=couple1_personB, 
        couple2_personA=couple2_personA, 
        couple2_personB=couple2_personB
    )
    
    return result.single()

def get_top_couples_subgraph(uid1a, uid1b, uid2a, uid2b):
    """Fetches subgraph details for both couples using four UIDs."""
    query = """
    MATCH (p1a:Person {uid: $uid1a})-[r1a]-(related1a)
    MATCH (p1b:Person {uid: $uid1b})-[r1b]-(related1b)
    MATCH (p2a:Person {uid: $uid2a})-[r2a]-(related2a)
    MATCH (p2b:Person {uid: $uid2b})-[r2b]-(related2b)
    
    RETURN 
        [
            {Person: p1a, Relationship: TYPE(r1a), ConnectedNode: related1a},
            {Person: p1b, Relationship: TYPE(r1b), ConnectedNode: related1b}
        ] AS Couple1_Details,
        
        [
            {Person: p2a, Relationship: TYPE(r2a), ConnectedNode: related2a},
            {Person: p2b, Relationship: TYPE(r2b), ConnectedNode: related2b}
        ] AS Couple2_Details
    """
    
    with driver.session() as session:
        result = session.run(query, uid1a=uid1a, uid1b=uid1b, uid2a=uid2a, uid2b=uid2b).single()
        return result.data() if result else {}


# 🔥 Fetch subgraph
#subgraph = get_top_couples_subgraph()

In [532]:
import networkx as nx
import matplotlib.pyplot as plt

def visualize_subgraph_with_networkx(subgraph):
    """Visualizes the subgraph using NetworkX and Matplotlib with improved edge label positioning."""
    G = nx.Graph()

    # Extract relationships from subgraph details
    all_relationships = subgraph["Couple1_Details"] + subgraph["Couple2_Details"]

    # Add nodes with attributes (excluding embeddings)
    for entry in all_relationships:
        person = entry["Person"]
        connected_node = entry["ConnectedNode"]
        relationship = entry["Relationship"]

        person_uid = person["uid"]
        connected_uid = connected_node["uid"]

        person_label = person["fullname"]
        connected_label = connected_node["fullname"]

        # Add nodes
        G.add_node(person_uid, label=person_label, color="lightblue")
        G.add_node(connected_uid, label=connected_label, color="lightgreen")

        # Add edges
        G.add_edge(person_uid, connected_uid, label=relationship)

    # Define layout with increased spacing
    plt.figure(figsize=(12, 8))
    pos = nx.spring_layout(G, seed=42, k=2)  # Increased spacing

    # Node labels
    labels = nx.get_node_attributes(G, "label")
    colors = [G.nodes[n]["color"] for n in G.nodes()]

    # Edge labels
    edge_labels = nx.get_edge_attributes(G, "label")

    # Draw network with adjusted positions
    nx.draw(G, pos, with_labels=True, labels=labels, node_color=colors, edge_color="gray", node_size=3000, font_size=10)
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=9, label_pos=0.3, rotate=False)  # Adjusted label position

    plt.title("Graph Visualization of Couples and Relationships (Improved Readability)")
    plt.show()

# 🔥 Run visualization
#visualize_subgraph_with_networkx(subgraph)

In [533]:
df_filtered_sorted.head(1)

Unnamed: 0,Couple 1,Couple 2,UID 1A,UID 1B,UID 2A,UID 2B,Similarity Score
1,Giuseppe Iannotti & Teresa Buttari,Giuseppe Iannotti & Teresa Buta,648a36e7-704a-41df-9854-a6a8bc8b67b9,db826d29-e13c-4d0f-859a-84d65a8c5468,9d11dc9f-6f0c-4abf-b88c-764eeb1185e7,352eacad-75f0-4cb5-bbf5-deab9012b11a,0.9905


### LLM decision to merge 

In [534]:

# ✅ Format the structured prompt
prompt = f"""
You are a genealogical expert specializing in Italian records. 
Your task is to determine whether the following two couples are likely the same, using **historical naming conventions, 
familial relationships, and regional patterns**.

### **Instructions:**
- **Analyze relationships**: Are the same people appearing as parents, spouses, or children in both couples?
- **Check surname variations**: Are there small spelling differences, phonetic changes, or aliases that are common in Italian records?
  - Shortened versions (e.g., "Enzo" → "Vincenzo")
  - Regional variations
  - Phonetic similarities (e.g., "Maria Anna" -> "Marianna")
- **Compare locations**: Do they come from the same or nearby locations?
- **Alias validation**: If alias names exist, could they be alternative spellings or diminutive versions?
- "Timeframe": are the dates within the span of one generation, is it plausible?

### **Couple 1 Data:**
{json.dumps(couple1_data, indent=4)}

### **Couple 2 Data:**
{json.dumps(couple2_data, indent=4)}

---

**Final Decision:**
- **100-90% → Almost certain match**
- **89-70% → Highly probable**
- **69-50% → Possible match**
- **49-30% → Unlikely match**
- **29-0% → Almost certainly different couples**

**Expected JSON Output Format:**
```json
{{
    "match_probability": 85,
    "justification": "The names and relationships are highly similar, with minor spelling differences in the surnames."
}}
"""

In [552]:
import json
from openai import OpenAI
from neo4j import GraphDatabase

# ✅ Neo4j Connection
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "password"

driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

# ✅ Initialize the OpenAI-compatible Ollama client
client = OpenAI(
    base_url="http://localhost:11434/v1",
    api_key="ollama",
)

import json
from openai import OpenAI

# ✅ Initialize the OpenAI-compatible Ollama client
client = OpenAI(
    base_url="http://localhost:11434/v1",
    api_key="ollama",
)

def remove_embedding_properties(data):
    """Removes embedding-related properties from a dictionary."""
    return {k: v for k, v in data.items() if not k.startswith("embedding_")}

def extract_couple_details(subgraph):
    """Extracts structured data for both couples from the subgraph."""
    couple1 = []
    couple2 = []

    # Process first couple
    for entry in subgraph["Couple1_Details"]:
        person = remove_embedding_properties(entry["Person"])
        connected_node = remove_embedding_properties(entry["ConnectedNode"])
        relationship = entry["Relationship"]

        couple1.append({
            "person_details": person,
            "relationship": relationship,
            "related_person_details": connected_node
        })

    # Process second couple
    for entry in subgraph["Couple2_Details"]:
        person = remove_embedding_properties(entry["Person"])
        connected_node = remove_embedding_properties(entry["ConnectedNode"])
        relationship = entry["Relationship"]

        couple2.append({
            "person_details": person,
            "relationship": relationship,
            "related_person_details": connected_node
        })

    return couple1, couple2


def determine_similarity(couple1_data, couple2_data):
    """Sends extracted subgraph data to LLM and retrieves similarity probability."""
    prompt = f"""
    You are a genealogical expert specializing in Italian records. 
    Your task is to determine if two couples represent the same individuals based on names, relationships, and other available information.

    ### **Couple 1:**
    {json.dumps(couple1_data, indent=4)}

    ### **Couple 2:**
    {json.dumps(couple2_data, indent=4)}

    **Expected JSON Output Format:**
    ```json
    {{
        "match_probability": 85,
        "justification": "The names and relationships are highly similar, but some minor differences exist."
    }}
    ```
    Return only a valid JSON response, with no extra explanations.
    """

    try:
        print("🛠️ SENDING LLM REQUEST...")  # Debug
        response = client.chat.completions.create(
            model="mistral:7b",
            temperature=0,
            messages=[{"role": "user", "content": prompt}]
        )

        raw_response = response.choices[0].message.content.strip()
        print(f"🔍 LLM Raw Response BEFORE PARSING: {raw_response}")  # Debug

        # ✅ Debugging: Print Type Before Parsing
        print(f"📌 Type of Raw Response: {type(raw_response)}")

        try:
            result = json.loads(raw_response)  # Parse JSON
            print(f"✅ JSON Parsed Successfully: {result}")  # Debug
            print(f"📌 Type of Parsed JSON: {type(result)}")  # Debug
        except json.JSONDecodeError as e:
            print(f"⚠️ JSON Parsing Error: {e}")
            return 0, "JSON parsing error."

        # ✅ Ensure valid response format
        if isinstance(result, dict) and "match_probability" in result and "justification" in result:
            probability = result.get("match_probability", 0)
            justification = result.get("justification", "No justification provided.")
            print(f"✅ Extracted Probability: {probability}, Justification: {justification}")  # Debug
            return probability, justification
        else:
            print(f"⚠️ Unexpected LLM response format: {result}")
            return 0, "Invalid response format."

    except Exception as e:
        print(f"⚠️ Error in `determine_similarity()`: {e}")
        return 0, f"Error: {e}"


def merge_couples(tx, couple1_uids, couple2_uids):
    """Merges the wife with the wife and the husband with the husband separately."""
    
    # ✅ Combine UID lists and remove duplicates
    unique_uids = list(set(couple1_uids + couple2_uids))
    print(f"🔹 Unique UIDs Being Merged: {unique_uids}")  # Debug

    # ✅ Ensure all nodes exist before merging
    query_check = """
    MATCH (p:Person)
    WHERE p.uid IN $unique_uids
    RETURN p.uid AS uid, p.gender AS gender
    """
    result = tx.run(query_check, unique_uids=unique_uids).data()
    
    # ✅ Separate husbands and wives
    husbands = [r["uid"] for r in result if r["gender"] == "male"]
    wives = [r["uid"] for r in result if r["gender"] == "female"]

    # ✅ Merge Wives First
    if len(wives) > 1:
        query_merge_wives = """
        MATCH (p:Person)
        WHERE p.uid IN $wives
        WITH COLLECT(p) AS wife_nodes
        CALL apoc.refactor.mergeNodes(wife_nodes, {properties: {
            alias_firstname: 'combine',
            alias_lastname: 'combine',
            notes: 'combine',
            location: 'combine',
            deceased: 'combine',
            embedding_node: 'discard',
            embedding_spouse: 'discard',
            embedding_parents: 'discard'
        }}) YIELD node
        RETURN node
        """
        tx.run(query_merge_wives, wives=wives)


    # ✅ Merge Husbands Next
    if len(husbands) > 1:
        query_merge_husbands = """
        MATCH (p:Person)
        WHERE p.uid IN $husbands
        WITH COLLECT(p) AS husband_nodes
        CALL apoc.refactor.mergeNodes(husband_nodes, {properties: {
            alias_firstname: 'combine',
            alias_lastname: 'combine',
            notes: 'combine',
            location: 'combine',
            deceased: 'combine',
            embedding_node: 'discard',
            embedding_spouse: 'discard',
            embedding_parents: 'discard'
        }}) YIELD node
        RETURN node
        """
        tx.run(query_merge_husbands, husbands=husbands)




def process_and_merge_couples(subgraph):
    """Extracts data, analyzes similarity, and merges if probability > 89%."""
    try:
        print(f"🛠️ Processing Subgraph: {subgraph}")  # Debug
        couple1_data, couple2_data = extract_couple_details(subgraph)

        print("🛠️ Extracted Couple 1 Data:", couple1_data)  # Debug
        print("🛠️ Extracted Couple 2 Data:", couple2_data)  # Debug

        # ✅ Get similarity probability from LLM
        probability, justification = determine_similarity(couple1_data, couple2_data)

        print(f"🔍 FINAL LLM Probability Score: {probability}%")

        # ✅ FIX: Ensure probability is an integer before checking conditions
        if not isinstance(probability, (int, float)):
            print(f"⚠️ Probability is invalid: {probability}. Defaulting to 0.")
            probability = 0

        if probability > 89:
            print("✅ High probability match! Proceeding with merge...")

            # ✅ Extract UID lists for merging
            couple1_uids = list({entry["person_details"]["uid"] for entry in couple1_data})
            couple2_uids = list({entry["person_details"]["uid"] for entry in couple2_data})

            print(f"🔹 Extracted Couple 1 UIDs: {couple1_uids}")  # Debug
            print(f"🔹 Extracted Couple 2 UIDs: {couple2_uids}")  # Debug

            # ✅ Ensure unique IDs and no self-references
            unique_uids = list(set(couple1_uids + couple2_uids))

            print(f"🔹 Unique UIDs After Merge Preparation: {unique_uids}")  # Debug

            if len(unique_uids) < 2:
                print(f"⚠️ Only one unique node found ({unique_uids}). Merge skipped.")
                return probability, "Only one unique node found. Merge skipped."

            with driver.session() as session:
                session.execute_write(merge_couples, couple1_uids, couple2_uids)

            print(f"✅ Merge Completed for: {unique_uids}")  # Debug
            return probability, justification  # ✅ Proper return of both values

        else:
            print(f"❌ Discarded due to low LLM score ({probability}%)")
            return probability, justification  # ✅ Proper return even if merge is skipped

    except Exception as e:
        print(f"⚠️ Error in `process_and_merge_couples()`: {e}")
        return 0, f"Error: {e}"  # ✅ Ensure a valid tuple is always returned


# 🔥 Run the process on the given subgraph
#process_and_merge_couples(subgraph)

In [553]:
import json
from openai import OpenAI
from neo4j import GraphDatabase

# ✅ Neo4j Connection
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "password"

driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

# ✅ Initialize the OpenAI-compatible Ollama client
client = OpenAI(
    base_url="http://localhost:11434/v1",
    api_key="ollama",
)

def remove_embedding_properties(data):
    """Removes embedding-related properties from a dictionary."""
    return {k: v for k, v in data.items() if not k.startswith("embedding_")}

def extract_couple_details(subgraph):
    """Extracts structured data for both couples from the subgraph."""
    couple1 = []
    couple2 = []

    # Process first couple
    for entry in subgraph["Couple1_Details"]:
        person = remove_embedding_properties(entry["Person"])
        connected_node = remove_embedding_properties(entry["ConnectedNode"])
        relationship = entry["Relationship"]

        couple1.append({
            "person_details": person,
            "relationship": relationship,
            "related_person_details": connected_node
        })

    # Process second couple
    for entry in subgraph["Couple2_Details"]:
        person = remove_embedding_properties(entry["Person"])
        connected_node = remove_embedding_properties(entry["ConnectedNode"])
        relationship = entry["Relationship"]

        couple2.append({
            "person_details": person,
            "relationship": relationship,
            "related_person_details": connected_node
        })

    return couple1, couple2


def determine_similarity(couple1_data, couple2_data):
    """Sends extracted subgraph data to LLM and retrieves similarity probability."""

    # ✅ Extract correct UIDs and names
    couple1_uid_a = couple1_data[0]['person_details'].get('uid', 'Unknown')
    couple1_uid_b = couple1_data[1]['person_details'].get('uid', 'Unknown')
    couple2_uid_a = couple2_data[0]['person_details'].get('uid', 'Unknown')
    couple2_uid_b = couple2_data[1]['person_details'].get('uid', 'Unknown')

    couple1_name = f"{couple1_data[0]['person_details'].get('fullname', 'Unknown')} & {couple1_data[1]['person_details'].get('fullname', 'Unknown')}"
    couple2_name = f"{couple2_data[0]['person_details'].get('fullname', 'Unknown')} & {couple2_data[1]['person_details'].get('fullname', 'Unknown')}"

    print(f"\n🔎 **Comparing Couples:**")
    print(f"💑 **Couple 1 (UIDs: {couple1_uid_a}, {couple1_uid_b}):** {couple1_name}")
    print(f"💑 **Couple 2 (UIDs: {couple2_uid_a}, {couple2_uid_b}):** {couple2_name}\n")

    # ✅ Construct LLM Prompt
    prompt = f"""
    You are a genealogical expert specializing in Italian records. 
    Your task is to determine if two couples represent the same individuals based on names, relationships, and other available information.

    ### **Couple 1 (UIDs: {couple1_uid_a}, {couple1_uid_b}):**
    {json.dumps(couple1_data, indent=4)}

    ### **Couple 2 (UIDs: {couple2_uid_a}, {couple2_uid_b}):**
    {json.dumps(couple2_data, indent=4)}

    **Expected JSON Output Format:**
    ```json
    {{
        "match_probability": 85,
        "justification": "The names and relationships are highly similar, but some minor differences exist."
    }}
    ```
    Return only a valid JSON response, with no extra explanations.
    """

    # ✅ Send to LLM
    response = client.chat.completions.create(
        model="mistral:7b",
        temperature=0,
        messages=[{"role": "user", "content": prompt}]
    )

    # ✅ Parse LLM Response
    raw_response = response.choices[0].message.content.strip()
    try:
        result = json.loads(raw_response)
    except json.JSONDecodeError:
        print(f"⚠️ JSON Parsing Error. Raw LLM Response: {raw_response}")
        return 0, "JSON parsing error."

    probability = result.get("match_probability", 0)
    justification = result.get("justification", "No justification provided.")

    print(f"✅ **LLM Probability Score:** {probability}%")
    return probability, justification

  
 

def process_and_merge_couples(subgraph):
    """Extracts data, analyzes similarity, and merges if probability > 89%."""
    try:
        couple1_data, couple2_data = extract_couple_details(subgraph)

        # ✅ Get similarity probability from LLM
        probability, justification = determine_similarity(couple1_data, couple2_data)

        # ✅ Ensure probability is an integer before checking conditions
        if not isinstance(probability, (int, float)):
            print(f"⚠️ Probability is invalid: {probability}. Defaulting to 0.")
            probability = 0

        if probability > 89:
            print("✅ **High probability match! Proceeding with merge...**")

            # ✅ Extract UID lists for merging
            couple1_uids = list({entry["person_details"]["uid"] for entry in couple1_data})
            couple2_uids = list({entry["person_details"]["uid"] for entry in couple2_data})

            print(f"🔹 Extracted Couple 1 UIDs: {couple1_uids}")
            print(f"🔹 Extracted Couple 2 UIDs: {couple2_uids}")

            # ✅ Ensure unique IDs and no self-references
            unique_uids = list(set(couple1_uids + couple2_uids))

            print(f"🔹 **Unique UIDs After Merge Preparation:** {unique_uids}\n")

            if len(unique_uids) < 2:
                print(f"⚠️ **Only one unique node found ({unique_uids}). Merge skipped.**")
                return probability, "Only one unique node found. Merge skipped."

            with driver.session() as session:
                session.execute_write(merge_couples, couple1_uids, couple2_uids)

            print(f"✅ **Merge Completed for:** {unique_uids}\n")
            return probability, justification

        else:
            print(f"❌ **Discarded due to low LLM score ({probability}%)**\n")
            return probability, justification

    except Exception as e:
        print(f"⚠️ Error in `process_and_merge_couples()`: {e}")
        return 0, f"Error: {e}"

In [554]:
def process_missing_embeddings():
    """Finds nodes with missing embeddings and regenerates them."""
    driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

    with driver.session() as session:
        # ✅ Fetch nodes that need embedding updates
        query = """
        MATCH (p:Person)
        WHERE p.embedding_node = "" OR p.embedding_spouse = "" OR p.embedding_parents = ""
        OPTIONAL MATCH (p)-[m:MARRIED_TO]->(spouse:Person)
        OPTIONAL MATCH (p)-[r:SON_OF|DAUGHTER_OF]->(parent:Person)
        RETURN p, m.date AS marriage_date, spouse, COLLECT(parent) AS parents
        """
        nodes = session.run(query).data()

        if not nodes:
            print("✅ No missing embeddings found.")
            return

        for record in nodes:
            node = record["p"]
            spouse = record["spouse"]
            marriage_date = record["marriage_date"]
            parents = record["parents"]

            uid = node["uid"]

            # 🔹 Generate hierarchical descriptions
            node_text, spouse_text, parent_text = generate_text(node, spouse, marriage_date, parents)

            # 🔹 Compute embeddings only if they are missing
            embedding_node = get_embedding(node_text) if node.get("embedding_node", "") == "" else node["embedding_node"]
            embedding_spouse = get_embedding(spouse_text) if node.get("embedding_spouse", "") == "" else node["embedding_spouse"]
            embedding_parents = get_embedding(parent_text) if node.get("embedding_parents", "") == "" else node["embedding_parents"]

            # 🔹 Ensure embeddings are JSON-compatible lists
            embedding_node = embedding_node.tolist() if isinstance(embedding_node, np.ndarray) else embedding_node
            embedding_spouse = embedding_spouse.tolist() if isinstance(embedding_spouse, np.ndarray) else embedding_spouse
            embedding_parents = embedding_parents.tolist() if isinstance(embedding_parents, np.ndarray) else embedding_parents

            # ✅ Update Neo4j with new embeddings
            session.run(
                """
                MATCH (p:Person {uid: $uid})
                SET p.embedding_node = $embedding_node,
                    p.embedding_spouse = $embedding_spouse,
                    p.embedding_parents = $embedding_parents
                """,
                uid=uid,
                embedding_node=embedding_node,
                embedding_spouse=embedding_spouse,
                embedding_parents=embedding_parents
            )

            print(f"✅ Updated missing embeddings for node: {uid}")

    print("\n🎯 All missing embeddings regenerated successfully.")

# 🔥 Run the function to update missing embeddings
#process_missing_embeddings()

In [555]:
# ✅ Run the Relationship Deduplication Process
run_deduplication()
# ✅ Run the Recursive Merging Process
merge_until_done(BATCH_SIZE)
run_deduplication()
print("🎯 Relationship deduplication completed successfully!")
merge_until_done_aliases()
run_deduplication()

🔄 Nodes before merge: 25, after merge: 25
🎯 Relationship deduplication completed successfully!
🔄 Nodes before merge: 25, after merge: 25


In [556]:
# 🔥 Run the optimized comparison and store results in a DataFrame
df_filtered = compare_couple_embeddings()
df_filtered_sorted = df_filtered.sort_values(by="Similarity Score", ascending=False)

🔍 **Total Couples Loaded for Comparison:** 25

📊 **Comparison Completed. High-Similarity Pairs Found:** 9



In [557]:
df_filtered_sorted.head()

Unnamed: 0,Couple 1,Couple 2,UID 1A,UID 1B,UID 2A,UID 2B,Similarity Score
1,Giuseppe Iannotti & Teresa Buttari,Giuseppe Iannotti & Teresa Buta,648a36e7-704a-41df-9854-a6a8bc8b67b9,db826d29-e13c-4d0f-859a-84d65a8c5468,9d11dc9f-6f0c-4abf-b88c-764eeb1185e7,352eacad-75f0-4cb5-bbf5-deab9012b11a,0.9905
0,Giuseppe Iannotti & Teresa Buttari,Sereno Iannotti & Teresa Buttari,648a36e7-704a-41df-9854-a6a8bc8b67b9,db826d29-e13c-4d0f-859a-84d65a8c5468,60d2c55b-08fd-40ee-82a2-a0a02fce6ea1,db826d29-e13c-4d0f-859a-84d65a8c5468,0.9333
4,Sereno Iannotti & Teresa Buttari,Giuseppe Iannotti & Teresa Buta,60d2c55b-08fd-40ee-82a2-a0a02fce6ea1,db826d29-e13c-4d0f-859a-84d65a8c5468,9d11dc9f-6f0c-4abf-b88c-764eeb1185e7,352eacad-75f0-4cb5-bbf5-deab9012b11a,0.9299
5,Daniel Iannotti & Marianna Rua Postolo,Diablo Iannotti & Sarah Veronica Maria Ronsuar...,f989718e-ce24-40e1-8556-809a96864b8f,e17ed22d-5dea-434a-9dee-7732db1e9fd6,705f1542-75da-43bf-8d77-d84918d5dcc9,a842f924-3b41-4093-ab6d-f05589e9bca5,0.9044
8,Moana Troiana & Vincenzo Iannotti,Anna Berardina De Santis & Ascenzo Iannotti,efe7ab81-3a4e-4517-89a6-fd4c2ce35c4b,4fb77f0e-6048-46f6-803b-5ae279f82673,a36f2a30-3098-4d3d-aafe-a7dfb14c615d,4fff7d44-4a99-4ce4-8ae6-9f6d2f170b1e,0.9018


### Run in a WHILE loop

In [558]:
import json
from openai import OpenAI
from neo4j import GraphDatabase

# ✅ Neo4j Connection
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "password"

# Set custom timeout
neo4j_config = {
    "max_connection_lifetime": 3600,  # 1 hour (increase if needed)
    "connection_acquisition_timeout": 120,  # Increase wait time for acquiring a connection
    "max_connection_pool_size": 50,  # Increase pool size to handle more queries
}

driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD), **neo4j_config)

In [559]:
# ✅ DataFrame to store discarded couples
df_discarded = pd.DataFrame(columns=["Couple 1", "Couple 2", "UID 1A", "UID 1B", "UID 2A", "UID 2B", "LLM Score", "Justification"])

In [560]:
while len(df_filtered_sorted) > 0:
    print(f"\n🔄 **Remaining Couples to Process:** {len(df_filtered_sorted)}")

    # ✅ Extract top couple from DataFrame
    top_couple = df_filtered_sorted.iloc[0]
    
    # ✅ Extract UIDs (both persons from each couple)
    couple1_uid_a = top_couple["UID 1A"] 
    couple1_uid_b = top_couple["UID 1B"] 
    couple2_uid_a = top_couple["UID 2A"] 
    couple2_uid_b = top_couple["UID 2B"]  

    # ✅ Extract couple names for logging
    couple1_name = top_couple.get("Couple 1", "Unknown")
    couple2_name = top_couple.get("Couple 2", "Unknown")

    # ✅ Ensure UIDs are valid
    if None in {couple1_uid_a, couple1_uid_b, couple2_uid_a, couple2_uid_b}:
        print(f"⚠️ **Skipping due to missing UIDs for {couple1_name} & {couple2_name}**")
        df_filtered_sorted = df_filtered_sorted.iloc[1:]  # Remove processed couple
        continue

    # ✅ Check if the couple was previously discarded
    if ((df_discarded["UID 1A"] == couple1_uid_a) & (df_discarded["UID 1B"] == couple1_uid_b) &
        (df_discarded["UID 2A"] == couple2_uid_a) & (df_discarded["UID 2B"] == couple2_uid_b)).any() or \
       ((df_discarded["UID 1A"] == couple2_uid_a) & (df_discarded["UID 1B"] == couple2_uid_b) &
        (df_discarded["UID 2A"] == couple1_uid_a) & (df_discarded["UID 2B"] == couple1_uid_b)).any():
        print(f"⚠️ **Skipping {couple1_name} & {couple2_name} (Previously Discarded)**")
        df_filtered_sorted = df_filtered_sorted.iloc[1:]  # Remove processed couple
        continue

    # ✅ Fetch exact subgraph using UIDs
    subgraph = get_top_couples_subgraph(couple1_uid_a, couple1_uid_b, couple2_uid_a, couple2_uid_b)

    # ✅ Perform LLM similarity check and merging
    probability, justification = process_and_merge_couples(subgraph)

    if probability > 89:
        print(f"📌 **Justification:** {justification}\n")
        

        # ✅ Update embeddings
        process_missing_embeddings()

        # ✅ Deduplication & Merging passes
        run_deduplication()
        merge_until_done(BATCH_SIZE)
        run_deduplication()
        merge_until_done_aliases()
        run_deduplication()
        merge_until_done(BATCH_SIZE)
        run_deduplication()

        # ✅ Recompute similarity scores
        print("🔄 **Recomputing similarity scores...**")
        df_filtered = compare_couple_embeddings()
        df_filtered_sorted = df_filtered.sort_values(by="Similarity Score", ascending=False)

    else:
        print(f"❌ **Discarded due to low LLM score ({probability}%)**")
        print(f"📌 **Justification:** {justification}\n")

        # ✅ Store in `df_discarded`
        new_row = pd.DataFrame([{
            "Couple 1": couple1_name,
            "Couple 2": couple2_name,
            "UID 1A": couple1_uid_a,
            "UID 1B": couple1_uid_b,
            "UID 2A": couple2_uid_a,
            "UID 2B": couple2_uid_b,
            "LLM Score": probability,
            "Justification": justification
        }])

        df_discarded = pd.concat([df_discarded, new_row], ignore_index=True)

    # ✅ Remove processed couple from `df_filtered_sorted`
    df_filtered_sorted = df_filtered_sorted.iloc[1:].reset_index(drop=True)


# ✅ Final Output
print("\n🎯 **Merging & Deduplication Complete!**")
print(f"❌ **Discarded Couples:** {len(df_discarded)}")


🔄 **Remaining Couples to Process:** 9


  warn(



🔎 **Comparing Couples:**
💑 **Couple 1 (UIDs: 648a36e7-704a-41df-9854-a6a8bc8b67b9, db826d29-e13c-4d0f-859a-84d65a8c5468):** Giuseppe Iannotti & Teresa Buttari
💑 **Couple 2 (UIDs: 9d11dc9f-6f0c-4abf-b88c-764eeb1185e7, 352eacad-75f0-4cb5-bbf5-deab9012b11a):** Giuseppe Iannotti & Teresa Buta

✅ **LLM Probability Score:** 90%
✅ **High probability match! Proceeding with merge...**
🔹 Extracted Couple 1 UIDs: ['db826d29-e13c-4d0f-859a-84d65a8c5468', '648a36e7-704a-41df-9854-a6a8bc8b67b9']
🔹 Extracted Couple 2 UIDs: ['9d11dc9f-6f0c-4abf-b88c-764eeb1185e7', '352eacad-75f0-4cb5-bbf5-deab9012b11a']
🔹 **Unique UIDs After Merge Preparation:** ['db826d29-e13c-4d0f-859a-84d65a8c5468', '648a36e7-704a-41df-9854-a6a8bc8b67b9', '9d11dc9f-6f0c-4abf-b88c-764eeb1185e7', '352eacad-75f0-4cb5-bbf5-deab9012b11a']

🔹 Unique UIDs Being Merged: ['db826d29-e13c-4d0f-859a-84d65a8c5468', '648a36e7-704a-41df-9854-a6a8bc8b67b9', '9d11dc9f-6f0c-4abf-b88c-764eeb1185e7', '352eacad-75f0-4cb5-bbf5-deab9012b11a']
✅ **Merge 

  warn(



🔎 **Comparing Couples:**
💑 **Couple 1 (UIDs: f989718e-ce24-40e1-8556-809a96864b8f, e17ed22d-5dea-434a-9dee-7732db1e9fd6):** Daniel Iannotti & Marianna Rua Postolo
💑 **Couple 2 (UIDs: 705f1542-75da-43bf-8d77-d84918d5dcc9, a842f924-3b41-4093-ab6d-f05589e9bca5):** Diablo Iannotti & Sarah Veronica Maria Ronsuarda Mac Allister

✅ **LLM Probability Score:** 90%
✅ **High probability match! Proceeding with merge...**
🔹 Extracted Couple 1 UIDs: ['e17ed22d-5dea-434a-9dee-7732db1e9fd6', 'f989718e-ce24-40e1-8556-809a96864b8f']
🔹 Extracted Couple 2 UIDs: ['705f1542-75da-43bf-8d77-d84918d5dcc9', 'a842f924-3b41-4093-ab6d-f05589e9bca5']
🔹 **Unique UIDs After Merge Preparation:** ['e17ed22d-5dea-434a-9dee-7732db1e9fd6', '705f1542-75da-43bf-8d77-d84918d5dcc9', 'f989718e-ce24-40e1-8556-809a96864b8f', 'a842f924-3b41-4093-ab6d-f05589e9bca5']

🔹 Unique UIDs Being Merged: ['e17ed22d-5dea-434a-9dee-7732db1e9fd6', '705f1542-75da-43bf-8d77-d84918d5dcc9', 'f989718e-ce24-40e1-8556-809a96864b8f', 'a842f924-3b41-4

  warn(



🔎 **Comparing Couples:**
💑 **Couple 1 (UIDs: efe7ab81-3a4e-4517-89a6-fd4c2ce35c4b, 4fb77f0e-6048-46f6-803b-5ae279f82673):** Moana Troiana & Vincenzo Iannotti
💑 **Couple 2 (UIDs: a36f2a30-3098-4d3d-aafe-a7dfb14c615d, 4fff7d44-4a99-4ce4-8ae6-9f6d2f170b1e):** Anna Berardina De Santis & Ascenzo Iannotti

✅ **LLM Probability Score:** 60%
❌ **Discarded due to low LLM score (60%)**

❌ **Discarded due to low LLM score (60%)**
📌 **Justification:** While the 'Troiana' and 'De Santis' surnames are different, both couples consist of a daughter named 'Moana/Anna Berardina' and a son whose first name starts with 'Vincenzo/Ascenzo'. However, the mothers' names differ significantly.


🔄 **Remaining Couples to Process:** 4

🔎 **Comparing Couples:**
💑 **Couple 1 (UIDs: 76586eaf-1b6d-458c-b35d-dbdf4fd8fa44, 758fcee8-6b34-4c2d-a553-f19cbcc5b258):** Alessandra Pozzi & Shablo Iannotti
💑 **Couple 2 (UIDs: a36f2a30-3098-4d3d-aafe-a7dfb14c615d, 4fff7d44-4a99-4ce4-8ae6-9f6d2f170b1e):** Anna Berardina De Santis

  warn(



🔎 **Comparing Couples:**
💑 **Couple 1 (UIDs: 705f1542-75da-43bf-8d77-d84918d5dcc9, a842f924-3b41-4093-ab6d-f05589e9bca5):** Diablo Iannotti & Sarah Veronica Maria Ronsuarda Mac Allister
💑 **Couple 2 (UIDs: efe7ab81-3a4e-4517-89a6-fd4c2ce35c4b, 4fb77f0e-6048-46f6-803b-5ae279f82673):** Moana Troiana & Vincenzo Iannotti

✅ **LLM Probability Score:** 90%
✅ **High probability match! Proceeding with merge...**
🔹 Extracted Couple 1 UIDs: ['705f1542-75da-43bf-8d77-d84918d5dcc9', 'a842f924-3b41-4093-ab6d-f05589e9bca5']
🔹 Extracted Couple 2 UIDs: ['efe7ab81-3a4e-4517-89a6-fd4c2ce35c4b', '4fb77f0e-6048-46f6-803b-5ae279f82673']
🔹 **Unique UIDs After Merge Preparation:** ['705f1542-75da-43bf-8d77-d84918d5dcc9', 'efe7ab81-3a4e-4517-89a6-fd4c2ce35c4b', 'a842f924-3b41-4093-ab6d-f05589e9bca5', '4fb77f0e-6048-46f6-803b-5ae279f82673']

🔹 Unique UIDs Being Merged: ['705f1542-75da-43bf-8d77-d84918d5dcc9', 'efe7ab81-3a4e-4517-89a6-fd4c2ce35c4b', 'a842f924-3b41-4093-ab6d-f05589e9bca5', '4fb77f0e-6048-46f6-8

In [562]:
df_discarded

Unnamed: 0,Couple 1,Couple 2,UID 1A,UID 1B,UID 2A,UID 2B,LLM Score,Justification
0,Moana Troiana & Vincenzo Iannotti,Anna Berardina De Santis & Ascenzo Iannotti,efe7ab81-3a4e-4517-89a6-fd4c2ce35c4b,4fb77f0e-6048-46f6-803b-5ae279f82673,a36f2a30-3098-4d3d-aafe-a7dfb14c615d,4fff7d44-4a99-4ce4-8ae6-9f6d2f170b1e,60,While the 'Troiana' and 'De Santis' surnames a...
1,Alessandra Pozzi & Shablo Iannotti,Anna Berardina De Santis & Ascenzo Iannotti,76586eaf-1b6d-458c-b35d-dbdf4fd8fa44,758fcee8-6b34-4c2d-a553-f19cbcc5b258,a36f2a30-3098-4d3d-aafe-a7dfb14c615d,4fff7d44-4a99-4ce4-8ae6-9f6d2f170b1e,60,While both couples share the last name 'Iannot...


# Why these names were not processed in the while loop?

In [563]:
df_filtered = compare_couple_embeddings()
df_filtered_sorted = df_filtered.sort_values(by="Similarity Score", ascending=False)
df_filtered_sorted

🔍 **Total Couples Loaded for Comparison:** 17

📊 **Comparison Completed. High-Similarity Pairs Found:** 2



Unnamed: 0,Couple 1,Couple 2,UID 1A,UID 1B,UID 2A,UID 2B,Similarity Score
0,Giuseppe Iannotti & Teresa Buta,Sereno Iannotti & Teresa Buta,9d11dc9f-6f0c-4abf-b88c-764eeb1185e7,352eacad-75f0-4cb5-bbf5-deab9012b11a,60d2c55b-08fd-40ee-82a2-a0a02fce6ea1,352eacad-75f0-4cb5-bbf5-deab9012b11a,0.9333
1,Alessandra Pozzi & Shablo Iannotti,Anna Berardina De Santis & Ascenzo Iannotti,76586eaf-1b6d-458c-b35d-dbdf4fd8fa44,758fcee8-6b34-4c2d-a553-f19cbcc5b258,a36f2a30-3098-4d3d-aafe-a7dfb14c615d,4fff7d44-4a99-4ce4-8ae6-9f6d2f170b1e,0.8999


In [15]:
# MATCH (n) DETACH DELETE n

# MATCH (n) OPTIONAL MATCH (n)-[r]-() RETURN n, r;

# MATCH (p:Person)-[r]-(neighbor) WHERE p.firstname = "Ascenzo" RETURN p, r, neighbor;

# MATCH (n) RETURN count(n);
