In [1]:
from openai import OpenAI

client = OpenAI(
    base_url = 'http://localhost:11434/v1',
    api_key='ollama',
)

In [2]:
from neo4j import GraphDatabase

# Define connection details
URI = "bolt://localhost:7687"  # Neo4j Bolt connection
USERNAME = "neo4j"
PASSWORD = "password"  # Replace with the password you set

# Create a Neo4j Driver instance
driver = GraphDatabase.driver(URI, auth=(USERNAME, PASSWORD))

# Define a function to test the connection
def test_connection():
    with driver.session() as session:
        result = session.run("RETURN 'Connected to Neo4j' AS message")
        for record in result:
            print(record["message"])

# Run test
test_connection()

# Close the driver when done
#driver.close()

Connected to Neo4j


In [3]:
import pandas as pd

# Define the file path
file_path = "IPR Avezzano Indexes 1809-1865.xlsx"

# Load the specific sheet "Marriages 1821 - 1845"
df = pd.read_excel(file_path, sheet_name="Marriages 1821-1845").head(10)

In [4]:
df.head()

Unnamed: 0,Record #,Last name - Groom,First name - Groom,Father of Groom,Mother of Groom,Last name - Bride,First name - Bride,Father of Bride,Mother of Bride,Date,Year,Comune Groom,Comune Bride,Notes
0,3,Iannotti,Ascenzo,Giuseppe,Teresa Buttari aka BUTTA,de Santis,Anna Berardina,Nicola,Maddalena del Rosso (MADDA),2020-07-30,1821,,,It was his 3rd marriage
1,6,Lolli,Anselmo,fu Andrea,Camilla Liberati,Collalto,Anna Giuseppa,Gabrielefu,fu Elisabetta Savina,2020-12-09,1821,,,They were living before in Parma
2,2,Paciotti,Francesco,Pietro,Giacinta di Simone,Lolli,Maria,Costanzo,Domenica Colella,2020-05-20,1821,,,Costanzo was bord in 1789
3,1,Pennazza,Giuseppe,Gennaro,Gesualda di Clemente,Novelli,Marianna,Francesco,Maria Domenica Marianella,2020-01-21,1821,,,"her name was Alfetta, then changed"
4,5,Rodorigo,Giacomo,Felice,Rosa di Crescenzo,Donsante,Aurora,Arcangelo,Clementina Cosimo,2020-11-25,1821,,,"Named orphan as Angela Proietta, later she wen..."


# Regex (fu, alias)

In [5]:
import re
import pandas as pd

def extract_alias(name):
    """
    ✅ Extracts alias name from either:
    - Brackets `()`, like "Melchior(Melchiorre)"
    - "aka", like "Marianna aka Nenna de Santis"
    
    Returns:
    - (cleaned name, extracted alias)
    """
    if not isinstance(name, str) or name.strip() == "":
        return name, ""  # ✅ Return unchanged name, no alias

    alias = ""

    # ✅ Extract aka alias (e.g., "Marianna aka Nenna de Santis")
    aka_match = re.search(r"aka\s+([\w\s]+)", name, re.IGNORECASE)
    if aka_match:
        alias = aka_match.group(1).strip()  # ✅ Extract aka alias
        name = re.sub(r"aka\s+[\w\s]+", "", name, flags=re.IGNORECASE).strip()  # ✅ Remove "aka" part

    # ✅ Extract alias from brackets `()`
    bracket_match = re.search(r"\(([^)]*?)\)", name)  # Extract content inside brackets
    if bracket_match:
        alias = bracket_match.group(1).strip()  # ✅ Extract alias name
        name = re.sub(r"\s*\(\s*[^)]*?\s*\)\s*", "", name).strip()  # ✅ Remove brackets & alias

    return name.strip(), alias.strip()

def format_proper_case(name):
    """
    ✅ Converts all words to start with a capital letter.
    - Fixes "di", "del", "de".
    - Removes "fu" / "fù" (even if attached like "Gabrielefu").
    - Extracts alias names from brackets `()` and "aka".
    - Detects if the person was deceased.
    """
    if not isinstance(name, str) or name.strip() == "":
        return name, "", ""  # ✅ Return unchanged name, empty deceased, no alias

    # ✅ Detect "fu" / "fù" at the start OR attached to the name
    deceased = ""
    name = name.strip()

    match = re.match(r"^(fu|fù)\s*", name, re.IGNORECASE)  # Match "fu " or "fù " at the start
    match_attached = re.search(r"(fu|fù)$", name, re.IGNORECASE)  # Match "fu" or "fù" at the end

    if match:
        deceased = "before {date}"  # 🔹 Placeholder (updated later)
        name = name[len(match.group(0)):].strip()  # ✅ Remove "fu"/"fù" at the start

    elif match_attached:
        deceased = "before {date}"  # 🔹 Placeholder (updated later)
        name = name[:match_attached.start()].strip()  # ✅ Remove "fu"/"fù" at the end

    # ✅ Extract Alias Name (Handles both aka and brackets)
    name, alias = extract_alias(name)

    # ✅ Fix "di", "del", "de" capitalization
    name = re.sub(r"\b(di|del|de)(?=\s|\b)", lambda x: x.group(1).capitalize(), name, flags=re.IGNORECASE)

    # ✅ Capitalize all words
    name = " ".join(word.capitalize() for word in name.split())

    return name.strip(), alias.strip(), deceased.strip()  # ✅ Return formatted name, alias, deceased status

def format_deceased_status(name, date, year):
    """
    ✅ If "fu" / "fù" is present, mark person as deceased **before the marriage date**.
    ✅ Extracts alias if present in `()` or "aka".
    ✅ Fixes Timestamp issue by formatting the date properly.
    """
    formatted_name, alias, deceased = format_proper_case(name)  # ✅ Format name, detect "fu", extract alias

    # ✅ Ensure date is properly converted to a string in "09 Dec" format
    if deceased and pd.notna(date) and pd.notna(year):
        date_str = date.strftime("%d %b") if isinstance(date, pd.Timestamp) else str(date).strip()
        year_str = str(year).strip()
        deceased = f"before {date_str} {year_str}"  # ✅ Correct format: "before 09 Dec 1821"

    return formatted_name, alias, deceased  # ✅ Returns three values

In [6]:
import pandas as pd

# ✅ Work on a **COPY** of df (DO NOT modify the original)
df_corrected = df.copy()

# ✅ Strip spaces from column names
df_corrected.columns = df_corrected.columns.str.strip()

# ✅ Define columns that get Alias + Deceased handling
columns_with_alias_deceased = [
    "Father of Groom", "Mother of Groom",
    "Father of Bride", "Mother of Bride"
]

# ✅ Define columns for Groom & Bride to get Alias (but NO Deceased)
columns_with_alias = [
    "Last name - Groom", "First name - Groom",
    "Last name - Bride", "First name - Bride"
]

# ✅ Apply transformations for Parents (Alias + Deceased)
for col in columns_with_alias_deceased:
    if col in df_corrected.columns:
        df_corrected[[col, f"Alias - {col}", f"Deceased - {col}"]] = df_corrected.apply(
            lambda row: format_deceased_status(row[col], row["Date"], row["Year"]), axis=1, result_type="expand"
        )

# ✅ Apply transformations for Groom & Bride (Alias ONLY)
for col in columns_with_alias:
    if col in df_corrected.columns:
        df_corrected[[col, f"Alias - {col}"]] = df_corrected.apply(
            lambda row: format_proper_case(row[col])[:2], axis=1, result_type="expand"
        )

# ✅ Apply Special Case for **Father's First Name Alias & Last Name Alias** (inherits Groom/Bride last name)
for role in ["Father of Groom", "Father of Bride"]:
    if role in df_corrected.columns:
        df_corrected[f"Alias - {role} First Name"] = df_corrected[f"Alias - {role}"]  # ✅ First name alias
        df_corrected[f"Alias - {role} Last Name"] = df_corrected["Last name - " + role.split(" ")[-1]]  # ✅ Inherit last name

# ✅ Apply Special Case for **Mother's Alias** (Split into Firstname & Lastname)
for role in ["Mother of Groom", "Mother of Bride"]:
    alias_col = f"Alias - {role}"
    alias_firstname_col = f"Alias - {role} First Name"
    alias_lastname_col = f"Alias - {role} Last Name"

# ✅ REMOVE COLUMNS: "Alias - Father of Groom" & "Alias - Father of Bride"
df_corrected.drop(columns=["Alias - Father of Groom", "Alias - Father of Bride"], inplace=True, errors="ignore")

# ✅ **NEW: If Alias - Father Last Name is identical to Groom/Bride Last Name, REMOVE VALUE**
for role in ["Groom", "Bride"]:
    father_alias_col = f"Alias - Father of {role} Last Name"
    father_lastname_col = f"Last name - {role}"

    if father_alias_col in df_corrected.columns and father_lastname_col in df_corrected.columns:
        df_corrected[father_alias_col] = df_corrected.apply(
            lambda row: "" if row[father_alias_col] == row[father_lastname_col] else row[father_alias_col], axis=1
        )

# ✅ **NEW RULE: If Alias - Last Name - Groom/Bride is populated, fill Alias - Father Last Name**
for role in ["Groom", "Bride"]:
    alias_last_name_col = f"Alias - Last name - {role}"
    alias_father_last_name_col = f"Alias - Father of {role} Last Name"

    if alias_last_name_col in df_corrected.columns and alias_father_last_name_col in df_corrected.columns:
        df_corrected[alias_father_last_name_col] = df_corrected.apply(
            lambda row: row[alias_last_name_col] if pd.notna(row[alias_last_name_col]) and row[alias_last_name_col] != "" else row[alias_father_last_name_col],
            axis=1
        )

# ✅ Save Corrected Data (Safe Copy)
df_corrected.to_csv("corrected_marriage_records.csv", index=False)

print("✅ Successfully formatted names, extracted aka aliases, handled 'fu', split mother aliases, and saved deceased statuses!")

✅ Successfully formatted names, extracted aka aliases, handled 'fu', split mother aliases, and saved deceased statuses!


# Check first name and last name for mothers with LLM

In [7]:
from openai import OpenAI
import pandas as pd
import json
from pydantic import BaseModel

# ✅ Setup OpenAI LLM
client = OpenAI(base_url="http://localhost:11434/v1", api_key="ollama")

# ✅ Define Pydantic Model for Structured Response
class NameSplit(BaseModel):
    firstname: str
    lastname: str

# ✅ Function to Normalize Capitalization
def normalize_name(name):
    """
    Ensures that names follow simple capitalization:
    - First letter uppercase, rest lowercase.
    """
    if not isinstance(name, str) or name.strip() == "":
        return ""
    
    return " ".join([word.capitalize() for word in name.strip().split()])

# ✅ Function to Generate LLM Prompt (With Context for Single-Word Names)
def generate_name_split_prompt(full_name, mother_name=""):
    """
    Generates a structured prompt for the LLM to split names.
    - Ensures **correct first name and last name extraction**.
    - Uses structured JSON output via Pydantic.
    - Adds context when only one name is present.
    """
    context_info = f"\n- The full name of the mother in the records is: **{mother_name}**." if mother_name else ""

    # ✅ Print Debug Info
    print(f"🔍 Debug: Context Provided for '{full_name}': {context_info}")
    print("")

    return f"""
    You are an expert in **historical genealogical records** in Italian.
    Your task is to **correctly split Italian full names into:**
    
    - **First Name:** The complete first name (including double names if present).
    - **Last Name:** The proper last name (with correct capitalization).

    📌 **Rules:**
    - **Do not change spelling** of any names.
    - **If only one word is present, use the provided context to determine if it is a first name or last name.**
    - Ensure the **full name remains intact**.
    - Format output as JSON with `"firstname"`, `"lastname"`.

    ---
    **Example Inputs → Outputs**
    - `"Maria Domenica Marianella"` → `{{
        "firstname": "Maria Domenica",
        "lastname": "Marianella"
    }}`
    - `"Rosa De Luca"` → `{{
        "firstname": "Rosa",
        "lastname": "De Luca"
    }}`
    - `"Teresa"{context_info}` → `{{
        "firstname": "Teresa",
        "lastname": ""
    }}`
    - `"De Santis"{context_info}` → `{{
        "firstname": "",
        "lastname": "De Santis"
    }}`

    ---
    🔍 **Now split the following name correctly:**
    **Input:** `{full_name}`
    **Output (JSON only):**
    """

# ✅ Function to Process Mother's and Alias Names using LLM (With Context)
def process_mother_and_alias_names(row):
    """
    Uses the LLM to correct:
    - 'Mother of Groom' & 'Mother of Bride' names
    - 'Alias - Mother of Groom' & 'Alias - Mother of Bride' names
    Splits them into first and last names.
    """
    corrected_data = {}

    for col in ["Mother of Groom", "Mother of Bride", "Alias - Mother of Groom", "Alias - Mother of Bride"]:
        full_name = row[col].strip() if pd.notna(row[col]) else ""
        mother_name = row[col.replace("Alias - ", "")].strip() if pd.notna(row[col.replace("Alias - ", "")]) else ""

        # ✅ Ensure LLM is called for all names, even if only one word is present
        if not full_name:
            corrected_data[col] = {"firstname": "", "lastname": ""}
            continue

        # ✅ Call LLM using `.parse()` for structured response
        try:
            response = client.beta.chat.completions.parse(
                model="mistral:7b",
                temperature=0,
                messages=[{"role": "user", "content": generate_name_split_prompt(full_name, mother_name)}],
                response_format=NameSplit,  # ✅ Structured Pydantic Response
            )

            # ✅ Extract structured data and normalize capitalization
            if response.choices[0].message.parsed:
                parsed_data = response.choices[0].message.parsed
                corrected_data[col] = {
                    "firstname": normalize_name(parsed_data.firstname.strip()),
                    "lastname": normalize_name(parsed_data.lastname.strip())
                }

                # ✅ Print Debug Info for Processed Names
                print(f"✅ {full_name} → First name: {parsed_data.firstname}, Last name: {parsed_data.lastname}")
                print("")  # ✅ Add spacing after each block for better readability

            else:
                print(f"⚠️ LLM Refused or Failed for: {full_name}")
                corrected_data[col] = {"firstname": normalize_name(full_name), "lastname": ""}
                print("")

        except Exception as e:
            print(f"❌ Error processing '{full_name}': {e}")
            corrected_data[col] = {"firstname": normalize_name(full_name), "lastname": ""}  # Keep fallback
            print("")
            
    return corrected_data

# ✅ Work on a **COPY** of df_llm → Output will be df_llm_corrected
df_llm_corrected = df_corrected.copy()

# ✅ Apply LLM Correction Row by Row for Mother's and Alias Names
corrected_names = df_llm_corrected.apply(process_mother_and_alias_names, axis=1)

# ✅ Convert corrected data into separate columns
# ✅ Convert corrected data into separate columns (Fixed Naming)
for index, corrections in corrected_names.items():
    for col, values in corrections.items():
        # ✅ Remove extra "Alias - Alias -" prefix issue
        clean_col_name = col.replace("Alias - Alias -", "Alias -").strip()

        new_firstname_col = f"Alias - {clean_col_name} First name" if "Alias" in col else f"First name - {clean_col_name}"
        new_lastname_col = f"Alias - {clean_col_name} Last name" if "Alias" in col else f"Last name - {clean_col_name}"

        df_llm_corrected.loc[index, new_firstname_col] = values["firstname"]
        df_llm_corrected.loc[index, new_lastname_col] = values["lastname"]

print("✅ Successfully processed all mother's names using LLM and saved the final dataset!")

🔍 Debug: Context Provided for 'Teresa Buttari': 
- The full name of the mother in the records is: **Teresa Buttari**.

✅ Teresa Buttari → First name: Teresa, Last name: Buttari

🔍 Debug: Context Provided for 'Maddalena Del Rosso': 
- The full name of the mother in the records is: **Maddalena Del Rosso**.

✅ Maddalena Del Rosso → First name: Maddalena, Last name: Del Rosso

🔍 Debug: Context Provided for 'BUTTA': 
- The full name of the mother in the records is: **Teresa Buttari**.

✅ BUTTA → First name: , Last name: BUTTA

🔍 Debug: Context Provided for 'MADDA': 
- The full name of the mother in the records is: **Maddalena Del Rosso**.

✅ MADDA → First name: MADDA, Last name: 

🔍 Debug: Context Provided for 'Camilla Liberati': 
- The full name of the mother in the records is: **Camilla Liberati**.

✅ Camilla Liberati → First name: Camilla, Last name: Liberati

🔍 Debug: Context Provided for 'Elisabetta Savina': 
- The full name of the mother in the records is: **Elisabetta Savina**.

✅ El

In [8]:
# ✅ Ensure column names do not have duplicate "Alias - Alias - ..." prefix
corrected_columns = {
    "Alias - Alias - Mother of Groom First name": "Alias - Mother of Groom First name",
    "Alias - Alias - Mother of Groom Last name": "Alias - Mother of Groom Last name",
    "Alias - Alias - Mother of Bride First name": "Alias - Mother of Bride First name",
    "Alias - Alias - Mother of Bride Last name": "Alias - Mother of Bride Last name",
}

# ✅ Rename columns properly in the DataFrame
df_llm_corrected.rename(columns=corrected_columns, inplace=True)

# ✅ Save Corrected Data (Safe Copy)
df_llm_corrected.to_csv("corrected_marriage_records_llm_final.csv", index=False)

print("✅ Successfully renamed columns and saved the final dataset!")

✅ Successfully renamed columns and saved the final dataset!


# Process compound names

In [12]:
#rule - based

# TO DO:

- Feliceantonio, through llm, make it an alias for name "Felice Antonio" 🛑 -> will use embeddings instead

- Read notes via LLM and create custom properties or adjustments ✅

- Replace LLM assignment of notes with OpenAI 4o and specifically recognize alias in notes

- Store hierarchical embeddings ✅

- Perform MERGE according to alias

- Perform MERGE according to really close embeddings + LLM decision 🚧

# Process notes

In [9]:
import os
api_key = os.environ["DIOPORCO"]

import pandas as pd
notes_df = pd.read_csv("./corrected_marriage_records_llm_final.csv")

In [10]:
import openai
import pandas as pd
import numpy as np
from pydantic import BaseModel


# ✅ Initialize OpenAI Client
client = openai.OpenAI(api_key=api_key)

# ✅ Define Pydantic Model for the Expected LLM Output
class NoteAssignment(BaseModel):
    groom: bool
    bride: bool
    father_groom: bool
    mother_groom: bool
    father_bride: bool
    mother_bride: bool

# ✅ Load the corrected dataset
notes_df = pd.read_csv("corrected_marriage_records_llm_final.csv")

# ✅ Convert "Notes" column to strings, ensuring we handle NaNs
notes_df["Notes"] = notes_df["Notes"].astype(str).replace("nan", "").replace(np.nan, "")

# ✅ Filter rows that have actual notes (non-empty strings)
notes_with_data = notes_df[notes_df["Notes"].str.strip() != ""]
print(f"🔎 Processing {len(notes_with_data)} rows with notes.")

# ✅ Function to process notes and assign them to the correct columns
def process_notes(row):
    note_text = row["Notes"].strip()

    # ✅ Skip processing if the note is empty
    if not note_text:
        print(f"⚠️ Skipping empty note for Record # {row['Record #']}\n{'-'*50}")
        return {
            "Notes - Groom": "",
            "Notes - Bride": "",
            "Notes - Father of Groom": "",
            "Notes - Mother of Groom": "",
            "Notes - Father of Bride": "",
            "Notes - Mother of Bride": ""
        }

    # ✅ Print debug info before calling the LLM
    print(f"\n📝 Processing Note: \"{note_text}\"")
    print(f"🔍 Full Marriage Record: {row.to_dict()}")

    # ✅ Call OpenAI GPT-4o to determine which person(s) the note belongs to
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are an expert in historical genealogy records."},
                {"role": "user", "content": f"""
                Your task is to determine to whom this note belongs in a marriage record.
                
                - Here is the full marriage record:
                  {row.to_dict()}
                - Here is the note: "{note_text}"
                
                🎯 **Your goal:** Identify who this note applies to.
                
                ✅ **Allowed Outputs (JSON format only)**:
                {{
                    "groom": true/false, 
                    "bride": true/false, 
                    "father_groom": true/false, 
                    "mother_groom": true/false, 
                    "father_bride": true/false, 
                    "mother_bride": true/false
                }}
                
                ⚠️ Important:
                - If the note applies to multiple people, mark them as **true**.
                - The note must always apply to at least one person.
                
                **Output (JSON only):**
                """}
            ],
            temperature=0.0,  # ✅ Ensure consistent results
            response_format={"type": "json_object"}  # ✅ FIXED: Correct JSON response format
        )

        # ✅ Extract structured response
        response_data = response.choices[0].message.content
        person_notes = NoteAssignment.model_validate_json(response_data)

        # ✅ Print Debugging Info of LLM Response
        print(f"🔹 LLM Raw Output: {response_data}")
        print(f"✅ LLM Decision → Groom: {person_notes.groom}, Bride: {person_notes.bride}, "
              f"Father of Groom: {person_notes.father_groom}, Mother of Groom: {person_notes.mother_groom}, "
              f"Father of Bride: {person_notes.father_bride}, Mother of Bride: {person_notes.mother_bride}")

    except Exception as e:
        print(f"❌ LLM Error: {e}")
        person_notes = NoteAssignment(
            groom=False, bride=False, father_groom=False, mother_groom=False, father_bride=False, mother_bride=False
        )


    # ✅ Assign the note text to the correct "Notes - X" column
    assigned_notes = {
        "Notes - Groom": note_text if person_notes.groom else "",
        "Notes - Bride": note_text if person_notes.bride else "",
        "Notes - Father of Groom": note_text if person_notes.father_groom else "",
        "Notes - Mother of Groom": note_text if person_notes.mother_groom else "",
        "Notes - Father of Bride": note_text if person_notes.father_bride else "",
        "Notes - Mother of Bride": note_text if person_notes.mother_bride else ""
    }

    # ✅ Print Debugging Info of Final Note Assignment
    print(f"📌 Final Assigned Notes: {assigned_notes}\n{'-'*50}")

    return assigned_notes

# ✅ Apply LLM Processing Row by Row
corrected_notes = notes_with_data.apply(process_notes, axis=1)

# ✅ Assign Corrected Notes Back to `notes_df`
for index, corrections in corrected_notes.items():
    for col, note_value in corrections.items():
        notes_df.loc[index, col] = note_value  # ✅ Store note in the correct column

# ✅ Save the updated DataFrame
notes_df.to_csv("corrected_marriage_records_llm_notes.csv", index=False)

print("✅ Successfully processed notes and assigned them to the correct people.")

🔎 Processing 5 rows with notes.

📝 Processing Note: "It was his 3rd marriage"
🔍 Full Marriage Record: {'Record #': 3, 'Last name - Groom': 'Iannotti', 'First name - Groom': 'Ascenzo', 'Father of Groom': 'Giuseppe', 'Mother of Groom': 'Teresa Buttari', 'Last name - Bride': 'De Santis', 'First name - Bride': 'Anna Berardina', 'Father of Bride': 'Nicola', 'Mother of Bride': 'Maddalena Del Rosso', 'Date': '2020-07-30', 'Year': 1821, 'Comune Groom': nan, 'Comune Bride': nan, 'Notes': 'It was his 3rd marriage', 'Deceased - Father of Groom': nan, 'Alias - Mother of Groom': 'BUTTA', 'Deceased - Mother of Groom': nan, 'Deceased - Father of Bride': nan, 'Alias - Mother of Bride': 'MADDA', 'Deceased - Mother of Bride': nan, 'Alias - Last name - Groom': nan, 'Alias - First name - Groom': nan, 'Alias - Last name - Bride': nan, 'Alias - First name - Bride': nan, 'Alias - Father of Groom First Name': nan, 'Alias - Father of Groom Last Name': nan, 'Alias - Father of Bride First Name': nan, 'Alias - Fa

#### Find alias in notes

In [29]:
import os
api_key = os.environ["DIOPORCO"]
# ✅ Initialize OpenAI Client
client = openai.OpenAI(api_key=api_key)

import pandas as pd
notes_df = pd.read_csv("./corrected_marriage_records_llm_notes.csv")

In [30]:
import openai
import pandas as pd
import numpy as np
from pydantic import BaseModel
from typing import List, Dict


# ✅ Define Pydantic Model for LLM Output (Now with First Name & Last Name distinction)
class AliasAssignment(BaseModel):
    groom: Dict[str, List[str]]  # {"first_name": [], "last_name": []}
    bride: Dict[str, List[str]]
    father_groom: Dict[str, List[str]]
    mother_groom: Dict[str, List[str]]
    father_bride: Dict[str, List[str]]
    mother_bride: Dict[str, List[str]]

# ✅ Load the dataset again to process aliases
notes_df = pd.read_csv("corrected_marriage_records_llm_notes.csv")

# ✅ Convert "Notes" column to strings, ensuring we handle NaNs
notes_df["Notes"] = notes_df["Notes"].astype(str).replace("nan", "").replace(np.nan, "")

# ✅ Filter rows that have actual notes (non-empty strings)
notes_with_data = notes_df[notes_df["Notes"].str.strip() != ""]
print(f"🔎 Processing {len(notes_with_data)} rows to extract aliases.")

# ✅ Function to capitalize alias names properly
def normalize_alias(name):
    """Ensure the first letter is uppercase and the rest are lowercase."""
    return name.capitalize() if isinstance(name, str) and name.strip() else ""

# ✅ Function to process notes and extract aliases
def extract_aliases_from_notes(row):
    note_text = row["Notes"].strip()

    # ✅ Skip processing if the note is empty
    if not note_text:
        print(f"⚠️ Skipping empty note for Record # {row['Record #']}\n{'-'*50}")
        return {}

    # ✅ Print debug info before calling the LLM
    print(f"\n📝 Checking for aliases in note: \"{note_text}\"")
    print(f"🔍 Full Marriage Record: {row.to_dict()}")

    # ✅ Call OpenAI GPT-4o to determine aliases and assign them to the correct person
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are an expert in historical genealogy records."},
                {"role": "user", "content": f"""
                Your task is to determine if the note contains aliases for any person in a marriage record.
                
                - Here is the full marriage record:
                  {row.to_dict()}
                - Here is the note: "{note_text}"
                
                🎯 **Your goal:** Identify if any alias is mentioned and assign it to the correct person.
                
                ✅ **Expected Output (JSON format only, distinguishing first & last name aliases)**:
                {{
                    "groom": {{"first_name": ["Alias1", "Alias2"], "last_name": ["Alias1"]}},
                    "bride": {{"first_name": ["Alias1"], "last_name": []}},
                    "father_groom": {{"first_name": [], "last_name": []}},
                    "mother_groom": {{"first_name": ["Alias1"], "last_name": ["Alias2"]}},
                    "father_bride": {{"first_name": [], "last_name": []}},
                    "mother_bride": {{"first_name": [], "last_name": []}}
                }}
                
                ⚠️ Important:
                - If no alias is found for a person, return **empty lists**.
                - If an alias is found, return **only the alias name(s)** inside lists (no extra text).
                
                **Output (JSON only):**
                """}
            ],
            temperature=0.0,  # ✅ Ensure consistent results
            response_format={"type": "json_object"}  # ✅ Correct JSON response format
        )

        # ✅ Extract structured response
        response_data = response.choices[0].message.content
        alias_data = AliasAssignment.model_validate_json(response_data)

        # ✅ Print Debugging Info of LLM Response
        print(f"🔹 LLM Raw Output: {response_data}")
        print(f"✅ LLM Decision → Groom: {alias_data.groom}, Bride: {alias_data.bride}, "
              f"Father of Groom: {alias_data.father_groom}, Mother of Groom: {alias_data.mother_groom}, "
              f"Father of Bride: {alias_data.father_bride}, Mother of Bride: {alias_data.mother_bride}")

    except Exception as e:
        print(f"❌ LLM Error: {e}")
        return {}

    # ✅ Prepare alias assignments (Handling First & Last Name separately)
    alias_assignments = {
        "Alias - First name - Groom": [normalize_alias(name) for name in alias_data.groom["first_name"]],
        "Alias - Last name - Groom": [normalize_alias(name) for name in alias_data.groom["last_name"]],
        "Alias - First name - Bride": [normalize_alias(name) for name in alias_data.bride["first_name"]],
        "Alias - Last name - Bride": [normalize_alias(name) for name in alias_data.bride["last_name"]],
        "Alias - Father of Groom First Name": [normalize_alias(name) for name in alias_data.father_groom["first_name"]],
        "Alias - Father of Groom Last Name": [normalize_alias(name) for name in alias_data.father_groom["last_name"]],
        "Alias - Mother of Groom First name": [normalize_alias(name) for name in alias_data.mother_groom["first_name"]],
        "Alias - Mother of Groom Last name": [normalize_alias(name) for name in alias_data.mother_groom["last_name"]],
        "Alias - Father of Bride First Name": [normalize_alias(name) for name in alias_data.father_bride["first_name"]],
        "Alias - Father of Bride Last Name": [normalize_alias(name) for name in alias_data.father_bride["last_name"]],
        "Alias - Mother of Bride First name": [normalize_alias(name) for name in alias_data.mother_bride["first_name"]],
        "Alias - Mother of Bride Last name": [normalize_alias(name) for name in alias_data.mother_bride["last_name"]]
    }

    # ✅ Print Debugging Info of Final Alias Assignment
    print(f"📌 Final Assigned Aliases: {alias_assignments}\n{'-'*50}")

    return alias_assignments

# ✅ Apply LLM Processing Row by Row for Alias Extraction
extracted_aliases = notes_with_data.apply(extract_aliases_from_notes, axis=1)

# ✅ Update `notes_df` with extracted aliases (Handling Multiple Entries)
for index, alias_data in extracted_aliases.items():
    for col, alias_list in alias_data.items():
        if alias_list:  # ✅ Only update if alias is present and column exists
            if col in notes_df.columns:
                existing_aliases = notes_df.loc[index, col]

                if pd.notna(existing_aliases) and existing_aliases != "":
                    # ✅ Convert existing aliases into a list, ensuring no duplicates
                    existing_aliases_list = [normalize_alias(x.strip()) for x in existing_aliases.split(",") if x.strip()]
                    new_aliases = list(set(existing_aliases_list + alias_list))  # ✅ Merge & remove duplicates
                    notes_df.loc[index, col] = ", ".join(new_aliases)  # ✅ Convert back to string
                else:
                    notes_df.loc[index, col] = ", ".join(alias_list)  # ✅ Store new aliases directly
            else:
                print(f"⚠️ Warning: Column '{col}' not found in DataFrame. Skipping update.")

# ✅ Save the updated DataFrame
notes_df.to_csv("corrected_marriage_records_llm_aliases.csv", index=False)

print("✅ Successfully extracted aliases from notes and assigned them correctly with proper capitalization.")


🔎 Processing 5 rows to extract aliases.

📝 Checking for aliases in note: "It was his 3rd marriage"
🔍 Full Marriage Record: {'Record #': 3, 'Last name - Groom': 'Iannotti', 'First name - Groom': 'Ascenzo', 'Father of Groom': 'Giuseppe', 'Mother of Groom': 'Teresa Buttari', 'Last name - Bride': 'De Santis', 'First name - Bride': 'Anna Berardina', 'Father of Bride': 'Nicola', 'Mother of Bride': 'Maddalena Del Rosso', 'Date': '2020-07-30', 'Year': 1821, 'Comune Groom': nan, 'Comune Bride': nan, 'Notes': 'It was his 3rd marriage', 'Deceased - Father of Groom': nan, 'Alias - Mother of Groom': 'BUTTA', 'Deceased - Mother of Groom': nan, 'Deceased - Father of Bride': nan, 'Alias - Mother of Bride': 'MADDA', 'Deceased - Mother of Bride': nan, 'Alias - Last name - Groom': nan, 'Alias - First name - Groom': nan, 'Alias - Last name - Bride': nan, 'Alias - First name - Bride': nan, 'Alias - Father of Groom First Name': nan, 'Alias - Father of Groom Last Name': nan, 'Alias - Father of Bride First Na

  notes_df.loc[index, col] = ", ".join(alias_list)  # ✅ Store new aliases directly
  notes_df.loc[index, col] = ", ".join(alias_list)  # ✅ Store new aliases directly


In [32]:
notes_df.iloc[0]

Record #                                                    3
Last name - Groom                                    Iannotti
First name - Groom                                    Ascenzo
Father of Groom                                      Giuseppe
Mother of Groom                                Teresa Buttari
Last name - Bride                                   De Santis
First name - Bride                             Anna Berardina
Father of Bride                                        Nicola
Mother of Bride                           Maddalena Del Rosso
Date                                               2020-07-30
Year                                                     1821
Comune Groom                                              NaN
Comune Bride                                              NaN
Notes                                 It was his 3rd marriage
Deceased - Father of Groom                                NaN
Alias - Mother of Groom                                 BUTTA
Deceased

# Define relationships, entities methods

In [33]:
notes_df = pd.read_csv("corrected_marriage_records_llm_aliases.csv")

In [34]:
import uuid
from pydantic import BaseModel
from typing import List, Optional
import pandas as pd
import numpy as np
from neo4j import GraphDatabase

class Person(BaseModel):
    id: str  # Unique Identifier
    fullname: str
    firstname: str
    lastname: str
    alias_firstname: str
    alias_lastname: str
    gender: str
    location: str = ""  # Default to empty string
    deceased: Optional[str] = ""
    notes: Optional[str] = ""  # ✅ Ensure `notes` is present


# ✅ Define Schema for Relationships
class Relationship(BaseModel):
    from_id: str  # Use unique IDs instead of fullnames
    to_id: str
    type: str
    date: Optional[str] = None

# ✅ Function to Format Date Correctly
def format_date(date_value, year_value):
    """Ensures the date is always in 'dd MMM yyyy' format."""
    if isinstance(date_value, pd.Timestamp):
        date_value = date_value.strftime("%d %b")  # Convert Timestamp
    elif isinstance(date_value, str):
        date_value = date_value.strip()

    year_value = str(year_value).strip() if pd.notna(year_value) else ""

    return f"{date_value} {year_value}" if date_value and year_value else None

# ✅ Function to Retrieve Deceased Status from df_llm
def get_deceased_status(row, person_type):
    """Retrieves correct deceased status from df_llm based on person type."""
    deceased_col = f"Deceased - {person_type}"
    return row[deceased_col] if deceased_col in row and pd.notna(row[deceased_col]) else ""

# ✅ Function to Clean and Extract Location
def clean_location(value):
    """Ensure location is either a valid string or default empty string."""
    return value.strip() if isinstance(value, str) and value.strip() else ""

In [35]:
def process_row(row):
    """Processes a marriage record row into structured JSON with notes included."""

    # ✅ Convert NaN to Empty String or None
    def clean_str(value):
        """Convert NaN to an empty string before stripping."""
        return str(value).strip() if isinstance(value, str) and pd.notna(value) else ""

    # ✅ Generate unique IDs for each person
    def generate_person(firstname, lastname, alias_firstname, alias_lastname, gender, location, deceased, notes):
        return Person(
            id=str(uuid.uuid4()),  # Unique UUID
            fullname=f"{firstname} {lastname}",
            firstname=firstname,
            lastname=lastname,
            alias_firstname=alias_firstname,
            alias_lastname=alias_lastname,
            gender=gender,
            location=location,  # Properly assign location
            deceased=deceased,
            notes=notes  # ✅ Store notes
        )

    # ✅ Extract and clean location for groom and bride
    groom_location = clean_location(row.get("Comune Groom", ""))
    bride_location = clean_location(row.get("Comune Bride", ""))

    # ✅ Extract Notes for Each Person
    groom_notes = clean_str(row.get("Notes - Groom", ""))
    bride_notes = clean_str(row.get("Notes - Bride", ""))
    father_groom_notes = clean_str(row.get("Notes - Father of Groom", ""))
    mother_groom_notes = clean_str(row.get("Notes - Mother of Groom", ""))
    father_bride_notes = clean_str(row.get("Notes - Father of Bride", ""))
    mother_bride_notes = clean_str(row.get("Notes - Mother of Bride", ""))

    # ✅ Create Persons with Notes
    groom = generate_person(clean_str(row["First name - Groom"]), clean_str(row["Last name - Groom"]),
                            clean_str(row.get("Alias - First name - Groom", "")), clean_str(row.get("Alias - Last name - Groom", "")),
                            "male", groom_location, get_deceased_status(row, "First name - Groom"), groom_notes)

    bride = generate_person(clean_str(row["First name - Bride"]), clean_str(row["Last name - Bride"]),
                            clean_str(row.get("Alias - First name - Bride", "")), clean_str(row.get("Alias - Last name - Bride", "")),
                            "female", bride_location, get_deceased_status(row, "First name - Bride"), bride_notes)

    father_groom = generate_person(clean_str(row["Father of Groom"]), clean_str(row["Last name - Groom"]),
                                   clean_str(row.get("Alias - Father of Groom First Name", "")), clean_str(row.get("Alias - Father of Groom Last Name", "")),
                                   "male", "", get_deceased_status(row, "Father of Groom"), father_groom_notes)

    mother_groom = generate_person(clean_str(row["First name - Mother of Groom"]), clean_str(row["Last name - Mother of Groom"]),
                                   clean_str(row.get("Alias - Mother of Groom First name", "")), clean_str(row.get("Alias - Mother of Groom Last name", "")),   
                                   "female", "", get_deceased_status(row, "Mother of Groom"), mother_groom_notes)

    father_bride = generate_person(clean_str(row["Father of Bride"]), clean_str(row["Last name - Bride"]),
                                   clean_str(row.get("Alias - Father of Bride First Name", "")), clean_str(row.get("Alias - Father of Bride Last Name", "")),
                                   "male", "", get_deceased_status(row, "Father of Bride"), father_bride_notes)

    mother_bride = generate_person(clean_str(row["First name - Mother of Bride"]), clean_str(row["Last name - Mother of Bride"]),
                                   clean_str(row.get("Alias - Mother of Bride First name", "")), clean_str(row.get("Alias - Mother of Bride Last name", "")),  
                                   "female", "", get_deceased_status(row, "Mother of Bride"), mother_bride_notes)

    persons = [groom, bride, father_groom, mother_groom, father_bride, mother_bride]

    # ✅ Define Relationships
    relationships = [
        Relationship(from_id=groom.id, to_id=bride.id, type="MARRIED_TO", date=format_date(row["Date"], row["Year"])),
        Relationship(from_id=bride.id, to_id=groom.id, type="MARRIED_TO", date=format_date(row["Date"], row["Year"])),
        Relationship(from_id=groom.id, to_id=father_groom.id, type="SON_OF"),
        Relationship(from_id=groom.id, to_id=mother_groom.id, type="SON_OF"),
        Relationship(from_id=bride.id, to_id=father_bride.id, type="DAUGHTER_OF"),
        Relationship(from_id=bride.id, to_id=mother_bride.id, type="DAUGHTER_OF"),
        Relationship(from_id=father_groom.id, to_id=mother_groom.id, type="MARRIED_TO"),
        Relationship(from_id=father_bride.id, to_id=mother_bride.id, type="MARRIED_TO"),
    ]

    return {"persons": [p.model_dump() for p in persons], "relationships": [r.model_dump() for r in relationships]}


# ✅ Process DataFrame from df_notes
records = [process_row(row) for _, row in notes_df.iterrows()]

# ✅ Save JSON output
with open("neo4j_data.json", "w") as f:
    json.dump(records, f, indent=2)

print("✅ Successfully processed marriage records with Correct Locations!")

✅ Successfully processed marriage records with Correct Locations!


# Insert to Neo4j

In [36]:
from pydantic import BaseModel
from typing import List, Optional
import pandas as pd
import json
import numpy as np
import re
from neo4j import GraphDatabase

# Define connection details
URI = "bolt://localhost:7687"  # Neo4j Bolt connection
USERNAME = "neo4j"
PASSWORD = "password"  # Replace with the password you set

# Create a Neo4j Driver instance
driver = GraphDatabase.driver(URI, auth=(USERNAME, PASSWORD))

with open("neo4j_data.json", "r") as f:
    records = json.load(f)

# ✅ Display the first few records for review
#num_records_to_display = min(5, len(records))  # Show up to 5 records
#records[:num_records_to_display]  # Display sample records

In [37]:
from neo4j import GraphDatabase

# ✅ Neo4j Connection Settings
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "password"

# ✅ Create Neo4j Driver
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

def insert_record(tx, persons, relationships):
    """Inserts persons and relationships while ensuring `notes` is stored correctly."""

    # ✅ Step 1: Create Persons (Independent Insert)
    for person in persons:
        query = """
        CREATE (p:Person {
            uid: $uid,
            fullname: $fullname,
            firstname: $firstname,
            lastname: $lastname,
            alias_firstname: $alias_firstname,
            alias_lastname: $alias_lastname,
            gender: $gender,
            location: $location,
            deceased: $deceased,
            notes: $notes  // ✅ NEW: Assign `notes` directly
        })
        """
        params = {
            "uid": person["id"],  # Unique ID
            "fullname": person["fullname"],
            "firstname": person["firstname"],
            "lastname": person["lastname"],
            "alias_firstname": person["alias_firstname"],
            "alias_lastname": person["alias_lastname"],
            "gender": person["gender"],
            "location": person["location"],
            "deceased": person["deceased"],
            "notes": person["notes"]  # ✅ Ensure `notes` is correctly assigned
        }
        tx.run(query, **params)

    # ✅ Step 2: Create Relationships (Using UID)
    processed_marriages = set()  # ✅ Track already created marriages to prevent duplicates

    for rel in relationships:
        if rel["type"] == "MARRIED_TO":
            marriage_key = tuple(sorted([rel["from_id"], rel["to_id"]]))  # Unique key for marriage
            
            if marriage_key in processed_marriages:
                continue  # ✅ Skip duplicate marriage entry
            processed_marriages.add(marriage_key)

            query = """
            MATCH (a:Person {uid: $from_uid})
            MATCH (b:Person {uid: $to_uid})
            CREATE (a)-[:MARRIED_TO {date: $date, bidirectional: true}]->(b),
                   (b)-[:MARRIED_TO {date: $date, bidirectional: true}]->(a)
            """
        else:
            query = """
            MATCH (a:Person {uid: $from_uid})
            MATCH (b:Person {uid: $to_uid})
            CREATE (a)-[:`""" + rel["type"] + """`]->(b)
            """

        params = {
            "from_uid": rel["from_id"],  # Unique ID of from person
            "to_uid": rel["to_id"],      # Unique ID of to person
            "date": rel.get("date", None)  # Optional date
        }

        tx.run(query, **params)



# ✅ Process and Insert Each Record
with driver.session() as session:
    for record in records:
        persons = record["persons"]
        relationships = record["relationships"]
        session.execute_write(insert_record, persons, relationships)

print("✅ Successfully inserted marriage records with ONE bidirectional `MARRIED_TO` relationship per couple!")

✅ Successfully inserted marriage records with ONE bidirectional `MARRIED_TO` relationship per couple!


# Entity resolution

In [38]:
from neo4j import GraphDatabase

# ✅ Neo4j Connection Settings
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "password"

# ✅ Configurable Batch Size
BATCH_SIZE = 2  # Change this value to adjust batch processing size

# ✅ Create Neo4j Driver
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

def recursive_merge(tx, batch_size):
    """Recursively merges duplicate persons based on fullname, spouse fullname, and marriage date."""
    merge_query = f"""
    CALL apoc.periodic.iterate(
    "MATCH (p1:Person)-[m1:MARRIED_TO]->(s1:Person)
     MATCH (p2:Person)-[m2:MARRIED_TO]->(s2:Person)
     WHERE p1.fullname = p2.fullname
     AND s1.fullname = s2.fullname
     AND COALESCE(m1.date, '') = COALESCE(m2.date, '')  // ✅ Ensure marriage date matches, including empty strings
     AND id(p1) < id(p2)  // Prevent duplicate merging
     RETURN p1, p2, m1, m2, s1, s2",
     
    "WITH p1, p2, m1, m2, s1, s2
     // ✅ Merge nodes while avoiding duplicate relationships
     CALL apoc.refactor.mergeNodes([p1, p2]) YIELD node
     // ✅ Ensure only one MARRIED_TO relationship remains with correct date
     WITH node, s1, m1, m2
     MATCH (node)-[r:MARRIED_TO]->(s1)
     WITH node, s1, COLLECT(r) AS rels
     CALL apoc.refactor.mergeRelationships(rels, {{properties: 'combine'}}) YIELD rel
     RETURN COUNT(*)",
    
    {{batchSize: {batch_size}, parallel: false}})  // ✅ Dynamic batch size
    """

    tx.run(merge_query)

def count_nodes(tx):
    """Counts the total number of Person nodes."""
    result = tx.run("MATCH (p:Person) RETURN count(p) AS count")
    return result.single()["count"]

def merge_until_done(batch_size):
    """Runs the merging process iteratively until no more merges are possible."""
    with driver.session() as session:
        prev_count = session.execute_read(count_nodes)
        
        while True:
            session.execute_write(recursive_merge, batch_size)
            current_count = session.execute_read(count_nodes)
            
            print(f"🔄 Nodes before merge: {prev_count}, after merge: {current_count}")
            
            if current_count == prev_count:
                print("✅ No more merges available. Stopping process.")
                break  # Exit when no further merges happen
            
            prev_count = current_count  # Update previous count

# ✅ Run the Recursive Merging Process
merge_until_done(BATCH_SIZE)

print("🎯 Recursive merging complete: All duplicate persons merged successfully!")

🔄 Nodes before merge: 60, after merge: 58
🔄 Nodes before merge: 58, after merge: 58
✅ No more merges available. Stopping process.
🎯 Recursive merging complete: All duplicate persons merged successfully!


# Assign hierarchical embeddings 

#### Without notes included

In [40]:
from openai import OpenAI
from neo4j import GraphDatabase
import numpy as np
import json

# ✅ Neo4j Connection
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "password"

# ✅ Ollama (Granite) Embedding Model
client = OpenAI(base_url="http://localhost:11434/v1", api_key="ollama")
MODEL_NAME = "granite-embedding:278m"

# ✅ Function to ensure properties exist in Neo4j
def initialize_person_properties(tx):
    query = """
    MATCH (p:Person)
    SET p.birthday = COALESCE(p.birthday, ""),
        p.embedding_node = COALESCE(p.embedding_node, ""),
        p.embedding_spouse = COALESCE(p.embedding_spouse, ""),
        p.embedding_parents = COALESCE(p.embedding_parents, "")
    """
    tx.run(query)

# ✅ Function to get embeddings from Ollama
def get_embedding(text):
    print(f"🔹 Generating embedding for: {text}")  # Debugging
    response = client.embeddings.create(model=MODEL_NAME, input=[text])
    return response.data[0].embedding

# ✅ Function to fetch persons and their relationships
def fetch_graph_data(tx):
    query = """
    MATCH (p:Person)
    OPTIONAL MATCH (p)-[m:MARRIED_TO]->(spouse:Person)
    OPTIONAL MATCH (p)-[r:SON_OF|DAUGHTER_OF]->(parent:Person)
    RETURN p, m.date AS marriage_date, spouse, COLLECT(parent) AS parents
    """
    return list(tx.run(query))

# ✅ Convert Neo4j Node to Dictionary
def node_to_dict(n):
    return dict(n) if n else {}

# ✅ Function to generate hierarchical text descriptions
def generate_text(node, spouse, marriage_date, parents):
    node_dict = node_to_dict(node)
    spouse_dict = node_to_dict(spouse)
    parents_dict = [node_to_dict(parent) for parent in parents]

    # ✅ Node embedding (individual properties)
    node_text = f"Person: {node_dict.get('fullname', 'Unknown')}, Alias: {node_dict.get('alias_firstname', '')} {node_dict.get('alias_lastname', '')}, Birthday: {node_dict.get('birthday', '')}, Location: {node_dict.get('location', '')}, Deceased: {node_dict.get('deceased', '')}"

    # ✅ Spouse embedding (includes spouse properties)
    spouse_text = node_text
    if spouse:
        spouse_text += f" | Married to {spouse_dict.get('fullname', 'Unknown')}, Alias: {spouse_dict.get('alias_firstname', '')} {spouse_dict.get('alias_lastname', '')}, Birthday: {spouse_dict.get('birthday', '')}, Location: {spouse_dict.get('location', '')}, Deceased: {spouse_dict.get('deceased', '')}, Marriage Date: {marriage_date}"

    # ✅ Parent embedding (includes both parents' properties)
    parent_text = node_text
    if parents_dict:
        parent_text += " | Parents: " + " & ".join([
            f"{p.get('fullname', 'Unknown')}, Alias: {p.get('alias_firstname', '')} {p.get('alias_lastname', '')}, Birthday: {p.get('birthday', '')}, Location: {p.get('location', '')}, Deceased: {p.get('deceased', '')}"
            for p in parents_dict
        ])

    return node_text, spouse_text, parent_text



# ✅ Function to process embeddings and store them in Neo4j
def process_embeddings():
    driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

    with driver.session() as session:
        # ✅ Ensure `embedding_node`, `embedding_spouse`, and `embedding_parents` properties exist
        session.execute_write(initialize_person_properties)

        # ✅ Fetch graph data
        nodes = session.execute_read(fetch_graph_data)

        for record in nodes:
            node = record["p"]
            spouse = record["spouse"]
            marriage_date = record["marriage_date"]
            parents = record["parents"]

            # 🔹 Generate hierarchical descriptions
            node_text, spouse_text, parent_text = generate_text(node, spouse, marriage_date, parents)

            # 🔹 Compute embeddings
            embedding_node = get_embedding(node_text)
            embedding_spouse = get_embedding(spouse_text)
            embedding_parents = get_embedding(parent_text)

            # 🔹 Ensure embeddings are JSON-compatible lists
            embedding_node = embedding_node.tolist() if isinstance(embedding_node, np.ndarray) else embedding_node
            embedding_spouse = embedding_spouse.tolist() if isinstance(embedding_spouse, np.ndarray) else embedding_spouse
            embedding_parents = embedding_parents.tolist() if isinstance(embedding_parents, np.ndarray) else embedding_parents


            # ✅ Store embeddings in Neo4j using `uid`
            session.run(
                """
                MATCH (p:Person {uid: $uid})
                SET p.embedding_node = $embedding_node,
                    p.embedding_spouse = $embedding_spouse,
                    p.embedding_parents = $embedding_parents
                """,
                uid=node["uid"],  # ✅ Use `uid` instead of `id`
                embedding_node=embedding_node,
                embedding_spouse=embedding_spouse,
                embedding_parents=embedding_parents
            )


    print("\n✅ All embeddings stored in Neo4j.")

# 🔥 Run embedding process
process_embeddings()



🔹 Generating embedding for: Person: Giuseppe Iannotti, Alias:  , Birthday: , Location: , Deceased: 
🔹 Generating embedding for: Person: Giuseppe Iannotti, Alias:  , Birthday: , Location: , Deceased:  | Married to Teresa Buttari, Alias:  Butta, Birthday: , Location: , Deceased: , Marriage Date: None
🔹 Generating embedding for: Person: Giuseppe Iannotti, Alias:  , Birthday: , Location: , Deceased: 
🔹 Generating embedding for: Person: Teresa Buttari, Alias:  Butta, Birthday: , Location: , Deceased: 
🔹 Generating embedding for: Person: Teresa Buttari, Alias:  Butta, Birthday: , Location: , Deceased:  | Married to Giuseppe Iannotti, Alias:  , Birthday: , Location: , Deceased: , Marriage Date: None
🔹 Generating embedding for: Person: Teresa Buttari, Alias:  Butta, Birthday: , Location: , Deceased: 
🔹 Generating embedding for: Person: Nicola De Santis, Alias:  , Birthday: , Location: , Deceased: 
🔹 Generating embedding for: Person: Nicola De Santis, Alias:  , Birthday: , Location: , Deceased:

#### With notes included (DO NOT USE)

In [None]:
from openai import OpenAI
from neo4j import GraphDatabase
import numpy as np
import json

# ✅ Neo4j Connection
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "password"

# ✅ Ollama (Granite) Embedding Model
client = OpenAI(base_url="http://localhost:11434/v1", api_key="ollama")
MODEL_NAME = "granite-embedding:278m"

# ✅ Function to ensure properties exist in Neo4j
def initialize_person_properties(tx):
    query = """
    MATCH (p:Person)
    SET p.birthday = COALESCE(p.birthday, ""),
        p.embedding_node = COALESCE(p.embedding_node, ""),
        p.embedding_spouse = COALESCE(p.embedding_spouse, ""),
        p.embedding_parents = COALESCE(p.embedding_parents, ""),
        p.notes = COALESCE(p.notes, ""),  // ✅ Ensure 'notes' property exists
        p.embedding_notes = COALESCE(p.embedding_notes, "")  // ✅ Ensure 'embedding_notes' property exists
    """
    tx.run(query)

# ✅ Function to get embeddings from Ollama
def get_embedding(text):
    #print(f"🔹 Generating embedding for: {text}")  # Debugging
    response = client.embeddings.create(model=MODEL_NAME, input=[text])
    return response.data[0].embedding

# ✅ Function to fetch persons and their relationships
def fetch_graph_data(tx):
    query = """
    MATCH (p:Person)
    OPTIONAL MATCH (p)-[m:MARRIED_TO]->(spouse:Person)
    OPTIONAL MATCH (p)-[r:SON_OF|DAUGHTER_OF]->(parent:Person)
    RETURN p, m.date AS marriage_date, spouse, COLLECT(parent) AS parents, p.notes AS notes  // ✅ Fetch notes
    """
    return list(tx.run(query))

# ✅ Convert Neo4j Node to Dictionary
def node_to_dict(n):
    return dict(n) if n else {}

# ✅ Function to generate hierarchical text descriptions
def generate_text(node, spouse, marriage_date, parents, notes):
    node_dict = node_to_dict(node)
    spouse_dict = node_to_dict(spouse)
    parents_dict = [node_to_dict(parent) for parent in parents]

    # ✅ Extract notes
    notes_text = f"Notes: {notes}" if notes else "Notes: None"

    # ✅ Node embedding (individual properties + notes)
    node_text = f"Person: {node_dict.get('fullname', 'Unknown')}, Alias: {node_dict.get('alias_firstname', '')} {node_dict.get('alias_lastname', '')}, Birthday: {node_dict.get('birthday', '')}, Location: {node_dict.get('location', '')}, Deceased: {node_dict.get('deceased', '')}, {notes_text}"

    # ✅ Spouse embedding (includes spouse properties + notes)
    spouse_text = node_text
    if spouse:
        spouse_text += f" | Married to {spouse_dict.get('fullname', 'Unknown')}, Alias: {spouse_dict.get('alias_firstname', '')} {spouse_dict.get('alias_lastname', '')}, Birthday: {spouse_dict.get('birthday', '')}, Location: {spouse_dict.get('location', '')}, Deceased: {spouse_dict.get('deceased', '')}, Marriage Date: {marriage_date}, {notes_text}"

    # ✅ Parent embedding (includes both parents' properties + notes)
    parent_text = node_text
    if parents_dict:
        parent_text += " | Parents: " + " & ".join([
            f"{p.get('fullname', 'Unknown')}, Alias: {p.get('alias_firstname', '')} {p.get('alias_lastname', '')}, Birthday: {p.get('birthday', '')}, Location: {p.get('location', '')}, Deceased: {p.get('deceased', '')}"
            for p in parents_dict
        ]) + f", {notes_text}"

    return node_text, spouse_text, parent_text, notes_text

# ✅ Function to process embeddings and store them in Neo4j
def process_embeddings():
    driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

    with driver.session() as session:
        # ✅ Ensure `embedding_node`, `embedding_spouse`, `embedding_parents`, and `embedding_notes` properties exist
        session.execute_write(initialize_person_properties)

        # ✅ Fetch graph data
        nodes = session.execute_read(fetch_graph_data)

        for record in nodes:
            node = record["p"]
            spouse = record["spouse"]
            marriage_date = record["marriage_date"]
            parents = record["parents"]
            notes = record["notes"]  # ✅ Include notes

            # 🔹 Generate hierarchical descriptions
            node_text, spouse_text, parent_text, notes_text = generate_text(node, spouse, marriage_date, parents, notes)

            # 🔹 Compute embeddings
            embedding_node = get_embedding(node_text)
            embedding_spouse = get_embedding(spouse_text)
            embedding_parents = get_embedding(parent_text)
            embedding_notes = get_embedding(notes_text)  # ✅ Compute embedding for notes

            # 🔹 Ensure embeddings are JSON-compatible lists
            embedding_node = embedding_node.tolist() if isinstance(embedding_node, np.ndarray) else embedding_node
            embedding_spouse = embedding_spouse.tolist() if isinstance(embedding_spouse, np.ndarray) else embedding_spouse
            embedding_parents = embedding_parents.tolist() if isinstance(embedding_parents, np.ndarray) else embedding_parents
            embedding_notes = embedding_notes.tolist() if isinstance(embedding_notes, np.ndarray) else embedding_notes

            # 🔹 Debugging: Print before storing
            #print(f"\n🔹 Storing embeddings for: {node['fullname']} (ID: {node['id']})")

            # ✅ Store embeddings in Neo4j using `uid`
            session.run(
                """
                MATCH (p:Person {uid: $uid})
                SET p.embedding_node = $embedding_node,
                    p.embedding_spouse = $embedding_spouse,
                    p.embedding_parents = $embedding_parents,
                    p.embedding_notes = $embedding_notes  // ✅ Store notes embedding
                """,
                uid=node["uid"],  # ✅ Use `uid` instead of `id`
                embedding_node=embedding_node,
                embedding_spouse=embedding_spouse,
                embedding_parents=embedding_parents,
                embedding_notes=embedding_notes  # ✅ Store notes embedding
            )

    print("\n✅ All embeddings stored in Neo4j.")

# 🔥 Run embedding process
process_embeddings()

# Embeddings Merge

In [1]:
import os
import openai
#personal_openai_key = os.environ["PERSONAL_OPENAI_KEY"]
#personal_openai_key
openai.api_key = os.getenv("PERSONAL_OPENAI_API_KEY")
print(os.getenv("PERSONAL_OPENAI_API_KEY"))

None


In [13]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity


# List of names
names = ["Giovanni Tricarico", "GioBatta Tricarico", "Giovanni Battista Tricarico", "Gio Batta Tricarico"]

# Function to get embeddings using the new OpenAI API
def get_embedding(text, model="text-embedding-3-large"):
    response = openai.embeddings.create(input=[text], model=model)
    return response.data[0].embedding

# Compute embeddings for each name
embeddings = np.array([get_embedding(name) for name in names])

# Calculate cosine similarity matrix
similarity_matrix = cosine_similarity(embeddings)

# Convert to DataFrame for better readability
df_similarity = pd.DataFrame(similarity_matrix, index=names, columns=names)

# Display the similarity matrix
print(df_similarity)

                   Giovanni  GioBatta  Giovanni Battista  Gio Batta
Giovanni           1.000000  0.670855           0.757999   0.610009
GioBatta           0.670855  1.000000           0.669497   0.867514
Giovanni Battista  0.757999  0.669497           1.000000   0.652523
Gio Batta          0.610009  0.867514           0.652523   1.000000


In [14]:
# List of names
names = ["Giovanni Tricarico", "GioBatta Tricarico", "Giovanni Battista Tricarico", "Gio Batta Tricarico"]

# Function to get embeddings using the new OpenAI API
def get_embedding(text, model="text-embedding-3-large"):
    response = openai.embeddings.create(input=[text], model=model)
    return response.data[0].embedding

# Compute embeddings for each name
embeddings = np.array([get_embedding(name) for name in names])

# Calculate cosine similarity matrix
similarity_matrix = cosine_similarity(embeddings)

# Convert to DataFrame for better readability
df_similarity = pd.DataFrame(similarity_matrix, index=names, columns=names)

# Display the similarity matrix
print(df_similarity)

                             Giovanni Tricarico  GioBatta Tricarico  \
Giovanni Tricarico                     1.000000            0.819354   
GioBatta Tricarico                     0.819354            1.000000   
Giovanni Battista Tricarico            0.820493            0.785798   
Gio Batta Tricarico                    0.763882            0.914558   

                             Giovanni Battista Tricarico  Gio Batta Tricarico  
Giovanni Tricarico                              0.820493             0.763882  
GioBatta Tricarico                              0.785798             0.914558  
Giovanni Battista Tricarico                     1.000000             0.769057  
Gio Batta Tricarico                             0.769057             1.000000  


In [28]:
# List of names
names = [
    "Francesco Tricarico",
    "GioBatta Tricarico",
    "Marco Tricarico",
    "Gio Batta Tricarico",
    "Giuseppe Tricarico",
    "Samuele Tricarico",
    "Angelo Tricarico",
    "Giovanni Tricarico",
    "Giovanni Battista Tricarico",
    "Giovanni Battista Castoldi"
]


# Function to get embeddings using the new OpenAI API
def get_embedding(text, model="text-embedding-3-large"):
    response = openai.embeddings.create(input=[text], model=model)
    return response.data[0].embedding

# Compute embeddings for each name
embeddings = np.array([get_embedding(name) for name in names])

# Calculate cosine similarity matrix
similarity_matrix = cosine_similarity(embeddings)

# Convert to DataFrame for better readability
df_similarity = pd.DataFrame(similarity_matrix, index=names, columns=names)

In [29]:
df_similarity

Unnamed: 0,Francesco Tricarico,GioBatta Tricarico,Marco Tricarico,Gio Batta Tricarico,Giuseppe Tricarico,Samuele Tricarico,Angelo Tricarico,Giovanni Tricarico,Giovanni Battista Tricarico,Giovanni Battista Castoldi
Francesco Tricarico,1.0,0.69401,0.728203,0.670867,0.773177,0.721264,0.606272,0.744846,0.646157,0.297893
GioBatta Tricarico,0.69401,1.0,0.623108,0.914462,0.777523,0.684822,0.635687,0.819313,0.785837,0.440798
Marco Tricarico,0.728203,0.623108,1.0,0.589992,0.671922,0.621264,0.54813,0.626173,0.530276,0.256126
Gio Batta Tricarico,0.670867,0.914462,0.589992,1.0,0.766961,0.659993,0.658352,0.76395,0.769119,0.425645
Giuseppe Tricarico,0.773177,0.777523,0.671922,0.766961,1.0,0.797099,0.715264,0.802879,0.760382,0.3984
Samuele Tricarico,0.721264,0.684822,0.621264,0.659993,0.797099,1.0,0.638092,0.714856,0.649502,0.329799
Angelo Tricarico,0.606272,0.635687,0.54813,0.658352,0.715264,0.638092,1.0,0.71959,0.658104,0.331574
Giovanni Tricarico,0.744846,0.819313,0.626173,0.76395,0.802879,0.714856,0.71959,1.0,0.820606,0.442206
Giovanni Battista Tricarico,0.646157,0.785837,0.530276,0.769119,0.760382,0.649502,0.658104,0.820606,1.0,0.529121
Giovanni Battista Castoldi,0.297893,0.440798,0.256126,0.425645,0.3984,0.329799,0.331574,0.442206,0.529121,1.0


In [None]:
# Opensource Ollama

In [32]:
import numpy as np
import pandas as pd
from openai import OpenAI
from sklearn.metrics.pairwise import cosine_similarity

# Initialize the OpenAI-compatible Ollama client
client = OpenAI(
    base_url="http://localhost:11434/v1",
    api_key="ollama",
)

# Define the model name (use "snowflake-arctic-embed2" if "granite-embedding:278m" is unavailable)
MODEL_NAME = "granite-embedding:278m"

# List of names to compare
names = [
    "Francesco Tricarico",
    "GioBatta Tricarico",
    "Marco Tricarico",
    "Gio Batta Tricarico",
    "Giuseppe Tricarico",
    "Samuele Tricarico",
    "Angelo Tricarico",
    "Giovanni Tricarico",
    "Giovanni Battista Tricarico",
    "Giovanni Battista Castoldi"
]

# Function to get embeddings from Ollama
def get_embedding(text):
    response = client.embeddings.create(model=MODEL_NAME, input=[text])
    return response.data[0].embedding

# Compute embeddings for each name
embeddings = np.array([get_embedding(name) for name in names])

# Compute cosine similarity matrix
similarity_matrix = cosine_similarity(embeddings)

# Convert to DataFrame for better readability
df_similarity = pd.DataFrame(similarity_matrix, index=names, columns=names)

# Save results to a CSV file
df_similarity.to_csv("ollama_embedding_similarity.csv")

# Print results
df_similarity

Unnamed: 0,Francesco Tricarico,GioBatta Tricarico,Marco Tricarico,Gio Batta Tricarico,Giuseppe Tricarico,Samuele Tricarico,Angelo Tricarico,Giovanni Tricarico,Giovanni Battista Tricarico,Giovanni Battista Castoldi
Francesco Tricarico,1.0,0.780938,0.883688,0.785554,0.874762,0.810909,0.842123,0.865042,0.811937,0.531871
GioBatta Tricarico,0.780938,1.0,0.792147,0.964006,0.832647,0.757723,0.782007,0.857478,0.885072,0.663624
Marco Tricarico,0.883688,0.792147,1.0,0.794121,0.830963,0.836522,0.852882,0.835993,0.790042,0.505922
Gio Batta Tricarico,0.785554,0.964006,0.794121,1.0,0.835634,0.76006,0.796135,0.853442,0.906988,0.695141
Giuseppe Tricarico,0.874762,0.832647,0.830963,0.835634,1.0,0.823639,0.854375,0.934586,0.874504,0.609797
Samuele Tricarico,0.810909,0.757723,0.836522,0.76006,0.823639,1.0,0.839948,0.804932,0.776651,0.511222
Angelo Tricarico,0.842123,0.782007,0.852882,0.796135,0.854375,0.839948,1.0,0.858966,0.824856,0.544978
Giovanni Tricarico,0.865042,0.857478,0.835993,0.853442,0.934586,0.804932,0.858966,1.0,0.928692,0.650092
Giovanni Battista Tricarico,0.811937,0.885072,0.790042,0.906988,0.874504,0.776651,0.824856,0.928692,1.0,0.75223
Giovanni Battista Castoldi,0.531871,0.663624,0.505922,0.695141,0.609797,0.511222,0.544978,0.650092,0.75223,1.0


In [33]:
#0.80 threshold
# Define threshold
THRESHOLD = 0.80

# Extract name pairs with similarity above threshold
name_pairs = []
for i in range(len(df_similarity)):
    for j in range(i + 1, len(df_similarity)):  # Avoid redundant pairs and self-matches
        similarity_score = df_similarity.iloc[i, j]
        if similarity_score >= THRESHOLD:
            name_pairs.append({"name_1": df_similarity.index[i], "name_2": df_similarity.columns[j]})

name_pairs

[{'name_1': 'Francesco Tricarico', 'name_2': 'Marco Tricarico'},
 {'name_1': 'Francesco Tricarico', 'name_2': 'Giuseppe Tricarico'},
 {'name_1': 'Francesco Tricarico', 'name_2': 'Samuele Tricarico'},
 {'name_1': 'Francesco Tricarico', 'name_2': 'Angelo Tricarico'},
 {'name_1': 'Francesco Tricarico', 'name_2': 'Giovanni Tricarico'},
 {'name_1': 'Francesco Tricarico', 'name_2': 'Giovanni Battista Tricarico'},
 {'name_1': 'GioBatta Tricarico', 'name_2': 'Gio Batta Tricarico'},
 {'name_1': 'GioBatta Tricarico', 'name_2': 'Giuseppe Tricarico'},
 {'name_1': 'GioBatta Tricarico', 'name_2': 'Giovanni Tricarico'},
 {'name_1': 'GioBatta Tricarico', 'name_2': 'Giovanni Battista Tricarico'},
 {'name_1': 'Marco Tricarico', 'name_2': 'Giuseppe Tricarico'},
 {'name_1': 'Marco Tricarico', 'name_2': 'Samuele Tricarico'},
 {'name_1': 'Marco Tricarico', 'name_2': 'Angelo Tricarico'},
 {'name_1': 'Marco Tricarico', 'name_2': 'Giovanni Tricarico'},
 {'name_1': 'Gio Batta Tricarico', 'name_2': 'Giuseppe Tri

In [35]:
import json
from openai import OpenAI

# Initialize the OpenAI-compatible Ollama client
client = OpenAI(
    base_url="http://localhost:11434/v1",
    api_key="ollama",
)


# Structured prompt to guide the model
prompt = f"""
You are a genealogical expert specializing in Italian records. 
Your task is to determine the probability that two names belong to the same person, based purely on **historical naming conventions, phonetics, and linguistic rules**.

### **Instructions:**
- **Ignore cosine similarity scores** and use expert knowledge.
- **Assess name relationships** based on common Italian naming patterns:
  - Shortened versions (e.g., "Enzo" → "Vincenzo")
  - Regional variations
  - Phonetic similarities (e.g., "Maria Anna" -> "Marianna")
- **Provide a probability (%)** that the two names are the same person, using this scale:
  - **90-100%** → Almost certain match
  - **70-89%** → Highly probable
  - **50-69%** → Possible match
  - **30-49%** → Unlikely match
  - **0-29%** → Almost certainly different individuals

---

### **Name Pairs for Analysis:**
{json.dumps(name_pairs, indent=4)}

**Expected JSON Output Format:**
```json
[
    {{"name_1": "Francesco Tricarico", "name_2": "Marco Tricarico", "probability": 15}},
]
Return only a valid JSON response, with no extra explanations. """

response = client.chat.completions.create( model="mistral:7b", temperature=0, messages=[{"role": "user", "content": prompt}] )

print(response.choices[0].message.content)

 [
        {"name_1": "Francesco Tricarico", "name_2": "Marco Tricarico", "probability": 30},
        {"name_1": "Francesco Tricarico", "name_2": "Giuseppe Tricarico", "probability": 30},
        {"name_1": "Francesco Tricarico", "name_2": "Samuele Tricarico", "probability": 30},
        {"name_1": "Francesco Tricarico", "name_2": "Angelo Tricarico", "probability": 30},
        {"name_1": "Francesco Tricarico", "name_2": "Giovanni Tricarico", "probability": 50},
        {"name_1": "Francesco Tricarico", "name_2": "Giovanni Battista Tricarico", "probability": 70},
        {"name_1": "GioBatta Tricarico", "name_2": "Gio Batta Tricarico", "probability": 90},
        {"name_1": "GioBatta Tricarico", "name_2": "Giuseppe Tricarico", "probability": 30},
        {"name_1": "GioBatta Tricarico", "name_2": "Giovanni Tricarico", "probability": 50},
        {"name_1": "GioBatta Tricarico", "name_2": "Giovanni Battista Tricarico", "probability": 90},
        {"name_1": "Marco Tricarico", "name_2": 

In [15]:
# MATCH (n) DETACH DELETE n

# MATCH (n) OPTIONAL MATCH (n)-[r]-() RETURN n, r;

# MATCH (p:Person)-[r]-(neighbor) WHERE p.firstname = "Paolantonio" RETURN p, r, neighbor;

# MATCH (n) RETURN count(n);
