In [1]:
from openai import OpenAI

client = OpenAI(
    base_url = 'http://localhost:11434/v1',
    api_key='ollama',
)

In [2]:
from neo4j import GraphDatabase

# Define connection details
URI = "bolt://localhost:7687"  # Neo4j Bolt connection
USERNAME = "neo4j"
PASSWORD = "password"  # Replace with the password you set

# Create a Neo4j Driver instance
driver = GraphDatabase.driver(URI, auth=(USERNAME, PASSWORD))

# Define a function to test the connection
def test_connection():
    with driver.session() as session:
        result = session.run("RETURN 'Connected to Neo4j' AS message")
        for record in result:
            print(record["message"])

# Run test
test_connection()

# Close the driver when done
#driver.close()

Connected to Neo4j


In [3]:
import pandas as pd

# Define the file path
file_path = "IPR Avezzano Indexes 1809-1865.xlsx"

# Load the specific sheet "Marriages 1821 - 1845"
df = pd.read_excel(file_path, sheet_name="Marriages 1821-1845").head(5)

In [4]:
df.head()

Unnamed: 0,Record #,Last name - Groom,First name - Groom,Father of Groom,Mother of Groom,Last name - Bride,First name - Bride,Father of Bride,Mother of Bride,Date,Year,Comune Groom,Comune Bride,Notes
0,3,Iannotti,Ascenzo,Giuseppe,Teresa Buttari(BUTTANA),de Santis,Anna Berardina,Nicola,Maddalena del Rosso,2020-07-30,1821,,,
1,6,Lolli,Anselmo,fu Andrea,Camilla Liberati,Collalto,Anna Giuseppa,Gabrielefu,fu Elisabetta Savina,2020-12-09,1821,,,
2,2,Paciotti,Francesco,Pietro,Giacinta di Simone,Lolli,Maria,Costanzo,Domenica Colella,2020-05-20,1821,,,
3,1,Pennazza,Giuseppe,Gennaro,Gesualda di Clemente,Novelli,Marianna,Francesco,Maria Domenica Marianella,2020-01-21,1821,,,
4,5,Rodorigo,Giacomo,Felice,Rosa di Crescenzo,Donsante,Aurora,Arcangelo,Clementina Cosimo,2020-11-25,1821,,,


# Regex (fu, alias)

In [42]:
import pandas as pd
import re

def extract_alias(name):
    """
    ✅ Extracts alias name from brackets `()`, removing it from the main name.
    - Handles inconsistent spacing like `Melchior(Melchiorre)`, `Melchior   (Melchiorre)`.
    """
    if not isinstance(name, str) or name.strip() == "":
        return name, ""  # ✅ Return unchanged name, no alias

    match = re.search(r"\s*\(\s*([^)]*?)\s*\)\s*", name)  # Extract content inside brackets
    alias = match.group(1).strip() if match else ""  # ✅ Extract alias name

    name = re.sub(r"\s*\(\s*[^)]*?\s*\)\s*", "", name).strip()  # ✅ Remove brackets & alias

    return name.strip(), alias.strip()


def format_proper_case(name):
    """
    ✅ Converts all words to start with a capital letter.
    - Fixes "di", "del", "de".
    - Removes "fu" / "fù" (even if attached like "Gabrielefu").
    - Extracts alias names from brackets `()`.
    - Detects if the person was deceased.
    """
    if not isinstance(name, str) or name.strip() == "":
        return name, "", ""  # ✅ Return unchanged name, empty deceased, no alias

    # ✅ Detect "fu" / "fù" at the start OR attached to the name (e.g., "fu Costanza", "Gabrielefu")
    deceased = ""
    name = name.strip()

    match = re.match(r"^(fu|fù)\s*", name, re.IGNORECASE)  # Match "fu " or "fù " at the start
    match_attached = re.search(r"(fu|fù)$", name, re.IGNORECASE)  # Match "fu" or "fù" at the end

    if match:
        deceased = "before {date}"  # 🔹 Placeholder (updated later)
        name = name[len(match.group(0)):].strip()  # ✅ Remove "fu"/"fù" at the start

    elif match_attached:
        deceased = "before {date}"  # 🔹 Placeholder (updated later)
        name = name[:match_attached.start()].strip()  # ✅ Remove "fu"/"fù" at the end

    # ✅ Extract Alias Name
    name, alias = extract_alias(name)

    # ✅ Fix "di", "del", "de" capitalization
    name = re.sub(r"\b(di|del|de)(?=\s|\b)", lambda x: x.group(1).capitalize(), name, flags=re.IGNORECASE)

    # ✅ Capitalize all words
    name = " ".join(word.capitalize() for word in name.split())

    return name, alias, deceased  # ✅ Return formatted name, alias, deceased status


def format_deceased_status(name, date, year):
    """
    ✅ If "fu" / "fù" is present, mark person as deceased **before the marriage date**.
    ✅ Extracts alias if present in `()` and formats properly.
    ✅ Fixes Timestamp issue by formatting the date properly.
    """
    formatted_name, alias, deceased = format_proper_case(name)  # ✅ Format name, detect "fu", extract alias

    # ✅ Ensure date is properly converted to a string in "09 Dec" format
    if deceased and pd.notna(date) and pd.notna(year):
        date_str = date.strftime("%d %b") if isinstance(date, pd.Timestamp) else str(date).strip()
        year_str = str(year).strip()
        deceased = f"before {date_str} {year_str}"  # ✅ Correct format: "before 09 Dec 1821"

    return formatted_name, alias, deceased  # ✅ Returns three values


# ✅ Work on a **COPY** of df (DO NOT modify the original)
df_corrected = df.copy()

# ✅ Strip spaces from column names
df_corrected.columns = df_corrected.columns.str.strip()

# ✅ Define columns that get Alias + Deceased handling
columns_with_alias_deceased = [
    "Father of Groom", "Mother of Groom",
    "Father of Bride", "Mother of Bride"
]

# ✅ Define columns for Groom & Bride to get Alias (but NO Deceased)
columns_with_alias = [
    "Last name - Groom", "First name - Groom",
    "Last name - Bride", "First name - Bride"
]

# ✅ Apply transformations for Parents (Alias + Deceased)
for col in columns_with_alias_deceased:
    if col in df_corrected.columns:
        df_corrected[[col, f"Alias - {col}", f"Deceased - {col}"]] = df_corrected.apply(
            lambda row: format_deceased_status(row[col], row["Date"], row["Year"]), axis=1, result_type="expand"
        )

# ✅ Apply transformations for Groom & Bride (Alias ONLY)
for col in columns_with_alias:
    if col in df_corrected.columns:
        df_corrected[[col, f"Alias - {col}"]] = df_corrected.apply(
            lambda row: format_proper_case(row[col])[:2], axis=1, result_type="expand"
        )

# ✅ Apply Special Case for **Father's First Name Alias & Last Name Alias** (inherits Groom/Bride last name)
for role in ["Father of Groom", "Father of Bride"]:
    if role in df_corrected.columns:
        df_corrected[f"Alias - {role} First Name"] = df_corrected[f"Alias - {role}"]  # ✅ First name alias
        df_corrected[f"Alias - {role} Last Name"] = df_corrected["Last name - " + role.split(" ")[-1]]  # ✅ Inherit last name

# ✅ Apply Special Case for **Mother's Alias** (Full Name Alias)
for role in ["Mother of Groom", "Mother of Bride"]:
    if role in df_corrected.columns:
        df_corrected[f"Alias - {role}"] = df_corrected[f"Alias - {role}"]  # ✅ Full alias, not split

# ✅ REMOVE COLUMNS: "Alias - Father of Groom" & "Alias - Father of Bride"
df_corrected.drop(columns=["Alias - Father of Groom", "Alias - Father of Bride"], inplace=True, errors="ignore")

# ✅ **NEW: If Alias - Father Last Name is identical to Groom/Bride Last Name, REMOVE VALUE**
for role in ["Groom", "Bride"]:
    father_alias_col = f"Alias - Father of {role} Last Name"
    father_lastname_col = f"Last name - {role}"

    if father_alias_col in df_corrected.columns and father_lastname_col in df_corrected.columns:
        df_corrected[father_alias_col] = df_corrected.apply(
            lambda row: "" if row[father_alias_col] == row[father_lastname_col] else row[father_alias_col], axis=1
        )

# ✅ **NEW RULE: If Alias - Last Name - Groom/Bride is populated, fill Alias - Father Last Name**
for role in ["Groom", "Bride"]:
    alias_last_name_col = f"Alias - Last name - {role}"
    alias_father_last_name_col = f"Alias - Father of {role} Last Name"

    if alias_last_name_col in df_corrected.columns and alias_father_last_name_col in df_corrected.columns:
        df_corrected[alias_father_last_name_col] = df_corrected.apply(
            lambda row: row[alias_last_name_col] if pd.notna(row[alias_last_name_col]) and row[alias_last_name_col] != "" else row[alias_father_last_name_col],
            axis=1
        )

# ✅ Save Corrected Data (Safe Copy)
df_corrected.to_csv("corrected_marriage_records.csv", index=False)

print("✅ Successfully formatted names, extracted aliases, handled 'fu', fixed date format, removed extra alias values when redundant, and saved deceased statuses!")

✅ Successfully formatted names, extracted aliases, handled 'fu', fixed date format, removed extra alias values when redundant, and saved deceased statuses!


In [43]:
df_corrected.tail()

Unnamed: 0,Record #,Last name - Groom,First name - Groom,Father of Groom,Mother of Groom,Last name - Bride,First name - Bride,Father of Bride,Mother of Bride,Date,...,Alias - Mother of Bride,Deceased - Mother of Bride,Alias - Last name - Groom,Alias - First name - Groom,Alias - Last name - Bride,Alias - First name - Bride,Alias - Father of Groom First Name,Alias - Father of Groom Last Name,Alias - Father of Bride First Name,Alias - Father of Bride Last Name
95,5,Ricci,Angelo Raffaele,Vincenzo,Maria Nunziata Fantauzzi,Sorge,Domenica,Domenico,Clementina Cipollone,2020-02-04,...,,,,,,,,,,
96,8,Fiasca,Angelo Nicola,Michelangelo,Francesca Borelli,Cipollone,Rosaria,Francesco,Domenica Lolli,2020-05-04,...,,,,,,,,,,
97,13,Lolli,Antonio,Diego,Maria Luigia Ricci,Giovanni Iori,Angela Maria,Benedetto,Teresa Lanciani,2020-10-28,...,,before 28 Oct 1827,,,Giovagnorio,,,,,Giovagnorio
98,1,Seritti,Bartolomeo,Epifanio,Barbara Collalto,Incari,Benedetta,Vincenzo,Anna Felice Giffi,2020-01-26,...,,,,,,,,,,
99,3,Del Rosso,Domenicantonio,Paolo,Maria Giuseppa Collalto,Carlucci,Anna Francesca,Berardino,Rosalba Ricci,2020-02-03,...,,before 03 Feb 1827,,,,,,,,


In [45]:
df_corrected.iloc[91]

Record #                                                2
Last name - Groom                                 Iannini
First name - Groom                                Tommaso
Father of Groom                                  Raffaele
Mother of Groom                           Agnese Melchior
Last name - Bride                                  Fiasca
First name - Bride                              Maddalena
Father of Bride                                    Nicola
Mother of Bride                             Cecilia Spina
Date                                  2020-03-19 00:00:00
Year                                                 1826
Comune Groom                                       Aquila
Comune Bride                                          NaN
Notes                                                 NaN
Deceased - Father of Groom                               
Alias - Mother of Groom                        Melchiorre
Deceased - Mother of Groom                               
Deceased - Fat

# Check first name and last name for mothers with LLM

In [46]:
from openai import OpenAI
import pandas as pd
import json
from pydantic import BaseModel

# ✅ Setup DeepSeek LLM
client = OpenAI(base_url="http://localhost:11434/v1", api_key="ollama")

# ✅ Define Pydantic Model for Structured Response
class NameSplit(BaseModel):
    firstname: str
    lastname: str

# ✅ Function to Generate LLM Prompt
def generate_name_split_prompt(full_name):
    """
    Generates a structured prompt for the LLM to split names.
    - Keeps **full names intact** (no missing middle names).
    - Uses structured JSON output via Pydantic.
    """
    return f"""
    You are an expert in **historical genealogical records** in Italian.
    Your task is to **correctly split Italian full names of mothers** into:
    
    - **First Name:** The complete first name (including double names if present).
    - **Last Name:** The proper last name (with correct capitalization).

    📌 **Rules:**
    - **Do not change spelling** of any names.
    - Ensure the **full name remains intact**.
    - Format output as JSON with `"firstname"`, `"lastname"`.

    ---
    **Example Inputs → Outputs**
    - `"Maria Domenica Marianella"` → `{{
        "firstname": "Maria Domenica",
        "lastname": "Marianella"
    }}`
    - `"Rosa De Luca"` → `{{
        "firstname": "Rosa",
        "lastname": "De Luca"
    }}`

    ---
    🔍 **Now split the following name correctly:**
    **Input:** `{full_name}`
    **Output (JSON only):**
    """

# ✅ Function to Process Mother Names using LLM
def process_mother_names(row):
    """
    Uses the LLM to correct 'Mother of Groom' and 'Mother of Bride' names,
    and splits them into first and last names.
    """
    corrected_data = {}

    for col in ["Mother of Groom", "Mother of Bride"]:
        full_name = row[col].strip() if pd.notna(row[col]) else ""

        if not full_name or " " not in full_name:  # Skip if no valid name
            corrected_data[col] = {"firstname": full_name, "lastname": ""}
            continue

        # ✅ Call LLM using `.parse()` for structured response
        try:
            response = client.beta.chat.completions.parse(
                model="mistral:7b",
                temperature=0,
                messages=[{"role": "user", "content": generate_name_split_prompt(full_name)}],
                response_format=NameSplit,  # ✅ Structured Pydantic Response
            )

            # ✅ Extract structured data
            if response.choices[0].message.parsed:
                parsed_data = response.choices[0].message.parsed
                corrected_data[col] = {
                    "firstname": parsed_data.firstname.strip(),
                    "lastname": parsed_data.lastname.strip()
                }
                print(f"✅ {full_name} → First name: {parsed_data.firstname}, Last name: {parsed_data.lastname}")

            else:
                print(f"⚠️ LLM Refused or Failed for: {full_name}")
                corrected_data[col] = {"firstname": full_name, "lastname": ""}

        except Exception as e:
            print(f"❌ Error processing '{full_name}': {e}")
            corrected_data[col] = {"firstname": full_name, "lastname": ""}  # Keep fallback

    return corrected_data

# ✅ Work on a **COPY** of df_corrected → Output will be df_llm
df_llm = df_corrected.copy()

# ✅ Apply LLM Correction Row by Row
corrected_mother_names = df_llm.apply(process_mother_names, axis=1)

# ✅ Convert corrected data into separate columns
for index, corrections in corrected_mother_names.items():
    for col, values in corrections.items():
        df_llm.loc[index, f"First name - {col}"] = values["firstname"]
        df_llm.loc[index, f"Last name - {col}"] = values["lastname"]

# ✅ Save Corrected Data (Safe Copy)
df_llm.to_csv("corrected_marriage_records_llm.csv", index=False)

print("✅ Successfully corrected mother names using LLM and saved the new dataset!")

✅ Teresa Buttari → First name: Teresa, Last name: Buttari
✅ Maddalena Del Rosso → First name: Maddalena, Last name: Del Rosso
✅ Camilla Liberati → First name: Camilla, Last name: Liberati
✅ Elisabetta Savina → First name: Elisabetta, Last name: Savina
✅ Giacinta Di Simone → First name: Giacinta, Last name: Di Simone
✅ Domenica Colella → First name: Domenica, Last name: Colella
✅ Gesualda Di Clemente → First name: Gesualda, Last name: Di Clemente
✅ Maria Domenica Marianella → First name: Maria Domenica, Last name: Marianella
✅ Rosa Di Crescenzo → First name: Rosa, Last name: Di Crescenzo
✅ Clementina Cosimo → First name: Clementina, Last name: Cosimo
✅ Vittoria Gatto → First name: Vittoria, Last name: Gatto
✅ Felicia Di Pietro → First name: Felicia, Last name: Di Pietro
✅ Caterina Iacoboni → First name: Caterina, Last name: Iacoboni
✅ Gesualda Di Clemente → First name: Gesualda, Last name: Di Clemente
✅ Maria Felice Paciotti → First name: Maria Felice, Last name: Paciotti
✅ Maria Sorge 

# TO DO:

- Feliceantonio, through llm, make it an alias for name "Felice Antonio"

- Use llm to distinguish the field "alias lastname" and "alias name" for mother of groom / bride

- Create field for "comune"

- Read notes via LLM and create custom properties or adjustments

In [47]:
df_llm.head()

Unnamed: 0,Record #,Last name - Groom,First name - Groom,Father of Groom,Mother of Groom,Last name - Bride,First name - Bride,Father of Bride,Mother of Bride,Date,...,Alias - Last name - Bride,Alias - First name - Bride,Alias - Father of Groom First Name,Alias - Father of Groom Last Name,Alias - Father of Bride First Name,Alias - Father of Bride Last Name,First name - Mother of Groom,Last name - Mother of Groom,First name - Mother of Bride,Last name - Mother of Bride
0,3,Iannotti,Ascenzo,Giuseppe,Teresa Buttari,De Santis,Anna Berardina,Nicola,Maddalena Del Rosso,2020-07-30,...,,,,,,,Teresa,Buttari,Maddalena,Del Rosso
1,6,Lolli,Anselmo,Andrea,Camilla Liberati,Collalto,Anna Giuseppa,Gabriele,Elisabetta Savina,2020-12-09,...,,,,,,,Camilla,Liberati,Elisabetta,Savina
2,2,Paciotti,Francesco,Pietro,Giacinta Di Simone,Lolli,Maria,Costanzo,Domenica Colella,2020-05-20,...,,,,,,,Giacinta,Di Simone,Domenica,Colella
3,1,Pennazza,Giuseppe,Gennaro,Gesualda Di Clemente,Novelli,Marianna,Francesco,Maria Domenica Marianella,2020-01-21,...,,,,,,,Gesualda,Di Clemente,Maria Domenica,Marianella
4,5,Rodorigo,Giacomo,Felice,Rosa Di Crescenzo,Donsante,Aurora,Arcangelo,Clementina Cosimo,2020-11-25,...,,,,,,,Rosa,Di Crescenzo,Clementina,Cosimo


# Create relationships, entities

In [11]:
df_llm = pd.read_csv("corrected_marriage_records_llm.csv")
import json

In [12]:
import uuid
from pydantic import BaseModel
from typing import List, Optional
import pandas as pd
import numpy as np
from neo4j import GraphDatabase

# ✅ Define Schema for Persons
class Person(BaseModel):
    id: str  # Unique Identifier
    fullname: str
    firstname: str
    lastname: str
    alias_firstname: str
    alias_lastname: str
    gender: str
    location: str = ""  # Default to empty string
    deceased: Optional[str] = ""

# ✅ Define Schema for Relationships
class Relationship(BaseModel):
    from_id: str  # Use unique IDs instead of fullnames
    to_id: str
    type: str
    date: Optional[str] = None

# ✅ Function to Format Date Correctly
def format_date(date_value, year_value):
    """Ensures the date is always in 'dd MMM yyyy' format."""
    if isinstance(date_value, pd.Timestamp):
        date_value = date_value.strftime("%d %b")  # Convert Timestamp
    elif isinstance(date_value, str):
        date_value = date_value.strip()

    year_value = str(year_value).strip() if pd.notna(year_value) else ""

    return f"{date_value} {year_value}" if date_value and year_value else None

# ✅ Function to Retrieve Deceased Status from df_llm
def get_deceased_status(row, person_type):
    """Retrieves correct deceased status from df_llm based on person type."""
    deceased_col = f"Deceased - {person_type}"
    return row[deceased_col] if deceased_col in row and pd.notna(row[deceased_col]) else ""

# ✅ Function to Clean and Extract Location
def clean_location(value):
    """Ensure location is either a valid string or default empty string."""
    return value.strip() if isinstance(value, str) and value.strip() else ""

In [14]:
# ✅ Process Each Marriage Record Row
def process_row(row):
    """Processes a marriage record row into structured JSON with unique IDs and correct location handling."""

    # ✅ Convert NaN to Empty String or None
    def clean_str(value):
        """Convert NaN to an empty string before stripping."""
        return str(value).strip() if isinstance(value, str) else ""

    # ✅ Generate unique IDs for each person
    def generate_person(firstname, lastname, alias_firstname, alias_lastname, gender, location, deceased):
        return Person(
            id=str(uuid.uuid4()),  # Unique UUID
            fullname=f"{firstname} {lastname}",
            firstname=firstname,
            lastname=lastname,
            alias_firstname=alias_firstname,
            alias_lastname=alias_lastname,
            gender=gender,
            location=location,  # Properly assign location
            deceased=deceased,
        )

    # ✅ Extract and clean location for groom and bride
    groom_location = clean_location(row.get("Comune Groom", ""))
    bride_location = clean_location(row.get("Comune Bride", ""))

    # ✅ Create Persons
    groom = generate_person(clean_str(row["First name - Groom"]), clean_str(row["Last name - Groom"]),
                            clean_str(row.get("Alias - First name - Groom", "")), clean_str(row.get("Alias - Last name - Groom", "")),
                            "male", groom_location, get_deceased_status(row, "First name - Groom"))

    bride = generate_person(clean_str(row["First name - Bride"]), clean_str(row["Last name - Bride"]),
                            clean_str(row.get("Alias - First name - Bride", "")), clean_str(row.get("Alias - Last name - Bride", "")),
                            "female", bride_location, get_deceased_status(row, "First name - Bride"))

    father_groom = generate_person(clean_str(row["Father of Groom"]), clean_str(row["Last name - Groom"]),
                                   clean_str(row.get("Alias - Father of Groom First Name", "")), clean_str(row.get("Alias - Father of Groom Last Name", "")),
                                   "male", "", get_deceased_status(row, "Father of Groom"))

    mother_groom = generate_person(clean_str(row["First name - Mother of Groom"]), clean_str(row["Last name - Mother of Groom"]),
                                   "", clean_str(row.get("Alias - Mother of Groom", "")),
                                   "female", "", get_deceased_status(row, "Mother of Groom"))

    father_bride = generate_person(clean_str(row["Father of Bride"]), clean_str(row["Last name - Bride"]),
                                   clean_str(row.get("Alias - Father of Bride First Name", "")), clean_str(row.get("Alias - Father of Bride Last Name", "")),
                                   "male", "", get_deceased_status(row, "Father of Bride"))

    mother_bride = generate_person(clean_str(row["First name - Mother of Bride"]), clean_str(row["Last name - Mother of Bride"]),
                                   "", clean_str(row.get("Alias - Mother of Bride", "")),
                                   "female", "", get_deceased_status(row, "Mother of Bride"))

    persons = [groom, bride, father_groom, mother_groom, father_bride, mother_bride]

    
    # ✅ Restore Relationships (Includes Marriage Date & Parents' Marriages)
    relationships = [
        Relationship(from_id=groom.id, to_id=bride.id, type="MARRIED_TO", date=format_date(row["Date"], row["Year"])),
        Relationship(from_id=bride.id, to_id=groom.id, type="MARRIED_TO", date=format_date(row["Date"], row["Year"])),
        Relationship(from_id=groom.id, to_id=father_groom.id, type="SON_OF"),
        Relationship(from_id=groom.id, to_id=mother_groom.id, type="SON_OF"),
        Relationship(from_id=bride.id, to_id=father_bride.id, type="DAUGHTER_OF"),
        Relationship(from_id=bride.id, to_id=mother_bride.id, type="DAUGHTER_OF"),
        Relationship(from_id=father_groom.id, to_id=mother_groom.id, type="MARRIED_TO"),
        Relationship(from_id=father_bride.id, to_id=mother_bride.id, type="MARRIED_TO"),
    ]

    return {"persons": [p.model_dump() for p in persons], "relationships": [r.model_dump() for r in relationships]}


# ✅ Process DataFrame from df_llm
records = [process_row(row) for _, row in df_llm.iterrows()]

# ✅ Save JSON output
with open("neo4j_data.json", "w") as f:
    json.dump(records, f, indent=2)

print("✅ Successfully processed marriage records with Correct Locations!")

✅ Successfully processed marriage records with Correct Locations!


# Insert to Neo4j

In [15]:
from pydantic import BaseModel
from typing import List, Optional
import pandas as pd
import json
import numpy as np
import re
from neo4j import GraphDatabase

# Define connection details
URI = "bolt://localhost:7687"  # Neo4j Bolt connection
USERNAME = "neo4j"
PASSWORD = "password"  # Replace with the password you set

# Create a Neo4j Driver instance
driver = GraphDatabase.driver(URI, auth=(USERNAME, PASSWORD))

df_llm = pd.read_csv("marriage_records.csv")
print("✅ Loaded df_llm from 'marriage_records.csv'")  

with open("neo4j_data.json", "r") as f:
    records = json.load(f)

# ✅ Display the first few records for review
#num_records_to_display = min(5, len(records))  # Show up to 5 records
#records[:num_records_to_display]  # Display sample records

✅ Loaded df_llm from 'marriage_records.csv'


In [128]:
from neo4j import GraphDatabase

# ✅ Neo4j Connection Settings
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "password"

# ✅ Create Neo4j Driver
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

def insert_record(tx, persons, relationships):
    """Inserts persons and relationships in a single transaction, WITHOUT merging."""

    # ✅ Step 1: Create Persons (Independent Insert)
    for person in persons:
        query = """
        CREATE (p:Person {
            uid: $uid,
            fullname: $fullname,
            firstname: $firstname,
            lastname: $lastname,
            alias_firstname: $alias_firstname,
            alias_lastname: $alias_lastname,
            gender: $gender,
            location: $location,
            deceased: $deceased
        })
        """
        params = {
            "uid": person["id"],  # Unique ID
            "fullname": person["fullname"],
            "firstname": person["firstname"],
            "lastname": person["lastname"],
            "alias_firstname": person["alias_firstname"],
            "alias_lastname": person["alias_lastname"],
            "gender": person["gender"],
            "location": person["location"],
            "deceased": person["deceased"]
        }
        tx.run(query, **params)

    # ✅ Step 2: Create Relationships (Using UID)
    processed_marriages = set()  # ✅ Track already created marriages to prevent duplicates

    for rel in relationships:
        if rel["type"] == "MARRIED_TO":
            marriage_key = tuple(sorted([rel["from_id"], rel["to_id"]]))  # Unique key for marriage
            
            if marriage_key in processed_marriages:
                continue  # ✅ Skip duplicate marriage entry
            processed_marriages.add(marriage_key)

            query = """
            MATCH (a:Person {uid: $from_uid})
            MATCH (b:Person {uid: $to_uid})
            CREATE (a)-[:MARRIED_TO {date: $date, bidirectional: true}]->(b),
                   (b)-[:MARRIED_TO {date: $date, bidirectional: true}]->(a)
            """
        else:
            # ✅ Insert other relationships normally
            query = """
            MATCH (a:Person {uid: $from_uid})
            MATCH (b:Person {uid: $to_uid})
            CREATE (a)-[:`""" + rel["type"] + """`]->(b)
            """

        params = {
            "from_uid": rel["from_id"],  # Unique ID of from person
            "to_uid": rel["to_id"],      # Unique ID of to person
            "date": rel.get("date", None)  # Optional date
        }

        tx.run(query, **params)

# ✅ Process and Insert Each Record
with driver.session() as session:
    for record in records:
        persons = record["persons"]
        relationships = record["relationships"]
        session.execute_write(insert_record, persons, relationships)

print("✅ Successfully inserted marriage records with ONE bidirectional `MARRIED_TO` relationship per couple!")

✅ Successfully inserted marriage records with ONE bidirectional `MARRIED_TO` relationship per couple!


# Entity resolution

In [129]:
from neo4j import GraphDatabase

# ✅ Neo4j Connection Settings
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "password"

# ✅ Configurable Batch Size
BATCH_SIZE = 2  # Change this value to adjust batch processing size

# ✅ Create Neo4j Driver
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

def recursive_merge(tx, batch_size):
    """Recursively merges duplicate persons based on fullname, spouse fullname, and marriage date."""
    merge_query = f"""
    CALL apoc.periodic.iterate(
    "MATCH (p1:Person)-[m1:MARRIED_TO]->(s1:Person)
     MATCH (p2:Person)-[m2:MARRIED_TO]->(s2:Person)
     WHERE p1.fullname = p2.fullname
     AND s1.fullname = s2.fullname
     AND COALESCE(m1.date, '') = COALESCE(m2.date, '')  // ✅ Ensure marriage date matches, including empty strings
     AND id(p1) < id(p2)  // Prevent duplicate merging
     RETURN p1, p2, m1, m2, s1, s2",
     
    "WITH p1, p2, m1, m2, s1, s2
     // ✅ Merge nodes while avoiding duplicate relationships
     CALL apoc.refactor.mergeNodes([p1, p2]) YIELD node
     // ✅ Ensure only one MARRIED_TO relationship remains with correct date
     WITH node, s1, m1, m2
     MATCH (node)-[r:MARRIED_TO]->(s1)
     WITH node, s1, COLLECT(r) AS rels
     CALL apoc.refactor.mergeRelationships(rels, {{properties: 'combine'}}) YIELD rel
     RETURN COUNT(*)",
    
    {{batchSize: {batch_size}, parallel: false}})  // ✅ Dynamic batch size
    """

    tx.run(merge_query)

def count_nodes(tx):
    """Counts the total number of Person nodes."""
    result = tx.run("MATCH (p:Person) RETURN count(p) AS count")
    return result.single()["count"]

def merge_until_done(batch_size):
    """Runs the merging process iteratively until no more merges are possible."""
    with driver.session() as session:
        prev_count = session.execute_read(count_nodes)
        
        while True:
            session.execute_write(recursive_merge, batch_size)
            current_count = session.execute_read(count_nodes)
            
            print(f"🔄 Nodes before merge: {prev_count}, after merge: {current_count}")
            
            if current_count == prev_count:
                print("✅ No more merges available. Stopping process.")
                break  # Exit when no further merges happen
            
            prev_count = current_count  # Update previous count

# ✅ Run the Recursive Merging Process
merge_until_done(BATCH_SIZE)

print("🎯 Recursive merging complete: All duplicate persons merged successfully!")

🔄 Nodes before merge: 600, after merge: 556
🔄 Nodes before merge: 556, after merge: 550
🔄 Nodes before merge: 550, after merge: 548
🔄 Nodes before merge: 548, after merge: 548
✅ No more merges available. Stopping process.
🎯 Recursive merging complete: All duplicate persons merged successfully!


In [15]:
# MATCH (n) DETACH DELETE n

# MATCH (n) OPTIONAL MATCH (n)-[r]-() RETURN n, r;

# MATCH (p:Person)-[r]-(neighbor) WHERE p.firstname = "Paolantonio" RETURN p, r, neighbor;

# MATCH (n) RETURN count(n);
