In [1]:
from pathlib import Path
from pydantic import BaseModel
from typing import Any
from neo4j import GraphDatabase
from dotenv import load_dotenv
import os
import duckdb
load_dotenv()

True

In [2]:
class Neo4jGraph:

    def __init__(self, neo4j_uri:str, neo4j_username:str, neo4j_password:str, db:str)->None:
        self.uri  = neo4j_uri
        self.auth = (neo4j_username, neo4j_password)
        self.db = db
        self.driver = GraphDatabase.driver(self.uri, auth=self.auth)

    def query(self, query:str, params:dict):
        with self.driver.session(database=self.db) as session:
            result = session.run(query, params)
            return [r for r in result]

class Node(BaseModel):
    id:int
    label: list[str]
    properties:dict[str, Any]

class Relation(BaseModel):
    id:str
    label:str
    properties:dict[str, Any]


In [3]:
def merge_node(graph:Neo4jGraph, nodes:list[Node]):
    res = graph.query(
        "UNWIND $data as row "
        "CALL apoc.merge.node("
        "row.label, "
        "{id:row.id}, "
        "row.properties, "
        "row.properties ) "
        "YIELD node "
        "RETURN node"
        ,
        {
            "data":[
                node.__dict__ for node in nodes
            ]
        }
    )
    return res

In [4]:
base_path = Path().cwd().parent
source_path = base_path / Path('gold/anilist/dim-anime.parquet')

neo4j_uri = os.environ['neo4j_uri']
neo4j_username = os.environ['neo4j_username']
neo4j_password = os.environ['neo4j_password']
neo4j_dbname = os.environ['neo4j_dbname']

In [5]:
graph = Neo4jGraph(
    neo4j_uri,
    neo4j_username,
    neo4j_password,
    neo4j_dbname,
)

In [6]:
def make_node_list_from_table(rows:list[Any], labels:list[str], columns:list[str])->list[Node]:
    nodes = []
    node_labels  = labels
    for row in rows:
        id_ = row[0]
        prop = dict(zip(columns[1:], row[1:]))
        node = Node(id=id_, label=node_labels, properties=prop)
        nodes.append(node)

    return nodes

In [9]:
tb = duckdb.read_parquet(str(source_path))
tb.shape

(759, 24)

In [10]:
BATCH_SIZE = 10000
num_rows, num_column = tb.shape
columns = tb.columns
labels = ['Anime']

In [11]:
batch = tb.fetchmany(size=BATCH_SIZE)
n = 1
while batch:
    nodes = make_node_list_from_table(batch, labels, columns)
    res = merge_node(graph, nodes)
    print(f"Inserted batch {n} ({len(res)} rows) into neo4j")
    batch = tb.fetchmany(size=BATCH_SIZE)
    n += 1

Inserted batch 1 (759 rows) into neo4j
