In [1]:
import asyncio
import json
import os
import sys

import nest_asyncio
import rdflib

from pymilvus import model, MilvusClient
from pydantic_ai.models.openai import OpenAIChatModel
from pydantic_ai.providers.openai import OpenAIProvider

from bluecore_models.utils.graph import load_jsonld, init_graph, BF

sys.path.append("/Users/jpnelson/30-39 Sinopia, Blue-Core, FOLIO, and PCC/32.10 Blue Core Agents/src")

from bluecore_ai.agents.duplicate import SupportDependencies, agent as dedup_agent

nest_asyncio.apply()

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [2]:
milvus_client = MilvusClient("dcmi_demo.db")

openai_model = OpenAIChatModel(
    'gpt-5',
    provider=OpenAIProvider(
        api_key=os.environ.get("OPENAI_API_KEY")
    )
)

  from pkg_resources import DistributionNotFound, get_distribution


In [3]:
dedup_agent.model = openai_model

In [4]:
dependencies = SupportDependencies(
    milvus_client=milvus_client,
    embedding_func=model.DefaultEmbeddingFunction()
)

## Known Work Example

In [5]:
known_work_result = asyncio.run(
    dedup_agent.run("https://dev.bcld.info/works/285829ac-236d-4623-bdc4-f9725dbcfc77",
                    deps=dependencies)
)

In [6]:
known_work_result

AgentRunResult(output=DeDupResult(score=0.995, best_match='https://dev.bcld.info/works/285829ac-236d-4623-bdc4-f9725dbcfc77'))

## Similar Main Title

In [10]:
work_graph = init_graph()
work_uri = rdflib.URIRef("https://dev.bcld.info/works/ed48222e-25ae-45ae-b1a8-bc53c3ab6388")
work_graph.add((work_uri, rdflib.RDF.type, BF.Work))
title_bnode = rdflib.BNode()
work_graph.add((work_uri, BF.title, title_bnode))
work_graph.add((title_bnode, rdflib.RDF.type, BF.Title))
work_graph.add((title_bnode, BF.mainTitle, rdflib.Literal("Scalar Fields in General Relativity")))

<Graph identifier=N28498e516a70458aac232a0469eb3d7b (<class 'rdflib.graph.Graph'>)>

In [11]:
similiar_main_title_result = asyncio.run(
    dedup_agent.run(work_graph.serialize(format='json-ld'),
                    deps=dependencies)
)

In [12]:
similiar_main_title_result

AgentRunResult(output=DeDupResult(score=0.65, best_match='https://dev.bcld.info/works/9bcfce81-fa73-41ab-8066-c11a591be2f3'))

In [22]:
for i,message in enumerate(similiar_main_title_result.all_messages()):
    print(i, type(message).__name__)
    for part in message.parts:
        match type(message).__name__:
            case "ModelRequest":
                print(f"\t{type(part).__name__}, {part.content}")

            case _:
                print(f"\t{type(part).__name__}")

0 ModelRequest
	UserPromptPart, [
  {
    "@id": "https://dev.bcld.info/works/ed48222e-25ae-45ae-b1a8-bc53c3ab6388",
    "@type": [
      "http://id.loc.gov/ontologies/bibframe/Work"
    ],
    "http://id.loc.gov/ontologies/bibframe/title": [
      {
        "@id": "_:Neeacdd197520424fb3daaf5d5f150798"
      }
    ]
  },
  {
    "@id": "_:Neeacdd197520424fb3daaf5d5f150798",
    "@type": [
      "http://id.loc.gov/ontologies/bibframe/Title"
    ],
    "http://id.loc.gov/ontologies/bibframe/mainTitle": [
      {
        "@value": "Scalar Fields in General Relativity"
      }
    ]
  }
]
1 ModelResponse
	ToolCallPart
2 ModelRequest
	ToolReturnPart, {'https://dev.bcld.info/works/18b1dc21-53ab-4a55-832c-e500d1466967': [{'distance': 0.965573251247406, 'triple': '<https://dev.bcld.info/works/18b1dc21-53ab-4a55-832c-e500d1466967> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://id.loc.gov/ontologies/bibframe/Work> '}, {'distance': 0.9618780612945557, 'triple': '<https://dev.bcld.info/

## Dissimilar Instance Title

In [23]:
instance_graph = init_graph()
instance_uri = rdflib.URIRef("https://dev.bcld.info/instances/93ddae0d-3ae5-4e16-bb94-b263e0493ce3")
instance_graph.add((instance_uri, rdflib.RDF.type, BF.Instance))
title_bnode = rdflib.BNode()
instance_graph.add((title_bnode, rdflib.RDF.type, BF.Title))
instance_graph.add((instance_uri, BF.title, title_bnode))
instance_graph.add((title_bnode, BF.mainTitle, rdflib.Literal("DCMI 2025 Presentation")))

<Graph identifier=Nacc59e0f9a244c11b2a8caf8a88163ee (<class 'rdflib.graph.Graph'>)>

In [24]:
instance_main_title_result = asyncio.run(
    dedup_agent.run(instance_graph.serialize(format='json-ld'),
                    deps=dependencies)
)

In [25]:
instance_main_title_result

AgentRunResult(output=DeDupResult(score=0.4792, best_match='https://dev.bcld.info/instances/004e1f19-5920-45f2-ae53-d115499e330f'))

Resolving https://dev.bcld.info/instances/004e1f19-5920-45f2-ae53-d115499e330f and extracting the mainTitle is "Volume 2"