In [28]:
import weaviate
from weaviate.classes.init import Auth
import os
from dotenv import load_dotenv

In [2]:
load_dotenv()

True

In [None]:
# Best practice: store your credentials in environment variables
weaviate_url = os.getenv("WEAVIATE_URL")
weaviate_key = os.getenv("WEAVIATE_API_KEY")

client = weaviate.connect_to_weaviate_cloud(
    cluster_url=weaviate_url,                     # Weaviate URL: "REST Endpoint" in Weaviate Cloud console
    auth_credentials=Auth.api_key(weaviate_key),  # Weaviate API key: "ADMIN" API key in Weaviate Cloud console
)

print(client.is_ready())  # Should print: `True`

True


In [7]:
from weaviate.classes.config import Configure, Property, DataType

In [None]:
client.collections.create(
    "DemoCollection",
    properties=[
        Property(name="text", data_type=DataType.TEXT),
        Property(name="source", data_type=DataType.TEXT),
        Property(name="source_id", data_type=DataType.TEXT),
        Property(name="title", data_type=DataType.TEXT),
    ],
    vector_config=[
        Configure.Vectors.text2vec_weaviate(
            name="title_vector", # unfortuantely, this is a typo, should be "text_vector" lol
            source_properties=["text"],
            model="Snowflake/snowflake-arctic-embed-l-v2.0",
            # Further options
            # dimensions=256
            # base_url="<custom_weaviate_embeddings_url>",
        )
    ],
    # Additional parameters not shown
)

<weaviate.collections.collection.sync.Collection at 0x118d7fa10>

In [11]:
import pandas as pd

In [None]:
# read ../data_preprocessing/processed_entities.csv to source_objects
# example output:
# source_objects = [
#     {"title": "The Shawshank Redemption", "description": "A wrongfully imprisoned man forms an inspiring friendship while finding hope and redemption in the darkest of places."},
#     {"title": "The Godfather", "description": "A powerful mafia family struggles to balance loyalty, power, and betrayal in this iconic crime saga."},
#     {"title": "The Dark Knight", "description": "Batman faces his greatest challenge as he battles the chaos unleashed by the Joker in Gotham City."},
#     {"title": "Jingle All the Way", "description": "A desperate father goes to hilarious lengths to secure the season's hottest toy for his son on Christmas Eve."},
#     {"title": "A Christmas Carol", "description": "A miserly old man is transformed after being visited by three ghosts on Christmas Eve in this timeless tale of redemption."}
# ]
# the fields in csv are: Source, ID, Name, Description
# map to the fields in the collection:
# - source: Source
# - source_id: ID
# - title: Name
# - text: Description

df_processed_entities = pd.read_csv("../data_preprocessing/processed_entities.csv")
source_objects = df_processed_entities.to_dict(orient="records")
# fields in source_objects should match the properties defined in the collection
for obj in source_objects:
    obj["source"] = obj.pop("Source")
    obj["source_id"] = obj.pop("ID")
    obj["title"] = obj.pop("Name")
    obj["text"] = obj.pop("Description")

print(len(source_objects), "source objects to insert")

collection = client.collections.get("DemoCollection")

with collection.batch.fixed_size(batch_size=200) as batch:
    for src_obj in source_objects:
        # The model provider integration will automatically vectorize the object
        batch.add_object(
            properties={
                # Property(name="text", data_type=DataType.TEXT),
                # Property(name="source", data_type=DataType.TEXT),
                # Property(name="source_id", data_type=DataType.TEXT),
                # Property(name="title", data_type=DataType.TEXT),
                "title": src_obj["title"],
                "source": src_obj["source"],
                "source_id": src_obj["source_id"],
                "text": src_obj["text"]
            },
            # vector=vector  # Optionally provide a pre-obtained vector
        )
        if batch.number_errors > 10:
            print("Batch import stopped due to excessive errors.")
            break

failed_objects = collection.batch.failed_objects
if failed_objects:
    print(f"Number of failed imports: {len(failed_objects)}")
    print(f"First failed object: {failed_objects[0]}")

# the processing of 22,658 source objects is actually very fast, only takes about 53 seconds

22685 source objects to insert


In [None]:
# Weaviate is capable of many types of searches:
# similarity searches, keyword searches, hybrid searches, and filtered searches.

In [16]:
collection = client.collections.get("DemoCollection")

response = collection.query.near_text(
    query="Francesca Paola Albanese",  # The model provider integration will automatically vectorize the query
    limit=2
)

for obj in response.objects:
    print(obj)

Object(uuid=_WeaviateUUIDInt('95aa1340-8eae-42a7-80d1-42d3d496f1ba'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=None, certainty=None, score=None, explain_score=None, is_consistent=None, rerank_score=None), properties={'source': 'US OFAC Specially Designated Nationals (SDN)', 'title': 'Francesca Paola Albanese', 'text': 'US OFAC Specially Designated Nationals (SDN)\nName: Francesca Paola Albanese\nType: Person\nGender: Female\nLast Name: Albanese\nFirst Name: Francesca\nMiddle Name: Paola\nDate of Birth: 1977-03-30\nBirth Place: Ariano Irpino, Italy\nPassport Number: YA4652441\nSource URL: https://sanctionssearch.ofac.treas.gov/Details.aspx?id=54707\nAddress: Tunis\nCountry: IT / TN\nNationality: IT\nProgram: US-ICC\nFirst seen: 2025-07-09T18:10:02\nLast update: 2025-07-09T18:10:02', 'source_id': 'ofac-54707'}, references=None, vector={}, collection='DemoCollection')
Object(uuid=_WeaviateUUIDInt('5aa18d4f-cb9e-47ea-9ec9-255f5bbf068e'), metadata=Metadata

In [17]:
collection = client.collections.get("DemoCollection")

response = collection.query.near_text(
    query="Francesca P Albanese",  # The model provider integration will automatically vectorize the query
    limit=2
)

for obj in response.objects:
    print(obj.properties)

{'source': 'US OFAC Specially Designated Nationals (SDN)', 'text': 'US OFAC Specially Designated Nationals (SDN)\nName: Francesca Paola Albanese\nType: Person\nGender: Female\nLast Name: Albanese\nFirst Name: Francesca\nMiddle Name: Paola\nDate of Birth: 1977-03-30\nBirth Place: Ariano Irpino, Italy\nPassport Number: YA4652441\nSource URL: https://sanctionssearch.ofac.treas.gov/Details.aspx?id=54707\nAddress: Tunis\nCountry: IT / TN\nNationality: IT\nProgram: US-ICC\nFirst seen: 2025-07-09T18:10:02\nLast update: 2025-07-09T18:10:02', 'title': 'Francesca Paola Albanese', 'source_id': 'ofac-54707'}
{'source': 'US OFAC Specially Designated Nationals (SDN)', 'title': 'Elena Mikhaylovna Afanasyeva', 'text': 'US OFAC Specially Designated Nationals (SDN)\nName: Елена Михайловна Афанасьева\nType: Person\nGender: Female\nLast Name: Afanasyeva / Pesti / Shudra / Афанасьева\nFirst Name: Helena / Elena / Елена / Victoria\nMiddle Name: Михайловна / Mikhaylovna\nDate of Birth: 1997-06-22\nBirth Plac

In [18]:
collection = client.collections.get("DemoCollection")

response = collection.query.near_text(
    query="Frances P ALBAN ESE",  # The model provider integration will automatically vectorize the query
    limit=2
)

for obj in response.objects:
    print(obj.properties)

{'source': 'US OFAC Specially Designated Nationals (SDN)', 'text': 'US OFAC Specially Designated Nationals (SDN)\nName: Francesca Paola Albanese\nType: Person\nGender: Female\nLast Name: Albanese\nFirst Name: Francesca\nMiddle Name: Paola\nDate of Birth: 1977-03-30\nBirth Place: Ariano Irpino, Italy\nPassport Number: YA4652441\nSource URL: https://sanctionssearch.ofac.treas.gov/Details.aspx?id=54707\nAddress: Tunis\nCountry: IT / TN\nNationality: IT\nProgram: US-ICC\nFirst seen: 2025-07-09T18:10:02\nLast update: 2025-07-09T18:10:02', 'title': 'Francesca Paola Albanese', 'source_id': 'ofac-54707'}
{'source': 'US OFAC Specially Designated Nationals (SDN)', 'text': 'US OFAC Specially Designated Nationals (SDN)\nName: PALESTINE LIBERATION FRONT - ABU ABBAS FACTION\nType: Organization\nSource URL: https://sanctionssearch.ofac.treas.gov/Details.aspx?id=4708\nAlias: PLF / PALESTINE LIBERATION FRONT / PLF-ABU ABBAS\nProgram: US-TERR\nFirst seen: 2023-04-20T10:27:20\nLast update: 2025-06-02T12:

In [None]:
collection = client.collections.get("DemoCollection")

response = collection.query.bm25(
    query="Frances P ALBAN ESE",  # The model provider integration will automatically vectorize the query
    limit=2
)

for obj in response.objects:
    print(obj.properties)

# seems bm25 mainly looks for exact matches, not fuzzy matches

{'source': 'US OFAC Specially Designated Nationals (SDN)', 'text': 'US OFAC Specially Designated Nationals (SDN)\nName: Sally-Anne Frances Jones\nType: Person\nLast Name: Hussain / Jones\nFirst Name: Sally / Sally-Anne / Sakinah\nMiddle Name: Anne / Frances\nDate of Birth: 1968-11-17\nBirth Place: Greenwich, England\nPassport Number: 519408086\nSource URL: https://sanctionssearch.ofac.treas.gov/Details.aspx?id=18630\nAlias: Sally Jones / Sakinah Hussain / Sally Anne Jones\nCountry: GB / SY\nNationality: GB\nProgram: US-TERR\nFirst seen: 2023-04-20T10:27:20\nLast update: 2025-06-02T12:10:03', 'title': 'Sally-Anne Frances Jones', 'source_id': 'Q27995864'}
{'source': 'US FBI Most Wanted', 'title': 'SHAILESHKUMAR P. JAIN', 'text': 'US FBI Most Wanted\nName: SHAILESHKUMAR P. JAIN\nType: Person\nGender: Male\nDate of Birth: 1970-02-10\nBirth Place: India\nHeight: 5\'8"\nWeight: 175 to 180 pounds\nEye Color: Brown\nHair Color: Black\nSource URL: https://www.fbi.gov/wanted/cyber/shaileshkumar-

In [20]:
collection = client.collections.get("DemoCollection")

response = collection.query.bm25(
    query="Frances Albanese",  # The model provider integration will automatically vectorize the query
    limit=2
)

for obj in response.objects:
    print(obj.properties)

# seems bm25 mainly looks for exact matches, not fuzzy matches

{'source': 'US OFAC Specially Designated Nationals (SDN)', 'text': 'US OFAC Specially Designated Nationals (SDN)\nName: Francesca Paola Albanese\nType: Person\nGender: Female\nLast Name: Albanese\nFirst Name: Francesca\nMiddle Name: Paola\nDate of Birth: 1977-03-30\nBirth Place: Ariano Irpino, Italy\nPassport Number: YA4652441\nSource URL: https://sanctionssearch.ofac.treas.gov/Details.aspx?id=54707\nAddress: Tunis\nCountry: IT / TN\nNationality: IT\nProgram: US-ICC\nFirst seen: 2025-07-09T18:10:02\nLast update: 2025-07-09T18:10:02', 'title': 'Francesca Paola Albanese', 'source_id': 'ofac-54707'}
{'source': 'US OFAC Specially Designated Nationals (SDN)', 'title': 'Sally-Anne Frances Jones', 'text': 'US OFAC Specially Designated Nationals (SDN)\nName: Sally-Anne Frances Jones\nType: Person\nLast Name: Hussain / Jones\nFirst Name: Sally / Sally-Anne / Sakinah\nMiddle Name: Anne / Frances\nDate of Birth: 1968-11-17\nBirth Place: Greenwich, England\nPassport Number: 519408086\nSource URL: 

In [None]:
collection = client.collections.get("DemoCollection")

response = collection.query.near_text(
    query="""Frances P ALBAN ESE Hybrid search
Hybrid search combines the results of a vector search and a keyword (BM25F) search by fusing the two result sets.
The fusion method and the relative weights are configurable.""",  # The model provider integration will automatically vectorize the query
    limit=2
)

for obj in response.objects:
    print(obj.properties)

# so with polluted contents, this would cause the embeddings to be more likely to pick up the noise

{'source': 'Canadian Consolidated Autonomous Sanctions', 'title': 'Bishr al-Sabban', 'text': 'Canadian Consolidated Autonomous Sanctions\nName: Bishr al-Sabban\nType: Person\nLast Name: al-Sabban\nFirst Name: Bishr\nCountry: SY\nProgram: CA-SEMA\nFirst seen: 2024-08-06T09:43:02\nLast update: 2025-05-19T18:43:02', 'source_id': 'Q12198429'}
{'source': 'Canadian Consolidated Autonomous Sanctions', 'text': 'Canadian Consolidated Autonomous Sanctions\nName: Fatemeh Ghorban-Hosseini\nType: Person\nLast Name: Ghorban-Hosseini \nFirst Name: Fatemeh \nDate of Birth: 1995\nAlias: فاطمه قربان حسینی\nCountry: IR\nProgram: CA-SEMA\nFirst seen: 2024-08-06T09:43:02\nLast update: 2025-05-19T18:43:02', 'title': 'Fatemeh Ghorban-Hosseini', 'source_id': 'NK-jFZUUfWkEHG2qdpfvYcXvX'}


In [23]:
collection = client.collections.get("DemoCollection")

response = collection.query.near_text(
    query="""Frances P ALBAN ESE Hybrid search
Hybrid search combines the results of a vector search and a keyword (BM25F) search by fusing the two result sets.
The fusion method and the relative weights are configurable.""",  # The model provider integration will automatically vectorize the query
    limit=10
)

for obj in response.objects:
    print(obj.properties)

# so with polluted contents, this would cause the embeddings to be more likely to pick up the noise

{'source': 'Canadian Consolidated Autonomous Sanctions', 'text': 'Canadian Consolidated Autonomous Sanctions\nName: Bishr al-Sabban\nType: Person\nLast Name: al-Sabban\nFirst Name: Bishr\nCountry: SY\nProgram: CA-SEMA\nFirst seen: 2024-08-06T09:43:02\nLast update: 2025-05-19T18:43:02', 'title': 'Bishr al-Sabban', 'source_id': 'Q12198429'}
{'source': 'Canadian Consolidated Autonomous Sanctions', 'title': 'Fatemeh Ghorban-Hosseini', 'text': 'Canadian Consolidated Autonomous Sanctions\nName: Fatemeh Ghorban-Hosseini\nType: Person\nLast Name: Ghorban-Hosseini \nFirst Name: Fatemeh \nDate of Birth: 1995\nAlias: فاطمه قربان حسینی\nCountry: IR\nProgram: CA-SEMA\nFirst seen: 2024-08-06T09:43:02\nLast update: 2025-05-19T18:43:02', 'source_id': 'NK-jFZUUfWkEHG2qdpfvYcXvX'}
{'source': 'Canadian Consolidated Autonomous Sanctions', 'text': 'Canadian Consolidated Autonomous Sanctions\nName: The Law Enforcement Forces (LEF\nType: LegalEntity\nAlias: فرماندهی انتظامی جمهوری اسلامی ایران\nCountry: IR\n

In [None]:
collection = client.collections.get("DemoCollection")

response = collection.query.near_text(
    query="""
    Find information about Frances P ALBAN ESE. Only consider this name.

Hybrid search
Hybrid search combines the results of a vector search and a keyword (BM25F) search by fusing the two result sets.
The fusion method and the relative weights are configurable.""",  # The model provider integration will automatically vectorize the query
    limit=10
)

for obj in response.objects:
    print(obj.properties)

# seems even with query hinting the name should be the focus, it still returns the noises

{'source': 'US OFAC Specially Designated Nationals (SDN)', 'text': 'US OFAC Specially Designated Nationals (SDN)\nName: مصطفى حبيب حرب\nType: Person\nGender: Male\nLast Name: حرب / Harb\nFirst Name: مصطفى / Mustafa / Mustapha / Mostafa\nMiddle Name: حبيب / Habib\nDate of Birth: 1973-08-06\nBirth Place: Haruf, Lebanon\nSource URL: https://sanctionssearch.ofac.treas.gov/Details.aspx?id=31771\nAlias: Mostafa Habib Harb / Mustapha Harb\nAddress: Haruf\nCountry: LB\nProgram: US-TERR\nFirst seen: 2023-04-20T10:27:20\nLast update: 2025-06-02T12:10:03', 'title': 'Mustafa Habib Harb', 'source_id': 'NK-2ZYZgqNdJg9JVcxoWv6PP9'}
{'source': 'US OFAC Specially Designated Nationals (SDN)', 'title': 'Abbas Hassan Gharib', 'text': 'US OFAC Specially Designated Nationals (SDN)\nName: عباس حسن غريب\nType: Person\nGender: Male\nLast Name: غريب / Gharib\nFirst Name: عباس / Abbas\nMiddle Name: Hassan / حسن\nDate of Birth: 1969-09-25\nBirth Place: Tayir Harfa, Lebanon\nSource URL: https://sanctionssearch.ofa

In [None]:
collection = client.collections.get("DemoCollection")

response = collection.query.near_text(
    query="""
    Name: Frances P ALBAN ESE
    (born 1977) is an Italian legal scholar and expert on human rights who has served as the United Nations (UN) Special Rapporteur on the situation of human rights in the Palestinian territories occupied since 1967 since 1 May 2022; initially appointed for a three-year term, in April 2025 she was confirmed for another three years. She is the first woman to hold the position. In response the United States Department of the Treasury under the Trump administration imposed sanctions on Albanese under Executive Order naming her a xxx, thus forbidding all related persons and companies from doing business with her.
-website: https://en.wikipedia.org/wiki/
-website: https://www.state.gov/releases/
    UN calls for reversal of US sanctions for Francesca Paola Albanese: https://news.un.org/en/story/2025/07/1165359
    """,  # The model provider integration will automatically vectorize the query
    limit=2
)

for obj in response.objects:
    print(obj.properties)

# so with polluted contents, this would cause the embeddings to be more likely to pick up the noise
# seems related content is ok, unsure if additional not mentioned but related content is also ok

{'source': 'US OFAC Specially Designated Nationals (SDN)', 'text': 'US OFAC Specially Designated Nationals (SDN)\nName: Francesca Paola Albanese\nType: Person\nGender: Female\nLast Name: Albanese\nFirst Name: Francesca\nMiddle Name: Paola\nDate of Birth: 1977-03-30\nBirth Place: Ariano Irpino, Italy\nPassport Number: YA4652441\nSource URL: https://sanctionssearch.ofac.treas.gov/Details.aspx?id=54707\nAddress: Tunis\nCountry: IT / TN\nNationality: IT\nProgram: US-ICC\nFirst seen: 2025-07-09T18:10:02\nLast update: 2025-07-09T18:10:02', 'title': 'Francesca Paola Albanese', 'source_id': 'ofac-54707'}
{'source': 'US OFAC Specially Designated Nationals (SDN)', 'title': 'PALESTINE LIBERATION FRONT - ABU ABBAS FACTION', 'text': 'US OFAC Specially Designated Nationals (SDN)\nName: PALESTINE LIBERATION FRONT - ABU ABBAS FACTION\nType: Organization\nSource URL: https://sanctionssearch.ofac.treas.gov/Details.aspx?id=4708\nAlias: PLF / PALESTINE LIBERATION FRONT / PLF-ABU ABBAS\nProgram: US-TERR\nF

In [27]:
# Work with Weaviate
client.close()