In [5]:
import weaviate
from datasets import load_dataset
import time
import traceback
from weaviate import WeaviateClient

In [3]:
# Load the dataset
dataset = load_dataset('Abirate/english_quotes')

# Inspect the dataset structure
print(dataset['train'][0])

{'quote': '“Be yourself; everyone else is already taken.”', 'author': 'Oscar Wilde', 'tags': ['be-yourself', 'gilbert-perreira', 'honesty', 'inspirational', 'misattributed-oscar-wilde', 'quote-investigator']}


In [6]:
# Connect to Weaviate instance
client = weaviate.Client("http://localhost:8080")

In [7]:
# Delete the existing Quotes class if it exists
try:
    client.schema.delete_class("Quotes")
    print("Quotes class deleted.")
except weaviate.exceptions.UnexpectedStatusCodeException:
    print("Quotes class did not exist, proceeding with creation.")


Quotes class deleted.


In [8]:
# Define the updated schema for your data
schema = {
    "class": "Quotes",
    "vectorizer": "text2vec-transformers",
    "properties": [
        {"name": "quote", "dataType": ["string"]},
        {"name": "author", "dataType": ["string"]},
        {"name": "tags", "dataType": ["string[]"]}
    ]
}

# Create the class in Weaviate
client.schema.create_class(schema)

In [9]:
# Define a generator function to yield data objects one at a time
def generate_quotes_data(dataset):
    for item in dataset['train']:
        yield {"quote": item["quote"], "author": item["author"], "tags": item["tags"]}

# Import the data into Weaviate with increased batch size
start_time = time.time()
try:
    with client.batch as batch:
        batch.batch_size = 100  # Adjust batch size for better performance
        for i, obj in enumerate(generate_quotes_data(dataset)):
            weaviate_obj = {
                "quote": obj["quote"],
                "author": obj["author"],
                "tags": obj["tags"],
            }
            batch.add_data_object(weaviate_obj, "Quotes")
            if (i + 1) % 100 == 0:
                print(f"{i + 1} objects imported...")
except Exception as e:
    print("An error occurred:")
    traceback.print_exc()

end_time = time.time()
print(f"Data import completed in {end_time - start_time:.2f} seconds.")


100 objects imported...
200 objects imported...
300 objects imported...
400 objects imported...
500 objects imported...
600 objects imported...
700 objects imported...
800 objects imported...
900 objects imported...
1000 objects imported...
1100 objects imported...
1200 objects imported...
1300 objects imported...
1400 objects imported...
1500 objects imported...
1600 objects imported...
1700 objects imported...
1800 objects imported...
1900 objects imported...
2000 objects imported...
2100 objects imported...
2200 objects imported...
2300 objects imported...
2400 objects imported...
2500 objects imported...
Data import completed in 576.87 seconds.


Test

In [10]:
# Perform a vector search to verify the data
try:
    response = (
        client.query
        .get("Quotes", ["quote", "author", "tags"])
        .with_near_text({"concepts": ["inspiration"]})
        .with_limit(5)
        .do()
    )

    # Print the entire response for debugging
    print("Response from Weaviate:", response)

    # Print the search results
    if 'data' in response and 'Get' in response['data'] and 'Quotes' in response['data']['Get']:
        for obj in response['data']['Get']['Quotes']:
            print(f"Quote: {obj['quote']} - Author: {obj['author']} - Tags: {', '.join(obj['tags'])}")
    else:
        print("The response structure is not as expected. Please check the response format.")
except Exception as e:
    print(f"An error occurred: {e}")

Response from Weaviate: {'data': {'Get': {'Quotes': [{'author': 'Jack London', 'quote': "“You can't wait for inspiration. You have to go after it with a club.”", 'tags': ['inspiration', 'on-writing', 'writing']}, {'author': 'Pablo Picasso,', 'quote': '“Others have seen what is and asked why. I have seen what could be and asked why not. ”', 'tags': ['creativity', 'inspirational']}, {'author': 'Walt Disney', 'quote': '“The way to get started is to quit talking and begin doing. ”', 'tags': ['motivation', 'success']}, {'author': 'George Bernard Shaw', 'quote': "“Life isn't about finding yourself. Life is about creating yourself.”", 'tags': ['inspirational', 'life', 'yourself']}, {'author': 'Maya Angelou', 'quote': '“The desire to reach for the stars is ambitious. The desire to reach hearts is wise.”', 'tags': ['ambition', 'inspirational', 'wisdom']}]}}}
Quote: “You can't wait for inspiration. You have to go after it with a club.” - Author: Jack London - Tags: inspiration, on-writing, writi

Search Functionality

In [11]:
# Function to perform vector search
def perform_vector_search(client, query):
    try:
        response = (
            client.query
            .get("Quotes", ["quote", "author", "tags"])
            .with_near_text({"concepts": [query]})
            .with_limit(5)
            .do()
        )

        # Print the search results
        print("\nVector Search results:")
        if 'data' in response and 'Get' in response['data'] and 'Quotes' in response['data']['Get']:
            for obj in response['data']['Get']['Quotes']:
                quote = obj.get('quote', 'N/A')
                author = obj.get('author', 'N/A')
                tags = obj.get('tags', [])
                if isinstance(tags, list):
                    tags = ', '.join(tags)
                print(f"Quote: {quote} - Author: {author} - Tags: {tags}")
        else:
            print("The response structure is not as expected. Please check the response format.")
    except Exception as e:
        print(f"An error occurred during the vector search: {e}")
        traceback.print_exc()

In [12]:
# Function to perform hybrid search
def perform_hybrid_search(client, query):
    try:
        response = (
            client.query
            .get("Quotes", ["quote", "author", "tags"])
            .with_hybrid(query=query, alpha=0.5)
            .with_limit(5)
            .do()
        )

        # Print the search results
        print("\nHybrid Search results:")
        if 'data' in response and 'Get' in response['data'] and 'Quotes' in response['data']['Get']:
            for obj in response['data']['Get']['Quotes']:
                quote = obj.get('quote', 'N/A')
                author = obj.get('author', 'N/A')
                tags = obj.get('tags', [])
                if isinstance(tags, list):
                    tags = ', '.join(tags)
                print(f"Quote: {quote} - Author: {author} - Tags: {tags}")
        else:
            print("The response structure is not as expected. Please check the response format.")
    except Exception as e:
        print(f"An error occurred during the hybrid search: {e}")
        traceback.print_exc()


In [13]:
# Get search term from the user
user_query = input("Enter a search term (e.g., 'A novel' or 'A film', or the name of a novel film, e.g. 'Moby Dick' : ")

# Perform vector search
perform_vector_search(client, user_query)

# Perform hybrid search
perform_hybrid_search(client, user_query)


Vector Search results:
Quote: “Some birds are not meant to be caged, that's all. Their feathers are too bright, their songs too sweet and wild. So you let them go, or when you open the cage to feed them they somehow fly out past you. And the part of you that knows it was wrong to imprison them in the first place rejoices, but still, the place where you live is that much more drab and empty for their departure.” - Author: Stephen King, - Tags: birds, freedom, friends, friendship, letting-go
Quote: “You see, cuckoos are parasites. They lay their eggs in other birds' nests. When the egg hatches, the baby cuckoo pushes the other baby birds out of the nest. The poor parent birds work themselves to death trying to find enough food to feed the enormous cuckoo child who has murdered their babies and taken their places.""Enormous?" said Jace. "Did you just call me fat?""It was an analogy.""I am not fat.” - Author: Cassandra Clare, - Tags: imposters
Quote: “Jane, be still; don't struggle so lik