In [None]:
import requests
import json

# Download the data
resp = requests.get('https://raw.githubusercontent.com/weaviate-tutorials/quickstart/main/data/jeopardy_tiny.json')
data = json.loads(resp.text)  # Load data

# Parse the JSON and preview it
print(type(data), len(data))
print(json.dumps(data[0], indent=2))

def jprint(data):
    print(json.dumps(data, indent=2))

In [None]:
import weaviate
import os

client = weaviate.connect_to_embedded(
    version="1.30.0",
    headers={
        "X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"]
    },
    environment_variables={"LOG_LEVEL": "error"})

In [None]:
jprint(client.get_meta())

In [None]:
if client.collections.exists("Question"):
    client.collections.delete("Question")
    print("Collection 'Question' deleted successfully.")

In [None]:
#Create the schema that will house our data
from weaviate.classes.config import Property, DataType, Configure

question_collection = client.collections.create(
    name="Question",
    vector_config=Configure.Vectors.text2vec_openai(), # Uses OpenAI for vectorization
    properties=[
        Property(name="question", data_type=DataType.TEXT),
        Property(name="answer", data_type=DataType.TEXT),
        Property(name="category", data_type=DataType.TEXT),
    ]
)
print("Collection 'Question' created successfully with explicit schema.")

In [None]:
collection = client.collections.get("Question")

with collection.batch.fixed_size(5) as batch:
    for i, d in enumerate(data):
    
        print(f"importing question: {i+1}")
            
        #Specify the properties we want to import into Weviate
        
        properties = {
                "answer": d["Answer"],
                "question":d["Question"],
                'category':d["Category"]
        }
        
        #Add data to Weaviate
        
        batch.add_object(properties=properties)

failed_objects = collection.batch.failed_objects
if failed_objects:
    print(f"Number of failed imports: {len(failed_objects)}")
    print(f"First failed object: {failed_objects[0]}")

In [None]:
response = collection.aggregate.over_all(total_count=True)
json_print(response.total_count)

In [None]:
#Extract and show any 3 questions and answers
response = collection.query.fetch_objects(limit=3,return_properties=["question", "answer"])
json_print([question.properties for question in response.objects])

----

## Lets Extract the vector that represents each question!

In [None]:
# write a query to extract the vector for a question

# ADD CODE HERE

In [None]:
#This is the question corresponding to this vector

result.objects[0].properties['question']

In [None]:
#This is the answer to this question

result.objects[0].properties['answer']

In [None]:
#Now display the vector representation of the above question and answer

# ADD CODE HERE

In [None]:
#How many numbers are there in this vector?

# ADD CODE HERE

## We have sucessfully extracted the vector for this datapoint!

## Lets see if we can search for a relevant answer using vector search!

In [None]:
#Build a vector search query to extract questions ,answers and categories related to "biology"

# ADD CODE HERE

print(json.dumps(response, indent=4))

## What is the distance between the `query`: `biology` and the returned objects?

In [None]:
#Write code to extract the distance between the query and returned object vectors

# ADD CODE HERE

print(json.dumps(response, indent=4))

In [None]:
#Extract all 10 questions and analyze them based on distance/similarity to the query vector
response = (collection.query.near_text(
    "animals", 
    limit=10, 
    return_metadata=["distance"]))

print(json.dumps(response, indent=2))

## Notice how as the responses get more irrelevant to the question that the distance between the `query`:`"animals"` and the response increases! - *The vectors are getting farther from each other!*

---

## We can let the vector database know to remove results after a threshold distance!

In [None]:
#Set a max distance threshold - What should the max distance be?

# ADD CODE HERE

## Now we prevented irrelevant results by removing vectors further then `max_distance` away!