In [None]:
# Installing required packeges
!pip install weaviate-client
!pip install -U langchain-community
!pip install PyPDF2

In [None]:
# Import all required packeges
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
import weaviate
from weaviate.classes.config import Configure
from sentence_transformers import SentenceTransformer
from google.colab import userdata # Import userdata to access secrets
from weaviate.classes.init import Auth
import weaviate.classes as wvc
import weaviate
import PyPDF2
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
# Load secrets/credentials from local dotenv file
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

In [None]:
# Connect to the Weaviate cloud
client = weaviate.connect_to_weaviate_cloud(
    cluster_url=os.environ['WEAVIATE_URL'],
    auth_credentials=Auth.api_key(os.environ['WEAVIATE_API_KEY']),
)

In [None]:
# Get the collection object
collection = client.collections.get("TestCollection")

In [None]:
# Check if Collection exist. If exist delete as it contains previous experiment output.
collection_name = "TestCollection"
# Delete the collection if it exists
try:
    if client.collections.exists(collection_name):
        client.collections.delete(collection_name)
        print(f"Collection '{collection_name}' deleted.")
    else:
        print(f"Collection '{collection_name}' does not exist, no need to delete.")
except Exception as e:
    print(f"Error deleting collection: {e}")

In [None]:
# Define the schema with vectorizer set to "none"
collection_name = "TestCollection"
collection_schema = {
    "name": collection_name,
    "description": "Collection for storing document chunks with pre-calculated vectors",
    "vectorizer": "none",  # Specify that we will provide pre-calculated vectors
    "properties": [
        {
            "name": "text",
            "dataType": ["text"],
            "description": "Text content of the document chunk",
        }
    ]
}

# # Create the collection with the defined schema
# try:
#     client.collections.create(collection_schema)
#     print(f"Collection '{collection_name}' created successfully with vectorizer='none'.")
# except Exception as e:
#     print(f"Error creating collection: {e}")

In [None]:
# Initialize your embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
# Read the Input PDF documentation.
complete_pdf = ""
# Open the PDF file in read-binary mode
with open('little_prince.pdf', 'rb') as file:
    # Create a PdfReader object
    reader = PyPDF2.PdfReader(file)

    # Get the number of pages
    num_pages = len(reader.pages)
    print(f"Number of pages: {num_pages}")

    # Iterate through each page and extract text
    for page_num in range(len(reader.pages)):
        page = reader.pages[page_num]
        text = page.extract_text()
        if text:
          complete_pdf += text + "\n"
print(complete_pdf)

In [None]:
# Chunk the document
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=20,
    separators=["\n\n", "\n", "."]
)
documents = text_splitter.create_documents([complete_pdf])

In [None]:
# Generate the embeddings for the text content
texts_to_encode = [doc.page_content for doc in documents]
embeddings = embedding_model.encode(texts_to_encode, normalize_embeddings=True) # No need for .tolist()

# Data preparation
data_objects_with_vectors = []
for i, doc in enumerate(documents):
    data_objects_with_vectors.append({
        # Ensure properties is a dictionary with the relevant data
        "properties": {"text": doc.page_content},
        "vector": embeddings[i]
    })

In [None]:
# Get a reference to the newly created collection
collection = client.collections.get(collection_name)

# Push the data with their vectors to the collection
print(f"Inserting {len(documents)} objects into '{collection_name}'...")
with collection.batch.dynamic() as batch:
    for obj in data_objects_with_vectors:
        batch.add_object(
            properties=obj["properties"],
            vector=obj["vector"]
        )
print("Vector data pushed to the collection successfully.")

In [None]:
# This is Query block which is a new and separate section. Hence getting collection details again here.
collection_name = "TestCollection" # Or the actual collection name
collection = client.collections.get(collection_name)

# Define the query text
query_text = "What are the two pictures the narrator knows how to draw?"
print(f"Generating embedding for query: '{query_text}'...")

# Generate the embedding for the query text using the 'encode' method
query_vector = embedding_model.encode(query_text).tolist()

# Perform the vector search
try:
    response = collection.query.near_vector(
        near_vector=query_vector,
        return_properties=["text"], # Specify the properties to return
        # Request distance using return_metadata
        return_metadata=wvc.query.MetadataQuery(distance=True)
    )

    print("Search results: ", response)
    for obj in response.objects:
        print(f"  Text: {obj.properties['text']}")
        if hasattr(obj, 'distance'):
             print(f"  Distance: {obj.distance:.4f}")
        elif hasattr(obj, 'metadata') and hasattr(obj.metadata, 'distance'):
             print(f"  Distance: {obj.metadata.distance:.4f}")
        else:
             print("  Distance information not available on object.")
        print("-" * 20)

except Exception as e:
    print(f"Error querying data: {e}")

In [None]:
# Make a Document set out of search response.
response_documents = []
for obj in response.objects:
    response_documents.append(obj.properties['text'])

In [None]:
# Take the output/response of search in last step and do Reranking.
model_name = 'BAAI/bge-reranker-v2-m3'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# (Optional) Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 3. Create pairs of query and document for the reranker
pairs = [[query_text, doc] for doc in response_documents]

# 4. Tokenize the input pairs
inputs = tokenizer(
    pairs,
    padding=True,
    truncation=True,
    return_tensors='pt',
    max_length=512  # Adjust max_length as needed by the model
).to(device)

# 5. Get relevance scores from the model
with torch.no_grad():
    outputs = model(**inputs)
    scores = outputs.logits.view(-1).float().cpu().numpy()

# 6. Combine documents with their scores and sort them
scored_documents = sorted(zip(response_documents, scores), key=lambda x: x[1], reverse=True)

# 7. Print the reranked documents
print("Reranked Documents:")
for doc, score in scored_documents:
    print(f"Score: {score:.4f} - Document: {doc}")
    print("=="*20)

In [None]:
# Take top 3 reranked output document and create context.
context = "".join([doc[0] for doc in scored_documents[:3]])
print(context)

In [None]:
# Creating a prompt for our RAG system output Generation.
prompt = f"""
Answer the following question based only on the provided context. If the answer
cannot be found in the context, respond with "I cannot answer this question based on the provided information."

Context:
{context}

Question: {query_text}
Answer:
"""

In [None]:
# Time for Text geneation!

# Specify the model ID for Llama 3.2 (e.g., a 1B instruct model)
model_id = "meta-llama/Llama-3.2-1B-Instruct"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)

# Move the model to GPU if available
if torch.cuda.is_available():
    model = model.to("cuda")

# Apply the chat template.
messages = [{"role": "user", "content": prompt}]
input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")

# Move input IDs to GPU if available
if torch.cuda.is_available():
    input_ids = input_ids.to("cuda")

# Generate text
outputs = model.generate(input_ids=input_ids, max_new_tokens=200, use_cache=True)

# Decode and print the output
decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
print(decoded_output)