# Pronova LLM V1 #
## Use this notebook to do the following ##
- Create Qdrant collections
- Chunk text files into smaller chunks
- Create embeddings for data chunks and querys
- Delete qdrant vectors
- Run the current model on a query

In [39]:
# Load require librarys
import os
from qdrant_client import QdrantClient
from qdrant_client.http import models
from openai import OpenAI
from dotenv import load_dotenv
from IPython.display import Markdown, display

# Load environment variables from .env file
load_dotenv()

True

### Setup Qdrant connection ###

In [40]:
# Get the Qdrant API key from the environment variable
Qdrant_api_key = os.getenv('Qdrant_API_KEY')
if not Qdrant_api_key:
    raise ValueError("No Qdrant API key found in environment variables")
Qdrant_url = os.getenv('Qdrant_URL')
if not Qdrant_url:
    raise ValueError("No Qdrant URL found in environment variables")


# Initialize Qdrant client
try:
    Qclient = QdrantClient(
        url= Qdrant_url,
        api_key=Qdrant_api_key
    )
    print("Successfully connected to Qdrant")
except Exception as e:
    print(f"Failed to connect to Qdrant: {e}")
    raise

Successfully connected to Qdrant


### Setup OpenAI connection ###

In [41]:
# Get the OpenAI API key from the environment variable
OpenAI_api_key = os.getenv('OPENAI_API_KEY')
if not OpenAI_api_key:
    raise ValueError("No OpenAI API key found in environment variables")

OpenAI.api_key = OpenAI_api_key

### Creating a Qdrant Collection (function) ###

In [42]:
#Create collection
def create_qdrant_collection(collection_name):
    try:
        Qclient.create_collection(
            collection_name=collection_name,
            vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE),
        )
        print(f"Collection '{collection_name}' created successfully")
    except Exception as e:
        print(f"Failed to create collection '{collection_name}': {e}")
        raise

### Get an OpenAI embedding from a text segment (Function) ###

In [43]:
# Function to get the embedding of a text
def get_embedding(text):
    client = OpenAI()
    response = client.embeddings.create(
        model="text-embedding-ada-002",
        input=text
    )
    return response.data[0].embedding

### Turn a file into chunks (Function) ###

In [24]:
# Function to read a text file and chunk its content
def chunk_text_from_file(file_path, chunk_size=400):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

### Get embedding from a list of chunks (Function) ###

In [44]:
# Function to get embeddings for a list of text chunks
def get_embeddings_for_chunks(chunks):
    embeddings = []
    for chunk in chunks:
        embedding = get_embedding(chunk)
        embeddings.append(embedding)
    return embeddings

### Upsert lists of documents to Qdrant (Function) ###
#### Mind the parameters ####

In [45]:
# function to upsert embeddings into Qdrant
# def upsert_embeddings(collection_name, embeddings, chunks):
def upsert_embeddings(collection_name, embeddings, chunks, files):

    points = []
    for i in range(len(embeddings)):
        points.append(
            {
                "id": i,
                "vector": embeddings[i],
                "payload": {
                    "text": chunks[i],   # Attach the chunk as payload
                    "source_url": files[i]  # Add the source URL
                }            
            }
        )
        # print(f"{i}: {files[i]}")

    # print('not actually upserting lol')
    # return


    try:
        Qclient.upsert(
            collection_name=collection_name,
            points=points
        )
        print("Embeddings upserted successfully")
    except Exception as e:
        print(f"Failed to upsert embeddings: {e}")
        raise

### Retrieve similar chunks from query (Function) ###

In [46]:
def retrieve_relevant_chunks(collection_name, query, top_k=5):
    query_embedding = get_embedding(query)
    
    search_result = Qclient.search(
        collection_name=collection_name,
        query_vector=query_embedding,
        limit=top_k
    )

    contexts = [result.payload["text"] for result in search_result]
    urls = [result.payload.get("source_url") for result in search_result]
    
    return contexts, urls

    # return [result.payload["text"] for result in search_result]

### Generate Response from Query (Function) ###

In [47]:
import numpy as np

def generate_response(collection_name, query):
    context, urls = retrieve_relevant_chunks(collection_name, query)
    unique_urls = np.unique(urls)

    system_role = "You are a specialized assistant that only provides advice on dog-related veterinary care. If a user asks about any other animal or topic outside of dog health, politely decline to answer and remind them that you only provide information about dogs. If you don't know the answer to a question, let the user know that you're not sure and suggest that they consult a veterinarian for more information."

    # Combine retrieved chunks into a single string
    context_text = "\n".join(context)

    # Generate a response using GPT-4
    client = OpenAI()
    completion = client.chat.completions.create(
        model="gpt-4o-mini-2024-07-18",
        messages=[
            {"role": "system", "content": system_role},
            {"role": "user", "content": "context" + context_text},
            {"role": "user", "content": query}
        ]
    )
    return completion.choices[0].message, unique_urls

In [48]:
Qclient.get_collections()

CollectionsResponse(collections=[])

### Delete all entries in a collection (Function) ###
*BE CAREFUL WITH THIS*

In [53]:
def delete_collection(collection_name):
    confirmation = input("Type 'DELETE' to confirm deletion of entries in " + collection_name + ". This cannot be undone. Type anything else to abort")
    if confirmation == "DELETE" :
        try:
            Qclient.delete_collection(collection_name=collection_name)
            print(f"Collection '{collection_name}' deleted successfully")
            create_qdrant_collection(collection_name)
        except Exception as e:
            print(f"Failed to delete collection '{collection_name}': {e}")
            raise
    else:
        print("Deletion aborted")

### Markdown Print Function ###

In [54]:
def print_markdown(md_text):
    display(Markdown(md_text))

## Playground ##
Use this section to test functions and play around with the model

In [55]:
# Define the folder paths
scraping_demo_folder = 'scrapingDemo'
scraped_files_folder = os.path.join(scraping_demo_folder, 'ScrapedFiles_petMD_allergies')
                                    

#numbver of files in the folder
num_files = len([name for name in os.listdir(scraped_files_folder) if os.path.isfile(os.path.join(scraped_files_folder, name))])
current_file = 1

all_files = []
all_embeddings = []
all_chunks = []

for filename in os.listdir(scraped_files_folder):
    if current_file > 10:
        break
    file_path = os.path.join(scraped_files_folder, filename)
    print(f"Processing file {current_file} of {num_files}: {filename}")
    
    
    # # Chunk the file
    chunks = chunk_text_from_file(file_path)
    if not chunks:  # Skip if no chunks are generated
        print(f"No chunks generated for file: {filename}")
        continue
    
    # Get embeddings for the chunks
    embeddings = get_embeddings_for_chunks(chunks)
    
    # Append the embeddings and chunks to the lists
    all_embeddings.extend(embeddings)
    all_chunks.extend(chunks)
    
    filename_repeated = [filename] * len(chunks)
    all_files.extend(filename_repeated)

    current_file += 1


Processing file 1 of 22: Cytopoint®_for_Dogs_.txt


RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [17]:

print("Upserting...")
collection_name = 'LLM_V2'  # Define your collection name

# # TODO 

# # upsert_embeddings(collection_name, all_embeddings, all_chunks)
upsert_embeddings(collection_name, all_embeddings, all_chunks, all_files)
# print(len(all_embeddings), len(all_chunks), len(all_files))

Upserting...
Failed to upsert embeddings: Unexpected Response: 404 (Not Found)
Raw response content:
b'{"status":{"error":"Not found: Collection `LLM_V2` doesn\'t exist!"},"time":0.000065542}'


UnexpectedResponse: Unexpected Response: 404 (Not Found)
Raw response content:
b'{"status":{"error":"Not found: Collection `LLM_V2` doesn\'t exist!"},"time":0.000065542}'

In [36]:
query = "Tell me abt cytopoint for dogs?"
relevant_chunks = retrieve_relevant_chunks("LLM_V2", query)
# print("Relevant chunks:" + str(relevant_chunks))

response, urls = generate_response("LLM_V2", query)
print_markdown(response.content)

print(f"Here are the urls I used: \n", [url for url in urls])



# print_markdown(generate_response("LLM_V1", query).content)

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [56]:
delete_collection("LLM_V2")

Collection 'LLM_V2' deleted successfully
Collection 'LLM_V2' created successfully
