# Pronova LLM V1 #
## Use this notebook to do the following ##
- Create Qdrant collections
- Chunk text files into smaller chunks
- Create embeddings for data chunks and querys
- Delete qdrant vectors
- Run the current model on a query

In [1]:
# Load require librarys
import os
from qdrant_client import QdrantClient
from qdrant_client.http import models
from openai import OpenAI
from dotenv import load_dotenv
from IPython.display import Markdown, display

# Load environment variables from .env file
load_dotenv()

True

### Setup Qdrant connection ###

In [2]:
# Get the Qdrant API key from the environment variable
Qdrant_api_key = os.getenv('Qdrant_API_KEY')
if not Qdrant_api_key:
    raise ValueError("No Qdrant API key found in environment variables")
Qdrant_url = os.getenv('Qdrant_URL')
if not Qdrant_url:
    raise ValueError("No Qdrant URL found in environment variables")


# Initialize Qdrant client
try:
    Qclient = QdrantClient(
        url= Qdrant_url,
        api_key=Qdrant_api_key
    )
    print("Successfully connected to Qdrant")
except Exception as e:
    print(f"Failed to connect to Qdrant: {e}")
    raise

Successfully connected to Qdrant


### Setup OpenAI connection ###

In [3]:
# Get the OpenAI API key from the environment variable
OpenAI_api_key = os.getenv('OPENAI_API_KEY')
if not OpenAI_api_key:
    raise ValueError("No OpenAI API key found in environment variables")

OpenAI.api_key = OpenAI_api_key

### Creating a Qdrant Collection (function) ###

In [4]:
#Create collection
def create_qdrant_collection(collection_name):
    try:
        Qclient.create_collection(
            collection_name=collection_name,
            vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE),
        )
        print(f"Collection '{collection_name}' created successfully")
    except Exception as e:
        print(f"Failed to create collection '{collection_name}': {e}")
        raise

### Get an OpenAI embedding from a text segment (Function) ###

In [5]:
# Function to get the embedding of a text
def get_embedding(text):
    client = OpenAI()
    response = client.embeddings.create(
        model="text-embedding-ada-002",
        input=text
    )
    return response.data[0].embedding

### Turn a file into chunks (Function) ###

In [6]:
# Function to read a text file and chunk its content
def chunk_text_from_file(file_path, chunk_size=400):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

### Get embedding from a list of chunks (Function) ###

In [7]:
# Function to get embeddings for a list of text chunks
def get_embeddings_for_chunks(chunks):
    embeddings = []
    for chunk in chunks:
        embedding = get_embedding(chunk)
        embeddings.append(embedding)
    return embeddings

### Upsert lists of documents to Qdrant (Function) ###
#### Mind the parameters ####

In [8]:
# function to upsert embeddings into Qdrant
def upsert_embeddings(collection_name, embeddings, chunks):
    points = []
    for i, embedding in enumerate(embeddings):
        points.append(
            {
                "id": i,
                "vector": embeddings[i],
                "payload": {"text": chunks[i]}
            }
        )
    try:
        Qclient.upsert(
            collection_name=collection_name,
            points=points
        )
        print("Embeddings upserted successfully")
    except Exception as e:
        print(f"Failed to upsert embeddings: {e}")
        raise

### Retrieve similar chunks from query (Function) ###

In [9]:
def retrieve_relevant_chunks(collection_name, query, top_k=5):
    query_embedding = get_embedding(query)
    
    search_result = Qclient.search(
        collection_name=collection_name,
        query_vector=query_embedding,
        limit=top_k
    )

    return [result.payload["text"] for result in search_result]

### Generate Response from Query (Function) ###

In [10]:
def generate_response(collection_name, query):
    context = retrieve_relevant_chunks(collection_name, query)
    system_role = "You are a specialized assistant that only provides advice on dog-related veterinary care. If a user asks about any other animal or topic outside of dog health, politely decline to answer and remind them that you only provide information about dogs. If you don't know the answer to a question, let the user know that you're not sure and suggest that they consult a veterinarian for more information."

    # Combine retrieved chunks into a single string
    context_text = "\n".join(context)

    # Generate a response using GPT-4
    client = OpenAI()
    completion = client.chat.completions.create(
        model="gpt-4o-mini-2024-07-18",
        messages=[
            {"role": "system", "content": system_role},
            {"role": "user", "content": "context" + context_text},
            {"role": "user", "content": query}
        ]
    )
    return completion.choices[0].message

### Delete all entries in a collection (Function) ###
*BE CAREFUL WITH THIS*

In [11]:
def delete_collection(collection_name):
    confirmation = input("Type 'DELETE' to confirm deletion of entries in " + collection_name + ". This cannot be undone. Type anything else to abort")
    if confirmation == "DELETE" :
        try:
            Qclient.delete_collection(collection_name=collection_name)
            print(f"Collection '{collection_name}' deleted successfully")
            create_qdrant_collection(collection_name)
        except Exception as e:
            print(f"Failed to delete collection '{collection_name}': {e}")
            raise
    else:
        print("Deletion aborted")

### Markdown Print Function ###

In [12]:
def print_markdown(md_text):
    display(Markdown(md_text))

## Playground ##
Use this section to test functions and play around with the model

In [None]:
# Define the folder paths
scraping_demo_folder = 'scrapingDemo'
scraped_files_folder = os.path.join(scraping_demo_folder, 'ScrapedFiles_petMD')

#numbver of files in the folder
num_files = len([name for name in os.listdir(scraped_files_folder) if os.path.isfile(os.path.join(scraped_files_folder, name))])
current_file = 1

# Initialize lists to hold all embeddings and chunks
all_embeddings = []
all_chunks = []

# Iterate through each file in the Scraped Files folder
for filename in os.listdir(scraped_files_folder):
    file_path = os.path.join(scraped_files_folder, filename)
    print(f"Processing file {current_file} of {num_files}: {filename}")
    
    # Chunk the file
    chunks = chunk_text_from_file(file_path)
    
    # Get embeddings for the chunks
    embeddings = get_embeddings_for_chunks(chunks)
    
    # Append the embeddings and chunks to the lists
    all_embeddings.extend(embeddings)
    all_chunks.extend(chunks)
    current_file += 1


# Upsert the embeddings to the database
print("Upserting...")
collection_name = 'LLM_V1'  # Define your collection name
upsert_embeddings(collection_name, all_embeddings, all_chunks)

Processing file 1 of 726: Skin_Cancer_in_Dogs.txt
Processing file 2 of 726: Meningitis_Meningoencephalitis_Meningomyelitis_in_Dogs.txt
Processing file 3 of 726: Anaphylaxis_in_Dogs.txt
Processing file 4 of 726: How_to_Get_Rid_of_Tapeworms_in_Dogs.txt
Processing file 5 of 726: Vascular_Ring_Anomalies_in_Dogs.txt
Processing file 6 of 726: Nasal_Dermatoses_in_Dogs_(Dog_Nose_Skin_Issues).txt
Processing file 7 of 726: Dog_Pneumonia.txt
Processing file 8 of 726: Glucose_in_the_Urine_in_Dogs.txt
Processing file 9 of 726: Lupus_in_Dogs.txt
Processing file 10 of 726: Progressive_Retinal_Atrophy_(PRA)_In_Dogs.txt
Processing file 11 of 726: Calcium_Deposits_in_the_Urinary_Tract_in_Dogs.txt
Processing file 12 of 726: Bacterial_Infection_(Campylobacteriosis)_in_Dogs.txt
Processing file 13 of 726: Gallstones_in_Dogs.txt
Processing file 14 of 726: Dog_Dementia_Symptoms_Causes_Treatment_and_Life_Expectancy.txt
Processing file 15 of 726: Eyelid_Protrusion_(Cherry_Eye)_in_Dogs.txt


FileNotFoundError: [Errno 2] No such file or directory: 'scrapingDemo/ScrapedFiles/Eyelid_Protrusion_(Cherry_Eye)_in_Dogs.txt'

In [71]:
query = "What source is your information based on?"
relevant_chunks = retrieve_relevant_chunks("LLM_V1", query)
print("Relevant chunks:" + str(relevant_chunks))
print_markdown(generate_response("LLM_V1", query).content)

Relevant chunks:['rinarians.', 'nd insights from our veterinarians.', 'arians.', 'mage:\xa0iStock.com/Image Source\nWRITTEN BY\nVeterinarian\nDr. Ellen Malmanger is originally from Arkansas, but attended Iowa State University College of Veterinary Medicine for veterinary school....\nWas this article helpful?\nSign up for weekly pet health tips and insights from our veterinarians.', "ary Consult: Canine and Feline. Lippincott Williams & Wilkins; 2005\n“Veterinary Information Network®, Inc.” VIN.com, 29 June 2005, www.vin.com.\n\n\nWRITTEN BY\nVeterinarian\nDr. Lauren Jones graduated from the University of Pennsylvania School of Veterinary Medicine in 2010, after receiving her bachelor's degree...\nWas this article helpful?\nSign up for weekly pet health tips and insights from our vete"]


I provide information based on general knowledge and guidelines related to dog health and veterinary care. If you have a specific question about dog health, I would be happy to help! For detailed or case-specific inquiries, it's always best to consult a veterinarian directly.

In [16]:
delete_collection("LLM_V1")

Collection 'LLM_V1' deleted successfully
Collection 'LLM_V1' created successfully
