# Pronova LLM V1 #
## Use this notebook to do the following ##
- Create Qdrant collections
- Chunk text files into smaller chunks
- Create embeddings for data chunks and querys
- Delete qdrant vectors
- Run the current model on a query

In [22]:
# Load require librarys
import os
from qdrant_client import QdrantClient
from qdrant_client.http import models
from openai import OpenAI
from dotenv import load_dotenv
from IPython.display import Markdown, display

# Load environment variables from .env file
load_dotenv()

True

### Setup Qdrant connection ###

In [23]:
# Get the Qdrant API key from the environment variable
Qdrant_api_key = os.getenv('Qdrant_API_KEY')
if not Qdrant_api_key:
    raise ValueError("No Qdrant API key found in environment variables")
Qdrant_url = os.getenv('Qdrant_URL')
if not Qdrant_url:
    raise ValueError("No Qdrant URL found in environment variables")


# Initialize Qdrant client
try:
    Qclient = QdrantClient(
        url= Qdrant_url,
        api_key=Qdrant_api_key
    )
    print("Successfully connected to Qdrant")
except Exception as e:
    print(f"Failed to connect to Qdrant: {e}")
    raise

Successfully connected to Qdrant


### Setup OpenAI connection ###

In [24]:
# Get the OpenAI API key from the environment variable
OpenAI_api_key = os.getenv('OPENAI_API_KEY')
if not OpenAI_api_key:
    raise ValueError("No OpenAI API key found in environment variables")

OpenAI.api_key = OpenAI_api_key

### Creating a Qdrant Collection (function) ###

In [25]:
#Create collection
def create_qdrant_collection(collection_name):
    try:
        Qclient.create_collection(
            collection_name=collection_name,
            vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE),
        )
        print(f"Collection '{collection_name}' created successfully")
    except Exception as e:
        print(f"Failed to create collection '{collection_name}': {e}")
        raise

### Get an OpenAI embedding from a text segment (Function) ###

In [26]:
# Function to get the embedding of a text
def get_embedding(text):
    client = OpenAI()
    response = client.embeddings.create(
        model="text-embedding-ada-002",
        input=text
    )
    return response.data[0].embedding

### Turn a file into chunks (Function) ###

In [27]:
# Function to read a text file and chunk its content
def chunk_text_from_file(file_path, chunk_size=400):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

### Get embedding from a list of chunks (Function) ###

In [28]:
# Function to get embeddings for a list of text chunks
def get_embeddings_for_chunks(chunks):
    embeddings = []
    for chunk in chunks:
        embedding = get_embedding(chunk)
        embeddings.append(embedding)
    return embeddings

### Upsert lists of documents to Qdrant (Function) ###
#### Mind the parameters ####

In [None]:
# function to upsert embeddings into Qdrant
# def upsert_embeddings(collection_name, embeddings, chunks):
def upsert_embeddings(collection_name, embeddings, chunks, files):

    points = []
    for i in range(len(embeddings)):
        points.append(
            {
                "id": i,
                "vector": embeddings[i],
                "payload": {
                    "text": chunks[i],   # Attach the chunk as payload
                    "source_url": files[i]  # Add the source URL
                }            
            }
        )
        # print(f"{i}: {files[i]}")

    # print('not actually upserting lol')
    # return


    try:
        Qclient.upsert(
            collection_name=collection_name,
            points=points
        )
        print("Embeddings upserted successfully")
    except Exception as e:
        print(f"Failed to upsert embeddings: {e}")
        raise

### Retrieve similar chunks from query (Function) ###

In [75]:
def retrieve_relevant_chunks(collection_name, query, top_k=5):
    query_embedding = get_embedding(query)
    
    search_result = Qclient.search(
        collection_name=collection_name,
        query_vector=query_embedding,
        limit=top_k
    )

    contexts = [result.payload["text"] for result in search_result]
    urls = [result.payload.get("source_url") for result in search_result]
    
    return contexts, urls

    # return [result.payload["text"] for result in search_result]

### Generate Response from Query (Function) ###

In [79]:
import numpy as np

def generate_response(collection_name, query):
    context, urls = retrieve_relevant_chunks(collection_name, query)
    unique_urls = np.unique(urls)

    system_role = "You are a specialized assistant that only provides advice on dog-related veterinary care. If a user asks about any other animal or topic outside of dog health, politely decline to answer and remind them that you only provide information about dogs. If you don't know the answer to a question, let the user know that you're not sure and suggest that they consult a veterinarian for more information."

    # Combine retrieved chunks into a single string
    context_text = "\n".join(context)

    # Generate a response using GPT-4
    client = OpenAI()
    completion = client.chat.completions.create(
        model="gpt-4o-mini-2024-07-18",
        messages=[
            {"role": "system", "content": system_role},
            {"role": "user", "content": "context" + context_text},
            {"role": "user", "content": query}
        ]
    )
    return completion.choices[0].message, unique_urls

### Delete all entries in a collection (Function) ###
*BE CAREFUL WITH THIS*

In [51]:
def delete_collection(collection_name):
    confirmation = input("Type 'DELETE' to confirm deletion of entries in " + collection_name + ". This cannot be undone. Type anything else to abort")
    if confirmation == "DELETE" :
        try:
            Qclient.delete_collection(collection_name=collection_name)
            print(f"Collection '{collection_name}' deleted successfully")
            create_qdrant_collection(collection_name)
        except Exception as e:
            print(f"Failed to delete collection '{collection_name}': {e}")
            raise
    else:
        print("Deletion aborted")

### Markdown Print Function ###

In [12]:
def print_markdown(md_text):
    display(Markdown(md_text))

## Playground ##
Use this section to test functions and play around with the model

In [None]:
# Define the folder paths
scraping_demo_folder = 'scrapingDemo'
scraped_files_folder = os.path.join(scraping_demo_folder, 'ScrapedFiles_petMD_allergies')
                                    

#numbver of files in the folder
num_files = len([name for name in os.listdir(scraped_files_folder) if os.path.isfile(os.path.join(scraped_files_folder, name))])
current_file = 1

all_files = []
all_embeddings = []
all_chunks = []

for filename in os.listdir(scraped_files_folder):
    file_path = os.path.join(scraped_files_folder, filename)
    print(f"Processing file {current_file} of {num_files}: {filename}")
    
    
    # # Chunk the file
    chunks = chunk_text_from_file(file_path)
    if not chunks:  # Skip if no chunks are generated
        print(f"No chunks generated for file: {filename}")
        continue
    
    # Get embeddings for the chunks
    embeddings = get_embeddings_for_chunks(chunks)
    
    # Append the embeddings and chunks to the lists
    all_embeddings.extend(embeddings)
    all_chunks.extend(chunks)
    
    filename_repeated = [filename] * len(chunks)
    all_files.extend(filename_repeated)

    current_file += 1


Processing file 1 of 22: 30_'Hypoallergenic'_Dogs_That_Don’t_Shed_a_Lot.txt
Processing file 2 of 22: 5_Signs_Your_Pet_Is_Having_an_Allergic_Reaction.txt
Processing file 3 of 22: 7_Dog_Allergy_Symptoms_to_Look_For.txt
Processing file 4 of 22: 8_Tips_for_Helping_House_Guests_with_Pet_Allergies.txt
Processing file 5 of 22: Allergies_in_Dogs_and_Puppies_Signs_Causes_and_Treatment.txt
Processing file 6 of 22: Can_a_Dog_Be_Allergic_to_Cats.txt
Processing file 7 of 22: Can_Dogs_Be_Allergic_to_Cats.txt
Processing file 8 of 22: Can_Dogs_Be_Allergic_to_Their_Beds.txt
Processing file 9 of 22: Can_Dogs_Have_Asthma.txt
Processing file 10 of 22: Can_I_Give_My_Dog_Benadryl®_And_if_So_How_Much.txt
Processing file 11 of 22: Chlorpheniramine_for_Cats_and_Dogs.txt
Processing file 12 of 22: Cytopoint®_for_Dogs_.txt
Processing file 13 of 22: Does_Your_Dog_Have_a_Flea_Allergy.txt
Processing file 14 of 22: Dog_Eye_Allergies_Symptoms_and_Treatment.txt
Processing file 15 of 22: Food_Allergies_and_Intolerances_

In [65]:

# all_files = []
# all_embeddings = []
# all_chunks = []
# current_file = 1
# for filename in os.listdir(scraped_files_folder):
#     file_path = os.path.join(scraped_files_folder, filename)
#     print(f"Processing file {current_file} of {num_files}: {filename}")
    
    
#     # # Chunk the file
#     chunks = chunk_text_from_file(file_path)
#     if not chunks:  # Skip if no chunks are generated
#         print(f"No chunks generated for file: {filename}")
#         continue
    
#     # Get embeddings for the chunks
#     embeddings = get_embeddings_for_chunks(chunks)
    
#     # Append the embeddings and chunks to the lists
#     all_embeddings.extend(embeddings)
#     all_chunks.extend(chunks)

#     # filename_repeated = [filename] * len(chunks)
    

#     filename_repeated = [filename] * len(chunks)
#     # print(len(embeddings), len(chunks), len(filename_repeated))

#     all_files.extend(filename_repeated)

#     current_file += 1


# print("Upserting...")
# collection_name = 'LLM_V2'  # Define your collection name

# # # TODO 

# # # upsert_embeddings(collection_name, all_embeddings, all_chunks)
# upsert_embeddings(collection_name, all_embeddings, all_chunks, all_files)
# # print(len(all_embeddings), len(all_chunks), len(all_files))

Processing file 1 of 22: 30_'Hypoallergenic'_Dogs_That_Don’t_Shed_a_Lot.txt
Processing file 2 of 22: 5_Signs_Your_Pet_Is_Having_an_Allergic_Reaction.txt
Processing file 3 of 22: 7_Dog_Allergy_Symptoms_to_Look_For.txt
Processing file 4 of 22: 8_Tips_for_Helping_House_Guests_with_Pet_Allergies.txt
Processing file 5 of 22: Allergies_in_Dogs_and_Puppies_Signs_Causes_and_Treatment.txt
Processing file 6 of 22: Can_a_Dog_Be_Allergic_to_Cats.txt
Processing file 7 of 22: Can_Dogs_Be_Allergic_to_Cats.txt
Processing file 8 of 22: Can_Dogs_Be_Allergic_to_Their_Beds.txt
Processing file 9 of 22: Can_Dogs_Have_Asthma.txt
Processing file 10 of 22: Can_I_Give_My_Dog_Benadryl®_And_if_So_How_Much.txt
Processing file 11 of 22: Chlorpheniramine_for_Cats_and_Dogs.txt
Processing file 12 of 22: Cytopoint®_for_Dogs_.txt
Processing file 13 of 22: Does_Your_Dog_Have_a_Flea_Allergy.txt
Processing file 14 of 22: Dog_Eye_Allergies_Symptoms_and_Treatment.txt
Processing file 15 of 22: Food_Allergies_and_Intolerances_

In [70]:

print("Upserting...")
collection_name = 'LLM_V2'  # Define your collection name

# # TODO 

# # upsert_embeddings(collection_name, all_embeddings, all_chunks)
upsert_embeddings(collection_name, all_embeddings, all_chunks, all_files)
# print(len(all_embeddings), len(all_chunks), len(all_files))

Upserting...
not actually upserting lol
Embeddings upserted successfully


In [33]:
print(Qclient.get_collections())

collections=[CollectionDescription(name='collectionOne')]


In [None]:
query = "Tell me abt cytopoint for dogs?"
relevant_chunks = retrieve_relevant_chunks("LLM_V2", query)
# print("Relevant chunks:" + str(relevant_chunks))

response, urls = generate_response("LLM_V2", query)
print_markdown(response.content)

print(f"Here are the urls I used: \n", [url for url in urls])



# print_markdown(generate_response("LLM_V1", query).content)

Cytopoint® is an injectable medication specifically designed for dogs to alleviate itching associated with atopic dermatitis, which is commonly caused by environmental allergies. The active ingredient in Cytopoint® is lokivetmab, a dog-specific monoclonal antibody that targets and neutralizes interleukin-31 (IL-31), a protein that plays a significant role in the itch response.

Here are some key points about Cytopoint®:

- **Administration**: It is administered as an injection under your dog’s skin by a veterinarian.
- **Duration of Effect**: After the injection, you can expect a gradual decrease in your dog's itchiness over the course of 4 to 8 weeks.
- **Follow-Up**: It is essential to have follow-up consultations with your veterinarian to assess the effectiveness of the treatment and determine the appropriate frequency for future injections.
- **Combination with Other Treatments**: Cytopoint® is often used alongside other allergy management treatments to help improve the overall health of your dog’s skin.
- **Not a Cure**: While it significantly reduces itching, Cytopoint® does not cure the underlying allergic condition; it is a symptomatic treatment aimed at providing relief.

If you have more specific questions about its use or effects on your dog, it's best to consult with your veterinarian.

here are the urls I used: 
 ['Cytopoint®_for_Dogs_.txt']


In [52]:
delete_collection("LLM_V2")

Collection 'LLM_V2' deleted successfully
Collection 'LLM_V2' created successfully
