# Pronova LLM V1 #
## Use this notebook to do the following ##
- Create Qdrant collections
- Chunk text files into smaller chunks
- Create embeddings for data chunks and querys
- Delete qdrant vectors
- Run the current model on a query

In [None]:
# Load require librarys
import os
from qdrant_client import QdrantClient
from qdrant_client.http import models
from openai import OpenAI
from dotenv import load_dotenv
from IPython.display import Markdown, display

# Load environment variables from .env file
load_dotenv()

### Setup Qdrant connection ###

In [None]:
# Get the Qdrant API key from the environment variable
Qdrant_api_key = os.getenv('Qdrant_API_KEY')
if not Qdrant_api_key:
    raise ValueError("No Qdrant API key found in environment variables")
Qdrant_url = os.getenv('Qdrant_URL')
if not Qdrant_url:
    raise ValueError("No Qdrant URL found in environment variables")


# Initialize Qdrant client
try:
    Qclient = QdrantClient(
        url= Qdrant_url,
        api_key=Qdrant_api_key
    )
    print("Successfully connected to Qdrant")
except Exception as e:
    print(f"Failed to connect to Qdrant: {e}")
    raise

### Setup OpenAI connection ###

In [None]:
# Get the OpenAI API key from the environment variable
OpenAI_api_key = os.getenv('OPENAI_API_KEY')
if not OpenAI_api_key:
    raise ValueError("No OpenAI API key found in environment variables")

OpenAI.api_key = OpenAI_api_key

### Creating a Qdrant Collection (function) ###

In [None]:
#Create collection
def create_qdrant_collection(collection_name):
    try:
        Qclient.create_collection(
            collection_name=collection_name,
            vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE),
        )
        print(f"Collection '{collection_name}' created successfully")
    except Exception as e:
        print(f"Failed to create collection '{collection_name}': {e}")
        raise

### Get an OpenAI embedding from a text segment (Function) ###

In [None]:
# Function to get the embedding of a text
def get_embedding(text):
    client = OpenAI()
    response = client.embeddings.create(
        model="text-embedding-ada-002",
        input=text
    )
    return response.data[0].embedding

### Turn a file into chunks (Function) ###

In [None]:
# Function to read a text file and chunk its content
def chunk_text_from_file(file_path, chunk_size=400):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

### Get embedding from a list of chunks (Function) ###

In [None]:
# Function to get embeddings for a list of text chunks
def get_embeddings_for_chunks(chunks):
    embeddings = []
    for chunk in chunks:
        embedding = get_embedding(chunk)
        embeddings.append(embedding)
    return embeddings

### Get largest Qdrant ID (function) ###

In [None]:
def get_qdrant_next_id(collection_name):
    try:
        collection = Qclient.get_collection(collection_name=collection_name)
        id = collection.points_count
        if id == None:
            # print("id was zero")
            return 0
        else:
            # print(f"Next ID in collection '{collection_name}' is {id}")
            return id
    except Exception as e:
        print(f"Failed to get next ID from collection '{collection_name}': {e}")
        raise

### Upsert lists of documents to Qdrant (Function) ###
#### Mind the parameters ####

In [None]:
# function to upsert embeddings into Qdrant
# def upsert_embeddings(collection_name, embeddings, chunks):
def upsert_embeddings(collection_name, embeddings, chunks, filename):
    largest_id = get_qdrant_next_id(collection_name)
    points = []
    for i in range(len(embeddings)):
        points.append(
            {
                "id": i + largest_id + 1,
                "vector": embeddings[i],
                "payload": {
                    "text": chunks[i],   # Attach the chunk as payload
                    "source_file": filename  # Add source file
                }            
            }
        )

    try:
        Qclient.upsert(
            collection_name=collection_name,
            points=points
        )
        print("Embeddings upserted successfully")
    except Exception as e:
        print(f"Failed to upsert embeddings: {e}")
        raise

### Retrieve similar chunks from query (Function) ###

In [None]:
def retrieve_relevant_chunks(collection_name, query, top_k=10):
    query_embedding = get_embedding(query)
    
    search_result = Qclient.search(
        collection_name=collection_name,
        query_vector=query_embedding,
        limit=top_k
    )

    contexts = [result.payload["text"] for result in search_result]
    files = [result.payload.get("source_file") for result in search_result]
    
    return contexts, files


### Generate Response from Query (Function) ###

In [None]:
from collections import Counter

def file_ratios(files):
    total_files = len(files)
    counts = Counter(files)
    return {file: count*100 / total_files for file, count in counts.items()}


# file_ratios(["a", "a", "b", "c"])
# {'a': 0.5, 'b': 0.25, 'c': 0.25}

In [None]:
import numpy as np

def generate_response(collection_name, query):
    context, files = retrieve_relevant_chunks(collection_name, query)
    
    # unique_files = np.unique(files)
    file_rank = file_ratios(files)
    

    system_role = "You are a specialized assistant that only provides advice on dog-related veterinary care. If a user asks about any other animal or topic outside of dog health, politely decline to answer and remind them that you only provide information about dogs."

    # If you don't know the answer to a question, let the user know that you're not sure and suggest that they consult a veterinarian for more information.

    # Combine retrieved chunks into a single string
    context_text = "\n".join(context)


    # Generate a response using GPT-4
    client = OpenAI()
    completion = client.chat.completions.create(
        model="gpt-4o-mini-2024-07-18",
        messages=[
            {"role": "system", "content": system_role},
            {"role": "user", "content": "context" + context_text},
            {"role": "user", "content": query}
        ]
    )
    return completion.choices[0].message, file_rank

### Delete all entries in a collection (Function) ###
*BE CAREFUL WITH THIS*

In [None]:
def delete_collection(collection_name):
    confirmation = input("Type 'DELETE' to confirm deletion of entries in " + collection_name + ". This cannot be undone. Type anything else to abort")
    if confirmation == "DELETE" :
        try:
            Qclient.delete_collection(collection_name=collection_name)
            print(f"Collection '{collection_name}' deleted successfully")
            create_qdrant_collection(collection_name)
        except Exception as e:
            print(f"Failed to delete collection '{collection_name}': {e}")
            raise
    else:
        print("Deletion aborted")

### Markdown Print Function ###

In [None]:
def print_markdown(md_text):
    display(Markdown(md_text))

### Process Files From Folder (function) ###

In [None]:
def process_files_from_folder(collection_name, file_path, limit):
    
    #number of files in the folder 
    num_files = len([name for name in os.listdir(file_path) if os.path.isfile(os.path.join(file_path, name))])
    if limit == None:
        limit = num_files
    current_file = 1

    for filename in os.listdir(file_path):
        if current_file > limit:
            break
        curr_file_path = os.path.join(file_path, filename)
        print(f"Processing file {current_file} of max {limit}: {filename}")
        
        # # Chunk the file
        chunks = chunk_text_from_file(curr_file_path)
        # Get embeddings for the chunks
        embeddings = get_embeddings_for_chunks(chunks)
        # Upsert the embeddings into Qdrant
        upsert_embeddings(collection_name, embeddings, chunks, filename)
        current_file += 1


## Playground ##
Use this section to test functions and play around with the model

In [None]:
filepath = "scrapingDemo/ScrapedFiles_petMD_nutrition"
collection_name = "LLM"
limit = 20
delete_collection(collection_name)
process_files_from_folder(collection_name, filepath, limit)

In [None]:
query = "Can I give my dog some raisins for dessert?"

response, file_rank = generate_response("LLM", query)
print_markdown(response.content)


print("files used: \n")
for file in file_rank:
    print(f"{file}, {file_rank[file]} %")