In [None]:
import sys
import os
sys.path.append('../')

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from qdrant_client.http import models
from qdrant_client.http.models import PointStruct
from langchain_community.vectorstores import Qdrant
import openai
from openai import OpenAI
import pdfplumber

In [None]:
PDF_FILE = ""
QDRANT_COLLECTION = ""
OPENAI_EMBEDDING_MODEL = "text-embedding-3-small"
OPENAI_GPT_MODEL = "gpt-3.5-turbo"

# Maximum number of results for Qdrant vector database
MAX_NO_SEARCH_RESULTS_QDRANT = 5
# Used to split the pdf file
CHUNK_SIZE = 500
SECRETS_DIRECTORY = "../secrets"

# Getting URLs and API keys

In [None]:
# OpenAI API key file
openai_api_key_file = os.path.join(SECRETS_DIRECTORY, 'openai_api_key.secret')

# Qdrant URL file
qdrant_url_file = os.path.join(SECRETS_DIRECTORY, 'qdrant_url.secret')

# Qdrant API key file 
qdrant_api_key_file = os.path.join(SECRETS_DIRECTORY, 'qdrant_api_key.secret') 


In [None]:
def read_key(file_path):
    try:
        with open(file_path, 'r') as file:
            key = file.read().strip()
        return key
    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found.")
        return None
    except Exception as e:
        print(f"An error occurred while reading the file '{file_path}': {e}")
        return None


# OpenAI API key
OPENAI_API_KEY = read_key(openai_api_key_file)

# Qdrant URL
QDRANT_URL = read_key(qdrant_url_file)

# Read the API key from the file
QDRANT_API_KEY = read_key(qdrant_api_key_file)

print(QDRANT_URL, "\n", QDRANT_API_KEY, "\n", OPENAI_API_KEY)

# Showing Qdrant collection information

In [None]:
qdrant_client = QdrantClient(
    host=QDRANT_URL,
    api_key=QDRANT_API_KEY
)

In [None]:
# qdrant_client.recreate_collection(
#     collection_name=QDRANT_COLLECTION,
#     vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE),
# )
# print("Create collection reponse:", qdrant_client)

In [None]:
collection_info = qdrant_client.get_collection(collection_name=QDRANT_COLLECTION)
print("Collection info:", collection_info)

## Read PDF file

In [None]:
full_text = ""
with pdfplumber.open(PDF_FILE) as pdf:
    # Going through all pages
    for page in pdf.pages:
        full_text += page.extract_text()

print(full_text)

## Chunk PDF text

In [None]:
text = full_text

chunks = []
# The loop continues as long as the length of text is greater than 500 characters
while len(text) > CHUNK_SIZE:
    # Find the last period within the first 500 characters
    last_period_index = text[:CHUNK_SIZE].rfind('.')
    if last_period_index == -1: 
        # If there's no period, then we get the whole CHUNK_SIZE
        last_period_index = CHUNK_SIZE
    chunks.append(text[:last_period_index])
    # Moving the the next chunk
    text = text[last_period_index+1:]
chunks.append(text)

# for chunk in chunks:
#     print(chunk)
#     print("---")

In [None]:
len(chunks)

## Create embeddings and index with qdrant

In [None]:
points = []
i = 1
for chunk in chunks: 
    i += 1

    print("Embeddings chunk: \n", chunk)
    openai_client = openai.OpenAI(api_key=OPENAI_API_KEY)
    embedding_response = openai_client.embeddings.create(
        input=chunk,
        model=OPENAI_EMBEDDING_MODEL
    )

    print(embedding_response)

    embedding = embedding_response.data[0].embedding   
    points.append(PointStruct(id=i, vector=embedding, payload={"text": chunk}))
    

In [None]:
len(points)

# class PromptServices (adapted from class PromptServices in FastAPI code

In [None]:
# Making prompt_services global so we don't need to construct this object for every query
prompt_services = PromptServices(
    OPENAI_API_KEY,
    QDRANT_URL,
    QDRANT_API_KEY,
    OPENAI_EMBEDDING_MODEL,
    OPENAI_GPT_MODEL
)

class PromptServices:
    """
    Class handling various prompt services related operations.
    """

    # Constructor
    def __init__(self,
                 openai_api_key,
                 qdrant_url,
                 qdrant_api_key,
                 openai_embedding_model,
                 openai_gpt_model):

        self._open_api_key = openai_api_key
        self._qdrant_url = qdrant_url
        self._qdrant_api_key = qdrant_api_key
        self._openai_embedding_model = openai_embedding_model
        self._openai_gpt_model = openai_gpt_model

        self._openai_client = openai.OpenAI(api_key=self._open_api_key)
        self._qdrant_client = QdrantClient(host=self._qdrant_url,
                                           api_key=self._qdrant_api_key)
        self._system_prompt = ("You are a knowledgeable assistant. "
                               "Please use the provided context to answer the question. "
                               "Please be as helpful and relevant as possible. "
                               "If you do not have the information, "
                               "please do not make up the answer.")

    #############################################
    # Get the embedding of the query using the provided embedding model
    def get_embedding(self, query):

        # Get the embedding of the query
        try:
            embedding_response = self._openai_client.embeddings.create(
                input=query,
                model=self._openai_embedding_model
            )
        except Exception as e:
            print("Exception :", str(e))

        return embedding_response.data[0].embedding

    #############################################
    # Search for closest texts in Qdrant vector database
    def get_context(self, embedding):

        try:
            search_results = self._qdrant_client.search(
                collection_name=QDRANT_COLLECTION,
                query_vector=embedding,
                limit=MAX_NO_SEARCH_RESULTS_QDRANT
            )
        except Exception as e:
            print("Exception :", str(e))

        return search_results

    #############################################
    # Search for closest texts in Qdrant vector database
    def get_context_2(self, embedding):

        try:
            search_results = self._qdrant_client.search(
                collection_name=QDRANT_COLLECTION,
                query_vector=embedding,
                limit=1
            )
        except Exception as e:
            print("Exception :", str(e))

        return search_results   
    #############################################
    def get_response(self, query, search_results):

        context = ""
        for result in search_results:
            context += result.payload['text'] + "\n"

        try:
            chat_response = self._openai_client.chat.completions.create(
                model=self._openai_gpt_model,
                messages=[
                    {"role": "system", "content": self._system_prompt},
                    {"role": "assistant", "content": context},
                    {"role": "user", "content": query}
                ]
            )
        except Exception as e:
            print("Exception :", str(e))

        return chat_response.choices[0].message.content.strip()


    #############################################
    def get_response_2(self, query, search_results):

        context = ""
        qdrant_id = search_results[0].id
        print(qdrant_id)


        search_results = qdrant_client.retrieve(
            collection_name=QDRANT_COLLECTION,
            ids=list(range(max(2, qdrant_id-2), qdrant_id+3))
        )

        print(list(range(max(2, qdrant_id-2), qdrant_id+3)))
        
        for result in search_results:
            context += result.payload['text'] + "\n"

        try:
            chat_response = self._openai_client.chat.completions.create(
                model=self._openai_gpt_model,
                messages=[
                    {"role": "system", "content": self._system_prompt},
                    {"role": "assistant", "content": context},
                    {"role": "user", "content": query}
                ]
            )
        except Exception as e:
            print("Exception :", str(e))

        return chat_response.choices[0].message.content.strip()


# get_answer_from_llm() - From FastAPI code

In [None]:
def get_answer_from_llm(query):
    try:
        embedding = prompt_services.get_embedding(query)
        search_results = prompt_services.get_context(embedding)
        response = prompt_services.get_response(query, search_results)
    except Exception as e:
        print("Exception :", str(e))

    return response

In [None]:
def get_answer_from_llm_2(query):
    try:
        embedding = prompt_services.get_embedding(query)
        search_results = prompt_services.get_context_2(embedding)
        response = prompt_services.get_response_2(query, search_results)
    except Exception as e:
        print("Exception :", str(e))

    return response