<a href="https://colab.research.google.com/github/jainakshay91/LLM_Docker/blob/ftr%2Fazure_openai/Azure_OpenAI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import os
import pandas as pd
import numpy as np

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader

from openai import AzureOpenAI

# Setup Azure Open AI 


with open("./key.txt", 'r') as file:
    for line in file:
        line = line.strip()
        value = line.split('=')[1].strip()

        if line.startswith("openai.api_base"):
            os.environ["OPENAI_API_BASE"] = value

        elif line.startswith("API key"):
            os.environ["OPENAI_API_KEY"] = value
            
        elif line.startswith("deployment_name_text"):  
            deployment_text = value

        elif line.startswith("deployment_name_embed"):
            deployment_embed = value

client = AzureOpenAI(
  azure_endpoint = os.getenv("OPENAI_API_BASE"),
  api_key=os.getenv("OPENAI_API_KEY"),
  api_version="2024-02-15-preview"
)

#################################################################
# Data Path: Dataset and Embeddings Database
#################################################################

database_path = "./Dataset/"
embed_path = "./Embed/"
completion_model_name = deployment_text
embedding_model_name = deployment_embed


In [None]:
def LLMsetter(message_text):
  completion = client.chat.completions.create(
    model=completion_model_name, # model = "deployment_name"
    messages = message_text,
    temperature=0.2,
    max_tokens=800,
    top_p=0.95,
    frequency_penalty=0,
    presence_penalty=0,
    stop=None,
  )

  return completion

def EmbeddingGenerator(text):
  return client.embeddings.create(model=embedding_model_name, input = text, encoding_format='float').data[0].embedding

def TextSplittersetter():
  # Setup the Text Splitter
  text_splitter = RecursiveCharacterTextSplitter(separators=[
          "\n\n",
          "\n",
          " ",
          ".",
          ",",
          "\u200B",  # Zero-width space
          "\uff0c",  # Fullwidth comma
          "\u3001",  # Ideographic comma
          "\uff0e",  # Fullwidth full stop
          "\u3002",  # Ideographic full stop
          "",
      ],
      chunk_size=5000,
      chunk_overlap=200,
      length_function=len,
      is_separator_regex=False,
  )
  return text_splitter

def data_vectorstore_generator(text_splitter):
  # Create loaders for the multiple files inside the data path

  loaders = []
  for f in os.listdir(database_path):
    if "ipynb" in f:
      continue
    else:
      loaders.append(PyPDFLoader(os.path.join(database_path,f)))

  chunked_documents = []
  # Initialize the database

  db = pd.DataFrame(columns=["Text"])

  # Chunking multiple PDFs in the database path
  for loader in loaders:
    # Load the Files to be chunked
    docs = loader.load()
    # Chunk the loaded PDF
    texts = text_splitter.split_documents(documents = docs)
    #print(texts[0].page_content)
    for text in texts:
      db.loc[len(db.index)] = text.page_content
  #print(db)
  db["Embeddings"] = db["Text"].apply(lambda x: EmbeddingGenerator(x))
  #print(db)

  return db

def cosine_similarity_compute(query_embed,data_embed):
  return np.dot(query_embed, data_embed) / (np.linalg.norm(query_embed) * np.linalg.norm(data_embed))

def context_generator(query, db, top_n):

  # Generate Query Embedding
  query_embedding = client.embeddings.create(model=embedding_model_name,input=query).data[0].embedding

  # Compute the Cosine Similarity Index with all the Embeddings
  db["Similarities"] = db.Embeddings.apply(lambda x: cosine_similarity_compute(x,query_embedding))

  # Sort the Similarity values and compute the top N closest similar texts for the context
  search_result = db.sort_values("Similarities",ascending=False).head(top_n)

  # Join the resultant search result to create the context
  context = ' '.join(search_result.Text)
  #print(context)

  return context

def main():

  # Define the Size of Context
  top_n = 4

  # Initialize the Text Splitter
  text_splitter = TextSplittersetter()

  # Generate the Data Vector Store
  # If the Vector Store was previously generated then pull the pickled database to avoid computing embeddings again
  if os.path.isfile(embed_path+"db.pkl"):
    db = pd.read_pickle(embed_path+"db.pkl")
  else:
    db = data_vectorstore_generator(text_splitter)
    db.to_pickle(embed_path+"db.pkl")
  print(db.shape)
  #print(db.Embeddings)
  #print(db.Embeddings.shape)
  query = input("Insert your Query \n")

  context = context_generator(query, db, top_n)


  #System Role and Content Setup
  system_setup = [{"role":"system","content":"You are an AI assistant that helps people find information. Your responses are human like and yet technically correct as much as possible"},
                  {"role":"user","content":query},
                 {"role":"assistant", "content":f"Relevat Information for generating output: \n {context}"}]

  response = LLMsetter(system_setup)

  # Print response

  print("Response: " + response.choices[0].message.content + "\n")
    

# Calling the main function


main()



(757, 2)
