In [None]:
import os
#from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
#from langchain.chat_models import AzureChatOpenAI
from openai import AzureOpenAI
from dotenv import load_dotenv

# Load environment variables
load_dotenv()


In [None]:
def ask_question(question):
    # Initialize the Azure ChatGPT model
    model = AzureOpenAI(
        azure_deployment=os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME"),  # Replace with your Azure deployment name
        azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT") ,  # Replace with your Azure OpenAI endpoint
        api_key=os.getenv("AZURE_OPENAI_API_KEY"),
        api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
    )
    # Use the chat.completions.create method instead
    response = model.chat.completions.create(
        model=os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME"),
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": question}
        ]
    )
    return response.choices[0].message.content

In [None]:
if __name__ == "__main__":
    # Test question
    question = "Who is the Prime Minister of Russia?"
    answer = ask_question(question)
    print("Answer:", answer)


In [None]:
EndPoint = os.getenv("AZURE_OPENAI_ENDPOINT")
print(EndPoint)
Vers = os.getenv("AZURE_OPENAI_API_VERSION")
print(Vers)
azure_deployment = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")

In [None]:
embeddings_endpoint = os.getenv("AZURE_OPENAI_EMBEDDINGS_ENDPOINT")
embeddings_key = os.getenv("AZURE_OPENAI_EMBEDDINGS_KEY")
embeddings_model = os.getenv("AZURE_OPENAI_EMBEDDINGS_MODEL")
print(embeddings_endpoint)
print(embeddings_key)
print(embeddings_model)

In [None]:
from langchain_openai import OpenAIEmbeddings

client = AzureOpenAI(
    azure_endpoint= EndPoint,
    api_key= os.getenv("AZURE_OPENAI_API_KEY"),
    api_version= Vers
)

embeddings = OpenAIEmbeddings(deployment=embeddings_model,chunk_size=1000)

In [None]:
# Create chat completion
response = client.chat.completions.create(
    model="gpt-35-turbo-16k",  # Your deployed model name
    messages=[
        {"role": "user", "content": "Who is Prime Minister of India?"}
    ]
)

# Print the response
print(response.choices[0].message.content)

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
from langchain_openai import AzureOpenAIEmbeddings
from langchain_community.vectorstores import Chroma

In [None]:
#import openai
import langchain_openai as openai
import chromadb


def create_vector_store():
    """Create and return a vector store from PDF documents."""
    try:
        # Load PDFs from directory
        loader = DirectoryLoader(
            "./Temp_PDF",
            glob="**/*.pdf",
            loader_cls=PyPDFLoader
        )
        documents = loader.load()
        
        # Split documents into chunks
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=250,
            chunk_overlap=50,
            length_function=len
        )
        chunks = text_splitter.split_documents(documents)
        
        # Initialize Azure OpenAI embeddings
        embeddings = AzureOpenAIEmbeddings(
            azure_deployment=embeddings_model,
            openai_api_key=os.getenv("AZURE_OPENAI_API_KEY"),
            azure_endpoint=EndPoint,
            chunk_size=1000,
            api_version=Vers
        )
        
        # Create Chroma persistent client
        persist_directory = "./chroma_db"
        client = chromadb.PersistentClient(path=persist_directory)
        
        # Create or load the vector store
        vectordb = Chroma.from_documents(
            documents=chunks,
            embedding=embeddings,
            persist_directory=persist_directory
        )
        
        # Persist the database
        vectordb.persist()
        return vectordb
        
    except Exception as e:
        print(f"Error creating vector store: {e}")
        return None

In [None]:
def search_documents(query, vectordb, k=5):
    """Search the vector store for relevant documents."""
    try:
        # Perform similarity search
        results = vectordb.similarity_search_with_relevance_scores(
            query,
            k=k
        )
        
        return [(doc.page_content, score) for doc, score in results]
        
    except Exception as e:
        print(f"Error searching documents: {e}")
        return None


In [None]:

def generate_response(client, query, context):
    """Generate a response using Azure GPT-4."""
    try:
        system_message = """You are a helpful assistant. Use the provided context to answer questions.
        If you cannot find the answer in the context, say so clearly. Only use information from the provided context."""
        
        messages = [
            {"role": "system", "content": system_message},
            {"role": "user", "content": f"Context: {context}\n\nQuestion: {query}"}
        ]
        
        response = client.chat.completions.create(
            model= azure_deployment ,  # Your GPT-4 deployment name
            messages=messages,
            temperature=0.9,
            max_tokens=1000
        )
        
        return response.choices[0].message.content
        
    except Exception as e:
        print(f"Error generating response: {e}")
        return None

In [None]:
def get_embeddings(client, input_text):
    """Generate embeddings for the input text using Azure OpenAI API."""
    try:
        # Validate input
        if not input_text or not isinstance(input_text, str):
            raise ValueError("Input text must be a non-empty string")
            
        response = client.embeddings.create(
            model=embeddings_model,
            input=input_text
        )
        
        if not response.data:
            raise ValueError("No embedding data received")
            
        return response.data[0].embedding
        
    except Exception as e:
        print(f"Error generating embeddings: {e}")
        return None

In [None]:

from typing import List
def process_pdf_directory(directory_path: str) -> List[dict]:
    """Process all PDFs in a directory and return their contents."""
    try:
        pdf_contents = []
        directory = Path(directory_path)
        
        # Iterate through all PDF files in directory
        for pdf_file in directory.glob('**/*.pdf'):
            text = extract_text_from_pdf(str(pdf_file))
            if text:
                pdf_contents.append({
                    'source': str(pdf_file),
                    'content': text
                })
                print(f"Successfully processed {pdf_file}")
            else:
                print(f"Failed to process {pdf_file}")
                
        return pdf_contents
    except Exception as e:
        print(f"Error processing PDF directory: {e}")
        return []

In [None]:
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
import os

def main():
    try:
        # Environment variables
        EndPoint = os.getenv("AZURE_OPENAI_ENDPOINT")
        Vers = os.getenv("AZURE_OPENAI_API_VERSION")
        azure_deployment = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")
        embeddings_endpoint = os.getenv("AZURE_OPENAI_EMBEDDINGS_ENDPOINT")
        embeddings_key = os.getenv("AZURE_OPENAI_EMBEDDINGS_KEY")
        embeddings_model = os.getenv("AZURE_OPENAI_EMBEDDINGS_MODEL")
        vector_db_dir = os.getenv("VECTOR_DB_BASE_DIR")

        # Debug: Print PDF directory contents
        pdf_dir = "./Temp_PDF"
        print(f"\nChecking contents of {pdf_dir}...")
        if os.path.exists(pdf_dir):
            pdf_files = [f for f in os.listdir(pdf_dir) if f.endswith('.pdf')]
            print(f"Found PDF files: {pdf_files}")
        else:
            print(f"Directory {pdf_dir} does not exist!")
            return

        # Load PDFs
        print("\nLoading PDFs...")
        loader = DirectoryLoader(
            pdf_dir,
            glob="**/*.pdf",
            loader_cls=PyPDFLoader
        )
        documents = loader.load()
        print(f"Loaded {len(documents)} documents")

        if not documents:
            print("No documents were loaded! Stopping.")
            return

        # Split documents
        print("\nSplitting documents into chunks...")
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200,
            length_function=len
        )
        chunks = text_splitter.split_documents(documents)
        print(f"Created {len(chunks)} chunks")

        # Sample content check
        if chunks:
            print("\nSample chunk content:")
            print(chunks[0].page_content[:200] + "...")
        
        # Initialize embeddings
        print("\nInitializing embeddings...")
        embeddings = AzureOpenAIEmbeddings(
            azure_deployment=embeddings_model,
            openai_api_key=embeddings_key,
            azure_endpoint=embeddings_endpoint,
            api_version="2023-05-15",
            chunk_size=1000
        )

        # Create and persist vector store
        print("\nCreating vector store...")
        vectordb = Chroma.from_documents(
            documents=chunks,
            embedding=embeddings,
            persist_directory=vector_db_dir
        )
        
        # Force persistence
        print("Persisting vector store...")
        vectordb.persist()

        # Verify vector store
        print("\nVerifying vector store...")
        collection = vectordb._collection
        count = collection.count()
        print(f"Number of vectors in store: {count}")

        if count == 0:
            print("Warning: Vector store is empty!")
            return

        # Test query to verify embeddings
        print("\nTesting vector store with a sample query...")
        test_query = "What is this document about?"
        results = vectordb.similarity_search_with_relevance_scores(test_query, k=1)
        
        if results:
            doc, score = results[0]
            print(f"Found result with score: {score}")
            print("Sample content:", doc.page_content[:200] + "...")
        else:
            print("No results found in test query!")

        print("\nVector store initialization complete!")

        # Main query loop
        while True:
            query = input("\nEnter your question (or 'quit' to exit): ")
            if query.lower() == 'quit':
                break
            
            results = vectordb.similarity_search_with_relevance_scores(query, k=5)
            if not results:
                print("No relevant documents found.")
                continue
            
            print(f"\nFound {len(results)} relevant documents")
            for i, (doc, score) in enumerate(results, 1):
                print(f"\nDocument {i} (score: {score:.3f}):")
                print(doc.page_content[:200] + "...")

            # Only use high-scoring results for context
            context = "\n".join([doc.page_content for doc, score in results if score > 0.7])
            
            if not context.strip():
                print("\nNo sufficiently relevant context found (score > 0.7)")
                continue

            response = generate_response(client, query, context)
            if response:
                print("\nResponse:", response)
            else:
                print("Failed to generate response.")

    except Exception as e:
        print(f"\nAn error occurred: {str(e)}")
        import traceback
        print(traceback.format_exc())

if __name__ == "__main__":
    load_dotenv()
    main()