In [1]:
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient
import os
from dotenv import load_dotenv
load_dotenv(".env")

endpoint = os.environ["FORM_RECOGNIZER_END_POINT"]
key = os.environ["FORM_RECOGNIZER_KEY"]


def extract_text(file_path:str):

    document_analysis_client = DocumentAnalysisClient(
        endpoint=endpoint,credential=AzureKeyCredential(key)
    )
    
    with open(file_path, "rb") as f:
        poller = document_analysis_client.begin_analyze_document("prebuilt-read", f)
    result = poller.result()

    return result.content
    


In [19]:
import os
from azure.ai.inference import EmbeddingsClient
from azure.core.credentials import AzureKeyCredential

def get_embeddings(text):
    endpoint = "https://models.inference.ai.azure.com"
    model_name = "text-embedding-3-large"
    token = "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"

    client = EmbeddingsClient(
        endpoint=endpoint,
        credential=AzureKeyCredential(token)
    )

    response = client.embed(
        input=[text],
        model=model_name
    )

    # Flatten the embeddings
    embeddings = [item for sublist in [item.embedding for item in response.data] for item in sublist]
    return embeddings

In [2]:
import requests
def generate_embeddings(text: str):
    response = requests.post("http://localhost:6000/generate_embeddings", data={"text": text})
    response.raise_for_status()
    embeddings = response.json()
    return embeddings

In [3]:
from typing import List, Dict
import textwrap
import chromadb
from chromadb import HttpClient
chroma_client = HttpClient("http://localhost:6060")
collection_client = chroma_client.get_or_create_collection("RAG")
def add_document_to_collection(file_path: str, chunks: List[str], embeddings: List[List[float]]):
    ids = [f"{file_path}_{i}" for i in range(len(chunks))]
    collection_client.add(
        ids=ids,
        documents=chunks,
        embeddings=embeddings,
    )
    return f"Document added to collection: {file_path}"


In [4]:
def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 100) -> List[str]:
    chunks = textwrap.wrap(text, chunk_size, break_long_words=False)
    return [chunks[0]] + [chunks[i-1][-overlap:] + chunks[i] for i in range(1, len(chunks))]

In [5]:
def process_document(file_path: str):
    extracted_text = extract_text(file_path)
    chunks = chunk_text(extracted_text)
    embeddings = [generate_embeddings(chunk) for chunk in chunks]
    return add_document_to_collection(file_path, chunks, embeddings)

In [6]:
import os

# Define the folder path
folder_path = r'C:\Users\laksh\Downloads\git\dropbox-drive\samples-models\pan'

# List all files in the folder
document_paths = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, file))]

# Process each document
for document_path in document_paths:
    process_document(document_path)

In [1]:
! pip show openai

Name: openai
Version: 1.43.0
Summary: The official Python library for the openai API
Home-page: https://github.com/openai/openai-python
Author: 
Author-email: OpenAI <support@openai.com>
License: 
Location: C:\Users\laksh\anaconda3\envs\myenv\Lib\site-packages
Requires: anyio, distro, httpx, jiter, pydantic, sniffio, tqdm, typing-extensions
Required-by: langchain-openai


In [7]:
from openai.lib.azure import AzureOpenAI
from dotenv import load_dotenv
import os
import json
load_dotenv(".env")

api_base = os.environ["API_BASE"]
api_key = os.environ["API_KEY"]
deployment_name = os.environ["DEPLOYMENT_NAME"]
api_version = os.environ["API_VERSION"]

client = AzureOpenAI(
    api_key=api_key,
    api_version=api_version,
    base_url=f"{api_base}/openai/deployments/{deployment_name}"
)

In [27]:
prompt_model = '''\nnote: 1-always return the answer based on the context provided
                2-please provide the answer in english language
                3-if the answer is not known in the context please provide the answer as "I don't know the answer to this question based on the context provided"
            '''
    
def get_chat_completion_openai(prompt: str, context: str) -> str:
    response = client.chat.completions.create(
        model=deployment_name,
        messages=[
            {"role": "system", "content": "You are a helpful assistant implementing a RAG model who answers in ENGLISH language only. Answer questions based on the provided context.{prompt_model}"},
            {"role": "user", "content": f"Context: {context}\n\nQuestion: {prompt}"}
        ],
    )
    response= response.choices[0].message.content
    return response

In [35]:
def rag_model(prompt: str, n_results: int = 10) -> Dict[str, str]:
    # Append additional instructions to the prompt
    # Generate embedding for the prompt
    prompt_embedding = get_embeddings(prompt)
    
    # Retrieve relevant documents
    results = collection_client.query(
        query_embeddings=[prompt_embedding],
        n_results=n_results
    )
    
    # Combine retrieved documents
    context = "\n".join(results['documents'][0])
    
    # Generate response using the context
    response = get_chat_completion_openai(prompt, context)
    
    return {
        "question": prompt,
        "answer": response,
        "context": context
    }

In [39]:
prompt="aditi reddy obireddy address"
rag=rag_model(prompt)
rag

{'question': 'aditi reddy obireddy address',
 'answer': 'The provided context does not include the address for Aditi Reddy Obireddy. The context only contains her Permanent Account Number (PAN) details. If you need more information, please provide additional context or documents.',
 'context': "आयकर विभाग INCOME TAX DEPARTMENT RAJESH KUMAR REDDY M MALLIKARJUNA REDDY MANDADI 26/01/1985 Permanent Account Number AJNPR8136N Signature 04 Majesh भारत सरकार GOVT. OF INDIA 90026000\nआयकर विभाग INCOME TAX DEPARTMENT RAJESH KUMAR REDDY M MALLIKARJUNA REDDY MANDADI 26/01/1985 Permanent Account Number AJNPR8136N Signature 04 Majesh भारत सरकार GOVT. OF INDIA 90026000\nआयकर विभाग INCOME TAX DEPARTMENT अन्यमत अधर्म स्थायी लेखा संख्या कार्ड Permanent Account Number Card BDAPA2611K भारत सरकार GOVT. OF INDIA नाम / Name ADITI REDDY OBIREDDY पिता का नाम / Father's Name MANOHAR REDDY OBIREDDY जन्म की तारीख Date of Birth 26/07/1991 Adili हस्ताक्षर / Signature 14122019 Adili\nआयकर विभाग INCOME TAX DEPARTMENT

In [14]:
def get_image_details(images, prompt: str):
    messages = [
        {"role": "system", "content": '''You are a helpful assistant in document analysis.and answer in json format ex:{
            document_type: "invoice",
            }, never include extra '```json\n' and '```' in the response'''},
        {"role": "user", "content": [
            {
                "type": "text",
                "text": prompt
            }
        ] + [
            {
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/png;base64,{image}"
                }
            } for image in images
        ]}
    ]

    response = client.chat.completions.create(
        model=deployment_name,
        messages=messages,
        max_tokens=500,
    )
    res = response.choices[0].message.content
    return res

In [15]:
import os
import fitz
import base64

def convert_pdf_to_images(pdf_path, output_folder):
    image_name = f"{os.path.basename(pdf_path)}.png"
    image_path = os.path.join(output_folder, image_name)
    image_base64 = []
    with fitz.open(pdf_path) as doc:
        page = doc[0]
        pix = page.get_pixmap()
        pix.save(image_path)
    image_base64.append(base64.b64encode(open(image_path, "rb").read()).decode('utf-8'))
    os.remove(image_path)
    return image_base64

In [16]:
pdf_path = "C:/Users/laksh/Downloads/cover-letter.pdf"
images = convert_pdf_to_images(pdf_path, output_folder=".")
prompt = "get the dopcument type from the image"   
image_details = get_image_details(images, prompt)
image_details

'{\n  "document_type": "cover_letter"\n}'

In [7]:
import chromadb
from chromadb import HttpClient
chroma_client = HttpClient("http://localhost:8000")
collection_client = chroma_client.get_or_create_collection("SIH")
def add_document_to_collection(file_path,extracted_text,embeddings,document_type):
    flattened_metadata = {
        "document_type": document_type
    }
    collection_client.add(
        ids=[file_path],
        documents=[extracted_text],
        embeddings=[embeddings],
        metadatas=[flattened_metadata]
    )
    return "Document added to collection {file_path}"


In [4]:
! pip install azure-ai-inference

Collecting azure-ai-inference
  Downloading azure_ai_inference-1.0.0b4-py3-none-any.whl.metadata (27 kB)
Downloading azure_ai_inference-1.0.0b4-py3-none-any.whl (85 kB)
   ---------------------------------------- 0.0/85.0 kB ? eta -:--:--
   ---------------------------------------- 85.0/85.0 kB 2.4 MB/s eta 0:00:00
Installing collected packages: azure-ai-inference
Successfully installed azure-ai-inference-1.0.0b4



[notice] A new release of pip is available: 24.1.2 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [13]:
import os
from azure.ai.inference import EmbeddingsClient
from azure.core.credentials import AzureKeyCredential

def get_embeddings(text):
    endpoint = "https://models.inference.ai.azure.com"
    model_name = "text-embedding-3-large"
    token = "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"

    client = EmbeddingsClient(
        endpoint=endpoint,
        credential=AzureKeyCredential(token)
    )

    response = client.embed(
        input=[text],
        model=model_name
    )

    # Flatten the embeddings
    embeddings = [item for sublist in [item.embedding for item in response.data] for item in sublist]
    return embeddings

In [14]:
# Example usage
text = '''I am writing to express my strong interest in the Software Engineer position at JP Morgan Chase
& Co. As a passionate and skilled full stack developer, I am excited about the opportunity to
contribute my technical expertise and innovative mindset to your esteemed organization.'''
embeds = get_embeddings(text)
print(embeds)

[-0.039535776, 0.0025048573, -0.014748496, -0.0055000405, 0.004141966, 0.009593619, -0.012812997, 0.038142215, -0.03558736, 0.011432342, 0.044206776, 0.0050484245, 0.007974252, 0.0059516565, 0.014658173, 0.00580972, -0.0059226244, 0.000863716, -0.039303515, -0.031458296, 0.04541969, -0.015845278, 0.0036129297, 0.036232524, 0.0051032635, -0.001433075, -0.020038858, 0.014116233, -0.0244776, -0.038555123, 0.0139097795, 0.0055322987, 0.018916268, 0.039742228, 0.011019436, -0.00444842, 0.029961511, -0.034167994, -0.013354937, -0.0003054458, -0.00032157495, 0.021600159, 0.004009707, 0.017264644, 0.047380995, 0.019496918, 0.04020675, -0.012496866, -0.00036633335, 0.0031500233, -0.0014556559, 0.051200375, 0.043380965, 0.011271051, -0.022013064, 0.0037355113, 0.023884047, -0.023716304, -0.01717432, 0.014993658, 0.01950982, 0.015071078, 0.0025322768, -0.013587196, -0.014916238, -0.0060548834, 0.0365164, -0.0008290384, -0.02009047, 0.028335692, -0.029058278, 0.01006459, 0.019535627, 0.033935733, 

In [15]:
file_path = "C:/Users/laksh/Downloads/cover-letter.pdf"
text = extract_text(file_path)
text

"Lakshman kumar reddy, 9-48, yarramvaripalem,parchoor,523169, lakshmanbhavanam009@gmail.com 8712131582 04-08-2024\nHiring Manager JP Morgan Chase & Co.\nDear Hiring Manager,\nI am writing to express my strong interest in the Software Engineer position at JP Morgan Chase & Co. As a passionate and skilled full stack developer, I am excited about the opportunity to contribute my technical expertise and innovative mindset to your esteemed organization.\nWith a robust skill set encompassing multiple programming languages, frameworks, and tools, I believe I am well-equipped to make significant contributions to your team. My proficiency in languages such as C, Python, and JavaScript, coupled with my experience in modern frameworks like React JS, React Native, and Next JS, allows me to develop efficient and scalable solutions across the entire stack.\nSome key highlights of my technical background include:\n1. Full Stack Development: Proficient in both front-end and back-end technologies, allo

In [16]:
embeds= get_embeddings(text)
embeds

[-0.029917678,
 -0.0061419075,
 -0.0050605857,
 0.011571806,
 -0.026231201,
 0.013707832,
 -0.008424328,
 0.033431143,
 -0.024887037,
 0.011358869,
 0.041043647,
 -0.018711857,
 -0.002129372,
 -0.0012302115,
 0.010094554,
 0.017793566,
 -0.008138194,
 -0.019363977,
 -0.045541946,
 -0.024487779,
 0.031275153,
 0.0053733373,
 0.0024920306,
 0.036305793,
 0.009435779,
 -0.0012401928,
 0.007013619,
 0.019843085,
 -0.012104149,
 -0.041256584,
 0.04045807,
 0.0062650116,
 0.012150729,
 0.028267413,
 0.0025485922,
 -0.010487157,
 0.014878987,
 -0.003124188,
 0.015983598,
 -0.010287529,
 -0.0187784,
 0.011871249,
 -0.009795112,
 0.0019047898,
 0.012223926,
 0.029385334,
 0.0038827767,
 -0.013541475,
 -0.0044583725,
 -0.0028879608,
 -0.006740793,
 0.04570165,
 0.016236462,
 -0.00084259914,
 -0.013395081,
 -0.004751161,
 0.05829156,
 0.019390594,
 -0.019190965,
 0.0065444917,
 -0.005945606,
 -0.023968743,
 -0.0054432075,
 -0.012343704,
 -0.0038095796,
 0.012902664,
 0.04570165,
 0.026204584,
 -0

In [17]:
add_document_to_collection(file_path,text,embeds,"cover-letter")

'Document added to collection {file_path}'

In [18]:
collection_client.query(
    query_embeddings=[embeds],
    n_results=10
)

{'ids': [['C:/Users/laksh/Downloads/cover-letter.pdf', 'sample', 'sample2']],
 'distances': [[0.0, 0.4322189635676672, 0.5310076150434437]],
 'embeddings': None,
 'metadatas': [[{'document_type': 'cover-letter'}, None, None]],
 'documents': [["Lakshman kumar reddy, 9-48, yarramvaripalem,parchoor,523169, lakshmanbhavanam009@gmail.com 8712131582 04-08-2024\nHiring Manager JP Morgan Chase & Co.\nDear Hiring Manager,\nI am writing to express my strong interest in the Software Engineer position at JP Morgan Chase & Co. As a passionate and skilled full stack developer, I am excited about the opportunity to contribute my technical expertise and innovative mindset to your esteemed organization.\nWith a robust skill set encompassing multiple programming languages, frameworks, and tools, I believe I am well-equipped to make significant contributions to your team. My proficiency in languages such as C, Python, and JavaScript, coupled with my experience in modern frameworks like React JS, React Na