# Step 1: Install Required Libraries

In [1]:
! pip install pymupdf pillow easyocr




# Step 2: Convert PDF Pages to Images

In [2]:
import fitz  # PyMuPDF
from PIL import Image

def pdf_to_images(pdf_path):
    images = []
    with fitz.open(pdf_path) as pdf:
        for page_num in range(pdf.page_count):
            page = pdf.load_page(page_num)
            pix = page.get_pixmap()
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            images.append(img)
    return images

# Example usage
pdf_path = "D:/Genai_project/Retrieval Augmented Generation/rag_final/RAG_task/data_files/TDP-Document.pdf"
images = pdf_to_images(pdf_path)


# Step 3: Apply OCR Using EasyOCR

In [4]:
import easyocr
import numpy as np

# Initialize EasyOCR reader (specify languages as needed)
reader = easyocr.Reader(['en'])  # Add other languages in the list if needed

def easyocr_extract(images):
    extracted_text = ""
    for img in images:
        # Convert PIL image to NumPy array
        img_np = np.array(img)
        # Perform OCR on the image array
        text = reader.readtext(img_np, detail=0)  # detail=0 for only text output
        extracted_text += " ".join(text) + "\n"
    return extracted_text

# Extract text from images
extracted_text = easyocr_extract(images)
print("Extracted Text:", extracted_text)


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


Extracted Text: Structured TDP for Data Science and Generative Al Made by: Khushboo Gaur (TDP-AVML Intern}
WEEK-I: Introduction and Setup: Data Science Foundations Overview of Data Science and Its Applications Data Science interdisciplinary field that combines statistical methods_ computer sciencc and domain expertise analyze andinterpret complex data. The primary goalisto uncovei patterns; generate insights, and drive decision-making processes Applications Data Science: Predictive Analytics: historical data Torecast future trends and behaviors. Customer Insights: Analyzes customer interactions and behaviors refine product offerings and optimize marketing strategies. Fraud Detection: Detects fraudulent activities by identifying anomalies transaction data. Healthcare Analytics: Enhances paticnt carc analyzing medical records and data treatment decisions: Basics of Data Analysis with Pandas Basic Operations: Loading Data: Import datasets into Pandas DataFrames for analysis using function

# Step 4: Save Extracted Text to a File

In [5]:
# output_file = "extracted_text.txt"
# with open(output_file, "w") as file:
#     file.write(extracted_text)
# print(f"Extracted text saved to {output_file}")


## Apply RAG Pipeline:

In [8]:
import os
import json
import logging
from dotenv import load_dotenv
from typing import List, Optional
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from openai import AzureOpenAI
from config import Config
import re
from search_utilities import create_search_index, upload_documents
import easyocr
import numpy as np

# Setup logging
logging.basicConfig(level=logging.INFO)

# Load environment variables from .env file
load_dotenv()

# Get the data folder path and chunk settings from the .env file
data_folder_path = os.getenv('DATA_FOLDER_PATH')
chunk_size = int(os.getenv('CHUNK_SIZE', 1000))  # Default chunk size is 1000 characters
chunk_overlap = int(os.getenv('CHUNK_OVERLAP', 200))  # Default overlap is 200 characters

config = Config()

# Initialize Azure OpenAI client if in cloud approach
if config.APPROACH == 'cloud':
    client = AzureOpenAI(azure_endpoint=os.getenv("OPENAI_API_BASE"),
                         api_key="bbc851a28be648d88779cd1e3de2feee",
                         api_version='2024-02-15-preview')
else:
    # For on-premises, initialize the sentence transformer model
    model_name = os.getenv('EMBEDDING_MODEL_NAME_ON_PREM', 'sentence-transformers/all-mpnet-base-v2')
    model = SentenceTransformer(model_name)

# Initialize EasyOCR reader
reader = easyocr.Reader(['en'])  # Add other languages in the list if needed

# Function to extract text from images using EasyOCR
def easyocr_extract(images):
    extracted_text = ""
    for img in images:
        # Convert PIL image to NumPy array
        img_np = np.array(img)
        # Perform OCR on the image array
        text = reader.readtext(img_np, detail=0)  # detail=0 for only text output
        extracted_text += " ".join(text) + "\n"
    return extracted_text

# Class for recursive text chunking
class RecursiveChunker:
    def __init__(self, chunk_size: int, chunk_overlap: int):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap

    def chunk_documents(self, docs: List[Document]) -> List[Document]:
        if not docs:
            logging.warning("No documents provided for chunking.")
            return []

        # Check if the method exists and is available
        try:
            splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
                encoding_name="cl100k_base",
                chunk_size=self.chunk_size,
                chunk_overlap=self.chunk_overlap,
                separators=["\n\n", "\n", " ", ""]
            )
        except AttributeError as e:
            logging.error(f"Error initializing text splitter: {e}")
            return []

        # Split the documents into chunks
        chunked_docs = splitter.split_documents(docs)
        return chunked_docs

# Function to generate embeddings
def generate_embeddings(text_list, model="embedding"):
    embeddings = []
    if config.APPROACH == 'cloud':
        logging.info('Generating embeddings using cloud model...')
        for text in text_list:
            try:
                embedding = client.embeddings.create(
                    input=[text],
                    model=model  # Use the model parameter as "embedding"
                ).data[0].embedding
                embeddings.append(embedding)
            except Exception as e:
                logging.error(f"Error generating embedding for text: {e}")
    else:
        logging.info('Generating embeddings using local model...')
        try:
            embeddings = model.encode(text_list).tolist()  # Ensure embeddings are list
        except Exception as e:
            logging.error(f"Error generating embeddings using local model: {e}")
    return embeddings

# Function to handle extracted text, chunking, and embeddings
def process_extracted_text(extracted_text: str):
    # Create the search index at the beginning
    create_search_index()

    chunker = RecursiveChunker(chunk_size, chunk_overlap)
    processed_documents = []  # Create a list to hold processed documents

    logging.info("Processing extracted text...")

    # Convert the text into a Document object
    documents = [Document(page_content=extracted_text)]

    # Chunk the documents
    chunks = chunker.chunk_documents(documents)

    logging.info(f"Number of chunks created: {len(chunks)}")

    if not chunks:
        logging.warning("No chunks created from the extracted text.")
        return []

    # Extract text from chunks
    chunk_texts = [chunk.page_content for chunk in chunks]

    # Generate embeddings for each chunk
    embeddings = generate_embeddings(chunk_texts)
    logging.info(f"Generated embeddings for {len(embeddings)} chunks.")

    # Sanitize filename and create a document ID
    document_id = "extracted_text_document"  # You can change this ID as needed

    # Process each chunk and create a document structure
    for chunk_text, embedding in zip(chunk_texts, embeddings):
        # Validate embeddings
        if not (isinstance(embedding, list) and all(isinstance(x, float) for x in embedding)):
            logging.error(f"Embedding is not a list of floats for chunk.")
            continue

        # Create a document dictionary
        document = {
            "id": document_id,
            "title": "Extracted Text Document",
            "content": chunk_text,
            "contentVector": embedding
        }

        # Append processed document to the list
        processed_documents.append(document)

        # Upload or index the document based on the approach
        if config.APPROACH == 'cloud':
            upload_documents([document])
            logging.info("Uploaded document to Azure Search.")
        else:
            index_on_prem_faiss([embedding], [document_id])
            logging.info("Uploaded embedding to FAISS.")

    logging.info("Processing completed for the extracted text.")
    
    return processed_documents  # Return the processed documents list


# Main execution
if __name__ == "__main__":
    # Step 1: Convert PDF to images
    pdf_path = "D:/Genai_project/Retrieval Augmented Generation/rag_final/RAG_task/data_files/TDP-Document.pdf"
    images = pdf_to_images(pdf_path)

    # Step 2: Extract text from images using EasyOCR
    extracted_text = easyocr_extract(images)
    print("Extracted Text:", extracted_text)  # Optional: Print the extracted text for verification

    # Step 3: Process the extracted text for chunking and embeddings
    processed_data = process_extracted_text(extracted_text)

    if processed_data:
        logging.info("Chunks and embeddings are created.")
    else:
        logging.error("No data was processed. Check input files or processing logic.")


2024-10-28 16:20:16,314 - INFO - Use pytorch device_name: cpu
2024-10-28 16:20:16,320 - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
2024-10-28 16:23:56,145 - INFO - Request URL: 'https://gptkb-jcgzeo3krxxra.search.windows.net/indexes('rag-1')?api-version=REDACTED'
Request method: 'PUT'
Request headers:
    'Content-Type': 'application/json'
    'Content-Length': '1004'
    'api-key': 'REDACTED'
    'Prefer': 'REDACTED'
    'Accept': 'application/json;odata.metadata=minimal'
    'x-ms-client-request-id': 'edb38389-951a-11ef-b592-c0b883fb6494'
    'User-Agent': 'azsdk-python-search-documents/11.5.1 Python/3.10.14 (Windows-10-10.0.22631-SP0)'
A body is sent with the request


Extracted Text: Structured TDP for Data Science and Generative Al Made by: Khushboo Gaur (TDP-AVML Intern}
WEEK-I: Introduction and Setup: Data Science Foundations Overview of Data Science and Its Applications Data Science interdisciplinary field that combines statistical methods_ computer sciencc and domain expertise analyze andinterpret complex data. The primary goalisto uncovei patterns; generate insights, and drive decision-making processes Applications Data Science: Predictive Analytics: historical data Torecast future trends and behaviors. Customer Insights: Analyzes customer interactions and behaviors refine product offerings and optimize marketing strategies. Fraud Detection: Detects fraudulent activities by identifying anomalies transaction data. Healthcare Analytics: Enhances paticnt carc analyzing medical records and data treatment decisions: Basics of Data Analysis with Pandas Basic Operations: Loading Data: Import datasets into Pandas DataFrames for analysis using function

2024-10-28 16:23:57,296 - INFO - Response status: 200
Response headers:
    'Transfer-Encoding': 'chunked'
    'Content-Type': 'application/json; odata.metadata=minimal; odata.streaming=true; charset=utf-8'
    'Content-Encoding': 'REDACTED'
    'Vary': 'REDACTED'
    'Server': 'Microsoft-IIS/10.0'
    'Strict-Transport-Security': 'REDACTED'
    'Preference-Applied': 'REDACTED'
    'OData-Version': 'REDACTED'
    'request-id': 'edb38389-951a-11ef-b592-c0b883fb6494'
    'elapsed-time': 'REDACTED'
    'Date': 'Mon, 28 Oct 2024 10:53:58 GMT'
2024-10-28 16:23:57,313 - INFO - Index created or updated. Result: {'additional_properties': {}, 'name': 'rag-1', 'fields': [<azure.search.documents.indexes.models._index.SearchField object at 0x00000207A75B59C0>, <azure.search.documents.indexes.models._index.SearchField object at 0x00000207A75B4E50>, <azure.search.documents.indexes.models._index.SearchField object at 0x00000207A75B4DC0>, <azure.search.documents.indexes.models._index.SearchField objec

Index: {'additional_properties': {}, 'name': 'rag-1', 'fields': [<azure.search.documents.indexes.models._index.SearchField object at 0x00000207A75B5E40>, <azure.search.documents.indexes.models._index.SearchField object at 0x00000207A75B4310>, <azure.search.documents.indexes.models._index.SearchField object at 0x00000207A75B41F0>, <azure.search.documents.indexes.models._index.SearchField object at 0x00000207A75B4130>], 'scoring_profiles': [], 'default_scoring_profile': None, 'cors_options': None, 'suggesters': [], 'analyzers': None, 'tokenizers': None, 'token_filters': [], 'char_filters': [], 'encryption_key': None, 'similarity': <azure.search.documents.indexes._generated.models._models_py3.BM25SimilarityAlgorithm object at 0x00000207A75B7FD0>, 'semantic_search': <azure.search.documents.indexes._generated.models._models_py3.SemanticSearch object at 0x00000207A75B6C20>, 'vector_search': <azure.search.documents.indexes._generated.models._models_py3.VectorSearch object at 0x00000207A75B4B2