In [3]:
import os
import json
import logging
from dotenv import load_dotenv
from typing import List
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
import PyPDF2
import docx
from sentence_transformers import SentenceTransformer
from openai import AzureOpenAI
from config import Config  # Ensure config.py is accessible in the notebook's directory
import re
from PIL import Image
import pytesseract
import io
from search_utilities import create_search_index, upload_documents  # Ensure search_utilities.py is accessible

# Setup logging
logging.basicConfig(level=logging.INFO)

# Load environment variables from .env file
load_dotenv()

# Get the data folder path and chunk settings from the .env file
data_folder_path = os.getenv('DATA_FOLDER_PATH')
chunk_size = int(os.getenv('CHUNK_SIZE', 1000))  # Default chunk size is 1000 characters
chunk_overlap = int(os.getenv('CHUNK_OVERLAP', 200))  # Default overlap is 200 characters

config = Config()  # Ensure Config class is defined in config.py

# Initialize Azure OpenAI client if in cloud approach
if config.APPROACH == 'cloud':
    client = AzureOpenAI(azure_endpoint=os.getenv("OPENAI_API_BASE"),
                         api_key='',  # Replace with your actual API key
                         api_version='2024-02-15-preview')
else:
    # For on-premises, initialize the sentence transformer model
    model_name = os.getenv('EMBEDDING_MODEL_NAME_ON_PREM', 'sentence-transformers/all-mpnet-base-v2')
    model = SentenceTransformer(model_name)


2024-10-04 12:34:02,560 - INFO - Use pytorch device_name: cpu
2024-10-04 12:34:02,561 - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2


In [4]:
# Read PDF Files:

def read_pdf(file_path):
    try:
        with open(file_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            text = ""
            for page_num in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_num]

                # Extract text
                page_text = page.extract_text()
                if page_text:
                    text += page_text
                else:
                    logging.warning(f"No text found on page {page_num} of {file_path}. Trying OCR.")
                    
                    # If no text, attempt OCR for images
                    images = page.images
                    for image in images:
                        try:
                            pil_image = Image.open(io.BytesIO(image['data']))
                            ocr_text = pytesseract.image_to_string(pil_image)
                            logging.info(f"OCR text extracted from image: {ocr_text}")
                            text += ocr_text
                        except Exception as e:
                            logging.error(f"Error during OCR on page {page_num} of {file_path}: {e}")
            return text
    except Exception as e:
        logging.error(f"Error reading PDF file {file_path}: {e}")
        return ""


In [5]:
pdf_text = read_pdf(r'D:\Genai_project\Retrieval Augmented Generation\rag_final\RAG_task\data_files\ESGreport.pdf')
print(pdf_text)



ag_final\RAG_task\data_files\ESGreport.pdf: [Errno 22] Invalid argument: 'D:\\Genai_project\\Retrieval Augmented Generation\rag_final\\RAG_task\\data_files\\ESGreport.pdf'



