# Document Processing
Using Azure OpenAI service to extract key entities with OUT using Azure AI Document Intelligence service

### 1. Install required libraries

In [None]:
# install Python PDF library
%pip install PyPDF2

### 2. Import helper libraries and load credentials from .env file

In [None]:
import os
from openai import AzureOpenAI
from dotenv import load_dotenv
load_dotenv()

### 3. Create AOAI client

In [4]:
# Create AOAI client using end point and key credentials
client = AzureOpenAI(
  azure_endpoint = os.getenv("OPENAI_API_ENDPOINT"), 
  api_key=os.getenv("OPENAI_API_KEY"),    
  api_version='2023-05-15',
)

### 4. Setup PDF information
Using sample PDF document from blob storage

In [None]:
import PyPDF2
import openai
import os
from urllib.request import urlopen
import urllib.request
import shutil
#from azure.storage.blob import ContainerClient, BlobServiceClient, BlockBlobService

# Replace with your OpenAI API key and model
my_ai_model = os.getenv("GPT4_MODEL_NAME")
pdf_file_url = os.getenv("BLOB_SAS_URL")
print(pdf_file_url)

with urlopen (pdf_file_url) as resp:
    print(resp.read())

local_file_name="sample-ukho-doc-process-using-aoai.pdf"
# Download the file from `url` and save it locally under `local_file_name`:
with urllib.request.urlopen(pdf_file_url) as response, open(local_file_name, 'wb') as out_file:
    shutil.copyfileobj(response, out_file)

### 5. Read  PDF document

In [6]:
processed_text_list = []
# Open the PDF file in binary mode
with open(local_file_name, 'rb') as pdf_file:
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    #print(pdf_reader)
    # Iterate through each page and extract text
    for page_num in range(len(pdf_reader.pages)):
        page = pdf_reader.pages[page_num]
        page_text = page.extract_text()
        processed_text_list.append(page_text)

# Combine all AI-processed text into a single string
combined_text = "\n".join(processed_text_list)

### 6. Now format the message to send to GPT model

In [7]:
messages = [
        {
            "role": "system",
            "content": """You are a Assistant, a backend processor.
- User input is messy raw text extracted from a PDF page by PyPDF2.
- Answer with polite and positive sense.
"""
        },
        {
            "role": "user",
            "content": "Summarize the content:" + combined_text
        }
    ]

### 7. Invoke GPT Model

In [None]:
response = client.chat.completions.create(
    model=my_ai_model, # model = "deployment_name".
    messages=messages
)

print(response.choices[0].message.content)

### 8. Print the token usage

In [None]:
# print total token usage
print(response.usage)