# Notebook showcasing use of Google Cloud GenAI(Palm API) + Doc AI to extract information from documents

**Author:** Jasmeet Bhatia



#### **Objective:** In this notebook we will use Veretx AI PALM Text model to extract intentities from a scanned PDF containing Patent Information

### Set up and import dependencies

In [None]:
#install dependencies
!pip install google-cloud-aiplatform --upgrade
#!apt-get install poppler-utils
#!pip install google-cloud-core
!pip install google-cloud-documentai
#!pip install google-cloud-storage
#!pip install simplejson
#!pip install pdf2image

In [None]:
!pip install --upgrade -r requirements.txt

In [None]:
#import libraries
#from google.api_core.client_options import ClientOptions
from google.cloud import documentai
import vertexai
from vertexai.preview.language_models import TextGenerationModel
import pandas as pd

#from PIL import Image, ImageDraw
#import os

#from IPython.display import display, Image
#from pdf2image import convert_from_path, convert_from_bytes

### Authentication 

In [None]:
#Use if running notebook locally
#! gcloud auth login

In [None]:
##Run only if using Google Colab Notebooks
#from google.colab import auth as google_auth
#google_auth.authenticate_user()

In [None]:
### Define path to the pdf file

## Display and review the PDF File

In [None]:
file_path='./sample_data/34_Deed.pdf'

In [None]:

from IPython.display import IFrame
IFrame(file_path, width=800, height=700)

### Use GCP Document AI to OCR the PDF

In [None]:
# Define function to OCR the PDF using Document AI
def process_document_sample(
    project_id: str,
    location: str,
    processor_id: str,
    file_path: str,
    mime_type: str,
    field_mask: str = None,
):

    client = documentai.DocumentProcessorServiceClient()

    name = client.processor_path(project_id, location, processor_id)

    # Import the file into memory
    with open(file_path, "rb") as image:
        image_content = image.read()

    # Load the image content
    raw_document = documentai.RawDocument(content=image_content, mime_type=mime_type)


    request = documentai.ProcessRequest(
        name=name, raw_document=raw_document
    )

    result = client.process_document(request=request)


    document = result.document

    # Read the text recognition output from the processor
    return(document.text)



### Use this for PDF Files

In [None]:
#For PDF Docs
ocr_output = process_document_sample(
  project_id="398507275014",
  location="us",
  processor_id="2fb6b1be15c7f2d",
    mime_type = 'application/pdf',
    field_mask = None,
  file_path= file_path
)

### Use this for TIFF files

In [None]:
##For TIFF docs uncomment the below section and run
#ocr_output = process_document_sample(
#  project_id="398507275014",
#  location="us",
#  processor_id="2fb6b1be15c7f2d",
#    mime_type = 'image/tiff',
#    field_mask = None,
#  file_path="./genai_demo_data/demo_data.tiff"
#)

In [None]:
#Print the first 1000 characters of the OCR output
print(ocr_output[:32000])

### Run the OCR results above through the Vertex AI GenAI/PALM Model to extact entities

In [None]:
# Define the function to process OCR output through Vertex AI GenAI Model


def predict_large_language_model_sample(
    project_id: str,
    model_name: str,
    temperature: float,
    max_decode_steps: int,
    top_p: float,
    top_k: int,
    content: str,
    location: str = "us-central1",
    tuned_model_name: str = "",
    ) :
    """Predict using a Large Language Model."""
    vertexai.init(project=project_id, location=location)
    model = TextGenerationModel.from_pretrained(model_name)
    if tuned_model_name:
      model = model.get_tuned_model(tuned_model_name)
    response = model.predict(
        content,
        temperature=temperature,
        max_output_tokens=max_decode_steps,
        top_k=top_k,
        top_p=top_p,)
    print(f"Response from Model: {response.text}")
    return(response.text)


### Enter the prompt to be used for entity etxraction from the document

In [None]:
prompt_suffix = '''Give me following information extracted form text above in a Table format:
- Name of Seller 1
- Seller 1 Type
-  If seller is an LLC, then name of the officer of the LLC
- Name of Seller 2
- Name of buyer 1
- Name of buyer 2
- Name of buyer 3
- Type of ownership
- Name of Title Company
- Address of the property
- Tract Number
- Water Right Details
- Title Order number
- Document transfer tax'''

#- List of parcels
#- Property details

### Combine the OCR output and the prompt/question above to create full input text to be fet to the model

In [None]:
ocr_text = ocr_output+prompt_suffix
print(ocr_text[5000:20000]) #Limiting to 20K characters in teh notebook. Model can handle 8K Tokens = ~32K Characters

### Feed the input prompt to the LLM

In [None]:
# Process the full Input Text through the GenAI Model
llm_output1 = predict_large_language_model_sample("jsb-alto", #GCP Project
                                                 "text-bison@001", #LLM Model 
                                                 0.2, #Temperature
                                                 256, #Max output tokens
                                                 0.8, #Top K
                                                 40,  #Top P
                                                 ocr_text, 
                                                 "us-central1")

In [None]:
prompt_suffix = ''' Convert the above information into table format with 
Columns - (Seller_1, Seller_1_Type, Seller_1_Officer, Seller_2, Buyer_1, Buyer_2, Buyer_3, Type_of_Ownership,Title_Company,Title_order_number,Document_transfer_tax)
For Blank fields put N/A'''

prompt2 = llm_output1+prompt_suffix
print(prompt2[:20000]) #Limiting to 20K characters in teh notebook. Model can handle 8K Tokens = ~32K Characters

In [None]:
llm_output2 = predict_large_language_model_sample("jsb-alto", #GCP Project
                                                 "text-bison@001", #LLM Model 
                                                 0.2, #Temperature
                                                 256, #Max output tokens
                                                 0.8, #Top K
                                                 40,  #Top P
                                                 prompt2, 
                                                 "us-central1")



### Print the response

In [None]:
#Print the answer received from LLM. 
#In this Patent document use case, answer should the name of the inventors
print(llm_output2)

### Covert to PD Dataframe

In [None]:
import io
output = pd.read_csv(io.StringIO(llm_output2), sep='|')
output = output.dropna(axis=1, how='all')
#output = output.dropna(axis=0, how='all')
# remove special character
output.columns = output.columns.str.replace(' ', '')
output

## Push Table to BQ

In [None]:
import datetime

from google.cloud import bigquery
import pandas
import pytz

# Construct a BigQuery client object.
client = bigquery.Client()

# TODO(developer): Set table_id to the ID of the table to create.
table_id = "jsb-alto.entity_extract.deed_extract6"


dataframe = output


job_config = bigquery.LoadJobConfig(
    # Specify a (partial) schema. All columns are always written to the
    # table. The schema is used to assist in data type definitions.
    schema=[
        # Specify the type of columns whose type cannot be auto-detected. For
        # example the "title" column uses pandas dtype "object", so its
        # data type is ambiguous.
        bigquery.SchemaField("Seller_1", bigquery.enums.SqlTypeNames.STRING),
        # Indexes are written if included in the schema by name.
        bigquery.SchemaField("Seller_1_Type", bigquery.enums.SqlTypeNames.STRING),
        # Indexes are written if included in the schema by name.
        bigquery.SchemaField("Seller_1_Officer", bigquery.enums.SqlTypeNames.STRING),
        
        bigquery.SchemaField("Seller_2", bigquery.enums.SqlTypeNames.STRING),
        
        bigquery.SchemaField("Buyer_1", bigquery.enums.SqlTypeNames.STRING),
        
        bigquery.SchemaField("Buyer_2", bigquery.enums.SqlTypeNames.STRING),
    ],
    # Optionally, set the write disposition. BigQuery appends loaded rows
    # to an existing table by default, but with WRITE_TRUNCATE write
    # disposition it replaces the table with the loaded data.
    write_disposition="WRITE_TRUNCATE",
)

job = client.load_table_from_dataframe(
    dataframe, table_id, job_config=job_config
)  # Make an API request.
job.result()  # Wait for the job to complete.

table = client.get_table(table_id)  # Make an API request.
print(
    "Loaded {} rows and {} columns to {}".format(
        table.num_rows, len(table.schema), table_id
    )
)

In [None]:
%%bigquery
SELECT * FROM jsb-alto.entity_extract.deed_extract6