In [None]:
from google.cloud import vision
from google.cloud import storage
import os
import json

# API key
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = r'path'

# PDF path
bucket_name = 'bucketname'
pdf_file_path = 'pdfpath'

# path to save
output_file_path = r'path'

# Initialize
vision_client = vision.ImageAnnotatorClient()
storage_client = storage.Client()

def extract_text_from_pdf():
    gcs_source_uri = f'gs://{bucket_name}/{pdf_file_path}'
    gcs_output_uri = f'gs://{bucket_name}/output/'

    mime_type = 'application/pdf'
    input_config = vision.InputConfig(
        gcs_source=vision.GcsSource(uri=gcs_source_uri),
        mime_type=mime_type
    )

    # JSON output
    output_config = vision.OutputConfig(
        gcs_destination=vision.GcsDestination(uri=gcs_output_uri)
    )

    async_request = vision.AsyncAnnotateFileRequest(
        features=[vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)],
        input_config=input_config,
        output_config=output_config
    )

    operation = vision_client.async_batch_annotate_files(requests=[async_request])

    print('Processing PDF for OCR')
    operation.result(timeout=1800)

    print(f'OCR completed, results saved to: {gcs_output_uri}')

    # download JSON files from the Cloud Storage and extract text
    bucket = storage_client.bucket(bucket_name)
    blobs = bucket.list_blobs(prefix='output/')
    extracted_text = ''
    processed_files = set()  # avoid duplicates

    for blob in blobs:
        if blob.name.endswith('.json') and blob.name not in processed_files:
            json_data = blob.download_as_string()
            response = json.loads(json_data)
            for page_response in response.get('responses', []):
                if 'fullTextAnnotation' in page_response:
                    extracted_text += page_response['fullTextAnnotation']['text'] + '\n'
            processed_files.add(blob.name)
            
            # delete the JSON file after processing
            blob.delete()
            print(f'Deleted {blob.name}')

    # save the extracted text to a .txt file
    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        output_file.write(extracted_text)

    print(f'Text extraction complete. Results saved in {output_file_path}')

# run
extract_text_from_pdf()