In [None]:
!pip install openai==0.28

In [43]:
# import packages
import boto3
import openai
import os
import io

In [14]:
# textract for both pdfs and image files (jpg/png)

def top_file(bucket_name, file_extension=".pdf"):
    s3 = boto3.client('s3')
    response = s3.list_objects_v2(Bucket=bucket_name)

    # Check if the bucket is not empty
    if 'Contents' in response and response['Contents']:
        # Filter files by the specified extension and sort by last modified timestamp
        filtered_files = [obj for obj in response['Contents'] if obj['Key'].lower().endswith(file_extension.lower())]
        if filtered_files:
            most_recent_file = max(filtered_files, key=lambda x: x['LastModified'])
            return most_recent_file['Key']
    
    return None

def start_text(bucket_name, document_key):
    textract = boto3.client('textract')

    # Check the file extension to determine the appropriate method
    file_extension = document_key.split('.')[-1].lower()

    if file_extension == 'pdf':
        response = textract.start_document_text_detection(
            DocumentLocation={
                'S3Object': {
                    'Bucket': bucket_name,
                    'Name': document_key
                }
            }
        )
    elif file_extension in ['jpg', 'jpeg', 'png']:
        response = textract.start_document_text_detection(
            DocumentLocation={
                'S3Object': {
                    'Bucket': bucket_name,
                    'Name': document_key
                }
            }
        )
    else:
        raise ValueError(f"Unsupported file type: {file_extension}")

    # Get the JobId for checking the analysis status
    job_id = response['JobId']
    return job_id

def get_text(job_id):
    textract = boto3.client('textract')
    response = None
    while response is None or response['JobStatus'] == 'IN_PROGRESS':
        response = textract.get_document_text_detection(JobId=job_id)
        
    return response

def extract_text(response):
    # Extract the detected text from the response
    detected_text = ""
    for page_result in response['Blocks']:
        if page_result['BlockType'] == 'LINE':
            detected_text += page_result['Text'] + "\n"

    return detected_text


In [41]:
# pass the textract string to GPT

def api_key_s3(bucket, key_file):
    s3 = boto3.client('s3')
    obj = s3.get_object(Bucket=bucket, Key=key_file)
    return obj['Body'].read().decode('utf-8').strip()

def chatgpt_to_s3(api_key_bucket, api_key_file, extracted_text, Output_Bucket, file):
    api_key = api_key_s3(api_key_bucket, api_key_file)
    openai.api_key = api_key
       
    prompt = f"Summarize these notes in a Study Guide:\n{extracted_text}."
    
    response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=[{'role': 'user', 'content': prompt}]
            )
    output = response['choices'][0]['message']['content'].strip()

    output_filename = file.replace('.', '_') + '_summary.txt'
    output_bytes = io.BytesIO(output.encode())

    s3.upload_fileobj(output_bytes, Output_Bucket, output_filename)

    print(f"The Study Guide is Located in {Output_Bucket}/{output_filename}")

In [44]:
# initialize main function

api_key_bucket = 'gpt-api-bucket'
api_key_file = 'GPTKey2.txt'
Output_Bucket = 'gpt-summaries'
    

if __name__ == "__main__":
    bucket_name = 'textract-assets'

    # Get the most recent file from the bucket
    file = top_file(bucket_name)

    if most_recent_file:
        print(f"Most recent file in the bucket: {file}")

        job_id = start_text(bucket_name, file)
        print(f"Text detection job submitted. JobId: {job_id}")

        # Retrieve the results
        response = get_text(job_id)

        # Extract text from the document
        detected_text = extract_text(response)
        
        # pass to GPT
        chatgpt_to_s3(api_key_bucket, api_key_file, detected_text, Output_Bucket, file)
            
    else:
        print("The S3 bucket is empty.")
        



Most recent file in the bucket: Tuberous Sclerosis Complex Notes.pdf
Text detection job submitted. JobId: 7d36a8ad50cbaf27e79bd4677ff96ab0bf474c17639b0462d74cec9064f56587
The Study Guide is Located in gpt-summaries/Tuberous Sclerosis Complex Notes_pdf_summary.txt
