In [1]:
import json
import os

from helper import pdf_to_text, preprocess, create_text_chunks, read_custom_message, call_openai_api, count_tokens
from secret_key import OPENAI_API_KEY

### Define path to pdf you wish to summarize, load custom message, and preprocess GPT-4 prompts

In [None]:
file_path = 'path/to/your/pdf'
text = pdf_to_text(file_path)
preprocessed_text = preprocess(text)
text_chunks = create_text_chunks(preprocessed_text)

custom_message_file = "custom_message.txt"
custom_message = read_custom_message(custom_message_file)

summaries = []
max_api_calls = 50
api_calls = 0

### Main script, that loops through summarizing each page (with custom message preset).

In [3]:
for chunk in text_chunks:
    if api_calls >= max_api_calls:
        break

    summary = call_openai_api(chunk, custom_message)

    if summary:
        token_count = count_tokens(summary)
        summaries.append({
            "original_chunk": chunk,
            "summary": summary,
            "token_count": token_count
        })

        api_calls += 1

os.makedirs("processed", exist_ok=True)

with open("processed/summaries.json", "w") as f:
    json.dump(summaries, f, indent=2)

### Formats .json into a more human readable .txt format

In [4]:
# Load the data
with open('processed/summaries.json', 'r') as file:
    data = json.load(file)

# Extract the summaries
summaries = [entry['summary'] for entry in data]

# Write the summaries to a new .txt file
with open('processed/summaries_formatted.txt', 'w') as file:
    for summary in summaries:
        summary_str = json.dumps(summary, indent=4)
        summary_str = summary_str.replace('\\n', '\n')
        file.write(summary_str)
        file.write('\n\n')