In [None]:
import os
import openai
import pathlib
import wave
import chunked_tts
import base64

In [None]:
FILEPATH = pathlib.Path('/Users/Itai/Downloads/s41586-025-09442-9_reference.pdf')
PPMT = 12 # price per million tokens for audio generation in dollars. Used to set a price cap for the audio generation.
PRICE_CAP = 0.5 # price cap for the audio generation in dollars.
OUTPUT_FILE = 'saved_paper.wav'
TEXT_MODEL = "gpt-4.1-mini-2025-04-14"
TTS_MODEL = "gpt-4o-mini-tts"
client =openai.OpenAI()
speech_file_path = pathlib.Path(OUTPUT_FILE)

In [None]:
# Retrieve and encode the PDF byte
prompt = "Print out all of the text in the paper that a narrator would read aloud. Include the title and section headings."+\
         "Do not include citation numbers. Do not include methods, acknowledgements, bibliography,"+\
         "reporting summary, or competing interests."

with open(FILEPATH, "rb") as file:
    data = file.read()

base64_string = base64.b64encode(data).decode("utf-8")

parsed_response = client.chat.completions.create(
    model=TEXT_MODEL,
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "file",
                    "file": {
                        "filename": str(FILEPATH.name),
                        "file_data": f"data:application/pdf;base64,{base64_string}",
                    }
                },
                {
                    "type": "text",
                    "text": prompt,
                }
            ],
        },
    ],
)


In [None]:
## Review the extracted text

print(parsed_response.choices[0].message.content)

In [None]:
# Estimate the cost of the TTS (may be very inaccurate)

import tiktoken 

tokenizer = tiktoken.encoding_for_model("gpt-4o-mini")
encoded = tokenizer.encode (parsed_response.choices[0].message.content)
print (len(encoded))
cost_estimate = PPMT*len(encoded)/1_000_000
print(f'estimated cost = {cost_estimate}$')

In [None]:
if cost_estimate < PRICE_CAP:
   chunks = chunked_tts.chunk_text_by_lines(parsed_response.choices[0].message.content)
   print(f"Splitting input into {len(chunks)} chunks…")

   pcm_all = b""
   for idx, chunk in enumerate(chunks, 1):
         print(f"→ Processing chunk {idx}/{len(chunks)} (chars={len(chunk)})")
         pcm_all += chunked_tts.tts_chunk(client, chunk)

   chunked_tts.wave_file(OUTPUT_FILE, pcm_all)
   print(f"Wrote {OUTPUT_FILE} (size={len(pcm_all)/24000/2:.1f} s)")