In [None]:
## google
from google import genai
from google.genai import types
import httpx
import re
from dotenv import load_dotenv
import os
from notion_client import Client
from utils import write_to_notion, extract_section

load_dotenv()

# Gemini setup
gclient = genai.Client()
prompt = """You are a concise document summariser. Read the PDF at the provided URL directly and
            return a clear, structured summary with key points, important details, and conclusions.
            Present your output in four sections: 'Title', 'Abstract', 'Summary' and 'Extended Summary'.

            In 'Title', extract the document title.
            In 'Abstract', extract the document abstract.
            Into 'Summary', include only the most critical information in brief bullet points.
            In 'Extended Summary', provide a more detailed explanation with relevant context.
            Format your response using markdown with appropriate headings and bullet points.
            """

# Notion setup
NOTION_TOKEN = os.getenv("NOTION_TOKEN")
DATABASE_ID = os.getenv("NOTION_DATABASE_ID")
notion = Client(auth=os.environ["NOTION_TOKEN"])

In [2]:
# put PDF URLs here
urls =[
    "https://arxiv.org/pdf/2402.15332",
    "https://arxiv.org/pdf/2104.13478",

]

In [None]:
for i, url in enumerate(urls):
    try:
        print(f"Processing document {i+1}/{len(urls)}: {url}")

        # Retrieve and encode the PDF byte
        doc_data = httpx.get(url).content

        # Summarise the document
        response = gclient.models.generate_content(
          model="gemini-2.5-flash",
          contents=[
              types.Part.from_bytes(
                data=doc_data,
                mime_type='application/pdf',
              ),
              prompt])

        # Extract title from the response
        title = extract_section(response.text, 'Title)
        print(f"Extracted Title: {title}")

        # Write to Notion
        result = write_to_notion(title, url, response.text, NOTION_TOKEN, DATABASE_ID)
        print(f"Page created successfully: {result['id']}")

    except httpx.HTTPError as e:
        print(f"❌ Error fetching PDF from {url}: {e}")
        continue
    except Exception as e:
        print(f"❌ Error processing {url}: {type(e).__name__}: {e}")
        continue

print(f"\n✅ Processing complete!")


Processing document 1/2: https://arxiv.org/pdf/2402.15332
Extracted Title: Title
Page created successfully: 2e2be5eb-a41a-81ed-9c51-cd07002dc8e6
Processing document 2/2: https://arxiv.org/pdf/2104.13478
Extracted Title: Title
Page created successfully: 2e2be5eb-a41a-81e3-a14a-c44cc11b2cee

✅ Processing complete!
