In [2]:
import os
from google import genai
from google.genai import types
import time
from dotenv import load_dotenv

load_dotenv()
client = genai.Client(api_key=os.environ.get("GOOGLE_API_KEY"))

In [35]:
# --- CONFIGURATION ---
SOURCE_DIR = './files/original'   # Where your PDFs are
PROCESSED_DIR = './files/processed' # Where MD files go
os.makedirs(PROCESSED_DIR, exist_ok=True)
STORE_NAME_ID = "hdj-store-v1"

In [43]:
# Create the File Search store with an optional display name
file_search_store = client.file_search_stores.create(config={'display_name': STORE_NAME_ID})
print(file_search_store)

name='fileSearchStores/hdjstorev1-r62gyruhwsj5' display_name='hdj-store-v1' create_time=datetime.datetime(2025, 11, 19, 21, 52, 33, 89352, tzinfo=TzInfo(0)) update_time=datetime.datetime(2025, 11, 19, 21, 52, 33, 89352, tzinfo=TzInfo(0)) active_documents_count=None pending_documents_count=None failed_documents_count=None size_bytes=None


In [None]:
# Get File Search store
my_file_search_store_name = ""
file_search_store = client.file_search_stores.get(name=my_file_search_store_name)
print(file_search_store)

In [38]:
# Convert files from .pdf to .md
import pymupdf4llm

md_files = []

for filename in os.listdir(SOURCE_DIR):
    if not filename.lower().endswith(".pdf"): continue
    
    file_path = os.path.join(SOURCE_DIR, filename)
    print(f"Processing: {filename}")
    
    # Get the document as a list of pages (page_chunks=True)
    # This is the KEY step for robust citation
    md_pages = pymupdf4llm.to_markdown(file_path, page_chunks=True)
    
    full_doc_content = []
    
    # Iterate through every page and inject a clear header
    for page_num, page_data in enumerate(md_pages, start=1):
        # We inject a header that the LLM cannot miss
        page_text = f"""## DOCUMENT_ID: {filename} | PAGE_NUMBER: {page_num}\n---\n{page_data['text']}\n---"""
        full_doc_content.append(page_text)
    
    # Join all pages back into one markdown file
    final_md = "\n".join(full_doc_content)
    
    # Save the processed MD file
    save_path = os.path.join(PROCESSED_DIR, f"{filename.split('.')[0]}.md")
    with open(save_path, 'w', encoding='utf-8') as f:
        f.write(final_md)

print("✅ Processing complete.")

Processing: test-doc-2.pdf
Processing: test-doc-1.pdf
✅ Processing complete.


In [44]:
# Upload and import a file into the File Search store, supply a file name which will be visible in citations
for file in os.listdir('./files/processed'):
    operation = client.file_search_stores.upload_to_file_search_store(
    file=f"./files/processed/{file}",
    file_search_store_name=file_search_store.name,
    config={
        'display_name' : file.split('.')[0],
    }
)

In [45]:
# List all file search stores
for file_search_store in client.file_search_stores.list():
    print(file_search_store)


name='fileSearchStores/hdjstorev1-r62gyruhwsj5' display_name='hdj-store-v1' create_time=datetime.datetime(2025, 11, 19, 21, 52, 33, 89352, tzinfo=TzInfo(0)) update_time=datetime.datetime(2025, 11, 19, 21, 52, 33, 89352, tzinfo=TzInfo(0)) active_documents_count=1 pending_documents_count=1 failed_documents_count=None size_bytes=183342


In [46]:
# List files in a specific file search store
files_in_store = client.file_search_stores.documents.list(parent=file_search_store.name)
for file in files_in_store:
    print(file)


name='fileSearchStores/hdjstorev1-r62gyruhwsj5/documents/testdoc1-1b9bqytksciu' display_name='test-doc-1' state=<DocumentState.STATE_ACTIVE: 'STATE_ACTIVE'> size_bytes=54346 mime_type='text/markdown' create_time=datetime.datetime(2025, 11, 19, 21, 52, 36, 549137, tzinfo=TzInfo(0)) custom_metadata=None update_time=datetime.datetime(2025, 11, 19, 21, 52, 37, 454669, tzinfo=TzInfo(0))
name='fileSearchStores/hdjstorev1-r62gyruhwsj5/documents/testdoc2-cqvw2e1fnccz' display_name='test-doc-2' state=<DocumentState.STATE_ACTIVE: 'STATE_ACTIVE'> size_bytes=128996 mime_type='text/markdown' create_time=datetime.datetime(2025, 11, 19, 21, 52, 38, 437656, tzinfo=TzInfo(0)) custom_metadata=None update_time=datetime.datetime(2025, 11, 19, 21, 52, 40, 40235, tzinfo=TzInfo(0))


In [42]:
# Delete all file search stores
for file_search_store in client.file_search_stores.list():
    client.file_search_stores.delete(name=file_search_store.name, config={'force': True})

In [None]:
json_schema = {
  "type": "object",
  "properties": {
    "analysis_results": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "relevance_score": {"type": "integer"},
          "hdj_themes": {"type": "array", "items": {"type": "string"}},
          "excerpt_summary": {"type": "string"},
          "source_excerpt_text": {"type": "string"},
          # NEW FIELD: Page Number
          "source_page_number": {
              "type": "integer", 
              "description": "The exact page number found in the section header (e.g., from 'PAGE_NUMBER: 5')."
          },
          "source_filename": {"type": "string"}
        },
        "required": ["relevance_score", "hdj_themes", "excerpt_summary", "source_excerpt_text", "source_page_number", "source_filename"]
      }
    }
  },
  "required": ["analysis_results"]
}

contents = """
**Role:** Specialized Policy Analyst.

**Task:** Search the uploaded files for mentions of Health Data Justice (Fairness, Transparency, Accountability, Inclusion).

**STRICT GROUNDING RULES:**
1. **ONLY** use the content found in the "Context" provided by the File Search tool.
2. **NEVER** use your internal training knowledge to answer.
3. **VERIFY FILENAMES:** You must extract the `source_filename` *strictly* from the `__CTX: Doc=...` tag in the text.
4. **IF THE TAG IS MISSING:** Do not use that text chunk.
5. **IF NO RELEVANT SECTIONS ARE FOUND:** Return an empty list `[]` for `analysis_results`. **Do not invent a response.**
6. **PROHIBITION:** Do not use the filename "Health Data Justice.pdf" unless it literally exists in the store.

**Output:** Return the results strictly according to the provided JSON Schema.
"""

In [50]:
# Ask a question about the file
response = client.models.generate_content(
    model="gemini-2.5-flash",
    contents=contents,
    config=types.GenerateContentConfig(
        response_schema=json_schema,
        response_mime_type="application/json",
        tools=[
            types.Tool(
                file_search=types.FileSearch(
                    file_search_store_names=[file_search_store.name]
                )
            )
        ],
    )
)

print(response.text)

Here is the JSON requested:
```json
{
  "analysis_results": [
    {
      "relevance_score": 9,
      "hdj_themes": [
        "Accountability",
        "Fairness"
      ],
      "excerpt_summary": "The General Data Protection Regulation (GDPR) mandates data protection by design and by default, requiring organizations to integrate safeguards into their processing activities to protect data subjects' rights and freedoms, emphasizing accountability and fairness in data handling.",
      "source_excerpt_text": "Die DSGVO verpflichtet zudem zu Datenschutz durch Technikgestaltung und durch datenschutzfreundliche Voreinstellungen (Art. 25 Abs. 1 und 2 DSGVO). Damit sind Verantwortliche verpflichtet, bereits in der Planungsphase von Datenverarbeitungen technische und organisatorische Maßnahmen so zu gestalten, dass sie die Rechte und Freiheiten der betroffenen Personen schützen. Dieses technische Datenschutzrecht ist somit integraler Bestandteil einer verantwortungsvollen Datenverarbeitungspra

In [49]:
# Run this specific check
debug_prompt = """
SEARCH QUERY: "Health Data Justice fairness transparency"

INSTRUCTIONS:
1. Search the provided files.
2. List the **exact filenames** of the documents you found.
3. If you found no documents, strictly say "NO DOCUMENTS FOUND".
4. Do NOT answer the question using your own knowledge.
"""

response = client.models.generate_content(
    model="gemini-2.5-flash",
    contents=debug_prompt,
    config=types.GenerateContentConfig(
        tools=[
            types.Tool(
                file_search=types.FileSearch(
                    file_search_store_names=[file_search_store.name]
                )
            )
        ]
    )
)

print(response.text)

The exact filenames of the documents found are:
*   test-doc-2.pdf
*   test-doc-1.pdf
