In [2]:
from litellm import completion
from base64 import b64encode

import fitz

In [None]:
def encode_image(image_path):
    with open(image_path, "rb") as f:
        return b64encode(f.read()).decode("utf-8")

def build_reviewer_message(reviewer_prompt, file_content):
    prompt_text = reviewer_prompt['before'] + file_content['text'] + reviewer_prompt['after']
    content = [{"type": "text", "text": prompt_text}]   
    if file_content['image_paths']:
        content += [
            {
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/jpeg;base64,{encode_image(path)}",
                },
            }
            for path in file_content['image_paths']
        ]
    return {"role": "user", "content": content}

def chunk_text(text, max_tokens=2000, token_ratio=0.75):
    """
    Chunk text into segments that roughly match the token limit.
    Assumes 1 token ≈ token_ratio words (default ~0.75 for GPT-4).
    """
    max_words = int(max_tokens / token_ratio)
    words = text.split()
    chunks = []

    for i in range(0, len(words), max_words):
        chunk = " ".join(words[i:i + max_words])
        chunks.append(chunk)

    return chunks

In [None]:
def extract_text(file_path):
    #given an open pdf file descriptor return text from it
    #returns a file_content dict with a single field 'text'

    doc = fitz.open(file_path)
    all_text = []

    for page in doc:
        text = page.get_text()
        if text.strip():  # Skip empty pages
            all_text.append(text)

    doc.close()
    return {'text': "\n".join(all_text)}


    

In [None]:


def test(pdf_file_name, model, reviewer_prompt, extract_content_message):
    #given an open pdf_file descriptor, extract content from the file
    #then send it to the model with added reviewer prompt, and return text of the response

    content = extract_content(pdf_file_name)
    messages = build_reviewer_message(reviewer_prompt, content)
        
    response = completion(model=model, messages=messages)
    return response["choices"][0]["message"]["content"]
