In [2]:
import requests
import fitz  # PyMuPDF
from io import BytesIO
import tiktoken  # OpenAI's tokenizer

def count_tokens(text):
    """Counts tokens in a given text string using OpenAI's tokenizer."""
    enc = tiktoken.get_encoding("cl100k_base")  # GPT-4 / GPT-3.5 tokenizer
    return len(enc.encode(text))

#in USD$ per 1M(illion) tokens
api_rates = {'gpt-4.5-preview' : 75,
                'gpt-4o' : 2.5,
                'gpt-4o-audio-preview' : 2.5,
                'gpt-4o-realtime-preview' : 5,
                'gpt-4o-mini' : 0.15,
                'gpt-4o-mini-audio-preview' : 0.15,
                'gpt-4o-mini-realtime-preview' : 0.6,
                'o1' : 15,
                'o3-mini' : 1.1,
                'o1-mini' : 1.1}

#genai is the key in the dictionary api_rates
def token_cost(genai, token_count):
    """Gives cost of tokens for a given model 'genai' and token count 'token_count' ."""
    ai_cost = round((token_count / 1000000) * api_rates[genai], 4)
    return ai_cost

#reads a pdf from local or URL -- URL needs to start with 'http'
def extract_full_pdf_text(source):
    """Fetches and extracts raw text from a PDF from URL or local file."""
    
    if source.lower().startswith("http"):
        # It's a URL, download the PDF
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(source)
        
        if response.status_code != 200:
            raise ValueError(f"Failed to fetch PDF from URL: {source}")
        
        pdf_bytes = BytesIO(response.content)
        document = fitz.open("pdf", pdf_bytes)
    
    else:
        # It's a local file
        document = fitz.open(source)

    # Extract full text from all pages
    full_text = "\n".join([page.get_text("text") for page in document])
    document.close()

    return full_text.strip()
def get_est_genai_cost(source, ai):
    """Extracts full text from a PDF or txt file and estimates token count and API cost."""
    
    if source.lower().endswith('.txt'):
        # Handle plain text files
        with open(source, 'r', encoding='utf-8') as f:
            full_text = f.read()
    else:
        # Handle PDFs or URLs
        full_text = extract_full_pdf_text(source)
    
    token_count = count_tokens(full_text)
    api_cost = token_cost(ai, token_count)

    return {
        "Total Tokens": token_count,
        "Estimated API Cost ($)": api_cost
    }

# Example usage
pdf_url = "https://www.finra.org/sites/default/files/2025-03/Regulatory-Notice-25-03.pdf"
token_info = get_est_genai_cost(pdf_url, "gpt-4o-mini")
print("PDF Cost", token_info)
text_version_path = "pdf_text.txt"
token_info = get_est_genai_cost(text_version_path, "gpt-4o-mini")
print("Extracted Text Cost", token_info)

PDF Cost {'Total Tokens': 2007, 'Estimated API Cost ($)': 0.0003}
Extracted Text Cost {'Total Tokens': 1689, 'Estimated API Cost ($)': 0.0003}
