In [None]:
import requests
from bs4 import BeautifulSoup
import PyPDF2

from dotenv import load_dotenv
import os

# find and load .env
load_dotenv()

# fetch variables
API_KEY        = os.getenv("API_KEY")
API_URL_BASE   = os.getenv("API_URL")
PROMPT_FILE    = os.getenv("PROMPT_FILE")
RAW_TEXT_FILE  = os.getenv("RAW_TEXT_FILE")
WEBPAGE_URL    = os.getenv("WEBPAGE_URL")
PDF_FILE       = os.getenv("PDF_FILE")
OUTPUT_FILE    = os.getenv("OUTPUT_FILE")
TEMPERATURE    = float(os.getenv("TEMPERATURE", 0.0))

# if you only stored the base URL, build the full endpoint:
API_URL = f"{API_URL_BASE}?key={API_KEY}"


def read_file(path: str) -> str:
    """Lê texto de um arquivo de texto plain."""
    try:
        with open(path, "r", encoding="utf-8") as f:
            return f.read()
    except FileNotFoundError:
        print(f"Erro: {path} não encontrado.")
        exit(1)


def fetch_webpage_text(url: str) -> str:
    """Busca o conteúdo de uma página web e extrai o texto."""
    try:
        resp = requests.get(url)
        resp.raise_for_status()
    except requests.RequestException as e:
        print(f"Erro ao buscar página {url}: {e}")
        exit(1)

    soup = BeautifulSoup(resp.text, "html.parser")
    # Remove scripts/styles
    for tag in soup(["script", "style"]):
        tag.decompose()
    text = soup.get_text(separator=" \n")
    return text.strip()


def extract_pdf_text(path: str) -> str:
    """Extrai texto de um arquivo PDF."""
    try:
        with open(path, "rb") as f:
            reader = PyPDF2.PdfFileReader(f)
            text = []
            for page_num in range(reader.numPages):
                page = reader.getPage(page_num)
                text.append(page.extractText())
    except FileNotFoundError:
        print(f"Erro: {path} não encontrado.")
        exit(1)
    except Exception as e:
        print(f"Erro ao ler PDF {path}: {e}")
        exit(1)
    return "\n".join(text)


def generate_content(prompt_text: str, temperature: float) -> dict:
    headers = {"Content-Type": "application/json"}
    body = {
        "contents": [{"parts": [{"text": prompt_text}]}],
        "generationConfig": {"temperature": temperature},
    }
    resp = requests.post(API_URL, headers=headers, json=body)
    resp.raise_for_status()
    return resp.json()


if __name__ == "__main__":
    # Leitura das fontes
    prompt_with_examples = read_file(PROMPT_FILE)
    raw_text = read_file(RAW_TEXT_FILE)
    webpage_text = fetch_webpage_text(WEBPAGE_URL)
    pdf_text = extract_pdf_text(PDF_FILE)

    # Montagem do prompt final
    final_prompt = (
        f"{prompt_with_examples}\n\n"
        f"Raw Text to Anonymize:\n{raw_text}\n\n"
        f"Webpage Content to Anonymize (from {WEBPAGE_URL}):\n{webpage_text}\n\n"
        f"PDF Content to Anonymize (from {PDF_FILE}):\n{pdf_text}"
    )

    # Chamada à API
    output = generate_content(final_prompt, TEMPERATURE)

    # Processa e grava resposta
    try:
        response_text = output["candidates"][0]["content"]["parts"][0]["text"]
        with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
            f.write(response_text)
        print(response_text)
    except (KeyError, IndexError):
        print("Erro: Formato de resposta inesperado da API.")
        print(output)


ModuleNotFoundError: No module named 'bs4'