In [1]:
# Agentic researches that examines today's DOGV (Diari Oficial de la Generalitat Valenciana)
# looking for projects and opportunities.
# Uses gpt-4o-mini

In [2]:
# imports
import os
import requests
import io
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display
from openai import OpenAI
from datetime import date
from pdfminer.high_level import extract_text
from pdfminer.layout import LAParams

In [3]:
# Load Open AI API key from a file called .env

load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

# Check the key

if not api_key:
    print("No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!")
elif not api_key.startswith("sk-proj-"):
    print("An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook")
elif api_key.strip() != api_key:
    print("An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook")
else:
    print("API key found and looks good so far!")


API key found and looks good so far!


In [4]:
def get_today_dogv_pdf(lang: str = "es") -> str | None:
    """
    Returns the URL of the DOGV PDF for today's date, if available.
    Searches sequentially for valid index numbers in today's folder.
    Example URL:
      https://dogv.gva.es/datos/2025/09/29/pdf/sumario_2025_10204_va.pdf
    """
    today = date.today()
    Y, M, D = today.year, f"{today.month:02d}", f"{today.day:02d}"

    for i in range(10200, 10300):
        url = f"https://dogv.gva.es/datos/{Y}/{M}/{D}/pdf/sumario_{Y}_{i}_{lang}.pdf"
        r = requests.head(url)
        if r.status_code == 200:
            return url

    print("No hay boletín publicado hoy.")
    return None

In [5]:
# Read a PDF from a URL (no local save) and return plain text

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    )
}

def pdf_url_to_text(url: str, *, timeout: int = 30, max_bytes: int = 25_000_000) -> str:
    """Fetch a PDF from `url` into memory and extract plain text without writing to disk."""
    resp = requests.get(url, headers=HEADERS, timeout=timeout)
    resp.raise_for_status()

    ctype = (resp.headers.get("Content-Type") or "").lower()
    if "pdf" not in ctype and not url.lower().endswith(".pdf"):
        raise ValueError(f"URL doesn't look like a PDF (Content-Type: {ctype or 'unknown'})")

    data = resp.content
    if len(data) > max_bytes:
        raise ValueError(f"PDF too large: {len(data):,} bytes > limit {max_bytes:,} bytes")

    laparams = LAParams()  # tweak if needed: line_margin, char_margin, etc.
    text = extract_text(io.BytesIO(data), laparams=laparams)
    return text

In [6]:
# System prompt
system_prompt = """
You are an expert AI analyst specialized in public administration, procurement, and funding in Spain.

Analyze raw text from official bulletins (DOGV, BOE, etc.) to find opportunities for a self-employed consultant in:

Experience: 
* Information technologies
* Artificial Inteligence
* Machine learning
* AI Agents
* Mobile Apps
* Software engineering

Products:
* CI Optimization - Selective testing
* CI Optimization - Actions refactoring
* CI Optimization - Custom Pull-Request reviewing agent
* Multi-agent system developing code on top of Git
* Conconi effort test app - Android
* Bluetooth connectivity library - Android

### Steps

1. **Detect type**
   Classify each section as:

* “Ayuda/Subvención”
* “Licitación/Contrato”
* “Programa o convocatoria”
* “Otros” (ignore)

2. **Extract info**
   For each relevant item include:

* Title
* Entity
* Summary / purpose
* Budget
* Deadline
* Reference
* URL
* Category
* Matched keywords

3. **Relevance score (0–100)**

* +50 AI/ML, data, software
* +30 ICT, consulting, innovation
* +10 possibly tech
* −50 unrelated (construction, cleaning, etc.)

4. **Output**
   Return JSON sorted by score:

```json
[
  {
    "category": "Ayuda/Subvención",
    "title": "...",
    "entity": "...",
    "summary": "...",
    "budget": "...",
    "deadline": "...",
    "reference": "...",
    "url": "...",
    "score": 85
  }
]
```

If none found → `[]`.
If the text mentions a reference such as "DOGV-C-2025-40626", construct the URL using:
`https://dogv.gva.es/va/resultat-dogv?signatura=2025/40687`
- Replace the last number (40687) with the reference’s numeric part.
- If date info is not available, leave the URL field null.

5. **Language**

* Answer in Spanish
* 1–2 sentence summaries
* No speculation
* Markdown format

6. **Goal**
   Show current AI/ML/IT consultant opportunities clearly and briefly.
""" 

In [7]:
todays_dogv_url = get_today_dogv_pdf()
dogv_text = pdf_url_to_text(todays_dogv_url)
user_prompt = "\nStructure the following information:\n\n"
user_prompt += dogv_text

In [8]:
#print(user_prompt)

In [9]:
# See how this function creates exactly the format above

def messages_for():
    return [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]

In [10]:
# Try this out, and then try for a few more websites
#messages_for(crawler)

In [11]:
# Create Open AI instance
openai = OpenAI()

## Time to bring it together - the API for OpenAI is very simple!

In [12]:
# And now: call the OpenAI API. You will get very familiar with this!
def call_api():
    response = openai.chat.completions.create(
        model = "gpt-4o-mini",
        messages = messages_for()
    )
    return response.choices[0].message.content

In [13]:
def display_research():
    research = call_api()
    display(Markdown(research))

In [14]:
display_research()

```json
[
  {
    "category": "Licitación/Contrato",
    "title": "Convocatoria de plazas de técnico o técnica informáticos",
    "entity": "Ayuntamiento de Castellón de la Plana",
    "summary": "Anuncio en extracto de las bases de la convocatoria de técnico o técnica informáticos.",
    "budget": null,
    "deadline": null,
    "reference": "DOGV-C-2025-46071",
    "url": "https://dogv.gva.es/va/resultat-dogv?signatura=2025/46071",
    "score": 40
  },
  {
    "category": "Programa o convocatoria",
    "title": "Oferta pública de empleo para la contratación indefinida",
    "entity": "Conselleria de Sanidad",
    "summary": "Anuncio de la apertura de un plazo de presentación de solicitudes para la selección de una plaza mediante contrato indefinido, dentro de la oferta pública de empleo 2025.",
    "budget": null,
    "deadline": null,
    "reference": "DOGV-C-2025-45925",
    "url": "https://dogv.gva.es/va/resultat-dogv?signatura=2025/45925",
    "score": 10
  },
  {
    "category": "Ayuda/Subvención",
    "title": "Subvenciones para el fomento del valenciano y el multilingüismo en el ámbito musical",
    "entity": "Conselleria de Educación, Cultura, Universidades y Empleo",
    "summary": "Se conceden subvenciones para el fomento del valenciano y el multilingüismo en el ámbito musical para el ejercicio presupuestario de 2025.",
    "budget": null,
    "deadline": null,
    "reference": "DOGV-C-2025-45857",
    "url": "https://dogv.gva.es/va/resultat-dogv?signatura=2025/45857",
    "score": 0
  }
]
```