<a href="https://colab.research.google.com/github/jospaquim/servir-jobs-scraper/blob/main/servir.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pytest-playwright playwright
!playwright install-deps
!playwright install chromium

Collecting pytest-playwright
  Downloading pytest_playwright-0.7.2-py3-none-any.whl.metadata (14 kB)
Collecting playwright
  Downloading playwright-1.58.0-py3-none-manylinux1_x86_64.whl.metadata (3.5 kB)
Collecting pytest-base-url<3.0.0,>=1.0.0 (from pytest-playwright)
  Downloading pytest_base_url-2.1.0-py3-none-any.whl.metadata (6.6 kB)
Collecting pyee<14,>=13 (from playwright)
  Downloading pyee-13.0.1-py3-none-any.whl.metadata (3.0 kB)
Downloading pytest_playwright-0.7.2-py3-none-any.whl (16 kB)
Downloading playwright-1.58.0-py3-none-manylinux1_x86_64.whl (46.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.2/46.2 MB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyee-13.0.1-py3-none-any.whl (15 kB)
Downloading pytest_base_url-2.1.0-py3-none-any.whl (5.3 kB)
Installing collected packages: pyee, pytest-base-url, playwright, pytest-playwright
Successfully installed playwright-1.58.0 pyee-13.0.1 pytest-base-url-2.1.0 pytest-playwright-0.7.2
Install

In [13]:
import asyncio
import nest_asyncio
import pandas as pd
import random
from playwright.async_api import async_playwright

nest_asyncio.apply()

async def get_servir_dataframe(limit_pages=3):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context(user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
        page = await context.new_page()

        url = "https://app.servir.gob.pe/DifusionOfertasExterno/faces/consultas/ofertas_laborales.xhtml"

        datos_totales = []

        try:
            await page.goto(url, wait_until="networkidle", timeout=60000)

            # Filtros: LIMA (15) y SISTEMAS
            await page.select_option('select[id$="cboDep_input"]', '15')
            await page.fill('input[id$="txtPuesto"]', 'SISTEMAS')
            await page.click('button:has-text("Buscar")')

            # Espera crítica
            await page.wait_for_timeout(4000)

            for i in range(1, limit_pages + 1):
                print(f"--- Scraping Page {i} ---")
                # Asegurar que los cuadros estén ahí antes de intentar leerlos
                await page.wait_for_selector(".cuadro-vacantes", timeout=20000)

                # Capturar todos los cuadros de la página actual
                cuadros = await page.locator(".cuadro-vacantes").all()
                print(f"   -> Encontrados {len(cuadros)} registros en la página {i}.")

                for cuadro in cuadros:
                    item = {
                        "puesto": (await cuadro.locator(".titulo-vacante").inner_text()).strip(),
                        "entidad": (await cuadro.locator(".nombre-entidad").inner_text()).strip(),
                    }

                    # Extracción dinámica de etiquetas (sub-titulo : detalle-sp)
                    detalles = await cuadro.locator(".row.box-mb").all()
                    for d in detalles:
                        sub = await d.locator(".sub-titulo").all_text_contents()
                        val = await d.locator(".detalle-sp").all_text_contents()
                        if sub and val:
                            # Normalizar nombre de columna: "Ubicación:" -> "ubicacion"
                            label = sub[0].replace(":", "").strip().lower().replace(" ", "_")
                            item[label] = val[0].strip()

                    datos_totales.append(item)

                # Navegación a la siguiente página
                boton_sig = page.locator("button:has-text('Sig.')").first
                if i < limit_pages and await boton_sig.is_enabled():
                    texto_antes = await page.locator(".btn-paginator-cnt").first.inner_text()
                    await boton_sig.click()

                    # Esperar a que el contador de página cambie de verdad
                    try:
                        await page.wait_for_function(
                            f"document.querySelector('.btn-paginator-cnt').innerText !== '{texto_antes}'",
                            timeout=15000
                        )
                    except:
                        print("El servidor no respondió el cambio de página a tiempo o no hay más páginas.")

                    await page.wait_for_timeout(random.randint(1000, 2500))
                else:
                    break

            df = pd.DataFrame(datos_totales)

            # eliminar saltos de línea y basura visual en todo el DF
            df = df.applymap(lambda x: " ".join(x.split()) if isinstance(x, str) else x)
            return df

        except Exception as e:
            return pd.DataFrame(datos_totales)
        finally:
            await browser.close()


In [14]:

def desinfectar_dataframe(df):
    if df.empty:
        return df

    df.columns = [
        " ".join(col.split()).replace(":", "").strip().lower().replace(" ", "_")
        for col in df.columns
    ]

    df = df.map(lambda x: " ".join(x.split()) if isinstance(x, str) else x)

    if 'remuneración' in df.columns:
        # Convertir "S/. 8,364.19" en 8364.19 (float)
        df['remuneracion_num'] = (
            df['remuneración']
            .str.replace(r'[S/\.,\s]', '', regex=True)
            .replace('', '0')
            .astype(float) / 100
        )

    # 4. Limpieza de fechas
    for f in ['fecha_inicio_de_publicación', 'fecha_fin_de_publicación']:
        if f in df.columns:
            df[f] = pd.to_datetime(df[f], format='%d/%m/%Y', errors='coerce')

    return df

In [15]:
df_final = asyncio.run(get_servir_dataframe(limit_pages=2))
df_final = desinfectar_dataframe(df_final)
display(df_final)