In [25]:
## EJERCICIO 2 
## Bumeran — Web Scraping
import sys
!{sys.executable} -m pip install --upgrade pip
!{sys.executable} -m pip install requests beautifulsoup4 pandas lxml
!{sys.executable} -m pip install selenium webdriver-manager




In [26]:
import os, re, time, json, datetime as dt
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# Carpeta de salida (tú usas "output")
OUT_DIR = "../output"
os.makedirs(OUT_DIR, exist_ok=True)

UAS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome Safari",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:117.0) Gecko/20100101 Firefox/117.0",
]
def ua(i=0): return {"User-Agent": UAS[i % len(UAS)]}

def make_session():
    s = requests.Session()
    r = Retry(total=3, backoff_factor=0.6, status_forcelist=[429,500,502,503,504])
    s.mount("http://", HTTPAdapter(max_retries=r))
    s.mount("https://", HTTPAdapter(max_retries=r))
    return s

def log(msg): 
    print(f"[{dt.datetime.now().strftime('%H:%M:%S')}] {msg}")


In [27]:
def get_listing_urls_requests(base_url_pattern: str, max_pages: int = 3, pause: float = 1.0):
    sess = make_session()
    urls = []
    for page in range(1, max_pages+1):
        url = base_url_pattern.format(page=page) if "{page}" in base_url_pattern \
              else (base_url_pattern if page == 1 else re.sub(r"(\.html)?$", f"-pagina-{page}.html", base_url_pattern))
        resp = sess.get(url, headers=ua(page-1), timeout=25)
        log(f"GET {resp.status_code} {url} (len={len(resp.text)})")
        if resp.status_code != 200 or len(resp.text) < 1000:
            if page == 1: break
            else: continue
        soup = BeautifulSoup(resp.text, "lxml")

        # Tarjetas (probamos varios selectores)
        cards = soup.select("[data-test-id='job-item']") or soup.select(".job-card") or soup.select("article")

        # además, plan B: escanear todos los <a>
        anchors = soup.select("a[href]")
        found = 0
        for a in anchors:
            href = a.get("href") or ""
            full = href if href.startswith("http") else urljoin("https://www.bumeran.com.pe", href)
            if urlparse(full).netloc.endswith("bumeran.com.pe") and re.search(r"(oferta-de-trabajo|empleo|empleos)", full):
                urls.append(full); found += 1
        log(f"Enlaces válidos encontrados: {found} · Acumulado: {len(urls)}")
        time.sleep(pause)
    return list(dict.fromkeys(urls))


In [28]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

def get_listing_urls_selenium(base_url_pattern: str, max_pages: int = 2, pause: float = 1.2, headless=False):
    opts = Options()
    if headless:
        opts.add_argument("--headless=new")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--disable-gpu")
    opts.add_argument("--start-maximized")

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=opts)

    urls = []
    try:
        for page in range(1, max_pages+1):
            url = base_url_pattern.format(page=page) if "{page}" in base_url_pattern \
                  else (base_url_pattern if page == 1 else re.sub(r"(\.html)?$", f"-pagina-{page}.html", base_url_pattern))
            log(f"Selenium GET {url}")
            driver.get(url)

            # Intentar cerrar cookie banner
            clicked = False
            for sel in [
                "button#onetrust-accept-btn-handler",
                "button[aria-label*='Aceptar']",
                "button[aria-label*='Aceptar todo']",
                "button[aria-label*='Aceptar todas']",
            ]:
                try:
                    el = WebDriverWait(driver, 3).until(EC.element_to_be_clickable((By.CSS_SELECTOR, sel)))
                    el.click(); clicked = True; break
                except Exception:
                    pass
            if not clicked:
                for txt in ["Aceptar", "Acepto", "Aceptar todo", "Aceptar y cerrar", "Entendido"]:
                    try:
                        btn = driver.find_element(By.XPATH, f"//button[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZÁÉÍÓÚÜ', 'abcdefghijklmnopqrstuvwxyzáéíóúü'), '{txt.lower()}')]")
                        btn.click(); break
                    except Exception:
                        pass

            # esperar algo de contenido y scrollear para lazy-load
            try:
                WebDriverWait(driver, 8).until(
                    EC.presence_of_all_elements_located((By.CSS_SELECTOR, "[data-test-id='job-item'], .job-card, article"))
                )
            except Exception:
                log("No se detectaron tarjetas enseguida. Sigo igual.")
            prev_h = 0
            for _ in range(8):
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(1.0)
                h = driver.execute_script("return document.body.scrollHeight")
                if h == prev_h: break
                prev_h = h

            soup = BeautifulSoup(driver.page_source, "lxml")
            found = 0
            for a in soup.select("a[href]"):
                href = a.get("href") or ""
                full = href if href.startswith("http") else urljoin("https://www.bumeran.com.pe", href)
                if "bumeran.com.pe" in full and re.search(r"(oferta-de-trabajo|empleo|empleos)", full):
                    urls.append(full); found += 1
            log(f"[Sel] Enlaces válidos: {found} · Acum: {len(urls)}")
            time.sleep(pause)
    finally:
        driver.quit()

    return list(dict.fromkeys(urls))


In [29]:
def get_job_urls_auto(base_url, max_pages=2, pause=1.2):
    urls = get_listing_urls_requests(base_url, max_pages=max_pages, pause=pause)
    if not urls:
        log("➡️ Pasando a Selenium (ventana visible) …")
        urls = get_listing_urls_selenium(base_url, max_pages=max_pages, pause=pause, headless=False)
    log(f"Total URLs capturadas: {len(urls)}")
    return urls


In [30]:
BASE_URL = "https://www.bumeran.com.pe/en-lima/empleos-area-tecnologia-sistemas-y-telecomunicaciones-subarea-programacion-full-time-publicacion-menor-a-15-dias.html"

MAX_PAGES = 2     # sube a 3–4 si ves más resultados (sin abusar)
PAUSE = 1.2

job_urls = get_job_urls_auto(BASE_URL, max_pages=MAX_PAGES, pause=PAUSE)
log(f"Primeras URLs: {job_urls[:5]}")


[19:31:30] GET 200 https://www.bumeran.com.pe/en-lima/empleos-area-tecnologia-sistemas-y-telecomunicaciones-subarea-programacion-full-time-publicacion-menor-a-15-dias.html (len=63597)
[19:31:30] Enlaces válidos encontrados: 0 · Acumulado: 0
[19:31:31] GET 200 https://www.bumeran.com.pe/en-lima/empleos-area-tecnologia-sistemas-y-telecomunicaciones-subarea-programacion-full-time-publicacion-menor-a-15-dias-pagina-2.html-pagina-2.html (len=63597)
[19:31:31] Enlaces válidos encontrados: 0 · Acumulado: 0
[19:31:32] ➡️ Pasando a Selenium (ventana visible) …
[19:31:35] Selenium GET https://www.bumeran.com.pe/en-lima/empleos-area-tecnologia-sistemas-y-telecomunicaciones-subarea-programacion-full-time-publicacion-menor-a-15-dias.html
[19:31:58] No se detectaron tarjetas enseguida. Sigo igual.
[19:32:00] [Sel] Enlaces válidos: 60 · Acum: 60
[19:32:01] Selenium GET https://www.bumeran.com.pe/en-lima/empleos-area-tecnologia-sistemas-y-telecomunicaciones-subarea-programacion-full-time-publicacion-m

In [31]:
def parse_jsonld_job(soup: BeautifulSoup):
    data = {"title": None, "description": None, "district": None, "employmentType": None, "company": None}
    for sc in soup.find_all("script", type="application/ld+json"):
        try: obj = json.loads(sc.string or "")
        except Exception: continue
        items = obj if isinstance(obj, list) else [obj]
        for it in items:
            if not isinstance(it, dict): continue
            t = it.get("@type")
            if isinstance(t, list): t = t[0]
            if t == "JobPosting" or any(k in it for k in ["title","description","hiringOrganization"]):
                data["title"] = data["title"] or it.get("title") or it.get("name")
                data["description"] = data["description"] or it.get("description")
                emp = it.get("employmentType"); 
                if isinstance(emp, list): emp = ", ".join(emp)
                data["employmentType"] = data["employmentType"] or emp
                org = it.get("hiringOrganization")
                if isinstance(org, dict): data["company"] = data["company"] or org.get("name")
                loc = it.get("jobLocation")
                if isinstance(loc, list) and loc: loc = loc[0]
                if isinstance(loc, dict):
                    addr = loc.get("address")
                    if isinstance(addr, dict): data["district"] = data["district"] or addr.get("addressLocality")
    return data

def extract_job_detail_requests(job_url: str) -> dict:
    sess = make_session()
    r = sess.get(job_url, headers=ua(0), timeout=25)
    if r.status_code != 200:
        return {"url": job_url, "Titulo del puesto": None, "Descripcion": None, "Distrito": None, "Modo de trabajo": None, "empresa": None}
    soup = BeautifulSoup(r.text, "lxml")
    jld = parse_jsonld_job(soup)
    title = jld.get("title")
    desc  = jld.get("description")
    company = jld.get("company")
    district = jld.get("district")
    mode = jld.get("employmentType")

    if not title:
        t = soup.select_one("[data-test-id='job-title']") or soup.find("h1")
        title = t.get_text(strip=True) if t else None
    if not company:
        c = soup.select_one("[data-test-id='job-company']") or soup.find(class_=re.compile("company", re.I))
        company = c.get_text(strip=True) if c else None
    if not desc:
        dc = soup.find(id=re.compile("descripcion|description", re.I)) or soup.find("section", string=re.compile("Descripción", re.I)) or soup.find("section")
        if dc:
            raw = dc.get_text("\n", strip=True)
            parts = re.split(r"(?i)beneficios", raw); desc = parts[0].strip() if parts else raw
    if not district:
        chip = soup.find(string=re.compile("Lima", re.I))
        if chip and getattr(chip, "parent", None): district = chip.parent.get_text(strip=True)
    if not mode:
        for key in ["remoto","híbrido","hibrido","presencial","full-time","tiempo completo"]:
            m = soup.find(string=re.compile(key, re.I))
            if m: mode = m.strip(); break

    return {"url": job_url, "Titulo del puesto": title, "Descripcion": desc,
            "Distrito": district, "Modo de trabajo": mode, "empresa": company}

# Fallback Selenium para detalle (por si alguna página también es JS-heavy)
from selenium.webdriver.chrome.service import Service
def extract_job_detail_selenium(job_url: str) -> dict:
    opts = Options(); opts.add_argument("--no-sandbox"); opts.add_argument("--disable-gpu"); opts.add_argument("--headless=new")
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=opts)
    try:
        driver.get(job_url)
        WebDriverWait(driver, 8).until(EC.presence_of_element_located((By.CSS_SELECTOR, "h1, [data-test-id='job-title']")))
        soup = BeautifulSoup(driver.page_source, "lxml")
        jld = parse_jsonld_job(soup)
        title = jld.get("title") or (soup.select_one("[data-test-id='job-title']") or soup.find("h1"))
        title = title if isinstance(title, str) else (title.get_text(strip=True) if title else None)
        company = jld.get("company") or (soup.select_one("[data-test-id='job-company']") or soup.find(class_=re.compile("company", re.I)))
        company = company if isinstance(company, str) else (company.get_text(strip=True) if company else None)
        desc = jld.get("description")
        if not desc:
            dc = soup.find(id=re.compile("descripcion|description", re.I)) or soup.find("section", string=re.compile("Descripción", re.I)) or soup.find("section")
            if dc:
                raw = dc.get_text("\n", strip=True)
                parts = re.split(r"(?i)beneficios", raw); desc = parts[0].strip() if parts else raw
        district = jld.get("district")
        if not district:
            chip = soup.find(string=re.compile("Lima", re.I))
            if chip and getattr(chip, "parent", None): district = chip.parent.get_text(strip=True)
        mode = jld.get("employmentType")
        if not mode:
            for key in ["remoto","híbrido","hibrido","presencial","full-time","tiempo completo"]:
                m = soup.find(string=re.compile(key, re.I))
                if m: mode = m.strip(); break
        return {"url": job_url, "Titulo del puesto": title, "Descripcion": desc,
                "Distrito": district, "Modo de trabajo": mode, "empresa": company}
    finally:
        driver.quit()

def extract_job_detail_auto(job_url: str) -> dict:
    d = extract_job_detail_requests(job_url)
    if not any([d["Titulo del puesto"], d["Descripcion"], d["Distrito"], d["Modo de trabajo"]]):
        return extract_job_detail_selenium(job_url)
    return d


In [32]:
rows = []
for i, u in enumerate(job_urls, start=1):
    log(f"[{i}/{len(job_urls)}] Detalle -> {u}")
    rows.append(extract_job_detail_auto(u))
    time.sleep(0.6)

expected = ["url","Titulo del puesto","Descripcion","Distrito","Modo de trabajo","empresa"]
df_jobs = pd.DataFrame(rows, columns=expected)
log(f"Filas extraídas: {len(df_jobs)}")

if not df_jobs.empty:
    df_jobs["Titulo del puesto"] = df_jobs["Titulo del puesto"].astype("string").str.strip()
    df_jobs["Descripcion"] = df_jobs["Descripcion"].astype("string").str.replace(r"\s+\n","\n", regex=True).str.strip()
    display(df_jobs.head())
else:
    log("⛔ df_jobs está vacío. Revisa que BASE_URL tenga resultados y, si es necesario, sube MAX_PAGES.")


[19:32:29] [1/61] Detalle -> https://www.bumeran.com.pe/empleos-seniority-junior.html?landing-jovenes-profesionales=true
[19:32:35] [2/61] Detalle -> https://www.bumeran.com.pe/empleos-seniority-gerencia-alta-gerencia-direccion.html?landing-puestos-ejecutivos=true
[19:32:42] [3/61] Detalle -> https://www.bumeran.com.pe/empleos.html
[19:32:49] [4/61] Detalle -> https://www.bumeran.com.pe/empleos-area-tecnologia-sistemas-y-telecomunicaciones.html
[19:32:55] [5/61] Detalle -> https://www.bumeran.com.pe/empleos-area-tecnologia-sistemas-y-telecomunicaciones-subarea-programacion.html
[19:33:01] [6/61] Detalle -> https://www.bumeran.com.pe/en-lima/empleos-area-tecnologia-sistemas-y-telecomunicaciones-subarea-programacion.html
[19:33:08] [7/61] Detalle -> https://www.bumeran.com.pe/en-lima/empleos-area-tecnologia-sistemas-y-telecomunicaciones-subarea-programacion-full-time-publicacion-menor-a-15-dias.html?relevantes=true
[19:33:14] [8/61] Detalle -> https://www.bumeran.com.pe/en-lima/empleos-a

Unnamed: 0,url,Titulo del puesto,Descripcion,Distrito,Modo de trabajo,empresa
0,https://www.bumeran.com.pe/empleos-seniority-j...,"Sorry, you have been blocked",,,,
1,https://www.bumeran.com.pe/empleos-seniority-g...,"Sorry, you have been blocked",,,,
2,https://www.bumeran.com.pe/empleos.html,"Sorry, you have been blocked",,,,
3,https://www.bumeran.com.pe/empleos-area-tecnol...,"Sorry, you have been blocked",,,,
4,https://www.bumeran.com.pe/empleos-area-tecnol...,"Sorry, you have been blocked",,,,


In [33]:
if df_jobs.empty:
    print("⛔ No hay datos que guardar.")
else:
    keep = ["Titulo del puesto","Descripcion","Distrito","Modo de trabajo","url"]
    for c in keep:
        if c not in df_jobs.columns: df_jobs[c] = None
    ts = dt.datetime.now().strftime("%Y%m%d_%H%M%S")
    out_path = os.path.join(OUT_DIR, f"bumeran_{ts}.csv")
    df_jobs[keep].to_csv(out_path, index=False, encoding="utf-8-sig", sep=";")
    print("✅ Guardado:", os.path.abspath(out_path))
    display(df_jobs[keep].head())


✅ Guardado: C:\Users\User\Documents\PROFE_CHRISTIAN\web_scrapping\output\bumeran_20250924_193906.csv


Unnamed: 0,Titulo del puesto,Descripcion,Distrito,Modo de trabajo,url
0,"Sorry, you have been blocked",,,,https://www.bumeran.com.pe/empleos-seniority-j...
1,"Sorry, you have been blocked",,,,https://www.bumeran.com.pe/empleos-seniority-g...
2,"Sorry, you have been blocked",,,,https://www.bumeran.com.pe/empleos.html
3,"Sorry, you have been blocked",,,,https://www.bumeran.com.pe/empleos-area-tecnol...
4,"Sorry, you have been blocked",,,,https://www.bumeran.com.pe/empleos-area-tecnol...


In [None]:
## Extract Job Posting Links (Extraer enlaces de ofertas de empleo)