In [None]:
import pandas as pd
import time
import random
import os
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from proxy_auth_plugin import create_proxy_auth_extension

# Archivos
INPUT_FILE = "a_scrapear.csv"
OUTPUT_FILE = "scrapeado_final.csv"
CHECKPOINT_FILE = "checkpoint.txt"

# Verificar si archivo original existe
if not os.path.exists(INPUT_FILE):
    raise FileNotFoundError(f"❌ No se encontró el archivo {INPUT_FILE}")

# Cargar dataset principal
df = pd.read_csv(INPUT_FILE)

# Limitar número de URLs
n_urls = 167271
df = df.head(n_urls)

# Añadir columna si no existe
if 'description_full' not in df.columns:
    df['description_full'] = None

# Reanudar desde checkpoint si existe
start_index = 0
if os.path.exists(CHECKPOINT_FILE):
    with open(CHECKPOINT_FILE, "r") as f:
        start_index = int(f.read().strip())
    print(f"⏸️ Reanudando scraping desde índice {start_index}")
else:
    print(f"🔁 Iniciando scraping desde el índice {start_index}")

# Si ya existe el archivo de salida, cargarlo y conservar lo anterior
if os.path.exists(OUTPUT_FILE):
    df_prev = pd.read_csv(OUTPUT_FILE)
    df.loc[:len(df_prev)-1, 'description_full'] = df_prev['description_full']
    print(f"📂 Archivo existente {OUTPUT_FILE} cargado con {len(df_prev)} filas ya scrapeadas.")

# Lista de proxies (reemplazar con proxies reales)
PROXIES = [
    {"host": "194.67.37.90", "port": "3128", "username": "", "password": ""},
    {"host": "residential.zenrows.com", "port": "8001", "username": "user1", "password": "pass1"},
    {"host": "proxy.soax.com", "port": "10000", "username": "user2", "password": "pass2"},
    {"host": "geo.rotating.proxy.shifter.io", "port": "35000", "username": "user3", "password": "pass3"},
    {"host": "pr.oxylabs.io", "port": "7777", "username": "user4", "password": "pass4"},
    {"host": "51.159.115.233", "port": "3128", "username": "", "password": ""},
    {"host": "51.15.147.172", "port": "3128", "username": "", "password": ""}
]

# Lista de user-agents
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64)...",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)...",
    "Mozilla/5.0 (X11; Linux x86_64)...",
]

# Inicializador de driver
def init_driver(proxy, user_agent):
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--incognito")
    options.add_argument(f"user-agent={user_agent}")

    prefs = {
        "profile.managed_default_content_settings.images": 2,
        "profile.managed_default_content_settings.stylesheets": 2,
        "profile.managed_default_content_settings.fonts": 2
    }
    options.add_experimental_option("prefs", prefs)

    plugin_path = create_proxy_auth_extension(
        proxy_host=proxy["host"],
        proxy_port=proxy["port"],
        proxy_username=proxy["username"],
        proxy_password=proxy["password"]
    )
    options.add_extension(plugin_path)
    driver = webdriver.Chrome(options=options)
    return driver, plugin_path

# Scraping function
def scrape_job_description_selenium(driver, url):
    try:
        driver.get(url)
        WebDriverWait(driver, 2.5).until(
            EC.presence_of_element_located(
                (By.CSS_SELECTOR, 'section.adp-body.mx-4.text-sm.md\\:mx-0.md\\:text-base')
            )
        )

        try:
            show_more_button = driver.find_element(
                By.CSS_SELECTOR,
                "a.font-bold.bg-white.inline-block.px-8.pt-4.text-adzuna-green-500.rounded-t-lg.hover\\:underline"
            )
            show_more_button.click()
            print("🔘 'Show Full Description' presionado.")
            time.sleep(random.uniform(0.2, 1))
        except NoSuchElementException:
            print("⚠️ Botón 'Show Full Description' no encontrado.")

        section = driver.find_element(By.CSS_SELECTOR, 'section.adp-body.mx-4.text-sm.md\\:mx-0.md\\:text-base')
        driver.execute_script("arguments[0].scrollIntoView();", section)
        time.sleep(random.uniform(0.5, 1.5))

        description_html = section.get_attribute('innerHTML')
        description_text = description_html.replace('<br>', '\n')

        if not description_text.strip():
            elements = section.find_elements(By.CSS_SELECTOR, "p, li")
            if not elements:
                print(f"⚠️ No se encontraron <p> o <li> en {url}")
                return None
            description_text = "\n".join([el.text.strip() for el in elements if el.text.strip()])

        if not description_text.strip():
            print(f"⚠️ Descripción vacía en {url}")
            return None

        return description_text

    except (TimeoutException, NoSuchElementException, ValueError) as e:
        print(f"⚠️ Error en {url}: {e}")
        return None
    except Exception as e:
        print(f"❌ Error inesperado: {e}")
        return None

# Variables de control
driver = None
plugin_path = None
restart_interval = 50
counter = 0

# Loop principal de scraping
for idx in range(start_index, len(df)):
    url = df.at[idx, "redirect_url"]
    if pd.notnull(url):
        if counter % restart_interval == 0:
            if driver:
                driver.quit()
                if plugin_path and os.path.exists(plugin_path):
                    os.remove(plugin_path)
            proxy = random.choice(PROXIES)
            user_agent = random.choice(USER_AGENTS)
            driver, plugin_path = init_driver(proxy, user_agent)
            print(f"🔁 Driver reiniciado con proxy {proxy['host']}")

        description = scrape_job_description_selenium(driver, url)
        df.at[idx, 'description_full'] = description
        print(f"[{idx}] URL procesada: {'✅' if description else '❌'}")

        counter += 1

        # Guardado parcial
        if counter % 500 == 0:
            df.to_csv(OUTPUT_FILE, index=False)
            with open(CHECKPOINT_FILE, "w") as f:
                f.write(str(idx + 1))
            print(f"💾 Guardado parcial en índice {idx}")

        time.sleep(random.uniform(0.2, 1))

# Limpieza final
if driver:
    driver.quit()
if plugin_path and os.path.exists(plugin_path):
    os.remove(plugin_path)

df.to_csv(OUTPUT_FILE, index=False)
with open(CHECKPOINT_FILE, "w") as f:
    f.write(str(len(df)))

print("✅ Scraping finalizado y guardado.")