In [None]:
import requests
import csv
import time
import os
from dotenv import load_dotenv

load_dotenv(dotenv_path='keys.env')

APP_ID = os.environ.get("APP_ID")
APP_KEY = os.environ.get("APP_KEY")

# Parámetros generales

COUNTRY = 'us'
RESULTS_PER_PAGE = 50
SAVE_INTERVAL = 10000
MAX_PAGES = 1000
MAX_EMPTY_PAGES = 10  # Máximo de páginas sin recoger ofertas antes de saltar
DELAY = 1

locations = [
    "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", 
    "Delaware", "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", 
    "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", 
    "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", 
    "New Hampshire", "New Jersey", "New Mexico", "New York", "North Carolina", 
    "North Dakota", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island", 
    "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Vermont", 
    "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming"
]

filename = "it_jobs.csv"
processed_job_ids = set()

if os.path.exists(filename):
    with open(filename, mode='r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            processed_job_ids.add(row["id"])

def guardar_empleos(jobs):
    if not jobs:
        return
    with open(filename, mode='a', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=jobs[0].keys())
        if f.tell() == 0:
            writer.writeheader()
        writer.writerows(jobs)

for location in locations:
    print(f"\n🌍 Buscando trabajos en {location}...")
    current_page = 1
    paginas_sin_resultados = 0
    local_jobs = []

    while current_page <= MAX_PAGES:
        print(f"📄 Página {current_page} - {location} - Acumulado local: {len(local_jobs)}")
        url = f"https://api.adzuna.com/v1/api/jobs/{COUNTRY}/search/{current_page}"
        params = {
            'app_id': APP_ID,
            'app_key': APP_KEY,
            'category': 'it-jobs',
            'results_per_page': RESULTS_PER_PAGE,
            'where': location,
            'content-type': 'application/json'
        }

        try:
            response = requests.get(url, params=params, timeout=10)
            if response.status_code != 200:
                print(f"❌ Error {response.status_code} en página {current_page} ({location})")
                break

            data = response.json()
            results = data.get("results", [])

            nuevos_trabajos = 0
            for job in results:
                job_id = job.get("id")
                if job_id in processed_job_ids:
                    continue
                processed_job_ids.add(job_id)
                nuevos_trabajos += 1

                loc = job.get("location", {})
                comp = job.get("company", {})
                cat = job.get("category", {})

                local_jobs.append({
                    "id": job_id,
                    "title": job.get("title"),
                    "description": job.get("description"),
                    "location_area": ", ".join(loc.get("area", [])),
                    "company_display_name": comp.get("display_name"),
                    "category_tag": cat.get("tag"),
                    "contract_time": job.get("contract_time"),
                    "salary_min": job.get("salary_min"),
                    "salary_max": job.get("salary_max"),
                    "salary_is_predicted": job.get("salary_is_predicted"),
                    "created": job.get("created"),
                    "latitude": job.get("latitude"),
                    "longitude": job.get("longitude"),
                    "redirect_url": job.get("redirect_url"),
                })

            if nuevos_trabajos == 0:
                paginas_sin_resultados += 1
            else:
                paginas_sin_resultados = 0

            if paginas_sin_resultados >= MAX_EMPTY_PAGES:
                print(f"⚠️ {MAX_EMPTY_PAGES} páginas sin trabajos nuevos. Pasando a la siguiente ubicación.")
                break

            if len(local_jobs) >= SAVE_INTERVAL:
                guardar_empleos(local_jobs)
                print(f"💾 Guardado parcial de {len(local_jobs)} trabajos en {location}")
                local_jobs.clear()

            current_page += 1
            time.sleep(DELAY)

        except Exception as e:
            print(f"❌ Error en página {current_page} ({location}): {e}")
            break

    if local_jobs:
        guardar_empleos(local_jobs)
        print(f"✅ Guardado final de {len(local_jobs)} trabajos para {location}")


In [None]:
# Cargar el CSV
df = pd.read_csv('it_jobs.csv')

# Verificar número de filas antes del procesamiento
original_count = len(df)

# Eliminar filas con redirect_url vacío o nulo
df = df[df['redirect_url'].notna() & (df['redirect_url'].str.strip() != "")]

# Eliminar duplicados por 'id' y 'redirect_url'
df_clean = df.drop_duplicates(subset=['id', 'redirect_url'], keep='last')

# Ver cuántas filas se eliminaron
removed = original_count - len(df_clean)
print(f"🧹 Se eliminaron {removed} filas (duplicadas o sin URL).")

# Guardar el DataFrame limpio
df_clean.to_csv('it_jobs_clean.csv', index=False)
print("✅ Archivo limpio guardado como 'it_jobs_clean.csv'")