In [None]:
# Dependencias principales
!pip install -q pdfplumber groq

In [None]:
from google.colab import files
from pathlib import Path

uploaded = files.upload()
ZIP_NAME = next(iter(uploaded.keys()))
DATA_ROOT = Path("data_cvs")
DATA_ROOT.mkdir(exist_ok=True)
ZIP_NAME

In [None]:
from zipfile import ZipFile

with ZipFile(ZIP_NAME) as zf:
    zf.extractall(DATA_ROOT)

pdf_files = sorted(DATA_ROOT.rglob("*.pdf"))
print(f"Se encontraron {len(pdf_files)} PDFs:")
for p in pdf_files:
    print(" -", p)

In [None]:
import pdfplumber
from pathlib import Path

TXT_ROOT = Path("cvs_txt")
TXT_ROOT.mkdir(exist_ok=True)

def pdf_to_text(pdf_path: Path, txt_path: Path) -> None:
    with pdfplumber.open(pdf_path) as pdf:
        pages_text = [page.extract_text() or "" for page in pdf.pages]
    text = "\n".join(pages_text)
    with txt_path.open("w", encoding="utf-8") as f:
        f.write(text)

for pdf_path in pdf_files:
    txt_path = TXT_ROOT / (pdf_path.stem + ".txt")
    pdf_to_text(pdf_path, txt_path)

print("Conversión completada. Archivos TXT generados en:", TXT_ROOT)
for p in sorted(TXT_ROOT.glob("*.txt")):
    print(" -", p)

In [None]:
from groq import Groq
import os, json, re, logging
from datetime import datetime

# En Colab puedes definir aquí tu API key (no subirla a GitHub):
# os.environ["GROQ_API_KEY"] = "gsk_..."

if "GROQ_API_KEY" not in os.environ:
    raise RuntimeError("Defina la variable de entorno GROQ_API_KEY antes de continuar.")

client = Groq(api_key=os.environ["GROQ_API_KEY"])
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

def extract_email(text: str):
    m = re.search(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", text)
    return m.group(0) if m else None

def extract_phone(text: str):
    m = re.search(r"(\+?\d{1,3})?[-.\s]?\(?\d{2,3}\)?[-.\s]?\d{3}[-.\s]?\d{4}", text)
    return m.group(0) if m else None

def extract_years_experience(text: str):
    m = re.search(r"(\d+)\s+(años|years|años de experiencia)", text, re.IGNORECASE)
    return int(m.group(1)) if m else None

def extract_llm_data(text: str) -> dict:
    system_prompt = (
        "Eres un modelo experto en análisis de hojas de vida. "
        "Extrae solo datos respaldados por el texto. "
        "Si un dato no está claro, devuelve null. "
        "Responde siempre en JSON válido, sin comentarios adicionales."
    )

    user_prompt = f"""Analiza el siguiente CV y devuelve un JSON con esta estructura EXACTA:

{{
  "nombre": string | null,
  "formacion_ia": "Sí" | "No" | null,
  "score_cv": number,
  "confianza": number
}}

CV:
{text}
"""

    try:
        response = client.chat.completions.create(
            model="llama-3.1-8b-instant",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt},
            ],
            temperature=0.1,
        )
        content = response.choices[0].message.content.strip()
        try:
            data = json.loads(content)
        except Exception:
            logging.warning("Respuesta no era JSON perfecto. Se aplica fallback neutro.")
            data = {"nombre": None, "formacion_ia": None, "score_cv": 0.0, "confianza": 0.0}
    except Exception as e:
        logging.error(f"Fallo en llamada al LLM: {e}")
        data = {"nombre": None, "formacion_ia": None, "score_cv": 0.0, "confianza": 0.0}
    return data

In [None]:
def process_cv(text: str, filename: str) -> dict:
    email = extract_email(text)
    phone = extract_phone(text)
    years = extract_years_experience(text)
    llm_data = extract_llm_data(text)

    metricas = {
        "email_detectado": 1 if email else 0,
        "telefono_detectado": 1 if phone else 0,
        "experiencia_detectada": 1 if years else 0,
    }

    result = {
        "nombre": llm_data.get("nombre"),
        "email": email,
        "telefono": phone,
        "anios_experiencia": years,
        "formacion_ia": llm_data.get("formacion_ia"),
        "score_cv": llm_data.get("score_cv"),
        "confianza": llm_data.get("confianza"),
        "metricas": metricas,
        "metadata": {
            "modelo_llm": "llama-3.1-8b-instant",
            "version_pipeline": "1.1",
            "fecha_ejecucion": datetime.utcnow().isoformat(),
            "archivo_origen": filename,
        },
    }

    if result["email"] is None or result["nombre"] is None:
        result["score_cv"] = 0.0
        result["confianza"] = min(result.get("confianza") or 0.0, 0.3)

    return result

In [None]:
from pathlib import Path

INPUT_DIR = TXT_ROOT
OUTPUT_JSON = Path("resultados_cvs.json")

results = []

for txt_path in sorted(INPUT_DIR.glob("*.txt")):
    logging.info(f"Procesando archivo: {txt_path.name}")
    with txt_path.open("r", encoding="utf-8", errors="ignore") as f:
        text = f.read()
    cv_result = process_cv(text, txt_path.name)
    results.append(cv_result)

with OUTPUT_JSON.open("w", encoding="utf-8") as f:
    json.dump(results, f, indent=4, ensure_ascii=False)

print(f"Se procesaron {len(results)} CVs")
print(f"Archivo JSON generado en: {OUTPUT_JSON.resolve()}")

In [None]:
import json

with open("resultados_cvs.json", "r", encoding="utf-8") as f:
    data = json.load(f)

print(f"Registros en resultados_cvs.json: {len(data)}")
data[:2]