In [None]:
import os
import json
import fitz  # PyMuPDF
import re
import concurrent.futures
from langchain_ollama import OllamaLLM
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pprint import pprint

In [None]:
# === 1. Extract text from PDFs ===
def extract_text_by_pages(folder_path):
    texts = {}
    for file in os.listdir(folder_path):
        if file.endswith(".pdf"):
            path = os.path.join(folder_path, file)
            with fitz.open(path) as doc:
                texts[file] = [page.get_text() for page in doc]
    return texts

In [None]:
# === 2. Relevance Filter ===
def is_relevant_page(text):
    keywords = [
        "robot arm", "manipulator", "structure", "gripper", "end-effector", "assistance", "assistive",
        "domestic tasks", "elderly care", "inflatable", "PAM",
        "brazo robótico", "estructura", "manipulador", "pinza", "efector final", "asistencia", "inflable"
    ]
    return any(k in text.lower() for k in keywords)

In [None]:
# === 3. Clean noisy content from chunks ===
def clean_chunk_text(text):
    text = re.sub(r"\$[^$]*\$", "", text)
    text = re.sub(r"\[.*?\]", "", text)
    text = re.sub(r"\(fig\.?.*?\)", "", text, flags=re.IGNORECASE)
    text = re.sub(r"\{.*?\}", "", text)
    return text

In [None]:
# === 4. Robust JSON parser ===
def clean_json_response(text):
    try:
        start = text.find("{")
        end = text.rfind("}") + 1
        return json.loads(text[start:end])
    except:
        return None

In [None]:
# === 5. Prompt Templates (EN & ES) ===
template_en = """
You are an expert in assistive robotics. Analyze the following text and extract:

1. Types of robotic arm structures — including Cartesian, SCARA, Articulated, Delta, Dual-arm, inflatable arms, PAM-based arms, soft robotic arms. Include inferred structures if not named explicitly.
2. Types of grippers — such as parallel 2-finger, adaptive 3-finger, soft, vacuum, suction, magnetic. Include functional descriptions.
3. For each one, list: Pros, Cons, Applications in domestic assistive robotics.

Return a valid JSON object:
{{
  "arms": [...],
  "grippers": [...]
}}

Text:
{{doc}}
"""

template_es = """
Eres un experto en robótica asistencial. Analiza el siguiente texto y extrae:

1. Tipos de estructuras de brazo robótico — incluyendo cartesiano, SCARA, articulado, delta, brazo dual, brazos inflables, brazos PAM, brazos blandos. Incluye estructuras inferidas si no están nombradas explícitamente.
2. Tipos de grippers — como pinza de 2 dedos, de 3 dedos adaptativo, suaves, por vacío, de succión, magnéticos. Incluye descripciones funcionales.
3. Para cada uno, lista: Ventajas, Desventajas, Aplicaciones en asistencia doméstica.

Devuelve un objeto JSON válido:
{{
  "arms": [...],
  "grippers": [...]
}}

Texto:
{{doc}}
"""

In [None]:
# === 6. Folder Setup and Model Config ===
pdf_folder = "/mnt/c/Users/laura/OneDrive - unimilitar.edu.co/Documentos/Proyecto de grado/AI_agent/Artículos"
cache_path = "results_cache.json"
llm = OllamaLLM(model="llama2:7b-chat", temperature=0.2)

In [8]:
# === 7. Load PDFs and filter relevant pages ===
docs_by_page = extract_text_by_pages(pdf_folder)
joined_docs = {
    name: " ".join([p for p in pages if is_relevant_page(p)])
    for name, pages in docs_by_page.items()
    if pages
}

# Diagnostic check
for name, content in joined_docs.items():
    print(f"📘 {name}: {len(content)} characters of relevant content")

📘 1-s2.0-S0921889025000703-main.pdf: 47242 characters of relevant content
📘 1-s2.0-S221478532205307X-main.pdf: 26574 characters of relevant content
📘 1-s2.0-S2405896321018991-main.pdf: 20150 characters of relevant content
📘 131467.pdf: 14387 characters of relevant content
📘 1cd20f074aaf0287a4bbcaceebb585fd3e71.pdf: 26829 characters of relevant content
📘 2020-J-RAL-OmniGripper.pdf: 40393 characters of relevant content
📘 2109.10892v3.pdf: 41754 characters of relevant content
📘 2401.10702v1.pdf: 43263 characters of relevant content
📘 3711936.pdf: 132119 characters of relevant content
📘 actuators-14-00124.pdf: 53294 characters of relevant content
📘 annurev-control-060117-105003.pdf: 83610 characters of relevant content
📘 Article SOFMER_CharlotteLEGOFF.pdf: 27503 characters of relevant content
📘 ASSISTIVE ROBOTIC ARM.pdf: 25209 characters of relevant content
📘 cooper2020.pdf: 30590 characters of relevant content
📘 Design_and_Development_of_a_Prototype_Robotic_Grip.pdf: 11418 characters of r

In [9]:
# === 8. Run analysis in parallel ===
results = {}
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
print(f"Procesando {len(joined_docs)} documentos en paralelo...\n")

def process_document(doc_name, full_text):
    from langchain_ollama import OllamaLLM
    from langchain.prompts import PromptTemplate
    from langchain.text_splitter import RecursiveCharacterTextSplitter

    # Modelo más rápido
    llm = OllamaLLM(model="llama2:7b-chat", temperature=0.2)

    # Detectar idioma
    lang = "es" if "es" in doc_name.lower() or "spa" in doc_name.lower() else "en"
    prompt_template = PromptTemplate(input_variables=["doc"], template=template_es if lang == "es" else template_en)
    chain = prompt_template | llm

    # Dividir en chunks y agrupar de a 3
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    raw_chunks = splitter.split_text(full_text)
    relevant_chunks = [c for c in raw_chunks if is_relevant_page(c)]
    grouped_chunks = ["\n\n".join(relevant_chunks[i:i+3]) for i in range(0, len(relevant_chunks), 3)]

    arms, grippers = [], []
    print(f"Procesando: {doc_name} ({len(grouped_chunks)} grupos de chunks)")

    for i, group in enumerate(grouped_chunks):
        cleaned = clean_chunk_text(group).replace("\u2028", " ").replace("\u2029", " ")
        try:
            response = chain.invoke({"doc": cleaned})
            print(f"\nRespuesta de {doc_name}, grupo {i+1}:\n{response[:800]}")
            parsed = clean_json_response(response)
            if parsed:
                arms.extend(parsed.get("arms", []))
                grippers.extend(parsed.get("grippers", []))
        except Exception as e:
            print(f"Error en {doc_name}, grupo {i+1}: {e}")
    
    return doc_name, {"arms": arms, "grippers": grippers}



with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [executor.submit(process_document, name, text) for name, text in joined_docs.items()]
    for future in concurrent.futures.as_completed(futures):
        doc_name, result = future.result()
        results[doc_name] = result

with open(cache_path, "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

print("Análisis finalizado y resultados guardados.\n")

Procesando 37 documentos en paralelo...

Procesando: 1-s2.0-S2405896321018991-main.pdf (5 grupos de chunks)
Procesando: 1-s2.0-S221478532205307X-main.pdf (8 grupos de chunks)
Procesando: 1-s2.0-S0921889025000703-main.pdf (13 grupos de chunks)
Procesando: 131467.pdf (3 grupos de chunks)
Procesando: Article SOFMER_CharlotteLEGOFF.pdf (8 grupos de chunks)
Procesando: 1cd20f074aaf0287a4bbcaceebb585fd3e71.pdf (17 grupos de chunks)
Procesando: 2401.10702v1.pdf (23 grupos de chunks)
Procesando: annurev-control-060117-105003.pdf (23 grupos de chunks)
Procesando: Design_and_Development_of_a_Prototype_Robotic_Grip.pdf (6 grupos de chunks)
Procesando: actuators-14-00124.pdf (21 grupos de chunks)
Error en 131467.pdf, grupo 1: model 'llama2:7b-chat' not found (status code: 404)
Error en 1-s2.0-S2405896321018991-main.pdf, grupo 1: model 'llama2:7b-chat' not found (status code: 404)
Error en 1cd20f074aaf0287a4bbcaceebb585fd3e71.pdf, grupo 1: model 'llama2:7b-chat' not found (status code: 404)
Error e

In [10]:
# === 9. Print all results clearly ===
print("📊 RESULTADOS COMPLETOS:")
for name, data in results.items():
    print(f"\n🗂️ Documento: {name}")
    if data["arms"] or data["grippers"]:
        pprint(data)
    else:
        print("⚠️ No se extrajeron brazos ni grippers.")

📊 RESULTADOS COMPLETOS:

🗂️ Documento: 131467.pdf
⚠️ No se extrajeron brazos ni grippers.

🗂️ Documento: 1-s2.0-S2405896321018991-main.pdf
⚠️ No se extrajeron brazos ni grippers.

🗂️ Documento: electronics-10-00793-v2.pdf
⚠️ No se extrajeron brazos ni grippers.

🗂️ Documento: cooper2020.pdf
⚠️ No se extrajeron brazos ni grippers.

🗂️ Documento: Article SOFMER_CharlotteLEGOFF.pdf
⚠️ No se extrajeron brazos ni grippers.

🗂️ Documento: gushi2020.pdf
⚠️ No se extrajeron brazos ni grippers.

🗂️ Documento: 1-s2.0-S221478532205307X-main.pdf
⚠️ No se extrajeron brazos ni grippers.

🗂️ Documento: Design_and_Development_of_a_Prototype_Robotic_Grip.pdf
⚠️ No se extrajeron brazos ni grippers.

🗂️ Documento: e37ije1673.pdf
⚠️ No se extrajeron brazos ni grippers.

🗂️ Documento: icce24.pdf
⚠️ No se extrajeron brazos ni grippers.

🗂️ Documento: 1-s2.0-S0921889025000703-main.pdf
⚠️ No se extrajeron brazos ni grippers.

🗂️ Documento: ASSISTIVE ROBOTIC ARM.pdf
⚠️ No se extrajeron brazos ni grippers.

🗂️ 