In [18]:
import sys
sys.path.append('../') # Retrocede una carpeta atrás
from config.paths import SRC_DIR
import os
from dotenv import load_dotenv
load_dotenv()

from utils.scraping_utils import ScrapingUtils

In [19]:
url_to_extract_urls = "https://www.honda.mx/motos/crf300f"
brand_to_scrape = "Honda"

In [20]:
scraping_utils = ScrapingUtils(api_key=os.getenv("FIRECRAWL_API_KEY"))

In [21]:
images_scraped = scraping_utils.get_images_from_website(url_to_extract_urls)

In [22]:
images_scraped



In [23]:
import re

def extract_urls_from_markdown(markdown_text_or_doc):
    """
    Extrae todas las URLs de un string en formato Markdown (enlaces y de imágenes).
    También puede recibir un objeto Document de Firecrawl.
    Ejemplo: [text](url), ![alt](url), [![alt](url)](url2)
    """
    # Si es un objeto Document, extraer el markdown o JSON
    if hasattr(markdown_text_or_doc, 'markdown'):
        markdown_text = markdown_text_or_doc.markdown or ""
    elif hasattr(markdown_text_or_doc, 'json'):
        # Si hay JSON con imágenes extraídas, usarlo primero
        import json
        try:
            json_data = markdown_text_or_doc.json
            if isinstance(json_data, str):
                json_data = json.loads(json_data)
            if isinstance(json_data, dict) and 'imagenes' in json_data:
                return json_data['imagenes']
        except:
            pass
        markdown_text = ""
    else:
        # Si es un string, usarlo directamente
        markdown_text = str(markdown_text_or_doc) if markdown_text_or_doc else ""

    # Si no hay markdown, retornar lista vacía
    if not markdown_text:
        return []

    # Busca urls dentro de paréntesis después de ] o ) o ![
    # Esto cubrirá casos como: [text](url), ![alt](url), [![alt](url)](url2)
    url_pattern = r'\((https?://[^\s)]+|/[^)\s]+)\)'
    urls = re.findall(url_pattern, markdown_text)
    return urls

In [24]:
urls = extract_urls_from_markdown(images_scraped)
urls

['https://www.honda.mx/web/img/Honda-the-power-of-dreams-color.png',
 'https://www.honda.mx/',
 'https://www.honda.mx/autos',
 'https://www.honda.mx/acura',
 'https://www.honda.mx/motos',
 'https://www.honda.mx/productos-de-fuerza',
 'https://www.honda.mx/marinos',
 'https://www.honda.mx/racing',
 'https://www.honda.mx/acerca',
 'https://www.honda.mx/honda-60-aniversario',
 'https://www.honda.mx/',
 'https://www.honda.mx/web/img/Honda-the-power-of-dreams-color.png',
 'https://www.honda.mx/',
 'https://www.honda.mx/web/img/motorcycles/home/logo-motos-horizontal-01.svg',
 'https://www.honda.mx/motos/#home',
 'https://www.honda.mx/motos/#home',
 'https://www.honda.mx/motos/lineup',
 'https://www.honda.mx/motos/#maintenance',
 'https://www.honda.mx/mapa-distribuidores?seccion=motos',
 'https://www.honda.mx/web/img/motorcycles/home/logo-motos-horizontal-01.svg',
 'https://www.honda.mx/motos/#home',
 'https://www.honda.mx/motos/crf300f#',
 'https://www.honda.mx/web/img/motorcycles/models/off

In [17]:
#
# ! Asegurate de cambiar a lo que quieres buscar
url_list = []
text_list_to_search = ["/wp-content/uploads/vento"]
for url in urls:
    if any(text in url for text in text_list_to_search):
        url_list.append(url)
url_list

['https://www.vento.com/wp-content/uploads/vento-logo.svg',
 'https://www.vento.com/wp-content/uploads/vento-logo.svg',
 'https://www.vento.com/wp-content/uploads/vento-falkon-250-logo-768x250.webp',
 'https://www.vento.com/wp-content/uploads/vento-falkon-250-logo-768x250.webp',
 'https://www.vento.com/wp-content/uploads/vento-falkon-220-01.jpg',
 'https://www.vento.com/wp-content/uploads/vento-falkon-220-01.jpg',
 'https://www.vento.com/wp-content/uploads/vento-falkon-220-spec-01.jpg',
 'https://www.vento.com/wp-content/uploads/vento-falkon-220-spec-02.jpg',
 'https://www.vento.com/wp-content/uploads/vento-falkon-220-spec-03.jpg',
 'https://www.vento.com/wp-content/uploads/vento-falkon-220-spec-04.jpg',
 'https://www.vento.com/wp-content/uploads/vento-falkon-220-spec-05.jpg',
 'https://www.vento.com/wp-content/uploads/vento-falkon-220-spec-06.jpg',
 'https://www.vento.com/wp-content/uploads/vento-falkon-220-spec-07.jpg',
 'https://www.vento.com/wp-content/uploads/vento-falkon-220-spec

In [4]:
data_scraped = scraping_utils.get_data_from_website(url_to_extract_urls)

In [5]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(data_scraped.html, "html.parser")
title_element = soup.find(class_="product_title entry-title elementor-heading-title elementor-size-default")
product_title = title_element.get_text(strip=True) if title_element else None
product_title

In [6]:
import re

def extract_image_and_pdf_links(markdown_text):
    """
    Extrae todos los links de imágenes y PDFs que aparecen en el markdown.
    """
    # Extensiones de imagen y PDF a buscar
    image_extensions = ['.webp', '.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.svg']
    pdf_extensions = ['.pdf']
    all_extensions = image_extensions + pdf_extensions

    # Busca los links en formato ![](url) y [nombre](url)
    pattern = r'\((https?://[^\s)]+|/[^)\s]+)\)'
    matches = re.findall(pattern, markdown_text)

    # Filtra links que terminen con alguna de las extensiones buscadas
    links = [link for link in matches if any(link.lower().endswith(ext) for ext in all_extensions)]
    return links

links = extract_image_and_pdf_links(data_scraped.markdown)

In [7]:
imagenes = []
pdfs = []
for item in links:
    image_extensions = [".webp", ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".svg"]
    if any(ext in item.lower() for ext in image_extensions):
        if '300x300' not in item:
            imagenes.append(item)
    elif ".pdf" in item:
        pdfs.append(item)

# Eliminar duplicados
imagenes = list(set(imagenes))
pdfs = list(set(pdfs))

In [8]:
pdfs

[]

In [9]:
imagenes

['https://kovemotocolombia.com/wp-content/uploads/2024/03/MX2503.png',
 'https://kovemotocolombia.com/wp-content/uploads/2024/03/3.png',
 'http://kovemotocolombia.com/wp-content/uploads/2024/01/cropped-Icono.png',
 'http://kovemotocolombia.com/wp-content/uploads/2024/03/MX250.png',
 'https://kovemotocolombia.com/wp-content/uploads/2024/03/1.png',
 'https://kovemotocolombia.com/wp-content/uploads/2024/03/2.png',
 'https://kovemotocolombia.com/wp-content/uploads/2024/03/MX2503-600x382.png']

In [107]:
import os
import requests

def descargar_archivos(lista_urls, carpeta_destino):
    """
    Descarga archivos desde una lista de URLs y los guarda en la carpeta de destino.
    Si se recibe un error 406, lo intenta de nuevo enviando headers de navegador.
    """
    if not os.path.exists(carpeta_destino):
        os.makedirs(carpeta_destino)

    # Headers tipo navegador para evadir errores como 406
    browser_headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'es-ES,es;q=0.9,en;q=0.8',
        'Connection': 'keep-alive',
        'Referer': 'https://fratelliglobal.com/',
    }

    for url in lista_urls:
        try:
            nombre_archivo = url.split("/")[-1]
            ruta_completa = os.path.join(carpeta_destino, nombre_archivo)

            # Si ya existe, no descargar de nuevo
            if os.path.exists(ruta_completa):
                print(f"Archivo ya existe: {nombre_archivo}")
                continue

            try:
                response = requests.get(url, stream=True, timeout=20)
                response.raise_for_status()
            except requests.exceptions.HTTPError as e:
                if response.status_code == 406:
                    print(f"406 recibido para {url}, reintentando con headers de navegador...")
                    response = requests.get(url, stream=True, timeout=20, headers=browser_headers)
                    response.raise_for_status()
                else:
                    raise

            with open(ruta_completa, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:  # filtra paquetes vacíos
                        f.write(chunk)
            print(f"Descargado: {nombre_archivo}")
        except Exception as e:
            print(f"Error al descargar {url}: {e}")


In [108]:
carpeta_destino = f"{SRC_DIR}/data/scraped_data_downloaded/{product_title}"
descargar_archivos(pdfs, carpeta_destino)
descargar_archivos(imagenes, carpeta_destino)

Descargado: politica-de-proteccion-de-datos-personales-v4.pdf
Descargado: BROCHURE-VESPA-GTS-2024.pdf
Descargado: icon-whatsapp-white.svg
Descargado: motorcycle.svg
