In [7]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import re
from datetime import datetime
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
import os
from dotenv import load_dotenv
import pandas as pd
from datetime import datetime
from azure.storage.blob import BlobServiceClient
# Load environment variables
load_dotenv()

AZURE_STORAGE_CONNECTION_STRING = os.getenv("AZURE_STORAGE_CONNECTION_STRING")

In [None]:
AZURE_STORAGE_CONNECTION_STRING

In [9]:
def read_urls_from_file(filepath):
    with open(filepath, 'r') as file:
        # Elimina comas y comillas extras de cada línea
        urls = [line.strip().rstrip(',').replace("'", "").replace('"', '') for line in file if line.strip()]
    return urls

def correct_json_format(json_text):
    corrected_text = re.sub(r'"\s*([^"]+?)\s*"\s*:\s*"(https?://[^"]+)"\s*"(\w+)"', r'"\1": "\2", "\3"', json_text)
    return corrected_text

def extract_categories(url):
    # Eliminar 'https://' y dividir la URL en partes para identificar los segmentos
    parts = url.replace('https://www.dia.es/', '').split('/')
    
    # La categoría se encuentra en el primer segmento después del dominio
    categoria = parts[0] if len(parts) > 0 else 'Sin categoría'
    
    # La subcategoría se encuentra en el segundo segmento
    subcategoria = parts[1] if len(parts) > 1 else 'Sin subcategoría'
    
    return categoria, subcategoria

In [11]:
# Read URLs from file
urls = read_urls_from_file('urls.txt')
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0'}
base_url = 'https://www.dia.es'
all_unique_links = set()

# Extract unique links
for url in urls:
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        category_segment = url.split('/')[-2]
        # Extract and adjust links to ensure they are absolute
        for a in soup.find_all('a', href=True):
            href = a['href']
            if href.startswith('/'):
                href = base_url + href  # Converts relative links to absolute
            # Check if URL does not contain 'sort' in the last segment
            if href.startswith(base_url) and category_segment in href and 'sort' not in href.split('/')[-1]:
                all_unique_links.add(href)

# Convert set to list
all_unique_links = list(all_unique_links)

productos = []
precios = []
categorias = []
subcategorias = []

# Scrape product details
for url in all_unique_links:
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        html_content = response.text
        soup = BeautifulSoup(html_content, 'html.parser')
        
        script = soup.find('script', type='application/ld+json')
        if script:
            corrected_json_text = correct_json_format(script.text)
            try:
                data = json.loads(corrected_json_text)
                nombre_producto = data.get('name', 'Nombre no disponible')
                offers = data.get('offers', {})
                precio_producto = offers.get('price', 'Precio no disponible')
                
                categoria, subcategoria = extract_categories(url)
                
                productos.append(nombre_producto)
                precios.append(precio_producto)
                categorias.append(categoria)
                subcategorias.append(subcategoria)
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON from URL: {url}")
                print(f"Error message: {e}")

# Create a DataFrame
df = pd.DataFrame({
    'Categoría': categorias,
    'Subcategoría': subcategorias,
    'Producto': productos,
    'Precio': precios
})

csv_filename = f'productos_precios_categorias_{datetime.now().strftime("%Y_%m_%d")}.csv'
df.to_csv(csv_filename, index=False, sep=';', encoding='utf-8-sig')

In [5]:
# Add a date column in Spanish format (dd/mm/yyyy)
current_date = datetime.now().strftime("%d/%m/%Y")
df['Fecha'] = current_date

# Save the DataFrame to a CSV file
csv_filename = f'productos_precios_categorias_{datetime.now().strftime("%Y_%m_%d")}.csv'
df.to_csv(csv_filename, index=False, sep=';', encoding='utf-8-sig')

# Azure Blob Storage details
connect_str = AZURE_STORAGE_CONNECTION_STRING  # Reemplaza con tu cadena de conexión de Azure Blob Storage
container_name = "scrapingdia"

# Create the BlobServiceClient object
blob_service_client = BlobServiceClient.from_connection_string(connect_str)

# Create a blob client
blob_client = blob_service_client.get_blob_client(container=container_name, blob=csv_filename)

# Upload the created file
with open(csv_filename, "rb") as data:
    blob_client.upload_blob(data, overwrite=True)

print(f"File {csv_filename} uploaded to Azure Blob Storage successfully.")


File productos_precios_categorias_2024_05_22.csv uploaded to Azure Blob Storage successfully.
