# Explorando extracción de datos

In [None]:
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor

from tqdm import tqdm
import requests
from bs4 import BeautifulSoup

In [4]:
data_dir = Path().resolve().parent / "data"
raw_dir = data_dir / "raw"

assert raw_dir.exists() & raw_dir.is_dir()

## Agricultura sonora

In [57]:
agricultira_dir = raw_dir / "agricultura"
agricultira_dir.mkdir(exist_ok=True)

In [5]:
url_agricultura = r"https://datos.sonora.gob.mx/dataset/Agricultura%20Sonora"

In [7]:
response = requests.get(url_agricultura)
response.raise_for_status()

In [10]:
response.text[:1000]; 

In [8]:
soup = BeautifulSoup(response.text, 'html.parser')

In [12]:
soup.prettify; 

In [50]:
atag = soup.find('a', class_='resource-url-analytics')
url = atag.get('href')
print('link:', url)
filename = Path(url).name
print('filename:', filename)

link: https://datos.sonora.gob.mx/dataset/3e17a7e8-c8d4-49c4-a426-4dec099cd0cd/resource/dd10301a-2a3c-47f5-887d-ef0dea49fa2e/download/agricultura-sonora-1999.xlsx
filename: agricultura-sonora-1999.xlsx


In [47]:
atags = soup.find_all("a", class_="resource-url-analytics")

download_urls = [a.get('href') for a in atags]

for du in download_urls: print(du)

https://datos.sonora.gob.mx/dataset/3e17a7e8-c8d4-49c4-a426-4dec099cd0cd/resource/dd10301a-2a3c-47f5-887d-ef0dea49fa2e/download/agricultura-sonora-1999.xlsx
https://datos.sonora.gob.mx/dataset/3e17a7e8-c8d4-49c4-a426-4dec099cd0cd/resource/a363751a-04d1-4a1b-b4a4-d1053dee6984/download/agricultura-sonora-2000.xlsx
https://datos.sonora.gob.mx/dataset/3e17a7e8-c8d4-49c4-a426-4dec099cd0cd/resource/e5657fa1-8a90-475a-a31b-24777a879c11/download/agricultura-sonora-2001.xlsx
https://datos.sonora.gob.mx/dataset/3e17a7e8-c8d4-49c4-a426-4dec099cd0cd/resource/b87fb27f-cd6c-4a2d-bb78-7e6f31f01974/download/agricultura-sonora-2002.xlsx
https://datos.sonora.gob.mx/dataset/3e17a7e8-c8d4-49c4-a426-4dec099cd0cd/resource/7977ea4c-2292-4131-beeb-a75094872ff2/download/agricultura-sonora-2003.xlsx
https://datos.sonora.gob.mx/dataset/3e17a7e8-c8d4-49c4-a426-4dec099cd0cd/resource/a97040f6-2e48-4347-8b7b-cd94f7353d44/download/agricultura-sonora-2004.xlsx
https://datos.sonora.gob.mx/dataset/3e17a7e8-c8d4-49c4-a42

In [43]:
r = requests.get(url, stream=True)

print('metadata:')
for k, v in r.headers.items(): print(k, v, sep=': ')

metadata:
Server: openresty
Date: Tue, 02 Dec 2025 03:35:09 GMT
Content-Type: application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
Content-Length: 39210
Connection: keep-alive
Last-Modified: Mon, 26 Feb 2024 18:38:24 GMT
Cache-Control: public, max-age=0, must-revalidate
Expires: Tue, 02 Dec 2025 15:35:09 GMT
ETag: "1708972704.6966813-39210-1176966738"
Strict-Transport-Security: max-age=63072000;includeSubDomains; preload
X-Served-By: datos.sonora.gob.mx


In [61]:
def download_file(url, dest_dir, chunk_size=5*1024*1024):
    """
    Descarga un archivo desde una URL mostrando una barra de progreso con tqdm.
    """
    try:
        with requests.get(url, stream=True) as r:
            r.raise_for_status()

            total_size = int(r.headers.get('content-length', 0))

            filename = Path(url).name
            file_path = dest_dir / filename
            
            with open(file_path, 'wb') as f, tqdm(
                total=total_size if total_size > 0 else None,
                unit='B',
                unit_scale=True,
                unit_divisor=1024,
                desc=file_path.name
            ) as bar:
                
                for chunk in r.iter_content(chunk_size=chunk_size):
                    if chunk:
                        f.write(chunk)
                        bar.update(len(chunk))
        
        return file_path

    except Exception as e:
        print(f'Error downloading file: {e}')

In [62]:
file_path = download_file(url, raw_dir)

agricultura-sonora-1999.xlsx: 100%|██████████| 38.3k/38.3k [00:00<00:00, 7.05MB/s]


In [55]:
def download_files_concurrently(urls, dest_dir, max_workers=5):
    """
    Descarga archivos de una lista de URLs de forma concurrente usando ThreadPoolExecutor.
    """
    downloaded_files = []

    dest_dir.mkdir(parents=True, exist_ok=True)
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [
            executor.submit(download_file, url, dest_dir)
            for url in urls
        ]
        
        for future in futures:
            result = future.result()  # Espera a que la tarea termine y obtiene el resultado
            if result:  # Si la descarga fue exitosa, agrega la ruta del archivo
                downloaded_files.append(result)

    return downloaded_files

In [63]:
files_path = download_files_concurrently(download_urls, agricultira_dir, 3)

agricultura-sonora-2001.xlsx: 100%|██████████| 39.8k/39.8k [00:00<00:00, 2.37MB/s]
agricultura-sonora-1999.xlsx: 100%|██████████| 38.3k/38.3k [00:00<00:00, 497kB/s]
agricultura-sonora-2000.xlsx: 100%|██████████| 40.0k/40.0k [00:00<00:00, 48.6MB/s]
agricultura-sonora-2002.xlsx: 100%|██████████| 39.2k/39.2k [00:00<00:00, 845kB/s]
agricultura-sonora-2003.xlsx: 100%|██████████| 91.9k/91.9k [00:00<00:00, 907kB/s]
agricultura-sonora-2004.xlsx: 100%|██████████| 85.2k/85.2k [00:00<00:00, 646kB/s]
agricultura-sonora-2005.xlsx: 100%|██████████| 88.2k/88.2k [00:00<00:00, 802kB/s]
agricultura-sonora-2006.xlsx: 100%|██████████| 89.4k/89.4k [00:00<00:00, 749kB/s]
agricultura-sonora-2007.xlsx: 100%|██████████| 89.9k/89.9k [00:00<00:00, 856kB/s]
agricultura-sonora-2008.xlsx: 100%|██████████| 88.5k/88.5k [00:00<00:00, 796kB/s]
agricultura-sonora-2009.xlsx: 100%|██████████| 89.7k/89.7k [00:00<00:00, 767kB/s]
agricultura-sonora-2010.xlsx: 100%|██████████| 98.8k/98.8k [00:00<00:00, 610kB/s]
agricultura-so

## Recursos hídricos Sonora

In [68]:
hidricos_dir = raw_dir / "hidricos"
hidricos_dir.mkdir(exist_ok=True)

In [69]:
url_hidricos = r"https://datos.sonora.gob.mx/dataset/Recursos%20H%C3%ADdricos"

In [76]:
def get_download_urls(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    atags = soup.find_all("a", class_="resource-url-analytics")

    return [a.get('href') for a in atags]

In [78]:
hdownload_urls = get_download_urls(url_hidricos)

for hdu in hdownload_urls: print(hdu)

https://datos.sonora.gob.mx/dataset/ee8f639f-5e89-46ae-93b9-934b05fc6233/resource/f2333c5e-26ea-47c8-93ee-fda5e9106672/download/hidrico_sonora_1941-1949.xlsx
https://datos.sonora.gob.mx/dataset/ee8f639f-5e89-46ae-93b9-934b05fc6233/resource/8826c8a0-75f2-49b0-bf48-907b57c3495d/download/hidrico_sonora_1950-1959.xlsx

https://datos.sonora.gob.mx/dataset/ee8f639f-5e89-46ae-93b9-934b05fc6233/resource/c05fe66a-bdeb-4d53-ba93-beb315b98901/download/hidrico_sonora_1970-1979.xlsx
https://datos.sonora.gob.mx/dataset/ee8f639f-5e89-46ae-93b9-934b05fc6233/resource/ffe3c845-1fc7-4a6c-ab21-3a1db1740fdd/download/hidrico_sonora_1980-1989.xlsx
https://datos.sonora.gob.mx/dataset/ee8f639f-5e89-46ae-93b9-934b05fc6233/resource/fabf0415-7c55-4b01-948f-5cedfb58d4b3/download/hidrico_sonora_1990-1999.xlsx
https://datos.sonora.gob.mx/dataset/ee8f639f-5e89-46ae-93b9-934b05fc6233/resource/458d5daa-d0d1-452d-9524-5ef871f7cbe9/download/hidrico_sonora_2000-2009.xlsx
https://datos.sonora.gob.mx/dataset/ee8f639f-5e89-4

In [80]:
hfiles_path = download_files_concurrently(hdownload_urls, hidricos_dir)

Error downloading file: Invalid URL '': No scheme supplied. Perhaps you meant https://?


hidrico_sonora_1990-1999.xlsx:   0%|          | 0.00/656k [00:00<?, ?B/s]
hidrico_sonora_1941-1949.xlsx: 100%|██████████| 145k/145k [00:00<00:00, 451kB/s]


hidrico_sonora_1990-1999.xlsx: 100%|██████████| 656k/656k [00:01<00:00, 536kB/s]



[A[A[A
hidrico_sonora_1970-1979.xlsx: 100%|██████████| 645k/645k [00:01<00:00, 509kB/s]


hidrico_sonora_1950-1959.xlsx: 100%|██████████| 317k/317k [00:00<00:00, 526kB/s]
hidrico_sonora_1980-1989.xlsx: 100%|██████████| 647k/647k [00:01<00:00, 608kB/s]
hidrico_sonora_2020-actualidad2024.xlsx:   0%|          | 0.00/312k [00:00<?, ?B/s]


hidrico_sonora_2000-2009.xlsx: 100%|██████████| 654k/654k [00:01<00:00, 607kB/s]
diccionario_hidrica_sonora.csv: 100%|██████████| 443/443 [00:00<00:00, 1.30MB/s]
catalogo.xlsx: 100%|██████████| 14.8k/14.8k [00:00<00:00, 18.1MB/s]
hidrico_sonora_2020-actualidad2024.xlsx: 100%|██████████| 312k/312k [00:00<00:00, 650kB/s]
hidrico_sonora_2010-2019.xlsx: 100%|██████████| 628k/628k [00:01<00:00, 443kB/s]
