# **Data Extraction**

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from pathlib import Path

## Scraping Datos Historicos

In [2]:
url = r"https://datos.profeco.gob.mx/datos_abiertos/qqp.php#"

In [3]:
r = requests.get(url)
r.status_code, r.content[:100]  # OK

(200,
 b'<!DOCTYPE html>\r\n<html lang="es">\r\n<head>\r\n    <meta http-equiv="Content-Type" content="text/html; c')

In [4]:
soup = BeautifulSoup(r.content, 'html.parser')

In [5]:
print(soup.prettify())

<!DOCTYPE html>
<html lang="es">
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="width=device-width, initial-scale=1, maximum-scale=1" name="viewport"/>
  <meta content="Datos abiertos de la Procuraduría Federal del Consumidor" name="description"/>
  <meta content="Procuraduría Federal del Consumidor (PROFECO)" name="author"/>
  <title>
   Datos abiertos | PROFECO
  </title>
  <link href="https://www.gob.mx/cms/uploads/image/file/488329/favicon.png" rel="icon"/>
  <link href="https://framework-gb.cdn.gob.mx/assets/styles/main.css" rel="stylesheet"/>
  <link href="https://cdn.datatables.net/v/dt/dt-1.11.0/r-2.2.9/datatables.min.css" rel="stylesheet" type="text/css">
  </link>
 </head>
 <body>
  <nav class="navbar navbar-inverse sub-navbar navbar-fixed-top">
   <div class="container">
    <div class="navbar-header">
     <button class="navbar-toggle collapsed" data-target="#subenlaces" data-toggle="collapse" type="button">
      <span class

In [6]:
soup.title, soup.title.string, soup.title.name

(<title>Datos abiertos | PROFECO</title>, 'Datos abiertos | PROFECO', 'title')

In [7]:
soup.a

<a class="navbar-brand" href="https://www.gob.mx/profeco">PROFECO</a>

In [8]:
soup.find_all('a')

[<a class="navbar-brand" href="https://www.gob.mx/profeco">PROFECO</a>,
 <a href="index.php"><i class="icon icon-home"></i></a>,
 <a href="file.php?t=c7ada121b4029dddd5a3463fa367c56d">Base de Datos Histórica Quién es Quién en los Precios 2025</a>,
 <a href="file.php?t=4af96007e16916edb17ef6f9279e7016">Base de Datos Histórica Quién es Quién en los Precios 2024</a>,
 <a href="file.php?t=f3270d227f2966e6138a3ed41a9bbfb7">Base de Datos Histórica Quién es Quién en los Precios 2023</a>,
 <a href="file.php?t=af88f42c5cb82c6c35dd962b1ae69051">Base de Datos Histórica Quién es Quién en los Precios 2022</a>,
 <a href="file.php?t=493b83b886f0266909d783fc8f776b11">Base de Datos Histórica Quién es Quién en los Precios 2021</a>,
 <a href="file.php?t=4df382eefa26f1f0d28d3a11aaf41add">Base de Datos Histórica Quién es Quién en los Precios 2020</a>,
 <a href="file.php?t=09939d92d2afcde64dbc06e057877e16">Base de Datos Histórica Quién es Quién en los Precios 2019</a>,
 <a href="file.php?t=01fafa951fb6c82e6

* Link para descargar (2025): https://datos.profeco.gob.mx/datos_abiertos/file.php?t=c7ada121b4029dddd5a3463fa367c56d

In [9]:
for link in soup.find_all('a'):
    href = link.get('href') 
    print(type(href), href, 'file' in href)

<class 'str'> https://www.gob.mx/profeco False
<class 'str'> index.php False
<class 'str'> file.php?t=c7ada121b4029dddd5a3463fa367c56d True
<class 'str'> file.php?t=4af96007e16916edb17ef6f9279e7016 True
<class 'str'> file.php?t=f3270d227f2966e6138a3ed41a9bbfb7 True
<class 'str'> file.php?t=af88f42c5cb82c6c35dd962b1ae69051 True
<class 'str'> file.php?t=493b83b886f0266909d783fc8f776b11 True
<class 'str'> file.php?t=4df382eefa26f1f0d28d3a11aaf41add True
<class 'str'> file.php?t=09939d92d2afcde64dbc06e057877e16 True
<class 'str'> file.php?t=01fafa951fb6c82e6e4bb491af8f1688 True
<class 'str'> file.php?t=059e79ffa462f6f51ed3aa1dbfa83a70 True
<class 'str'> file.php?t=c388a30cb3f4b4c4fa29302618ef5557 True
<class 'str'> file.php?t=4ecfa981c01e742a5461bf543a7b4108 True


In [10]:
download_file_endpoints = [
    link.get('href')
    for link in soup.find_all('a')
    if link.get('href') and "file" in link.get('href')
]
download_file_endpoints

['file.php?t=c7ada121b4029dddd5a3463fa367c56d',
 'file.php?t=4af96007e16916edb17ef6f9279e7016',
 'file.php?t=f3270d227f2966e6138a3ed41a9bbfb7',
 'file.php?t=af88f42c5cb82c6c35dd962b1ae69051',
 'file.php?t=493b83b886f0266909d783fc8f776b11',
 'file.php?t=4df382eefa26f1f0d28d3a11aaf41add',
 'file.php?t=09939d92d2afcde64dbc06e057877e16',
 'file.php?t=01fafa951fb6c82e6e4bb491af8f1688',
 'file.php?t=059e79ffa462f6f51ed3aa1dbfa83a70',
 'file.php?t=c388a30cb3f4b4c4fa29302618ef5557',
 'file.php?t=4ecfa981c01e742a5461bf543a7b4108']

In [11]:
URL_DOWNLOAD_ROOT = "https://datos.profeco.gob.mx/datos_abiertos/"

---

In [63]:
import re

In [64]:
links_dict = {}

In [72]:
for link in soup.find_all("a", href=True):
    text = link.get_text(strip=True)         
    href = link["href"]                      
    # print(text, href)  # OK
    match = re.search(r"\b(20\d{2})\b", text)  # busca un año tipo 2020–2099
    if match:
        year = match.group(1)
        links_dict[year] = URL_DOWNLOAD_ROOT + '/' + href
 

In [73]:
links_dict

{'2025': 'https://datos.profeco.gob.mx/datos_abiertos//file.php?t=c7ada121b4029dddd5a3463fa367c56d',
 '2024': 'https://datos.profeco.gob.mx/datos_abiertos//file.php?t=4af96007e16916edb17ef6f9279e7016',
 '2023': 'https://datos.profeco.gob.mx/datos_abiertos//file.php?t=f3270d227f2966e6138a3ed41a9bbfb7',
 '2022': 'https://datos.profeco.gob.mx/datos_abiertos//file.php?t=af88f42c5cb82c6c35dd962b1ae69051',
 '2021': 'https://datos.profeco.gob.mx/datos_abiertos//file.php?t=493b83b886f0266909d783fc8f776b11',
 '2020': 'https://datos.profeco.gob.mx/datos_abiertos//file.php?t=4df382eefa26f1f0d28d3a11aaf41add',
 '2019': 'https://datos.profeco.gob.mx/datos_abiertos//file.php?t=09939d92d2afcde64dbc06e057877e16',
 '2018': 'https://datos.profeco.gob.mx/datos_abiertos//file.php?t=01fafa951fb6c82e6e4bb491af8f1688',
 '2017': 'https://datos.profeco.gob.mx/datos_abiertos//file.php?t=059e79ffa462f6f51ed3aa1dbfa83a70',
 '2016': 'https://datos.profeco.gob.mx/datos_abiertos//file.php?t=c388a30cb3f4b4c4fa2930261

---

In [12]:
full_download_links = [URL_DOWNLOAD_ROOT + path for path in download_file_endpoints]
full_download_links

['https://datos.profeco.gob.mx/datos_abiertos/file.php?t=c7ada121b4029dddd5a3463fa367c56d',
 'https://datos.profeco.gob.mx/datos_abiertos/file.php?t=4af96007e16916edb17ef6f9279e7016',
 'https://datos.profeco.gob.mx/datos_abiertos/file.php?t=f3270d227f2966e6138a3ed41a9bbfb7',
 'https://datos.profeco.gob.mx/datos_abiertos/file.php?t=af88f42c5cb82c6c35dd962b1ae69051',
 'https://datos.profeco.gob.mx/datos_abiertos/file.php?t=493b83b886f0266909d783fc8f776b11',
 'https://datos.profeco.gob.mx/datos_abiertos/file.php?t=4df382eefa26f1f0d28d3a11aaf41add',
 'https://datos.profeco.gob.mx/datos_abiertos/file.php?t=09939d92d2afcde64dbc06e057877e16',
 'https://datos.profeco.gob.mx/datos_abiertos/file.php?t=01fafa951fb6c82e6e4bb491af8f1688',
 'https://datos.profeco.gob.mx/datos_abiertos/file.php?t=059e79ffa462f6f51ed3aa1dbfa83a70',
 'https://datos.profeco.gob.mx/datos_abiertos/file.php?t=c388a30cb3f4b4c4fa29302618ef5557',
 'https://datos.profeco.gob.mx/datos_abiertos/file.php?t=4ecfa981c01e742a5461bf5

## Download full data

In [15]:
DATA_PATH = Path("../data/")
RAW_DATA_PATH = DATA_PATH.joinpath("raw/")
DATA_PATH, RAW_DATA_PATH

(WindowsPath('../data'), WindowsPath('../data/raw'))

In [16]:
DATA_PATH.mkdir(exist_ok=True)
RAW_DATA_PATH.mkdir(exist_ok=True)

In [18]:
url_2025 = full_download_links[0]
url_2025

'https://datos.profeco.gob.mx/datos_abiertos/file.php?t=c7ada121b4029dddd5a3463fa367c56d'

In [None]:
r = requests.get(url_2025)
r.status_code

200

* Los archivos son pesados: `+100mb`

In [27]:
response = requests.get(url_2025, stream=True)
print(response.status_code)
for header in response.headers: 
    print(header)

200
Cache-Control
Transfer-Encoding
Content-Type
Server
X-Powered-By
Content-Transfer-Encoding
Content-disposition
Strict-Transport-Security
X-Content-Type-Options
X-Frame-Options
X-XSS-Protection
Date


In [28]:
content_disposition = response.headers['content-disposition']
filename = content_disposition.split("filename=")[1]
content_disposition, filename

('inline; filename=QQP_2025.rar', 'QQP_2025.rar')

In [29]:
with open(RAW_DATA_PATH.joinpath(filename), mode='wb') as file: 
    for chunk in response.iter_content(chunk_size=10*1024): 
        file.write(chunk)

In [None]:
def download_file_by_chunks(url, path=Path("../data/raw/")):
    response = requests.get(url, stream=True) 

    # Get filename
    if "content-disposition" in response.headers:
        content_disposition = response.headers["content-disposition"]
        filename = content_disposition.split("filename=")[-1]
    else:
        filename = url.split("/")[-1]

    file_path = path / filename

    # download by chunks
    with open(file_path, mode='wb') as file:
        for chunk in response.iter_content(chunk_size=1024*1024):  # 1 MB
            if chunk:  # avoid empty chunks
                file.write(chunk)
    
    if file_path.is_file(): 
        print('Data downloaded at: ', file_path)
        
    return file_path

In [48]:
download_file_by_chunks(url_2025, RAW_DATA_PATH)

Data downloaded at:  ..\data\raw\QQP_2025.rar


WindowsPath('../data/raw/QQP_2025.rar')

### Download in parallel

In [49]:
from concurrent.futures import ThreadPoolExecutor

In [61]:
def download_files(downloader, urls): 
    with ThreadPoolExecutor(max_workers=4) as executor: 
        executor.map(downloader, urls)

In [62]:
download_files(download_file_by_chunks, full_download_links)

Data downloaded at:  ..\data\raw\QQP_2025.rar
Data downloaded at:  ..\data\raw\QQP_2022.rar
Data downloaded at:  ..\data\raw\QQP_2024.rar
Data downloaded at:  ..\data\raw\QQP_2023.rar
Data downloaded at:  ..\data\raw\QQP_2021.rar
Data downloaded at:  ..\data\raw\QQP_2020.rar
Data downloaded at:  ..\data\raw\QQP_2019.rar
Data downloaded at:  ..\data\raw\QQP_2018.rar
Data downloaded at:  ..\data\raw\QQP_2017.rar
Data downloaded at:  ..\data\raw\QQP_2016.rar
Data downloaded at:  ..\data\raw\QQP_2015.rar
