### 1.1 Descarga de datos

Notebook dedicado a la descarga de datos de internet de la página de Compranetinfo.hacienda.gob.mx.
Tener en cuenta que esta página puede no abrir desde determinados países, si eso ocurre será necesario instalar una VPN.

In [5]:
#Importa las librerías necesarias
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import tqdm
import io
import zipfile

### Descarga automática de los datos y decompresión del archivo comprimido.

In [6]:
# URL of the zip file
url = "https://compranetinfo.hacienda.gob.mx/dabiertos/contrataciones_arr.json.zip"

print("Starting the download...")

# Create a session object to manage retries
session = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
session.mount('http://', HTTPAdapter(max_retries=retries))
session.mount('https://', HTTPAdapter(max_retries=retries))

# Send a GET request to the URL with retries and streaming enabled
response = session.get(url, stream=True)

# Check if the request was successful
if response.status_code == 200:
    total_size_in_bytes = int(response.headers.get('content-length', 0))
    block_size = 1024  # 1 Kibibyte
    progress_bar = tqdm.tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
    content = io.BytesIO()
    try:
        for data in response.iter_content(block_size):
            progress_bar.update(len(data))
            content.write(data)
    except requests.exceptions.RequestException as e:
        print(f"Download failed with error: {e}")
    finally:
        progress_bar.close()

    if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
        print("ERROR, something went wrong during the download")
    else:
        print("Download completed successfully.")

        print("Extracting the ZIP file...")
        # Open the zip file located at the URL
        zip_file = zipfile.ZipFile(content)
        
        # Extract the zip file to the current directory
        zip_file.extractall(".")
        print("Extraction completed successfully.")
        
        # Get the list of extracted file names
        extracted_files = zip_file.namelist()
        print(f"Extracted files: {extracted_files}")
else:
    print(f"Failed to download the file. Status code: {response.status_code}")


Starting the download...


  0%|          | 97.3k/3.05G [00:04<42:29:44, 19.9kiB/s]


KeyboardInterrupt: 

In [None]:
import ijson

filename = 'contratacionesabiertas_bulk.json'

# Processing the JSON file incrementally
with open(filename, 'r') as file:
    parser = ijson.parse(file)
    for prefix, event, value in parser:
        print('prefix={}, event={}, value={}'.format(prefix, event, value))
        # Process your JSON data incrementally here
        print("test")


## Unzip file

In [None]:
print("Extracting the ZIP file... \n...this may take a while... about 3 to 5 mins...")

# Open the zip file located at the path
zip_file_path = "contrataciones_arr.json-001.zip"
with ZipFile(zip_file_path, 'r') as zip_file:
    # Get a list of all archived file names from the zip
    all_files = zip_file.namelist()
    
    # Set up the progress bar
    with tqdm(total=len(all_files), desc="Extracting files", unit="files") as pbar:
        for file in all_files:
            # Extract each file to the current directory
            zip_file.extract(member=file, path=".")
            # Update the progress bar by one
            pbar.update(1)

print("Extraction completed successfully.")

# Get the list of extracted file names
extracted_files = zip_file.namelist()
print(f"Extracted files: {extracted_files}")


### Read the json file

In [None]:
import ijson

filename = 'contratacionesabiertas_bulk.json'

# Processing the JSON file incrementally
with open(filename, 'r') as file:
    parser = ijson.parse(file)
    for prefix, event, value in parser:
        print('prefix={}, event={}, value={}'.format(prefix, event, value))
        # Process your JSON data incrementally here
        print("test")

In [12]:
import json

filename = 'contratacionesabiertas_bulk.json'
pretty_data = json.dumps(filename, indent=4)
print(pretty_data)


"contratacionesabiertas_bulk.json"


In [13]:
import json

# Path to your large JSON file
file_path = 'contratacionesabiertas_bulk.json'

# Function to read and yield items from a JSON file one at a time
def read_json_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            yield json.loads(line)

# Create a generator
json_gen = read_json_file(file_path)

# Number of items to print
num_items_to_print = 5

# Iterate over the generator and print the first few items
for i, item in enumerate(json_gen):
    print(json.dumps(item, indent=2))  # Pretty print the JSON item
    if i + 1 == num_items_to_print:
        break
