In [1]:
import os
import PyPDF2
import re
import requests
import sys
import zipfile
import numpy as np
import pandas as pd 
import polars as pl

In [3]:
def extractAndDownloadLinks(pdfPath: str, downloadFolder: str):
    """Extracts links from Mexico's open data webpage PDF files and downloads them.

    Parameters
    ----------
    pdfPath (str): 
        The path to the PDF file.
    downloadFolder (str): 
        The folder to save downloaded files.

    Returns
    -------
    None
    """
    with open(pdfPath, 'rb') as pdf:
        reader = PyPDF2.PdfReader(pdf)

        for page in reader.pages:
            if "/Annots" in page:
                for annot in page["/Annots"]:
                    obj = annot.get_object()
                    if obj["/Subtype"] == "/Link":
                        linkDestination = obj['/A']['/URI']

                        # Extract filename from the link (might need adjustments)
                        filename = linkDestination.split('/')[-1]  

                        # Download the file                        
                        response = requests.get(linkDestination)                        
                        if response.status_code == 200:
                            with open(os.path.join(downloadFolder, filename), 'wb') as f:
                                f.write(response.content)
                            print(f"Downloaded: {linkDestination}")
                        else:
                            print(f"Failed to download: {linkDestination}")


extractAndDownloadLinks(covidPDFPath, downloadsPath)

Downloaded: https://datosabiertos.salud.gob.mx/gobmx/salud/datos_abiertos/historicos/2024/03/datos_abiertos_covid19_05.03.2024.zip
Downloaded: https://datosabiertos.salud.gob.mx/gobmx/salud/datos_abiertos/historicos/2024/03/datos_abiertos_covid19_12.03.2024.zip
Downloaded: https://datosabiertos.salud.gob.mx/gobmx/salud/datos_abiertos/historicos/2024/03/datos_abiertos_covid19_19.03.2024.zip
Downloaded: https://datosabiertos.salud.gob.mx/gobmx/salud/datos_abiertos/historicos/2024/03/datos_abiertos_covid19_26.03.2024.zip
Downloaded: https://datosabiertos.salud.gob.mx/gobmx/salud/datos_abiertos/historicos/2024/02/datos_abiertos_covid19_06.02.2024.zip
Downloaded: https://datosabiertos.salud.gob.mx/gobmx/salud/datos_abiertos/historicos/2024/02/datos_abiertos_covid19_13.02.2024.zip
Downloaded: https://datosabiertos.salud.gob.mx/gobmx/salud/datos_abiertos/historicos/2024/02/datos_abiertos_covid19_20.02.2024.zip
Downloaded: https://datosabiertos.salud.gob.mx/gobmx/salud/datos_abiertos/historico

In [4]:
def renameFiles(zipFilesPath: str):
    """
    Renames zip files in the Mexican COVID19 dataset from the local date format, to a more universal YYYY/MM/DD format.

    Parameters
    ----------
    zipFilesPath (str): 
        The path to the folder containing all the zip files.
    
    Returns
    -------
    None
    """
    for filename in os.listdir(zipFilesPath):
        splitDate = filename.split("_")[-1]
        day, month, year, extension = splitDate.split(".")
        destination = zipFilesPath + f"\\datos_abiertos_covid19_{year}.{month}.{day}.zip"
        os.rename(zipFilesPath+"\\"+filename, destination)
        print(f"Renamed {filename} to {destination.split('\\')[-1]}")


renameFiles(downloadsPath) # Why? You might ask... Just to be able to merge them into less files later on...

Renamed datos_abiertos_covid19_02.01.2024.zip to datos_abiertos_covid19_2024.01.02.zip
Renamed datos_abiertos_covid19_02.04.2024.zip to datos_abiertos_covid19_2024.04.02.zip
Renamed datos_abiertos_covid19_05.03.2024.zip to datos_abiertos_covid19_2024.03.05.zip
Renamed datos_abiertos_covid19_06.02.2024.zip to datos_abiertos_covid19_2024.02.06.zip
Renamed datos_abiertos_covid19_09.01.2024.zip to datos_abiertos_covid19_2024.01.09.zip
Renamed datos_abiertos_covid19_09.04.2024.zip to datos_abiertos_covid19_2024.04.09.zip
Renamed datos_abiertos_covid19_12.03.2024.zip to datos_abiertos_covid19_2024.03.12.zip
Renamed datos_abiertos_covid19_13.02.2024.zip to datos_abiertos_covid19_2024.02.13.zip
Renamed datos_abiertos_covid19_16.01.2024.zip to datos_abiertos_covid19_2024.01.16.zip
Renamed datos_abiertos_covid19_19.03.2024.zip to datos_abiertos_covid19_2024.03.19.zip
Renamed datos_abiertos_covid19_20.02.2024.zip to datos_abiertos_covid19_2024.02.20.zip
Renamed datos_abiertos_covid19_23.01.2024.z

In [5]:
zipFiles = [file for file in os.listdir(downloadsPath) if file.endswith('.zip')]

# Initialize an empty DataFrame and variables for the current year and month
dataFrame = None
currentYear = None
currentMonth = None

for zipFile in zipFiles:
    print(f"Opening {zipFile}")

    # Extract the year and month from the zip file name
    match = re.search(r'datos_abiertos_covid19_(\d{4})\.(\d{2})\.\d{2}\.zip', zipFile)
    year = match.group(1)
    month = match.group(2)

    # Extract the zip file
    with zipfile.ZipFile(os.path.join(downloadsPath, zipFile), 'r') as zipReference:
        zipReference.extractall(downloadsPath)

    # Get the name of the extracted file
    extractedFile = zipReference.namelist()[0]

    # Read the extracted file into a DataFrame
    temporalDataFrame = pl.read_csv(os.path.join(downloadsPath, extractedFile), infer_schema_length=0)  

    if year == currentYear and month == currentMonth:
        # If the year and month match, concatenate the DataFrame
        dataFrame = pl.concat([dataFrame, temporalDataFrame], how='vertical_relaxed') if dataFrame is not None else temporalDataFrame
    else:
        # If the year and month do not match, save the DataFrame and start a new one
        if dataFrame is not None:
            dataFrame.write_csv(os.path.join(downloadsPath, f'COVID19MEXICO-{currentYear}-{currentMonth}.csv'))
            print(f"Successfully saved COVID19 data for {currentYear}/{currentMonth}")
            del dataFrame 
        dataFrame = temporalDataFrame
        currentYear = year
        currentMonth = month

    # Delete the temporary DataFrame 
    del temporalDataFrame

# Save the last DataFrame
if dataFrame is not None:
    dataFrame.write_csv(os.path.join(downloadsPath, f'COVID19MEXICO-{currentYear}-{currentMonth}.csv'))
    print(f"Successfully saved COVID19 data for {currentYear}/{currentMonth}")
    del dataFrame

Opening datos_abiertos_covid19_2024.01.02.zip
Opening datos_abiertos_covid19_2024.01.09.zip
Opening datos_abiertos_covid19_2024.01.16.zip
Opening datos_abiertos_covid19_2024.01.23.zip
Opening datos_abiertos_covid19_2024.01.30.zip
Opening datos_abiertos_covid19_2024.02.06.zip
Successfully saved COVID19 data for 2024/01
Opening datos_abiertos_covid19_2024.02.13.zip
Opening datos_abiertos_covid19_2024.02.20.zip
Opening datos_abiertos_covid19_2024.02.27.zip
Opening datos_abiertos_covid19_2024.03.05.zip
Successfully saved COVID19 data for 2024/02
Opening datos_abiertos_covid19_2024.03.12.zip
Opening datos_abiertos_covid19_2024.03.19.zip
Opening datos_abiertos_covid19_2024.03.26.zip
Opening datos_abiertos_covid19_2024.04.02.zip
Successfully saved COVID19 data for 2024/03
Opening datos_abiertos_covid19_2024.04.09.zip
Successfully saved COVID19 data for 2024/04
