For all datasets:
1. Extract dataset: 
    - HTTP download into csv file format
        - Resilience: Add retries after n-seconds if unsuccessful initially

2. Transform dataset
    - Drop all columns that are unnecessary
    - (Translate column names) - this would be hardcoded and has potential license implications (just like transforming actually)
    - Check for missing values:
        - if over a certain threshold: handle with:
            - mean/mode/deletion/regression/knn
    - Change time datatypes into datetime format
    - (Potentially) Aggregate datasets into monthly formats
        - Not sure if that is best for this step or better saved for a later step

3. Load dataset into /data/ folder

- In order to make it modular (and because we are working with a large number of datasets):
    - define functions for standard tasks
        - download
        - datetime transformation
        - missing value handling
        - (dropping columns)
        - saving dataset
- Throughout it all use logging (on console & also in a log file? use library?) 
- Focus on Error Handling
- Don't forget to update github issue with this content
- Add .sh
- Perform operations not in place

In [108]:
# Installs
#%pip install retry

# TODO: Necessary? Does this fw the .sh -> create a proper venv for this project?

In [134]:
# Imports

import pandas as pd
import logging
from retry import retry
import requests	
from enum import Enum, auto
import io
import copy
import zipfile
import os

In [3]:
# Configure the logging system
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

In [135]:
@retry(tries=3, delay=30, logger=logging.getLogger())
def extract_dataset(dataset_url: str, timeout: (int,int) = (None, None), is_zip: bool = False):
    """
    Download datasets via HTTP request.
    Retry three times, after waiting for 30s each, if unsuccessful.
    
    Parameters:
    dataset_url: (str): URL of a dataset in the csv-file format.
    timeout: (int, int): The timeout for the HTTP request in seconds. First tuple value is connection timeout, second tuple value is read timeout. Default behaviour is, that no time-out is applied
    is_zip: (bool): Flag indicating if the dataset is a zip file containing multiple CSV files.
    
    Returns:
    response: The decoded response content
    """

    logging.info(f"Attempting to fetch data from {dataset_url}") 
    response = requests.get(dataset_url, timeout=timeout) 
    response.raise_for_status()  # Raise an exception for HTTP errors 
    logging.info(f"Successfully fetched data from {dataset_url}") 
    
    # Pick out the csv dataset file which is not metadata (as identified by file-name).
    if is_zip:
        with zipfile.ZipFile(io.BytesIO(response.content)) as zip_file:
            csv_files = [f for f in zip_file.namelist() if f.endswith('.csv') and 'metadata' not in f.lower()]
            if len(csv_files) == 1: # Ensure that there is only a singular csv dataset file
                with zip_file.open(csv_files[0]) as csv_file:
                    csv_data = csv_file.read().decode('utf-8')
                    return csv_data
            else:
                raise ValueError(f"Expected exactly one CSV file without 'metadata' in the name, found: {csv_files}")
    else:
        return response.content.decode('utf-8')
    return response

2024-11-12 15:50:04,658 - INFO - Attempting to fetch data from https://api.worldbank.org/v2/en/indicator/SP.POP.TOTL?downloadformat=csv
2024-11-12 15:50:04,662 - DEBUG - Starting new HTTPS connection (1): api.worldbank.org:443
2024-11-12 15:50:05,141 - DEBUG - https://api.worldbank.org:443 "GET /v2/en/indicator/SP.POP.TOTL?downloadformat=csv HTTP/11" 200 87056
2024-11-12 15:50:05,169 - INFO - Successfully fetched data from https://api.worldbank.org/v2/en/indicator/SP.POP.TOTL?downloadformat=csv


In [154]:
def extract_into_df(csv_data, separator=",", skiprows=0):
    """
    Load a csv dataset into a pandas dataframe for further transformation.
    
    Parameters:
    csv_data: Dataset in CSV format as provided by extract_dataset_function.
    separator: The CSV value separator for this file
    skiprows: The number of rows of metadata that are to be skipped at the beginning of the file
    
    Returns:
    pd.DataFrame: The resulting DataFrame
    """
    
    try: 
        logging.info(f"Attempting to load data into DataFrame") 
        #df = pd.read_csv(io.StringIO(csv_data.content.decode('utf-8')), sep=separator)
        df = pd.read_csv(io.StringIO(csv_data), sep=separator, skiprows=skiprows)
        logging.info(f"Successfully loaded data into DataFrame with {len(df)} rows") 
        return df 
    except requests.exceptions.RequestException as e: 
        logging.error(f"Failed to load data: {e}") 
        raise e 
    except Exception as e: 
        logging.error(f"Unexpected error: {e}") 
        raise e

In [73]:
def filter_drop_columns(df, white_list):
    """
    Drop columns from a DataFrame except those in the whitelist.
    
    Parameters:
    df (pd.DataFrame): The DataFrame to operate on.
    white_list (list): List of columns to keep.
    
    Returns:
    pd.DataFrame: DataFrame with only the columns in the whitelist.
    """

    try:
        logging.info(f"Attempting to drop non-whitelisted columns")

        # Check if whitelist columns exist in DataFrame
        missing_columns = [col for col in white_list if col not in df.columns]
        if missing_columns:
            raise ValueError(
                f"The following columns in the whitelist are missing from the DataFrame: {missing_columns}")

        # Drop columns not in the whitelist
        columns_to_drop = [col for col in df.columns if col not in white_list]
        df_dropped = df.drop(columns=columns_to_drop)

        if len(list(df_dropped.columns)) < 15:
            logging.info(f"Successfully dropped columns. Remaining columns: {list(df_dropped.columns)}")
        else:
            logging.info(f"Successfully dropped columns. {len(list(df_dropped.columns))} columns remaining.")

        return df_dropped

    except ValueError as e:
        logging.error(f"Please make sure all columns in the white list are contained in the DataFrame: {e}")
        return df
    except Exception as e:
        logging.error(f"Unexpected error: {e}")
        return df


In [7]:
class Strategy(Enum):
    """
    Each enumeration represents a strategy for handling missing values of pd.DataFrame
    """

    BFILL = auto()
    FFILL = auto()
    DROP_ROW = auto()
    LINEAR_INTERPOLATION = auto()
    MODE = auto()
    MEDIAN = auto()


def filter_handle_missing_values(df, column, threshold=0, strategy=Strategy.DROP_ROW):
    """
    Handle missing values in a specific column of a DataFrame based on a given strategy.
    
    Parameters:
    df (pd.DataFrame): The DataFrame to operate on.
    column (str): The column to handle missing values for.
    threshold (float): Threshold of missing values (0-1) after which the strategy is applied.
    strategy (Strategy): Strategy for handling missing values.
    
    Returns:
    pd.DataFrame: DataFrame with missing values handled in the specified column.
    """

    try:
        temp_df = copy.deepcopy(df)  # Avoid making in place changes to the dataframe

        # Calculate the percentage of missing values in the column
        missing_ratio = temp_df[column].isnull().mean()

        if missing_ratio > threshold:
            logging.info(
                f"Column '{column}' has {missing_ratio * 100:.2f}% missing values, applying {strategy.name} strategy")

            if strategy == Strategy.BFILL:
                temp_df[column] = temp_df[column].fillna(method='bfill')
                logging.info(f"Applied back fill strategy to column '{column}'")

            elif strategy == Strategy.FFILL:
                temp_df[column] = temp_df[column].fillna(method='ffill')
                logging.info(f"Applied forward fill strategy to column '{column}'")

            elif strategy == Strategy.DROP_ROW:
                temp_df = temp_df.dropna(subset=[column])
                logging.info(f"Dropped rows with missing values in column '{column}'")

            elif strategy == Strategy.LINEAR_INTERPOLATION:
                temp_df[column] = temp_df[column].interpolate(method='linear')
                logging.info(f"Applied linear interpolation to column '{column}'")

            elif strategy == Strategy.MODE:
                mode_value = temp_df[column].mode()[0]
                temp_df[column] = temp_df[column].fillna(mode_value)
                logging.info(f"Applied mode imputation to column '{column}' with mode value {mode_value}")

            elif strategy == Strategy.MEDIAN:
                median_value = temp_df[column].median()
                temp_df[column] = temp_df[column].fillna(median_value)
                logging.info(f"Applied median imputation to column '{column}' with median value {median_value}")

            return temp_df
        else:
            logging.info(
                f"Column '{column}' has {missing_ratio * 100:.2f}% missing values, which is lower than the threshold of {threshold * 100:.2f}%. Not applying a strategy.")
            return df

    except Exception as e:
        logging.error(
            f"Unexpected error while handling missing values in column '{column}' with strategy '{strategy.name}': {e}")
        return df

In [201]:
def filter_rows_by_values(df, column_name, column_values):
    """
    Filter rows in a DataFrame based on column values and drop all rows where values do not match the given values.
    
    Parameters:
    df (pd.DataFrame): The DataFrame to filter.
    column_name (str): The column name to check for the values.
    column_values: The value or list of values to filter rows by.
    
    Returns:
    pd.DataFrame: DataFrame with rows filtered by the given column values.
    """
    
    try:
        # Ensure the column exists in the DataFrame
        if column_name not in df.columns:
            logging.error(f"Column '{column_name}' does not exist in the DataFrame.")
            return df
        
        # If column_values is not a list, convert it to a list
        if not isinstance(column_values, list):
            column_values = [column_values]
        
        # Create a mask for the matching rows
        mask = df[column_name].isin(column_values)
        affected_rows = len(df) - mask.sum()  # Calculate the number of rows that do not match
        
        # Filter the DataFrame
        filtered_df = df[mask]
        
        logging.info(f"Column '{column_name}': Filtering rows where values are in {column_values}.")
        logging.info(f"Number of rows dropped: {affected_rows}")
        return filtered_df
    except Exception as e:
        logging.error(f"Unexpected error while filtering rows by '{column_name}' with value '{column_values}': {e}")
        return df

In [101]:
# TODO: I will have to see how well this works with the different datasets. Check how many NaT (not a time) values there are, and handle that case (possibly using above function)

# Function to check if a column name can be converted to a datetime object
def try_convert_to_datetime(col_name):
    try:
        return pd.to_datetime(col_name, dayfirst=True, errors='raise').date()
    except ValueError:
        return col_name


def filter_transform_to_datetime(df, column=None, do_columns=False):
    """
    Transform a column in various formats to a uniform datetime datatype.
    Alternatively transform column names to datetime datatype.
    
    Parameters:
    df (pd.DataFrame): The DataFrame containing the date column.
    column (str): The column name containing date values in various formats.
    doColumns (bool): If true, all column names that represent a date will be transformed to uniform datetime objects
    
    Returns:
    pd.DataFrame: DataFrame with the date column transformed to datetime.
    """

    try:

        temp_df = copy.deepcopy(df)  # Avoid making in place changes to the dataframe
        if do_columns:

            # Transform columns that can be parsed as datetime objects
            new_columns = {col: try_convert_to_datetime(col) for col in temp_df.columns}
            temp_df.rename(columns=new_columns, inplace=True)
            logging.info(f"Successfully transformed {len(new_columns)} column names to datetime")
        elif column is not None:
            logging.info(f"Transforming column '{column}' to datetime")
            temp_df[column] = pd.to_datetime(temp_df[column], errors='coerce').dt.date
            logging.info(f"Successfully transformed column '{column}' to datetime")
        else:
            logging.error("Please provide a meaningful column name")
    except Exception as e:
        logging.error(f"Unexpected error while transforming column '{column}' to datetime: {e}")
        return df

    return temp_df

In [10]:
def load_df_to_csv(df, file_name, file_path='../data/', overwrite=False):
    """
    Save a DataFrame to a CSV file.
    
    Parameters:
    df (pd.DataFrame): The DataFrame to save.
    file_path (str): The path where the CSV file will be saved. The default path is that to the local /data/ folder, as required by the project specifications.
    file_name (str): The name of the file to be stored, excluding the file ending, which is hardcoded as '.csv'
    overwrite (bool): Flag to allow overwriting of existing files.
    
    Returns:
    None
    """

    if not file_path:  # Check if file_path is an empty string 
        file_path = './'  # Default to current working directory 

    full_path = os.path.join(file_path, file_name + '.csv')

    # Check if the file already exists 
    if os.path.exists(full_path):
        if not overwrite:
            logging.error(
                f"File '{full_path}' already exists. Set overwrite-flag to True in order to perform this action")
            return
        else:
            logging.warning(f"File '{full_path}' is being overwritten as the overwrite-flag is set to True")

    try:
        df.to_csv(full_path, index=False)
        logging.info(f"DataFrame successfully saved to {full_path}")
    except PermissionError as e:
        logging.error(f"Permission error while trying to save the DataFrame to {full_path}: {e}")
    except FileNotFoundError as e:
        logging.error(f"File not found error while trying to save the DataFrame to {full_path}: {e}")
    except Exception as e:
        logging.error(f"Unexpected error while saving the DataFrame to {full_path}: {e}")

# Applying the ETL Pipeline to the datasets 

### Chile Covid Mortality Dataset

In [137]:
chile_url = "https://datos.gob.cl/dataset/8982a05a-91f7-422d-97bc-3eee08fde784/resource/8e5539b7-10b2-409b-ae5a-36dae4faf817/download/defunciones_covid19_2020_2024.csv"

# Extract the dataset into a data-frame
chile_data = extract_dataset(chile_url, timeout=(200, 200))
chile_df = extract_into_df(chile_data, separator=";")

2024-11-12 15:51:25,207 - INFO - Attempting to fetch data from https://datos.gob.cl/dataset/8982a05a-91f7-422d-97bc-3eee08fde784/resource/8e5539b7-10b2-409b-ae5a-36dae4faf817/download/defunciones_covid19_2020_2024.csv
2024-11-12 15:51:25,210 - DEBUG - Starting new HTTPS connection (1): datos.gob.cl:443
2024-11-12 15:51:26,947 - DEBUG - https://datos.gob.cl:443 "GET /dataset/8982a05a-91f7-422d-97bc-3eee08fde784/resource/8e5539b7-10b2-409b-ae5a-36dae4faf817/download/defunciones_covid19_2020_2024.csv HTTP/11" 200 16071695
2024-11-12 15:51:32,433 - INFO - Successfully fetched data from https://datos.gob.cl/dataset/8982a05a-91f7-422d-97bc-3eee08fde784/resource/8e5539b7-10b2-409b-ae5a-36dae4faf817/download/defunciones_covid19_2020_2024.csv
2024-11-12 15:51:32,448 - INFO - Attempting to load data into DataFrame
2024-11-12 15:51:32,724 - INFO - Successfully loaded data into DataFrame with 58289 rows


In [138]:
# Perform transformations

# Required fields for analysis are the death-date and the diagnosis (COVID-19)
chile_df = filter_drop_columns(chile_df, ["FECHA_DEF", "DIAG1"])

# Transform the date-fields into datetime objects
chile_df = filter_transform_to_datetime(chile_df, "FECHA_DEF")

# No missing values are imputed, as there are not enough missing values in the dataset
for column in chile_df.columns:
    chile_df = filter_handle_missing_values(chile_df, column=column, strategy=Strategy.DROP_ROW)

2024-11-12 15:51:35,198 - INFO - Attempting to drop non-whitelisted columns
2024-11-12 15:51:35,204 - INFO - Successfully dropped columns. Remaining columns: ['FECHA_DEF', 'DIAG1']
2024-11-12 15:51:35,209 - INFO - Transforming column 'FECHA_DEF' to datetime
2024-11-12 15:51:35,240 - INFO - Successfully transformed column 'FECHA_DEF' to datetime
2024-11-12 15:51:35,248 - INFO - Column 'FECHA_DEF' has 0.00% missing values, which is lower than the threshold of 0.00%. Not applying a strategy.
2024-11-12 15:51:35,254 - INFO - Column 'DIAG1' has 0.00% missing values, which is lower than the threshold of 0.00%. Not applying a strategy.


In [139]:
# Load the transformed dataframe back into a CSV-database file.
load_df_to_csv(chile_df, file_name='chile_covid_mortality', overwrite=False)

2024-11-12 15:51:38,324 - ERROR - File '../data/chile_covid_mortality.csv' already exists. Set overwrite-flag to True in order to perform this action


### USA Covid Mortality Dataset

In [140]:
usa_url = "https://data.cdc.gov/api/views/exs3-hbne/rows.csv?fourfour=exs3-hbne&cacheBust=1729520760&date=20241106&accessType=DOWNLOAD"

# Extract the dataset into a data-frame
data = extract_dataset(usa_url, timeout=(200, 200))
usa_df = extract_into_df(data)

2024-11-12 15:51:45,354 - INFO - Attempting to fetch data from https://data.cdc.gov/api/views/exs3-hbne/rows.csv?fourfour=exs3-hbne&cacheBust=1729520760&date=20241106&accessType=DOWNLOAD
2024-11-12 15:51:45,357 - DEBUG - Starting new HTTPS connection (1): data.cdc.gov:443
2024-11-12 15:51:46,691 - DEBUG - https://data.cdc.gov:443 "GET /api/views/exs3-hbne/rows.csv?fourfour=exs3-hbne&cacheBust=1729520760&date=20241106&accessType=DOWNLOAD HTTP/11" 200 None
2024-11-12 15:51:49,437 - INFO - Successfully fetched data from https://data.cdc.gov/api/views/exs3-hbne/rows.csv?fourfour=exs3-hbne&cacheBust=1729520760&date=20241106&accessType=DOWNLOAD
2024-11-12 15:51:49,442 - INFO - Attempting to load data into DataFrame
2024-11-12 15:51:49,617 - INFO - Successfully loaded data into DataFrame with 79002 rows


In [141]:
# Perform transformations

# This dataset has duplicate values, therefore drop all rows for the different regions in the US and keep only the total US rows.
usa_df = filter_rows_by_values(usa_df, "jurisdiction_residence", "United States")

# Keep only required fields for analysis
usa_df = filter_drop_columns(usa_df, ["data_period_start", "data_period_end", "group", "subgroup1", "covid_deaths", "crude_rate"]) 

# Transform the date-fields into datetime objects. This also works for the american M/D/Y date format.
usa_df = filter_transform_to_datetime(usa_df, "data_period_start") 
usa_df = filter_transform_to_datetime(usa_df, "data_period_end")

# Drop the rows for which there is no data about covid mortality
print(f"Before: \n{usa_df.isnull().sum()} \n")
for column in usa_df.columns:
    usa_df = filter_handle_missing_values(usa_df, column=column, strategy=Strategy.DROP_ROW)
print(f"After: \n{usa_df.isnull().sum()} \n")

2024-11-12 15:51:52,809 - INFO - Column 'jurisdiction_residence': Filtering rows where value is 'United States'.
2024-11-12 15:51:52,810 - INFO - Number of rows dropped: 71820
2024-11-12 15:51:52,812 - INFO - Attempting to drop non-whitelisted columns
2024-11-12 15:51:52,815 - INFO - Successfully dropped columns. Remaining columns: ['data_period_start', 'data_period_end', 'group', 'subgroup1', 'covid_deaths', 'crude_rate']
2024-11-12 15:51:52,816 - INFO - Transforming column 'data_period_start' to datetime
2024-11-12 15:51:52,824 - INFO - Successfully transformed column 'data_period_start' to datetime
2024-11-12 15:51:52,826 - INFO - Transforming column 'data_period_end' to datetime
2024-11-12 15:51:52,832 - INFO - Successfully transformed column 'data_period_end' to datetime
2024-11-12 15:51:52,842 - INFO - Column 'data_period_start' has 0.00% missing values, which is lower than the threshold of 0.00%. Not applying a strategy.
2024-11-12 15:51:52,845 - INFO - Column 'data_period_end' 

Before: 
data_period_start       0
data_period_end         0
group                   0
subgroup1               0
covid_deaths         1962
crude_rate           1962
dtype: int64 

After: 
data_period_start    0
data_period_end      0
group                0
subgroup1            0
covid_deaths         0
crude_rate           0
dtype: int64 



In [142]:
# Load the transformed dataframe back into a CSV-database file.
load_df_to_csv(usa_df, file_name='usa_covid_mortality', overwrite=False)

2024-11-12 15:52:00,274 - ERROR - File '../data/usa_covid_mortality.csv' already exists. Set overwrite-flag to True in order to perform this action


### Colombia Covid Mortality Dataset

In [152]:
colombia_url = "https://www.datos.gov.co/api/views/jp5m-e7yr/rows.csv?fourfour=jp5m-e7yr&cacheBust=1705599009&date=20241106&accessType=DOWNLOAD"

# Extract the dataset into a data-frame
data = extract_dataset(colombia_url, timeout=(200, 200))
colombia_df = extract_into_df(data)

2024-11-12 15:53:09,667 - INFO - Attempting to fetch data from https://www.datos.gov.co/api/views/jp5m-e7yr/rows.csv?fourfour=jp5m-e7yr&cacheBust=1705599009&date=20241106&accessType=DOWNLOAD
2024-11-12 15:53:09,671 - DEBUG - Starting new HTTPS connection (1): www.datos.gov.co:443
2024-11-12 15:53:10,741 - DEBUG - https://www.datos.gov.co:443 "GET /api/views/jp5m-e7yr/rows.csv?fourfour=jp5m-e7yr&cacheBust=1705599009&date=20241106&accessType=DOWNLOAD HTTP/11" 200 None
2024-11-12 15:53:14,641 - INFO - Successfully fetched data from https://www.datos.gov.co/api/views/jp5m-e7yr/rows.csv?fourfour=jp5m-e7yr&cacheBust=1705599009&date=20241106&accessType=DOWNLOAD
2024-11-12 15:53:14,674 - INFO - Attempting to load data into DataFrame


fecha reporte web,ID de caso,Fecha de notificación,Código DIVIPOLA departamento,Nombre departamento,


2024-11-12 15:53:15,163 - INFO - Successfully loaded data into DataFrame with 143125 rows


In [144]:
# Perform transformations

# Keep only required fields for analysis
colombia_df = filter_drop_columns(colombia_df, ["Fecha de muerte", "Recuperado"])

# Transform the date-fields into datetime objects.
colombia_df = filter_transform_to_datetime(colombia_df, "Fecha de muerte")

# No missing values are imputed, as there are not enough missing values in the dataset
for column in colombia_df.columns:
    colombia_df = filter_handle_missing_values(colombia_df, column=column, strategy=Strategy.MEDIAN)

2024-11-12 15:52:11,041 - INFO - Attempting to drop non-whitelisted columns
2024-11-12 15:52:11,052 - INFO - Successfully dropped columns. Remaining columns: ['Recuperado', 'Fecha de muerte']
2024-11-12 15:52:11,061 - INFO - Transforming column 'Fecha de muerte' to datetime
2024-11-12 15:52:11,109 - INFO - Successfully transformed column 'Fecha de muerte' to datetime
2024-11-12 15:52:11,124 - INFO - Column 'Recuperado' has 0.00% missing values, which is lower than the threshold of 0.00%. Not applying a strategy.
2024-11-12 15:52:11,140 - INFO - Column 'Fecha de muerte' has 0.00% missing values, which is lower than the threshold of 0.00%. Not applying a strategy.


In [145]:
# Load the transformed dataframe back into a CSV-database file.
load_df_to_csv(colombia_df, file_name='colombia_covid_mortality', overwrite=False)

2024-11-12 15:52:13,610 - ERROR - File '../data/colombia_covid_mortality.csv' already exists. Set overwrite-flag to True in order to perform this action


### Mexico Covid Mortality Dataset

In [206]:
mexico_url = "https://datos.covid-19.conacyt.mx/Downloads/Files/Casos_Diarios_Estado_Nacional_Defunciones_20230625.csv"

# Extract the dataset into a data-frame
data = extract_dataset(mexico_url, timeout=(200, 200))
mexico_df = extract_into_df(data)

2024-11-12 16:19:06,871 - INFO - Attempting to fetch data from https://datos.covid-19.conacyt.mx/Downloads/Files/Casos_Diarios_Estado_Nacional_Defunciones_20230625.csv
2024-11-12 16:19:06,875 - DEBUG - Starting new HTTPS connection (1): datos.covid-19.conacyt.mx:443
2024-11-12 16:19:08,212 - DEBUG - https://datos.covid-19.conacyt.mx:443 "GET /Downloads/Files/Casos_Diarios_Estado_Nacional_Defunciones_20230625.csv HTTP/11" 200 104290
2024-11-12 16:19:08,738 - INFO - Successfully fetched data from https://datos.covid-19.conacyt.mx/Downloads/Files/Casos_Diarios_Estado_Nacional_Defunciones_20230625.csv
2024-11-12 16:19:08,740 - INFO - Attempting to load data into DataFrame
2024-11-12 16:19:08,766 - INFO - Successfully loaded data into DataFrame with 33 rows


In [207]:
# Perform transformations

# Transform the date column names into datetime format
mexico_df = filter_transform_to_datetime(mexico_df, do_columns=True) 

# Keep the nombre column and all date columns as they will all be required for the analysis.
# For this dataset, having a whitelist is a bit unfortunate, however we work around this issue by generating all column names automatically.
start_date, end_date = '17-03-2020', '23-06-2023' # Define the date range 
date_range = pd.date_range(start=start_date, end=end_date).date.tolist() # Generate the date range 
date_range.append("nombre")

mexico_df = filter_drop_columns(mexico_df, white_list=date_range)

# Leave only the row containing national mortality in order to avoid having duplicate values.
mexico_df = filter_rows_by_values(mexico_df, "nombre", "Nacional")

# No missing values need to be imputed, as there are not enough missing values in the dataset
assert mexico_df.isnull().sum().sum() == 0

2024-11-12 16:19:11,044 - INFO - Successfully transformed 1197' column names to datetime
2024-11-12 16:19:11,049 - INFO - Attempting to drop non-whitelisted columns
2024-11-12 16:19:11,056 - INFO - Successfully dropped columns. 1195 columns remaining.
2024-11-12 16:19:11,059 - INFO - Column 'nombre': Filtering rows where values are in ['Nacional'].
2024-11-12 16:19:11,060 - INFO - Number of rows dropped: 32


In [208]:
# Load the transformed dataframe back into a CSV-database file.
load_df_to_csv(mexico_df, file_name='mexico_covid_mortality', overwrite=False)

2024-11-12 16:19:17,205 - ERROR - File '../data/mexico_covid_mortality.csv' already exists. Set overwrite-flag to True in order to perform this action


### World Population Dataset

In [176]:
world_pop_url = "https://api.worldbank.org/v2/en/indicator/SP.POP.TOTL?downloadformat=csv"

# Extract the dataset into a data-frame
data = extract_dataset(world_pop_url, timeout=(200, 200), is_zip=True)
world_pop_df = extract_into_df(data, skiprows=3)

2024-11-12 16:06:04,566 - INFO - Attempting to fetch data from https://api.worldbank.org/v2/en/indicator/SP.POP.TOTL?downloadformat=csv
2024-11-12 16:06:04,569 - DEBUG - Starting new HTTPS connection (1): api.worldbank.org:443
2024-11-12 16:06:05,292 - DEBUG - https://api.worldbank.org:443 "GET /v2/en/indicator/SP.POP.TOTL?downloadformat=csv HTTP/11" 200 87056
2024-11-12 16:06:05,295 - INFO - Successfully fetched data from https://api.worldbank.org/v2/en/indicator/SP.POP.TOTL?downloadformat=csv
2024-11-12 16:06:05,297 - INFO - Attempting to load data into DataFrame
2024-11-12 16:06:05,300 - INFO - Successfully loaded data into DataFrame with 266 rows


In [198]:
world_pop_df.isnull().sum()

Country Name    0
2020            1
2021            1
2022            1
2023            1
dtype: int64

In [199]:
world_pop_df

Unnamed: 0,Country Name,2020,2021,2022,2023
0,Aruba,106585.0,106537.0,106445.0,106277.0
1,Africa Eastern and Southern,685112979.0,702977106.0,720859132.0,739108306.0
2,Afghanistan,38972230.0,40099462.0,41128771.0,42239854.0
3,Africa Western and Central,466189102.0,478185907.0,490330870.0,502789511.0
4,Angola,33428486.0,34503774.0,35588987.0,36684202.0
...,...,...,...,...,...
261,Kosovo,1790133.0,1786038.0,1768086.0,1756374.0
262,"Yemen, Rep.",32284046.0,32981641.0,33696614.0,34449825.0
263,South Africa,58801927.0,59392255.0,59893885.0,60414495.0
264,Zambia,18927715.0,19473125.0,20017675.0,20569737.0


In [203]:
# Perform transformations

# Keep the data for the years 2020-2023 and the country name as an identifier 
white_list = [str(x) for x in range(2020, 2024)]
white_list.append("Country Name")
world_pop_df = filter_drop_columns(world_pop_df, white_list) 

# Select rows for the countries under analysis
countries = ["Chile", "United States", "Colombia", "Mexico"]
world_pop_df = filter_rows_by_values(world_pop_df, "Country Name", countries)
    

# No missing values are imputed, as there are not enough missing values in the dataset
assert world_pop_df.isnull().sum().sum() == 0

2024-11-12 16:18:18,676 - INFO - Attempting to drop non-whitelisted columns
2024-11-12 16:18:18,679 - INFO - Successfully dropped columns. Remaining columns: ['Country Name', '2020', '2021', '2022', '2023']
2024-11-12 16:18:18,687 - INFO - Column 'Country Name': Filtering rows where values are in ['Chile', 'United States', 'Colombia', 'Mexico'].
2024-11-12 16:18:18,688 - INFO - Number of rows dropped: 262


In [205]:
# Load the transformed dataframe back into a CSV-database file.
load_df_to_csv(colombia_df, file_name='world_population_total', overwrite=False)

2024-11-12 16:18:49,357 - INFO - DataFrame successfully saved to ../data/world_population_total.csv
