In [1]:
import pandas as pd
from pandas import Series, DataFrame
import country_converter as coco
import os

Data sources:

- https://data.unicef.org/topic/child-health/immunization/
- Immunization coverage by antigen
- https://data.unicef.org/wp-content/uploads/2016/07/wuenic2021rev_web-update.xlsx

In [2]:
## Get current working directory
print("Current working directory:", os.getcwd())

## Find path to raw data (for the raw files)
raw_data_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'raw_data'))

## Find path to folder for saving cleaned csv
cleaned_data_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'cleaned_data'))

Current working directory: /Users/jennagreene/Documents/GitHub/HATCH_data/reading_files


In [3]:
target_file = '/wuenic2021rev_web-update.xlsx'
target_filepath = raw_data_path + target_file

In [4]:
file = pd.ExcelFile(target_filepath)
unicef_sheets = file.sheet_names

In [5]:
def read_unicef(sheet):
    # Read the Excel file into a DataFrame
    unicef = pd.read_excel(target_filepath, sheet_name=sheet)
    
    # Convert the vaccination coverage percentages to fractions
    for col in unicef.columns[4:]:
        unicef[col] = unicef[col] / 100
    
    # Rename columns for clarity
    unicef.rename(columns={'country': 'Country Name', 'vaccine': 'Technology Name'}, inplace=True)
    
    # Append 'Vaccine' to the technology name to indicate it's a vaccine
    unicef['Technology Name'] = unicef['Technology Name'] + ' Vaccine'
    
    # Add metadata columns
    unicef['Unit'] = '%'  # Unit is percentage
    unicef['Metric'] = 'Share of Population'
    unicef['Data Source'] = 'UNICEF'
    unicef['Spatial Scale'] = 'National'
    
    # Convert ISO3 country codes to ISO2 country codes
    iso3 = unicef['iso3']
    iso2 = []
    for iso in iso3:
        iso2.append(coco.convert(names=iso, to='iso2'))
    unicef['Country Code'] = iso2
    
    # Drop unnecessary columns
    unicef.drop(columns={'unicef_region', 'iso3'}, inplace=True)
    
    # Set the ID column and index
    unicef['ID'] = unicef['Technology Name'] + '_'+ unicef['Metric'] + '_' + unicef['Country Code']
    unicef.set_index('ID', inplace=True)
    
    return unicef

In [6]:
unicef_list = []
for x in unicef_sheets[:-1]:
    df = read_unicef(x)
    unicef_list.append(df)

In [7]:
un = pd.concat(unicef_list)

In [8]:
output_file = 'unicef.csv'
output_file_path = os.path.join(cleaned_data_path, output_file)

un.to_csv(output_file_path)
print("Data saved to:", output_file_path)

Data saved to: /Users/jennagreene/Documents/GitHub/HATCH_data/cleaned_data/unicef.csv
