In [1]:
import pandas as pd
import glob
import bamboolib
import plotly.express as px
import plotly.graph_objs as go

## Read files

In [2]:
def read_file_who(folder_path,type_name,page,skip_row):
    # get a list of all xlsx files in the folder
    df_all = []
    for (path,name) in zip(folder_path,type_name):
        file_list = glob.glob(path + '/*.xlsx')
        df_list = []
        # loop through each file and read the specified sheet
        for file_name in file_list:
            excel_file = pd.ExcelFile(file_name)
            if page == None:
                sheet_names = excel_file.sheet_names
            else:
                sheet_names = page
            # read the file and the specified sheet
            for sheet in sheet_names:
                if skip_row == None:
                    df = pd.read_excel(file_name, sheet_name=sheet)
                else:
                    df = pd.read_excel(file_name, sheet_name=sheet,skiprows=skip_row,header = 0)
                df['Option'] = sheet
                df_list.append(df)
                output_df = pd.concat(df_list)
        output_df['Type'] = name
        df_all.append(output_df)
    
    all_data_df = pd.concat(df_all)
    return all_data_df 




In [3]:
folder_path = ['raw/government_expenditure_on_routine_immunization',
    'raw/government_expenditure_on_vaccine_used_in_routine_immunization',
    'raw/total_expenditure_on_routine_immunization',
    'raw/total_expenditure_on_vaccine_used_in_routine_immunization']
type_name = ['gov_immunization','gov_vaccine','total_immunization','total_vaccine']

expenditure_df = read_file_who(folder_path,type_name,None,None)

coverage_df = read_file_who(['raw\coverage'],['coverage'],['Data'],None)

population_df = read_file_who(['raw\population'],['population'],['Estimates'],16)

gdp_df = read_file_who(['raw\GDP'],['current GDP'],['Data'],3)

land_df = read_file_who(['raw\land'],['land'],['SYB65_145_202209_Land'],1)


## Data Preparation

In [4]:
#Standardize the country name
def replace_country_names(df, replacements):
    for original, replacement in replacements.items():
        df.loc[df['Country'] == original, 'Country'] = replacement
    return df

replacements = {
    'Bolivia (Plurinational State of)': 'Bolivia',
    'Congo (The)': 'Congo',
    #'Democratic Republic of the Congo': 'Congo',
    'Czechia': 'Czech Republic',
    'Iran (Islamic Republic of)': 'Iran',
    'Lao People\'s Democratic Republic': 'Laos',
    'Lao People\'s Democratic Republic (the)': 'Laos',
    'North Macedonia': 'North Macedonia',
    'Republic of Macedonia': 'North Macedonia',
    'The former Yugoslav Republic of Macedonia': 'North Macedonia',
    'Netherlands': 'The Netherlands',
    'Saint Vincent and the Grenadines': 'Saint Vincent and the Grenadines',
    'Saint Vincent and The Grenadines': 'Saint Vincent and the Grenadines',
    'Sudan': 'Sudan',
    'South Sudan': 'South Sudan',
    'Syrian Arab Republic': 'Syria',
    'Syria': 'Syria',
    'Tanzania': 'Tanzania',
    'United Republic of Tanzania': 'Tanzania',
    'Turks and Caicos Islands': 'Turks and Caicos Islands',
    'Turks and Caicos Islands, The': 'Turks and Caicos Islands',
    'United Kingdom': 'United Kingdom',
    'England': 'United Kingdom',
    'Wales': 'United Kingdom',
    'Scotland': 'United Kingdom',
    'Northern Ireland': 'United Kingdom',
    'Venezuela (Bolivarian Republic of)': 'Venezuela',
    'Viet Nam': 'Vietnam',
    'Bolivia (Plurin. State of)':'Bolivia',
    'Côte d’Ivoire':"Côte d'Ivoire",
    'Dem. Rep. of the Congo':'Democratic Republic of the Congo',
    "Lao People's Dem. Rep.":'Laos',
    "Dem. People's Rep. Korea":'Democratic Peoples Republic of Korea',
    'State of Palestine' : 'West Bank and Gaza',
    'Russian Federation' :"Russia",
    'Türkiye':'Turkey',
    'United Rep. of Tanzania':'Tanzania',
    'Saint Vincent & Grenadines':'Saint Vincent and the Grenadines',
    'Venezuela (Boliv. Rep. of)' : 'Venezuela'
}



In [5]:
def get_missing_value_percentage(expenditure_detail_df,columns_to_group=['Country'], columns_to_filter=['total_immunization in USD', 'total_vaccine in USD'], threshold=0.3,more_than = True):
    # group the data by both 'Year' and 'Country'
    grouped = expenditure_detail_df.groupby(columns_to_group)
    
    # calculate the percentage of missing values for each column within each group
    nan_percentages = grouped.apply(lambda x: x.isna().mean())
    
    # filter the results to only show the specified columns
    filtered = nan_percentages[columns_to_filter]
    
    # filter the results to only show percentages above the threshold value
    if more_than == True:
        filtered = filtered[filtered > threshold].dropna()
    else:
        filtered = filtered[filtered < threshold].dropna()
    return filtered

### Land

In [6]:
land_df = land_df[land_df['Series'] == 'Land area (thousand hectares)']
# Get the latest year
latest_year = land_df['Year'].max()

# Create a new dataframe with the latest values for each country/region
country_area_df = land_df.loc[land_df['Year'] == latest_year, ["Unnamed: 1", 'Value']].copy()

# Rename the 'Region/Country/Area' column to 'Country'
country_area_df = country_area_df.rename(columns={"Unnamed: 1": 'Country','Value':'land_area'})
country_area_df = replace_country_names(country_area_df, replacements)

### Goverment expenditure on immunization

In [7]:
#expenditure_df = read_file_who(folder_path,type_name,None,None)
expenditure_df.columns = expenditure_df.columns.astype(str)
value_vars = expenditure_df.columns.drop(['ISO', 'Country', 'Region', 'Gavi / Income status', 'Option', 'Type'])
expenditure_melt_df = pd.melt(expenditure_df, id_vars=['ISO', 'Country', 'Region', 'Gavi / Income status', 'Option', 'Type'], 
                 value_vars=value_vars, 
                 var_name='Year', value_name='value')

expenditure_melt_df['Year'] = pd.to_datetime(expenditure_melt_df['Year'], format='%Y').dt.year
expenditure_df = expenditure_melt_df


In [8]:
gov_type = ['gov_immunization', 'gov_vaccine']
gov_list = ['in USD', 'in USD per capita', 'in USD per surviving infant', '% government expenditure']
start = 0
for j in gov_type:
    for i in gov_list:
        if start == 0:
            expenditure_detail_df = expenditure_df[(expenditure_df['Type'] == j) & (expenditure_df['Option'] == i)][['ISO', 'Country', 'Region', 'Gavi / Income status', 'Year', 'value']]
            expenditure_detail_df = expenditure_detail_df.rename(columns={'value': j + ' ' + i})
        else:
            df_2 = expenditure_df[(expenditure_df['Type'] == j) & (expenditure_df['Option'] == i)][['ISO', 'Country', 'Region', 'Gavi / Income status', 'Year', 'value']]
            df_2 = df_2.rename(columns={'value': j + ' ' + i})
            expenditure_detail_df = pd.merge(expenditure_detail_df, df_2, on=['ISO', 'Country', 'Region', 'Gavi / Income status', 'Year'], how='outer', suffixes=('', ''))
        start += 1 

total_type = ['total_immunization', 'total_vaccine']
total_list = ['in USD', 'in USD per capita', 'in USD per surviving infant']
for j in total_type:
    for i in total_list:
        df_2 = expenditure_df[(expenditure_df['Type'] == j) & (expenditure_df['Option'] == i)][['ISO', 'Country', 'Region', 'Gavi / Income status', 'Year', 'value']]
        df_2 = df_2.rename(columns={'value': j + ' ' + i})
        expenditure_detail_df = pd.merge(expenditure_detail_df, df_2, on=['ISO', 'Country', 'Region', 'Gavi / Income status', 'Year'], how='outer', suffixes=('', ''))

In [9]:
expenditure_detail_df = replace_country_names(expenditure_detail_df, replacements)
expenditure_detail_df['Year']= pd.to_datetime(expenditure_detail_df['Year'], format='%Y')

In [10]:
#expenditure_detail_filt_df = expenditure_detail_df[['Country','Year','Region','Gavi / Income status','total_immunization in USD per surviving infant','total_vaccine in USD per surviving infant']]
expenditure_detail_filt_df = expenditure_detail_df[['Country','Year','Region','Gavi / Income status','total_immunization in USD','total_vaccine in USD','total_immunization in USD per surviving infant','total_vaccine in USD per surviving infant']]

In [11]:
#expenditure_detail_filt_df = expenditure_detail_filt_df.rename(columns = {'total_immunization in USD per surviving infant':'total_immunization in USD', 'total_vaccine in USD per surviving infant':'total_vaccine in USD'})

### Immunization coverage

In [12]:
coverage_df = coverage_df.rename(columns={'NAME': 'Country','YEAR':'Year'})
coverage_df = coverage_df[coverage_df['COVERAGE_CATEGORY'].isin(['WUENIC'])]
coverage_df = coverage_df[~coverage_df['ANTIGEN'].isin(['YFV'])]
coverage_df = replace_country_names(coverage_df, replacements)
coverage_df['Year'] = pd.to_datetime(coverage_df['Year'], format='%Y')

In [13]:
# create a pivot table for each antigen
antigen_pivot = pd.pivot_table(coverage_df, index=['Country', 'Year'], 
                               columns='ANTIGEN',
                               values=['COVERAGE'], 
                               aggfunc='sum')

# flatten the column index
antigen_pivot.columns = [f'{col[1]}_{col[0]}' for col in antigen_pivot.columns]

# reset the index to make Country and Year columns
antigen_pivot = antigen_pivot.reset_index()

#rename
immunization_coverage_df = antigen_pivot

### World Population

In [14]:
#population_df = read_file_who(['raw\population'],['population'],['Estimates'],16)
population_df

cols = [i for i in range(100)]
total = population_df[cols].sum(axis=1)
cols = [i for i in range(1,6)]
total_below_5 = population_df[cols].sum(axis=1)
cols = [i for i in range(6,11)]
total_below_10 = population_df[cols].sum(axis=1)
cols = [i for i in range(10,16)]
total_below_15 = population_df[cols].sum(axis=1)
cols = [i for i in range(16,21)]
total_below_20 = population_df[cols].sum(axis=1)
cols = [i for i in range(21,26)]
total_below_25 = population_df[cols].sum(axis=1)
cols = [i for i in range(0,1)]
total_below_1 = population_df[cols].sum(axis=1)

# add the 'total' column to the DataFrame
population_df['total_population'] = total
population_df['total_population_1-5'] = total_below_5
population_df['total_population_6-10'] = total_below_10
population_df['total_population_11-15'] = total_below_15
population_df['total_population_16-20'] = total_below_20
population_df['total_population_21-25'] = total_below_25
population_df['total_population_0-1'] = total_below_1
population_df['total_population'] = population_df['total_population'] + population_df['100+']
population_df = population_df.rename(columns={'Region, subregion, country or area *': 'Country'})
population_df['Year'] = pd.to_datetime(population_df['Year'], format='%Y')
population_df = replace_country_names(population_df, replacements)
population_df.columns = [str(column) for column in population_df.columns]

In [15]:
list = ['total_population','total_population_0-1','total_population_1-5', 'total_population_6-10',
       'total_population_11-15', 'total_population_16-20']
for i in list:
    population_df[i] = pd.to_numeric(population_df[i], downcast='float', errors='coerce')

In [16]:
population_filt_df = population_df[['Country','Year','total_population','total_population_0-1','total_population_1-5','total_population_6-10',
       'total_population_11-15', 'total_population_16-20']]

### GDP

In [17]:
#gdp_df = read_file_who(['raw\GDP'],['current GDP'],['Data'],3)

gdp_df.columns = gdp_df.columns.astype(str)
value_vars = gdp_df.columns.drop(['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code', 'Option', 'Type'])
gdp_melt_df = pd.melt(gdp_df, id_vars=['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code', 'Option', 'Type'], 
                 value_vars=value_vars, 
                 var_name='Year', value_name='GDP')
gdp_df = gdp_melt_df.rename(columns={'Country Name':'Country'})
gdp_df['GDP_Million'] = gdp_df['GDP']/1000000
gdp_df['Year'] = pd.to_datetime(gdp_df['Year'], format='%Y')
gdp_df = replace_country_names(gdp_df, replacements)

#rename
gdp_filt_df = gdp_df

### Merge Data

In [95]:
merged_df = pd.merge(expenditure_detail_filt_df[['Country', 'Region', 'Gavi / Income status', 'Year',
       'total_vaccine in USD',  'total_immunization in USD','total_immunization in USD per surviving infant','total_vaccine in USD per surviving infant'
     ]].drop_duplicates(),immunization_coverage_df.drop_duplicates(), on = ['Country','Year'], how = 'left')
merged_df = pd.merge(merged_df.drop_duplicates(),gdp_filt_df[['Country','GDP_Million','Year']].drop_duplicates(), on = ['Country','Year'], how = 'left')
merged_df = pd.merge(merged_df.drop_duplicates(),population_filt_df[['Country','Year','total_population','total_population_0-1','total_population_1-5', 'total_population_6-10',
       'total_population_11-15', 'total_population_16-20']].drop_duplicates(),on = ['Country','Year'],how ='left')
merged_df = pd.merge(merged_df.drop_duplicates(),country_area_df[['Country','land_area']].drop_duplicates(),on = ['Country'],how ='left')

In [96]:
merged_df

                   Country Region              Gavi / Income status  \
0              Afghanistan   EMRO         Gavi low income countries   
1                   Angola   AFRO  non-Gavi middle income countries   
2                  Albania   EURO  non-Gavi middle income countries   
3                  Andorra   EURO             High income countries   
4     United Arab Emirates   EMRO             High income countries   
...                    ...    ...                               ...   
3115                 Samoa   WPRO  non-Gavi middle income countries   
3116                 Yemen   EMRO         Gavi low income countries   
3117          South Africa   AFRO  non-Gavi middle income countries   
3118                Zambia   AFRO  Gavi low-middle income countries   
3119              Zimbabwe   AFRO  Gavi low-middle income countries   

           Year  total_vaccine in USD  total_immunization in USD  \
0    2006-01-01                   NaN               4.000000e+07   
1    2006-0

### Clean data

##### Exclude the rich country (high income)

##### Create new column to calculate the cost of vaccine delivery

In [97]:
merged_df['Year'] = pd.to_datetime(merged_df['Year'], format='%Y')
merged_df['total_vaccine_delivery_cost in USD'] = merged_df['total_immunization in USD'] -merged_df['total_vaccine in USD']


In [98]:
#change the data into Million format
merged_df['Vaccine USD Mil'] = merged_df['total_vaccine in USD'] / 1000000
merged_df['Immunization USD Mil'] = merged_df['total_immunization in USD'] / 1000000
merged_df['Vaccine Delivery Cost USD Mil'] = merged_df['total_vaccine_delivery_cost in USD'] / 1000000

In [99]:
# group the dataframe by country
grouped_df = merged_df.groupby('Country')

# Define a function to create the previous year columns in million
def create_prev_year_cols_million(col):
    return [f"{col}_prev_{i}_year_million" for i in range(1, 6)]

# Create new columns for 'total_vaccine in USD', 'total_immunization in USD', and 'total_vaccine_delivery_cost in USD', 
# for the value for previous year up until last 5 years, in million
for col in ['total_vaccine in USD', 'total_immunization in USD', 'total_vaccine_delivery_cost in USD']:
    prev_year_cols = create_prev_year_cols_million(col)
    for i, prev_year_col in enumerate(prev_year_cols):
        merged_df[prev_year_col] = grouped_df[col].apply(lambda x: x.shift(i+1) / 1000000)
        #merged_df[prev_year_col] = grouped_df[col].apply(lambda x: x.shift(i+1) )
        # Calculate the percentage change between the current year and the previous year
        curr_col = f"{col}_prev_1_year_million"
        pct_col = f"{col}_prev_{i+1}_year_pct_change"
        if curr_col in merged_df.columns and prev_year_col in merged_df.columns:
            merged_df[pct_col] = (merged_df[curr_col] - merged_df[prev_year_col]) / merged_df[prev_year_col]
        else:
            merged_df[pct_col] = float('nan')


In [100]:
# group the dataframe by country
grouped_df = merged_df.groupby('Country')

# Define a function to create the previous year columns in million
def create_prev_year_cols_million(col):
    return [f"{col}_prev_{i}_year_million" for i in range(1, 6)]

# Create new columns for 'total_vaccine in USD', 'total_immunization in USD', and 'total_vaccine_delivery_cost in USD', 
# for the value for previous year up until last 5 years, in million
for col in ['total_vaccine in USD', 'total_immunization in USD', 'total_vaccine_delivery_cost in USD']:
    prev_year_cols = create_prev_year_cols_million(col)
    for i, prev_year_col in enumerate(prev_year_cols):
        merged_df[prev_year_col] = grouped_df[col].apply(lambda x: x.shift(i+1) / 1000000)
        #merged_df[prev_year_col] = grouped_df[col].apply(lambda x: x.shift(i+1) )
        # Calculate the percentage change between the current year and the previous year
        curr_col = f"{col}_prev_1_year_million"
        pct_col = f"{col}_prev_{i+1}_year_pct_change"
        if curr_col in merged_df.columns and prev_year_col in merged_df.columns:
            merged_df[pct_col] = (merged_df[curr_col] - merged_df[prev_year_col]) / merged_df[prev_year_col]
        else:
            merged_df[pct_col] = float('nan')


In [101]:
merged_df['total_vaccine in USD_prev_1_year_pct_change'] = (merged_df['Vaccine USD Mil'] - merged_df['total_vaccine in USD_prev_1_year_million']) / merged_df['total_vaccine in USD_prev_1_year_million']
merged_df['total_immunization in USD_prev_1_year_pct_change'] = (merged_df['Immunization USD Mil'] - merged_df['total_immunization in USD_prev_1_year_million']) / merged_df['total_immunization in USD_prev_1_year_million']
merged_df['total_vaccine_delivery_cost in USD_prev_1_year_pct_change'] = (merged_df['Vaccine Delivery Cost USD Mil'] - merged_df['total_vaccine_delivery_cost in USD_prev_1_year_million']) / merged_df['total_vaccine_delivery_cost in USD_prev_1_year_million']

In [102]:
merged_df

                   Country Region              Gavi / Income status  \
0              Afghanistan   EMRO         Gavi low income countries   
1                   Angola   AFRO  non-Gavi middle income countries   
2                  Albania   EURO  non-Gavi middle income countries   
3                  Andorra   EURO             High income countries   
4     United Arab Emirates   EMRO             High income countries   
...                    ...    ...                               ...   
3115                 Samoa   WPRO  non-Gavi middle income countries   
3116                 Yemen   EMRO         Gavi low income countries   
3117          South Africa   AFRO  non-Gavi middle income countries   
3118                Zambia   AFRO  Gavi low-middle income countries   
3119              Zimbabwe   AFRO  Gavi low-middle income countries   

           Year  total_vaccine in USD  total_immunization in USD  \
0    2006-01-01                   NaN               4.000000e+07   
1    2006-0

In [85]:

go.Figure(
    data=[go.Histogram(x=merged_df["Vaccine Delivery Cost USD Mil"], xbins={"start": -60.0, "end": 420.0, "size": 40.0})],
    layout=go.Layout(title="Histogram of Vaccine Delivery Cost USD Mil", yaxis={"title": "Count"}, bargap=0.05),
    )

In [86]:
# Step: Keep rows where Vaccine Delivery Cost USD Mil > 0
merged_df = merged_df.loc[merged_df['Vaccine Delivery Cost USD Mil'] > 0]

In [87]:
fig = px.histogram(merged_df.dropna(subset=['total_vaccine in USD_prev_1_year_pct_change']), x='Year', histfunc='avg', y='total_vaccine in USD_prev_1_year_pct_change')
fig

In [88]:
fig = px.histogram(merged_df.dropna(subset=['total_immunization in USD_prev_1_year_pct_change']), x='Year', histfunc='avg', y='total_immunization in USD_prev_1_year_pct_change')
fig

In [89]:
fig = px.histogram(merged_df.dropna(subset=['total_immunization in USD_prev_1_year_pct_change']), x='Year', histfunc='avg', y='total_immunization in USD_prev_1_year_pct_change')
fig

In [90]:

fig = px.histogram(merged_df, x='Year', histfunc='avg', y='total_vaccine_delivery_cost in USD_prev_1_year_pct_change')
fig

In [91]:
merged_df

           Country Region              Gavi / Income status       Year  \
196         Angola   AFRO  non-Gavi middle income countries 2007-01-01   
197        Albania   EURO  non-Gavi middle income countries 2007-01-01   
209   Burkina Faso   AFRO         Gavi low income countries 2007-01-01   
210     Bangladesh  SEARO  Gavi low-middle income countries 2007-01-01   
213        Bahamas   AMRO             High income countries 2007-01-01   
...            ...    ...                               ...        ...   
3092    Seychelles   AFRO             High income countries 2021-01-01   
3095          Togo   AFRO         Gavi low income countries 2021-01-01   
3102       Tunisia   EMRO  non-Gavi middle income countries 2021-01-01   
3118        Zambia   AFRO  Gavi low-middle income countries 2021-01-01   
3119      Zimbabwe   AFRO  Gavi low-middle income countries 2021-01-01   

      total_vaccine in USD  total_immunization in USD  \
196           9.944444e+06               1.500000e+07 

##### Immunization & vaccine expenditure percentage change is below 100% per year on average. However the Average vaccine delivery percentage change is over 100%. This shows us that there are data inaccuracy on the immunization and vaccine expenditure. We will exclude the data for vaccine delivery cost changes over 100%

In [78]:
# Step: Keep rows where total_vaccine_delivery_cost in USD_prev_1_year_pct_change >= 1
merged_df = merged_df.loc[merged_df['total_vaccine_delivery_cost in USD_prev_1_year_pct_change'] <= 1]

In [79]:
merged_df

           Country Region              Gavi / Income status       Year  \
196         Angola   AFRO  non-Gavi middle income countries 2007-01-01   
197        Albania   EURO  non-Gavi middle income countries 2007-01-01   
209   Burkina Faso   AFRO         Gavi low income countries 2007-01-01   
210     Bangladesh  SEARO  Gavi low-middle income countries 2007-01-01   
213        Bahamas   AMRO             High income countries 2007-01-01   
...            ...    ...                               ...        ...   
3092    Seychelles   AFRO             High income countries 2021-01-01   
3095          Togo   AFRO         Gavi low income countries 2021-01-01   
3102       Tunisia   EMRO  non-Gavi middle income countries 2021-01-01   
3118        Zambia   AFRO  Gavi low-middle income countries 2021-01-01   
3119      Zimbabwe   AFRO  Gavi low-middle income countries 2021-01-01   

      total_vaccine in USD  total_immunization in USD  \
196           9.944444e+06               1.500000e+07 

### Feature Engineering

##### Create a population difference data

In [31]:
# sort the dataframe by country and year
merged_df = merged_df.sort_values(['Country', 'Year'])

# create new columns for the population percentage difference
merged_df['Population_Diff_Percent_0-1'] = None
merged_df['Population_Diff_Percent_1-5'] = None
merged_df['Population_Diff_Percent_6-10'] = None
merged_df['Population_Diff_Percent_11-15'] = None
merged_df['Population_Diff_Percent_16-20'] = None
merged_df['Population_Diff_Percent'] = None

# iterate over each country group
for country, group_df in grouped_df:
    # calculate the population difference percentage for each row
    group_df['Population_Diff_Percent_0-1'] = group_df['total_population_0-1'].pct_change()
    group_df['Population_Diff_Percent_1-5'] = group_df['total_population_1-5'].pct_change()
    group_df['Population_Diff_Percent_6-10'] = group_df['total_population_6-10'].pct_change()
    group_df['Population_Diff_Percent_11-15'] = group_df['total_population_11-15'].pct_change()
    group_df['Population_Diff_Percent_16-20'] = group_df['total_population_16-20'].pct_change()
    group_df['Population_Diff_Percent'] = group_df['total_population'].pct_change()
    # update the merged_df with the new column values
    merged_df.update(group_df)

In [32]:
# calculate the average coverage for different vaccines
merged_df['Avg_Vaccine_Coverage'] = merged_df[['BCG_COVERAGE', 'DTPCV1_COVERAGE', 'DTPCV3_COVERAGE', 'HEPB3_COVERAGE', 
                                                'HEPB_BD_COVERAGE', 'HIB3_COVERAGE', 'MCV1_COVERAGE', 'MCV2_COVERAGE',
                                                'PCV3_COVERAGE', 'POL3_COVERAGE', 'RCV1_COVERAGE', 'ROTAC_COVERAGE']].mean(axis=1)




#### Trea 

In [33]:
#get the next year cost
# Create new columns for the next year's values of 'total_vaccine in USD', 'total_immunization in USD', and 'total_vaccine_delivery_cost in USD'
merged_df['Next Year Vaccine USD Mil'] = grouped_df['total_vaccine in USD'].shift(-1) / 1000000
merged_df['Next Year Immunization USD Mil'] = grouped_df['total_immunization in USD'].shift(-1) / 1000000
merged_df['Next Year Vaccine Delivery Cost USD Mil'] = grouped_df['total_vaccine_delivery_cost in USD'].shift(-1) / 1000000

In [34]:
#get the next year cost
# Create new columns for the next year's values of 'total_vaccine in USD', 'total_immunization in USD', and 'total_vaccine_delivery_cost in USD'
merged_df['Next Year Vaccine USD Mil'] = grouped_df['total_vaccine in USD'].shift(-1) 
merged_df['Next Year Immunization USD Mil'] = grouped_df['total_immunization in USD'].shift(-1) 
merged_df['Next Year Vaccine Delivery Cost USD Mil'] = grouped_df['total_vaccine_delivery_cost in USD'].shift(-1) 

In [35]:
# Define a function to create the previous year columns for GDP in million
def create_prev_year_gdp_cols_million():
    return [f"GDP_prev_{i}_year" for i in range(1, 6)]

# Create new columns for GDP, for the value for previous year up until last 5 years
prev_gdp_cols = create_prev_year_gdp_cols_million()
for i, prev_gdp_col in enumerate(prev_gdp_cols):
    merged_df[prev_gdp_col] = grouped_df['GDP_Million'].apply(lambda x: x.shift(i+1))

    # Calculate the percentage change between the current year and the previous year
    curr_col = "GDP_Million"
    pct_col = f"GDP_prev_{i+1}_year_pct_change"
    if curr_col in merged_df.columns and prev_gdp_col in merged_df.columns:
        merged_df[pct_col] = (merged_df[curr_col] - merged_df[prev_gdp_col]) / merged_df[prev_gdp_col]
    else:
        merged_df[pct_col] = float('nan')


In [36]:
# sort the dataframe by country and year
merged_df.sort_values(['Country', 'Year'], inplace=True)

# group the dataframe by country and calculate the percentage change in immunization USD between consecutive years
merged_df['current_Immunization_cost_pct_change_from_last_year'] = merged_df.groupby('Country')['Immunization USD Mil'].pct_change() * 100



In [37]:
import ppscore as pps

# Compute the PPS matrix for all pairs of columns in merged_df
pps_matrix = pps.matrix(merged_df)

# Find the columns that have a PPS score of at least 0.5 with respect to the target column
target_column = 'Immunization USD Mil'
high_pps_cols = set()
for col in merged_df.columns:
    pps_score = pps.score(merged_df, col, target_column)['ppscore']
    if (pps_score > 0.0) & (pps_score < 0.9):
        high_pps_cols.add(col)

# Print the high-PPS columns and their corresponding PPS scores
for col in high_pps_cols:
    pps_score = pps.score(merged_df, col, target_column)['ppscore']
    print(f'{col}: {pps_score}')
    
# Create a new DataFrame with only the high-PPS columns and the target column
#new_df = merged_df[high_pps_cols + [target_column]]


Next Year Immunization USD Mil: 0.577892897203423
total_vaccine in USD_prev_1_year_million: 0.4874875482571598
total_vaccine in USD_prev_4_year_million: 0.39060218843691163
total_immunization in USD_prev_3_year_million: 0.46250811712730977
total_population_0-1: 0.06443259564310377
total_vaccine_delivery_cost in USD: 0.006312388754726284
Country: 0.5910994939569542
total_immunization in USD_prev_5_year_million: 0.3153876700763317
Vaccine USD Mil: 0.6517618565402876
land_area: 0.5988807818572437
GDP_Million: 0.20329257240928422
Next Year Vaccine USD Mil: 0.47225407542578635
total_vaccine in USD_prev_3_year_million: 0.42586721656765925
total_population: 0.06563707099780713
GDP_prev_4_year: 0.1508824937428267
total_immunization in USD_prev_1_year_million: 0.5414474947259531
total_vaccine in USD: 0.6517618565402876
total_immunization in USD_prev_2_year_million: 0.4576625648585899
total_vaccine in USD_prev_2_year_million: 0.43808608810107175
GDP_prev_5_year: 0.14550833682791597
total_immuniz

In [38]:
high_pps_cols

{'Country',
 'GDP_Million',
 'GDP_prev_1_year',
 'GDP_prev_2_year',
 'GDP_prev_3_year',
 'GDP_prev_4_year',
 'GDP_prev_5_year',
 'Next Year Immunization USD Mil',
 'Next Year Vaccine USD Mil',
 'Vaccine Delivery Cost USD Mil',
 'Vaccine USD Mil',
 'land_area',
 'total_immunization in USD_prev_1_year_million',
 'total_immunization in USD_prev_2_year_million',
 'total_immunization in USD_prev_3_year_million',
 'total_immunization in USD_prev_4_year_million',
 'total_immunization in USD_prev_5_year_million',
 'total_population',
 'total_population_0-1',
 'total_population_1-5',
 'total_population_16-20',
 'total_vaccine in USD',
 'total_vaccine in USD_prev_1_year_million',
 'total_vaccine in USD_prev_2_year_million',
 'total_vaccine in USD_prev_3_year_million',
 'total_vaccine in USD_prev_4_year_million',
 'total_vaccine in USD_prev_5_year_million',
 'total_vaccine_delivery_cost in USD',
 'total_vaccine_delivery_cost in USD_prev_1_year_million'}

##### Extrapolate the data

In [39]:
# Specify columns to interpolate missing values
cols_to_interpolate = ['total_vaccine in USD', 'total_immunization in USD',
       'total_immunization in USD per surviving infant',
       'total_vaccine in USD per surviving infant', 'BCG_COVERAGE',
       'DTPCV1_COVERAGE', 'DTPCV3_COVERAGE', 'HEPB3_COVERAGE',
       'HEPB_BD_COVERAGE', 'HIB3_COVERAGE', 'IPV1_COVERAGE', 'MCV1_COVERAGE',
       'MCV2_COVERAGE', 'MCV2X2_COVERAGE', 'PCV3_COVERAGE', 'POL3_COVERAGE',
       'RCV1_COVERAGE', 'ROTAC_COVERAGE', 'GDP_Million', 'total_population','total_population_0-1',
       'total_population_1-5', 'total_population_6-10',
       'total_population_11-15', 'total_population_16-20', 'land_area',
       'total_vaccine_delivery_cost in USD', 'Vaccine USD Mil',
       'Immunization USD Mil', 'Vaccine Delivery Cost USD Mil',
       'total_vaccine in USD_prev_1_year_million',
       'total_vaccine in USD_prev_1_year_pct_change',
       'total_vaccine in USD_prev_2_year_million',
       'total_vaccine in USD_prev_2_year_pct_change',
       'total_vaccine in USD_prev_3_year_million',
       'total_vaccine in USD_prev_3_year_pct_change',
       'total_vaccine in USD_prev_4_year_million',
       'total_vaccine in USD_prev_4_year_pct_change',
       'total_vaccine in USD_prev_5_year_million',
       'total_vaccine in USD_prev_5_year_pct_change',
       'total_immunization in USD_prev_1_year_million',
       'total_immunization in USD_prev_1_year_pct_change',
       'total_immunization in USD_prev_2_year_million',
       'total_immunization in USD_prev_2_year_pct_change',
       'total_immunization in USD_prev_3_year_million',
       'total_immunization in USD_prev_3_year_pct_change',
       'total_immunization in USD_prev_4_year_million',
       'total_immunization in USD_prev_4_year_pct_change',
       'total_immunization in USD_prev_5_year_million',
       'total_immunization in USD_prev_5_year_pct_change',
       'total_vaccine_delivery_cost in USD_prev_1_year_million',
       'total_vaccine_delivery_cost in USD_prev_1_year_pct_change',
       'total_vaccine_delivery_cost in USD_prev_2_year_million',
       'total_vaccine_delivery_cost in USD_prev_2_year_pct_change',
       'total_vaccine_delivery_cost in USD_prev_3_year_million',
       'total_vaccine_delivery_cost in USD_prev_3_year_pct_change',
       'total_vaccine_delivery_cost in USD_prev_4_year_million',
       'total_vaccine_delivery_cost in USD_prev_4_year_pct_change',
       'total_vaccine_delivery_cost in USD_prev_5_year_million',
       'total_vaccine_delivery_cost in USD_prev_5_year_pct_change']
for j in cols_to_interpolate:
    # Group the data by country and year
    grouped = merged_df.groupby(['Country'])

    # Loop through each group
    for name, group in grouped:
        # Create a new dataframe with the year as the index
        ts = pd.DataFrame(group[j].values, index=group['Year'], columns=[j])
        # Convert the column to numeric dtype
        ts[j] = pd.to_numeric(ts[j], errors='coerce')
        # Interpolate missing values using time series method
        ts = ts.interpolate(method='time')
        # Update the merged_df with the interpolated values
        merged_df.loc[group.index,j] = ts[j].values

In [40]:
merged_df

          Country Region              Gavi / Income status       Year  \
780   Afghanistan   EMRO         Gavi low income countries 2010-01-01   
1170  Afghanistan   EMRO         Gavi low income countries 2012-01-01   
1560  Afghanistan   EMRO         Gavi low income countries 2014-01-01   
1950  Afghanistan   EMRO         Gavi low income countries 2016-01-01   
2145  Afghanistan   EMRO         Gavi low income countries 2017-01-01   
...           ...    ...                               ...        ...   
2144     Zimbabwe   AFRO  Gavi low-middle income countries 2016-01-01   
2534     Zimbabwe   AFRO  Gavi low-middle income countries 2018-01-01   
2729     Zimbabwe   AFRO  Gavi low-middle income countries 2019-01-01   
2924     Zimbabwe   AFRO  Gavi low-middle income countries 2020-01-01   
3119     Zimbabwe   AFRO  Gavi low-middle income countries 2021-01-01   

      total_vaccine in USD  total_immunization in USD  \
780            23204631.00               2.518116e+07   
1170     

In [41]:
merged_df.to_csv('immunization_prediction_data.csv')

### Data Selection

In [42]:
merged_df = merged_df[merged_df['Year'] >= '2006']

In [43]:
selected_column = ['Country',
 'GDP_Million',
 'GDP_prev_1_year',
 'GDP_prev_2_year',
 'GDP_prev_3_year',
 'land_area',
 'total_immunization in USD_prev_1_year_million',
 'total_immunization in USD_prev_2_year_million',
 'total_immunization in USD_prev_3_year_million',
 'total_population',
 'total_population_0-1',
 'total_population_1-5',
 'total_population_16-20',
 'total_vaccine in USD_prev_1_year_million',
 'total_vaccine in USD_prev_2_year_million',
 'total_vaccine in USD_prev_3_year_million',
 'total_vaccine_delivery_cost in USD_prev_1_year_million',

       
       'Vaccine USD Mil',
       'Immunization USD Mil'
       ]


selected_data_df = merged_df[selected_column]
#selected_data_df['Year'] = selected_data_df['Year'].dt.year.astype(float)
selected_data_df = selected_data_df.dropna(axis = 0)

In [44]:
selected_data_df

          Country   GDP_Million  GDP_prev_1_year  GDP_prev_2_year  \
1170  Afghanistan  20203.572960     18190.410821     15633.856787   
1560  Afghanistan  20550.582747     20564.485419     20203.572960   
1950  Afghanistan  18019.558182     19998.156214     20550.582747   
2145  Afghanistan  18896.352022     18019.558182     19998.156214   
2340  Afghanistan  18418.848300     18896.352022     18019.558182   
...           ...           ...              ...              ...   
2144     Zimbabwe  20548.678070     19963.120610     19495.519630   
2534     Zimbabwe  34156.069918     17584.890937     20548.678070   
2729     Zimbabwe  21832.234926     34156.069918     17584.890937   
2924     Zimbabwe  21509.698406     21832.234926     34156.069918   
3119     Zimbabwe  28371.238666     21509.698406     21832.234926   

      GDP_prev_3_year  land_area  \
1170     12154.835708    65223.0   
1560     18190.410821    65223.0   
1950     20564.485419    65223.0   
2145     20550.582747    65

### ML

In [49]:
# Import required libraries
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor

import pandas as pd
import numpy as np

def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

#merged_df = merged_df.dropna(axis=0)
# Split the data into training and testing sets
target_column = [ 'Vaccine USD Mil',
       'Immunization USD Mil']
X_col = [x for x in selected_column if x not in target_column]
X = selected_data_df[X_col]

y1 = selected_data_df[target_column[0]]
y2 = selected_data_df[target_column[1]]
#y3 = merged_df['total_vaccine_delivery_cost in USD']

# Label encode categorical columns
label_encoder = LabelEncoder()
X['Country'] = label_encoder.fit_transform(X['Country'])
#X['Region'] = label_encoder.fit_transform(X['Region'])
#X['Year'] = label_encoder.fit_transform(X['Year'])
#X['Gavi / Income status'] = label_encoder.fit_transform(X['Gavi / Income status'])

# One-hot encode categorical columns (alternative to label encoding)
# ct = ColumnTransformer([('encoder', OneHotEncoder(), [0, 1, 2])], remainder='passthrough')
# X = ct.fit_transform(X)



# Split data into training and testing sets
X1_train, X1_test, y1_train, y1_test = train_test_split(X, y1, test_size=0.2, random_state=42)

# Split data into training and testing sets
X2_train, X2_test, y2_train, y2_test = train_test_split(X, y2, test_size=0.2, random_state=42)

# Define the models and their hyperparameters
models = [
    {
        'name': 'linear regression',
        'model': LinearRegression(),
        'params': {}
    },
#     {
#         'name': 'random forest',
#         'model': RandomForestRegressor(),
#         'params': {
#             'model__n_estimators': [10, 50, 100, 200],
#             'model__max_depth': [None, 5, 10, 20]
#         }
#     },
#     {
#         'name': 'gradient boosting',
#         'model': GradientBoostingRegressor(),
#         'params': {
#             'model__n_estimators': [10, 50, 100, 200],
#             'model__max_depth': [3, 5, 10],
#             'model__learning_rate': [0.1, 0.01, 0.001]
#         }
#     },
    {
        'name': 'neural network',
        'model': MLPRegressor(activation='relu'),
        'params': {
            'model__hidden_layer_sizes': [(50,), (100,), (50, 50),(150, 150)],
            'model__learning_rate_init': [0.1, 0.01, 0.001],
            'model__max_iter': [500],
            'model__solver': ['adam','lbfgs'],
       
        }
    }
]



# Evaluate each model using cross-validation and select the best one for y1
best_model_y1 = None
best_score_y1 = None
for model in models:
    pipeline = Pipeline([('scaler', StandardScaler()), ('model', model['model'])])
    clf = GridSearchCV(pipeline, model['params'], cv=5, scoring='neg_mean_absolute_percentage_error')
    clf.fit(X1_train, y1_train)
    score = -clf.best_score_
    print(model['name'], f'mean absolute percentage error ({target_column[0]}):', score)
    if best_score_y1 is None or score < best_score_y1:
        best_score_y1 = score
        best_model_y1 = clf.best_estimator_

# Get the name and parameters of the best model for y1
best_model_name_y1 = best_model_y1.named_steps['model'].__class__.__name__
best_model_params_y1 = best_model_y1.named_steps['model'].get_params()
print(f'Best model ({target_column[0]}):', best_model_name_y1)
print(f'Best model parameters ({target_column[0]}):', best_model_params_y1)


# Evaluate each model using cross-validation and select the best one for y2
best_model_y2 = None
best_score_y2 = None
for model in models:
    pipeline = Pipeline([('scaler', StandardScaler()), ('model', model['model'])])
    clf = GridSearchCV(pipeline, model['params'], cv=5, scoring='neg_mean_absolute_percentage_error')
    clf.fit(X2_train, y2_train)
    score = -clf.best_score_
    print(model['name'], f'mean absolute percentage error ({target_column[1]}):', score)
    if best_score_y2 is None or score < best_score_y2:
        best_score_y2 = score
        best_model_y2 = clf.best_estimator_

# Get the name and parameters of the best model for y2
best_model_name_y2 = best_model_y2.named_steps['model'].__class__.__name__
best_model_params_y2 = best_model_y2.named_steps['model'].get_params()
print(f'Best model ({target_column[1]}):', best_model_name_y2)
print(f'Best model parameters ({target_column[1]}):', best_model_params_y2)


# # Evaluate each model using cross-validation and select the best one for y3
# best_model_y3 = None
# best_score_y3 = None
# for model in models:
#     pipeline = Pipeline([('scaler', StandardScaler()), ('model', model['model'])])
#     clf = GridSearchCV(pipeline, model['params'], cv=5, scoring='neg_mean_squared_error')
#     clf.fit(X, y3)
#     score = -clf.best_score_
#     print(model['name'], 'mean squared error (total_vaccine_delivery_cost in USD):', score)
#     if best_score_y3 is None or score < best_score_y3:
#         best_score_y3 = score
#         best_model_y3 = clf.best_estimator_

# # Get the name and parameters of the best model for y3
# best_model_name_y3 = best_model_y3.named_steps['model'].__class__.__name__
# best_model_params_y3 = best_model_y3.named_steps['model'].get_params()
# print('Best model (total_vaccine_delivery_cost in USD):', best_model_name_y3)
# print('Best model parameters (total_vaccine_delivery_cost in USD):', best_model_params_y3)


# Train the best models on the full dataset
best_model_y1.fit(X1_train, y1_train)
best_model_y2.fit(X2_train, y2_train)
#best_model_y3.fit(X, y3)

# Make predictions on the testing set and evaluate performance
y1_pred = best_model_y1.predict(X1_test)
print(f'Mean absolute percentage error ({target_column[0]}):', mean_absolute_percentage_error(y1_test, y1_pred))
print(f'R^2 score ({target_column[0]}):', r2_score(y1_test, y1_pred))

y2_pred = best_model_y2.predict(X2_test)
print(f'Mean absolute percentage error ({target_column[1]}):', mean_absolute_percentage_error(y2_test, y2_pred))
print(f'R^2 score ({target_column[1]}):', r2_score(y2_test, y2_pred))


# y2_pred = best_model_y2.predict(X2_test)
# print(f'Mean squared error ({target_column[1]}):', mean_squared_error(y2_test, y2_pred))
# print(f'R^2 score ({target_column[1]}):', r2_score(y2_test, y2_pred))

# y3_pred = best_model_y3.predict(X)
# print('Mean squared error (total_vaccine_delivery_cost in USD):', mean_squared_error(y3, y3_pred))
# print('R^2 score (total_vaccine_delivery_cost in USD):', r2_score(y3, y3_pred))



linear regression mean squared error (Vaccine USD Mil): 2.036564661251389
neural network mean squared error (Vaccine USD Mil): 0.6623461383734005
Best model (Vaccine USD Mil): MLPRegressor
Best model parameters (Vaccine USD Mil): {'activation': 'relu', 'alpha': 0.0001, 'batch_size': 'auto', 'beta_1': 0.9, 'beta_2': 0.999, 'early_stopping': False, 'epsilon': 1e-08, 'hidden_layer_sizes': (150, 150), 'learning_rate': 'constant', 'learning_rate_init': 0.01, 'max_fun': 15000, 'max_iter': 500, 'momentum': 0.9, 'n_iter_no_change': 10, 'nesterovs_momentum': True, 'power_t': 0.5, 'random_state': None, 'shuffle': True, 'solver': 'lbfgs', 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': False, 'warm_start': False}
linear regression mean squared error (Immunization USD Mil): 1.8685765506429917
neural network mean squared error (Immunization USD Mil): 0.4400624326478139
Best model (Immunization USD Mil): MLPRegressor
Best model parameters (Immunization USD Mil): {'activation': 'relu', 'alpha':

##### Test the delivery cost

In [66]:
# Train the best models on the full dataset
best_model_y1.fit(X1_train, y1_train)
best_model_y2.fit(X2_train, y2_train)

# Make predictions on the testing set and evaluate performance
y1_pred = best_model_y1.predict(X1_test)
y2_pred = best_model_y2.predict(X2_test)

# Create a dataframe to show the prediction, actual value, and the MAPE for y1
y1_results = pd.DataFrame({'y1_actual': y1_test, 'y1_prediction': y1_pred})
y1_results['y1_mape'] = y1_results.apply(lambda x: mean_absolute_percentage_error(x['y1_actual'], x['y1_prediction']), axis=1)
print('Results for y1 (Vaccine USD Mil):')
print(y1_results.head())

# Create a dataframe to show the prediction, actual value, and the MAPE for y2
y2_results = pd.DataFrame({'y2_actual': y2_test, 'y2_prediction': y2_pred})
y2_results['y2_mape'] = y2_results.apply(lambda x: mean_absolute_percentage_error(x['y2_actual'], x['y2_prediction']), axis=1)
print('Results for y2 (Immunization USD Mil):')
print(y2_results.head())


Results for y1 (Vaccine USD Mil):
      y1_actual  y1_prediction     y1_mape
2709   0.598164      -1.668359  378.913331
663   44.854592     -74.818628  266.802603
1689  58.469041      68.893721   17.829400
1474  29.839815      15.825783   46.964206
1726   1.247649       1.676708   34.389375
Results for y2 (Immunization USD Mil):
       y2_actual  y2_prediction    y2_mape
2709    3.228500       1.637936  49.266359
663    72.313362      36.964826  48.882441
1689  106.307348     115.032806   8.207765
1474   37.136162      18.482759  50.229754
1726    1.892629       2.710066  43.190552


In [67]:
y2_results

       y2_actual  y2_prediction    y2_mape
2709    3.228500       1.637936  49.266359
663    72.313362      36.964826  48.882441
1689  106.307348     115.032806   8.207765
1474   37.136162      18.482759  50.229754
1726    1.892629       2.710066  43.190552
...          ...            ...        ...
962    42.742304      45.480679   6.406711
814    30.765821      27.748847   9.806253
1874   23.863989      28.390441  18.967708
1899   14.527025      22.491148  54.822810
2059   30.895938      19.053826  38.329026

[118 rows x 3 columns]

In [62]:
####