In [1]:
import pandas as pd
import glob
import bamboolib
import plotly.express as px
import plotly.graph_objs as go
import numpy as np
import wbgapi as wb
import config
import pymysql
from joblib import Parallel, delayed
import ppscore as pps

## Read files

In [150]:
# MySQL connection details
host = config.HOST
user = 'read'
password = config.PASSWORD
database = config.DATABASE

# Create a connection to the MySQL database
connection = pymysql.connect(
    host=host,
    user=user,
    password=password,
    database=database
)

In [3]:
cursor = connection.cursor()
cursor.execute("SHOW TABLES")
tables = cursor.fetchall()

for table in tables:
    print(table[0])

CYA_UN_WPP_LEx
CYA_UN_WPP_Mortality_Rates
CYA_UN_WPP_Population
CYAntCat_eJRF_Coverage
CYI_WHO_eJRF_raw_expenditure
CYI_WHO_eJRF_raw_expenditure_Metadata
CY_GHED_Base
CY_GHED_Codebook
CY_GHED_Metadata
CY_GHED_NCU
CY_GHED_PPP
CY_GHED_USD
CY_IMF_WEO
CY_UN_WPP_LE
CY_UN_WPP_LEx
CY_UN_WPP_Misc
CY_UN_WPP_TFR
CY_WHO_eJRF_raw_expenditure
CY_eJRF_clean
C_UNtoISO3
RCIY_IMF_WEO
RCI_IMF_WEO_Metadata
RCY_IMF_WEO
T_MI4A
V_MI4A


In [4]:
# Execute a SQL query
sql = 'SELECT country_name AS "Country", country_code AS "Country Code", region_WHO, income, year AS "Year", gdp_usd AS "GDP_Million", gdp_usd2020 AS "GDP_Constant_Million" FROM CY_GHED_USD'
gdp_df = pd.read_sql_query(sql, connection)

sql = "SELECT * FROM CY_eJRF_clean"
expenditure_df = pd.read_sql_query(sql, connection)

sql = "SELECT * FROM CYAntCat_eJRF_Coverage"
coverage_df = pd.read_sql_query(sql, connection)


In [5]:
birth_rate_df = wb.data.DataFrame(['SP.DYN.CBRT.IN'],time=range(2000, 2030), labels=True)
total_pop_df = wb.data.DataFrame(['SP.POP.TOTL'],time=range(2000, 2030), labels=True)
population_0_14_df = wb.data.DataFrame(["SP.POP.0014.TO"],time=range(2000, 2030), labels=True)
population_15_64_df = wb.data.DataFrame(["SP.POP.1564.TO"],time=range(2000, 2030), labels=True)
population_65_Up_df = wb.data.DataFrame(["SP.POP.65UP.TO"],time=range(2000, 2030), labels=True)
land_df = wb.data.DataFrame(['AG.LND.TOTL.K2'],time=range(2000, 2030), labels=True)

## Data Preparation

In [6]:
def get_missing_value_percentage(expenditure_detail_df,columns_to_group=['Country'], columns_to_filter=['total_immunization in USD', 'total_vaccine in USD'], threshold=0.3,more_than = True):
    # group the data by both 'Year' and 'Country'
    grouped = expenditure_detail_df.groupby(columns_to_group)
    
    # calculate the percentage of missing values for each column within each group
    nan_percentages = grouped.apply(lambda x: x.isna().mean())
    
    # filter the results to only show the specified columns
    filtered = nan_percentages[columns_to_filter]
    
    # filter the results to only show percentages above the threshold value
    if more_than == True:
        filtered = filtered[filtered > threshold].dropna()
    else:
        filtered = filtered[filtered < threshold].dropna()
    return filtered

### Birth per year & population

In [7]:
population_65_Up_df =population_65_Up_df.reset_index()
population_65_Up_df = population_65_Up_df.rename(columns=lambda x: x.replace('YR', ''))
population_65_Up_df = population_65_Up_df.rename(columns={'economy':'Country Code'})
population_65_Up_df = population_65_Up_df.melt(id_vars=['Country Code', 'Country'], var_name='Year', value_name='total_population_65-Up')
population_65_Up_df['Year'] = pd.to_datetime(population_65_Up_df['Year'], format='%Y').dt.year

population_15_64_df =population_15_64_df.reset_index()
population_15_64_df = population_15_64_df.rename(columns=lambda x: x.replace('YR', ''))
population_15_64_df = population_15_64_df.rename(columns={'economy':'Country Code'})
population_15_64_df = population_15_64_df.melt(id_vars=['Country Code', 'Country'], var_name='Year', value_name='total_population_15-64')
population_15_64_df['Year'] = pd.to_datetime(population_15_64_df['Year'], format='%Y').dt.year

population_0_14_df = population_0_14_df.reset_index()
population_0_14_df = population_0_14_df.rename(columns=lambda x: x.replace('YR', ''))
population_0_14_df = population_0_14_df.rename(columns={'economy':'Country Code'})
population_0_14_df = population_0_14_df.melt(id_vars=['Country Code', 'Country'], var_name='Year', value_name='total_population_0-14')
population_0_14_df['Year'] = pd.to_datetime(population_0_14_df['Year'], format='%Y').dt.year

total_pop_df = total_pop_df.reset_index()
total_pop_df = total_pop_df.rename(columns=lambda x: x.replace('YR', ''))
total_pop_df = total_pop_df.rename(columns={'economy':'Country Code'})
total_pop_df = total_pop_df.melt(id_vars=['Country Code', 'Country'], var_name='Year', value_name='total_population')
total_pop_df['Year'] = pd.to_datetime(total_pop_df['Year'], format='%Y').dt.year

birth_rate_df = birth_rate_df.reset_index()
birth_rate_df = birth_rate_df.rename(columns=lambda x: x.replace('YR', ''))
birth_rate_df = birth_rate_df.rename(columns={'economy':'Country Code'})
birth_rate_df = birth_rate_df.melt(id_vars=['Country Code', 'Country'], var_name='Year', value_name='birthrate_crude_per_1000')
birth_rate_df['Year'] = pd.to_datetime(birth_rate_df['Year'], format='%Y').dt.year

birth_per_year_df = pd.merge(birth_rate_df,total_pop_df[['Country Code','Year','total_population']],on = ['Country Code','Year'] )
birth_per_year_df = pd.merge(birth_per_year_df,population_0_14_df[['Country Code','Year','total_population_0-14']],on = ['Country Code','Year'] )
birth_per_year_df = pd.merge(birth_per_year_df,population_15_64_df[['Country Code','Year','total_population_15-64']],on = ['Country Code','Year'] )
birth_per_year_df = pd.merge(birth_per_year_df,population_65_Up_df[['Country Code','Year','total_population_65-Up']],on = ['Country Code','Year'] )
birth_per_year_df['birth_per_year'] = (birth_per_year_df['birthrate_crude_per_1000']/1000)*birth_per_year_df['total_population']

### Land

In [8]:
land_df = wb.data.DataFrame(['AG.LND.TOTL.K2'],time=range(2000, 2030), labels=True)
land_df =land_df.reset_index()
land_df = land_df.rename(columns=lambda x: x.replace('YR', ''))
land_df = land_df.rename(columns={'economy':'Country Code'})
land_df = land_df.melt(id_vars=['Country Code', 'Country'], var_name='Year', value_name='land_area')
land_df['Year'] = pd.to_datetime(land_df['Year'], format='%Y').dt.year


# Group the data by Country Code and get the mode of land_area
mode_land_area = land_df.groupby('Country Code')['land_area'].apply(lambda x: x.mode()[0] if len(x.mode()) > 0 else x.unique()[0])

# Create a new DataFrame with 'Country Code' and 'land_area' columns
mode_land_area_df = pd.DataFrame({'Country Code': mode_land_area.index.astype(str), 'land_area': mode_land_area.values.astype(float)})

country_area_df = mode_land_area_df.copy()

### Goverment expenditure on immunization

In [9]:
expenditure_df.columns = expenditure_df.columns.astype(str)
expenditure_df = expenditure_df.rename(columns={'country_code':'Country Code',"year": 'Year','country_name':'Country','group_GAVI':'Gavi / Income status',
                                             'region_WHO':'Region','3-total-expenditure-routine-immunization':'total_immunization in USD',
                                             '4-government-expenditure-routine-immunization':'gov_immunization in USD',
                                             '6-total-expenditure-vaccines-used-routine-immunization':'total_vaccine in USD',
                                             '7-government-expenditure-vaccines-used-routine-immunization':'gov_vaccine in USD'})

In [10]:
expenditure_df['Year']= pd.to_datetime(expenditure_df['Year'], format='%Y').dt.year
expenditure_detail_filt_df = expenditure_df[['Country Code','Country','Year','Region','Gavi / Income status','total_immunization in USD','gov_immunization in USD','total_vaccine in USD','gov_vaccine in USD']]

### Immunization coverage

In [151]:
sql = "SELECT * FROM CYAntCat_eJRF_Coverage"
coverage_df = pd.read_sql_query(sql, connection)
coverage_df = coverage_df.rename(columns={"country_name":'Country','year':'Year','country_code':'Country Code'})

In [153]:
coverage_df = coverage_df[coverage_df['COVERAGE_CATEGORY'].isin(['ADMIN'])]
coverage_df = coverage_df[~coverage_df['ANTIGEN'].isin(['YFV'])]
coverage_df['Year'] = pd.to_datetime(coverage_df['Year'], format='%Y').dt.year

In [154]:
# create a pivot table for each antigen
doses_pivot = pd.pivot_table(coverage_df, index=['Country Code', 'Year'], 
                               columns='ANTIGEN',
                               values=['DOSES'], 
                               aggfunc='sum')

# flatten the column index
doses_pivot.columns = [f'{col[1]}_{col[0]}' for col in doses_pivot.columns]

# reset the index to make Country and Year columns
doses_pivot = doses_pivot.reset_index()

#rename
#immunization_coverage_df = antigen_pivot

In [155]:
# create a pivot table for each antigen
antigen_pivot = pd.pivot_table(coverage_df, index=['Country Code', 'Year'], 
                               columns='ANTIGEN',
                               values=['COVERAGE'], 
                               aggfunc='sum')

# flatten the column index
antigen_pivot.columns = [f'{col[1]}_{col[0]}' for col in antigen_pivot.columns]

# reset the index to make Country and Year columns
antigen_pivot = antigen_pivot.reset_index()

#rename


In [157]:
# create a pivot table for each antigen
target_pivot = pd.pivot_table(coverage_df, index=['Country Code', 'Year'], 
                               columns='ANTIGEN',
                               values=['TARGET_NUMBER'], 
                               aggfunc='sum')

# flatten the column index
target_pivot.columns = [f'{col[1]}_{col[0]}' for col in target_pivot.columns]

# reset the index to make Country and Year columns
target_pivot = target_pivot.reset_index()


In [158]:
immunization_coverage_df = pd.merge(doses_pivot,antigen_pivot, on =['Country Code', 'Year'], how = 'outer')
immunization_coverage_df = pd.merge(immunization_coverage_df,target_pivot, on =['Country Code', 'Year'], how = 'outer')

In [159]:
threshold = 0.2 # 90% threshold
num_rows = immunization_coverage_df.shape[0]
missing_value_threshold = num_rows * threshold
immunization_coverage_df = immunization_coverage_df.dropna(thresh=missing_value_threshold, axis=1)#.drop('level_0',axis = 1)

### GDP

In [161]:
gdp_df.columns = gdp_df.columns.astype(str)
gdp_df['Year'] = pd.to_datetime(gdp_df['Year'], format='%Y').dt.year

#rename
gdp_filt_df = gdp_df

### Merge Data

In [513]:
merged_df = pd.merge(expenditure_detail_filt_df.drop_duplicates(),immunization_coverage_df.drop_duplicates(), on = ['Country Code','Year'], how = 'left')

merged_df = pd.merge(merged_df.drop_duplicates(),gdp_filt_df[['Country Code', 'Year', 
       'GDP_Million', 'GDP_Constant_Million']].drop_duplicates(), on = ['Country Code','Year'], how = 'left')

merged_df = pd.merge(merged_df.drop_duplicates(),birth_per_year_df[['Country Code', 'Year', 'birthrate_crude_per_1000',
       'total_population', 'total_population_0-14', 'total_population_15-64',
       'total_population_65-Up', 'birth_per_year']].drop_duplicates(),on = ['Country Code','Year'],how ='left')

merged_df = pd.merge(merged_df.drop_duplicates(),country_area_df[['Country Code','land_area']].drop_duplicates(),on = ['Country Code'],how ='left')

gdp_def_df = gdp_filt_df[gdp_filt_df['Country Code'] == 'USA']
gdp_def_df['gdp deflator index'] = gdp_def_df['GDP_Constant_Million']/gdp_def_df['GDP_Million']

merged_df = pd.merge(merged_df.drop_duplicates(),gdp_def_df[['Year','gdp deflator index']].drop_duplicates(), on = 'Year', how = 'left')

### Clean data

##### Deflate the data using the USD deflator index

In [514]:
cols_to_adjust = [ 'total_immunization in USD', 'gov_immunization in USD',
       'total_vaccine in USD', 'gov_vaccine in USD']

for col in cols_to_adjust:
    merged_df[f'{col}'] = (merged_df[col] * merged_df['gdp deflator index']) 

##### Exclude the rich country (high income)

In [515]:
merged_df = merged_df[merged_df['Gavi / Income status'] != 'High income countries']

##### Drop few vaccine due to very high missing values

In [516]:
#coverage_list = ['BCG_COVERAGE', 'DTPCV1_COVERAGE', 'DTPCV3_COVERAGE', 'HEPB3_COVERAGE', 'HEPB_BD_COVERAGE', 'HIB3_COVERAGE', 'MCV1_COVERAGE', 'MCV2_COVERAGE', 'PCV3_COVERAGE', 'POL3_COVERAGE', 'RCV1_COVERAGE', 'ROTAC_COVERAGE']
#merged_df = merged_df.drop(['IPV1_COVERAGE'] , axis = 1)

##### Create new column to calculate the cost of vaccine delivery

In [517]:
merged_df['Year'] = pd.to_datetime(merged_df['Year'], format='%Y')
merged_df['total_immunization in USD'] = merged_df['total_immunization in USD'].replace(0, np.nan)
merged_df['total_vaccine in USD'] = merged_df['total_vaccine in USD'].replace(0, np.nan)
merged_df['total_vaccine_delivery_cost in USD'] = merged_df['total_immunization in USD'] -merged_df['total_vaccine in USD']


##### Change the data into Million format

In [518]:
#change the data into Million format
merged_df['Vaccine USD Mil'] = merged_df['total_vaccine in USD'] / 1000000
merged_df['Immunization USD Mil'] = merged_df['total_immunization in USD'] / 1000000
merged_df['Vaccine Delivery Cost USD Mil'] = merged_df['total_vaccine_delivery_cost in USD'] / 1000000

##### Compute the gov to total immunization ratio

In [519]:
merged_df['gov_to_total_immunization_ratio'] = merged_df['gov_immunization in USD']/merged_df['total_immunization in USD']
merged_df['gov_to_total_vaccine_ratio'] = merged_df['gov_vaccine in USD']/merged_df['total_vaccine in USD']

In [520]:
origin_merged_df = merged_df.copy()

##### Create new columns for previous year data and average previous 3 years data

In [521]:
# group the dataframe by country
grouped_df = merged_df.groupby('Country')

# Define a function to create the previous year columns in million
def create_prev_year_cols_million(col):
    return [f"{col}_prev_{i}_year_million" for i in range(1, 6)]

# Define a function to calculate the average of the previous 6 years' data considering only available years
def calculate_3_year_average(row, cols):
    return row[cols].apply(lambda x: np.nanmean(x), axis=1)

# Create new columns for 'total_vaccine in USD', 'total_immunization in USD', and 'total_vaccine_delivery_cost in USD', 
# for the value for previous year, in million
for col in ['total_vaccine in USD', 'total_immunization in USD', 'total_vaccine_delivery_cost in USD']:
    prev_year_cols = create_prev_year_cols_million(col)
    for i, prev_year_col in enumerate(prev_year_cols):
        merged_df[prev_year_col] = grouped_df[col].apply(lambda x: x.shift(periods=i+1) / 1000000)
        #print(i)
        #print(col)
        # Calculate the percentage change between the current year and the previous year
        curr_col = f"{col}_prev_1_year_million"
        pct_col = f"{col}_prev_{i+1}_year_pct_change"
        if curr_col in merged_df.columns and prev_year_col in merged_df.columns:
            merged_df[pct_col] = (merged_df[curr_col] - merged_df[prev_year_col]) / merged_df[prev_year_col]
        else:
            merged_df[pct_col] = float('nan')

    # Create a new column to store the average of the previous 6 years' data considering only available years
    avg_col = f"{col}_prev_5_year_avg_million"
    merged_df[avg_col] = calculate_3_year_average(merged_df, prev_year_cols)
    #print('mm')

In [522]:
merged_df['total_vaccine in USD_prev_1_year_pct_change'] = (merged_df['Vaccine USD Mil'] - merged_df['total_vaccine in USD_prev_1_year_million']) / merged_df['total_vaccine in USD_prev_1_year_million']
merged_df['total_immunization in USD_prev_1_year_pct_change'] = (merged_df['Immunization USD Mil'] - merged_df['total_immunization in USD_prev_1_year_million']) / merged_df['total_immunization in USD_prev_1_year_million']
merged_df['total_vaccine_delivery_cost in USD_prev_1_year_pct_change'] = (merged_df['Vaccine Delivery Cost USD Mil'] - merged_df['total_vaccine_delivery_cost in USD_prev_1_year_million']) / merged_df['total_vaccine_delivery_cost in USD_prev_1_year_million']

##### Visualize some of the data to see if there is unexpected data

In [523]:
go.Figure(
    data=[go.Histogram(x=merged_df["Vaccine Delivery Cost USD Mil"], xbins={"start": -60.0, "end": 420.0, "size": 40.0})],
    layout=go.Layout(title="Histogram of Vaccine Delivery Cost USD Mil", yaxis={"title": "Count"}, bargap=0.05),
    )

In [524]:
# Step: Keep rows where Vaccine Delivery Cost USD Mil > 0
#merged_df = merged_df.loc[merged_df['Vaccine Delivery Cost USD Mil'] > 0]

In [525]:
fig = px.histogram(merged_df.dropna(subset=['total_vaccine in USD_prev_1_year_pct_change']), x='Year', histfunc='avg', y='total_vaccine in USD_prev_1_year_pct_change')
fig

In [526]:
fig = px.histogram(merged_df.dropna(subset=['total_immunization in USD_prev_1_year_pct_change']), x='Year', histfunc='avg', y='total_immunization in USD_prev_1_year_pct_change')
fig

In [527]:
fig = px.histogram(merged_df.dropna(subset=['total_immunization in USD_prev_1_year_pct_change']), x='Year', histfunc='avg', y='total_immunization in USD_prev_1_year_pct_change')
fig

In [528]:

fig = px.histogram(merged_df, x='Year', histfunc='avg', y='total_vaccine_delivery_cost in USD_prev_1_year_pct_change')
fig

In [529]:
import pandas as pd; import numpy as np
# Step: Sort column(s) Country ascending (A-Z), Year ascending (A-Z)
merged_df = merged_df.sort_values(by=['Country', 'Year'], ascending=[True, True])

merged_df

     Country Code      Country       Year Region  \
0             AFG  Afghanistan 2006-01-01   EMRO   
1             AFG  Afghanistan 2007-01-01   EMRO   
2             AFG  Afghanistan 2008-01-01   EMRO   
3             AFG  Afghanistan 2009-01-01   EMRO   
4             AFG  Afghanistan 2010-01-01   EMRO   
...           ...          ...        ...    ...   
3115          ZWE     Zimbabwe 2017-01-01   AFRO   
3116          ZWE     Zimbabwe 2018-01-01   AFRO   
3117          ZWE     Zimbabwe 2019-01-01   AFRO   
3118          ZWE     Zimbabwe 2020-01-01   AFRO   
3119          ZWE     Zimbabwe 2021-01-01   AFRO   

                  Gavi / Income status  total_immunization in USD  \
0            Gavi low income countries               5.039581e+07   
1            Gavi low income countries               2.368317e+07   
2            Gavi low income countries               2.112499e+07   
3            Gavi low income countries               2.668447e+07   
4            Gavi low income c

##### Exclude the immunization data that 0 or NaN

In [530]:
merged_df = merged_df.loc[merged_df['total_immunization in USD'] > 0]

In [531]:
merged_df = merged_df.loc[merged_df['gov_immunization in USD'] > 0]

In [532]:
merged_df = merged_df.sort_values(by=['total_vaccine_delivery_cost in USD_prev_1_year_pct_change'], ascending=[False])

### Feature Engineering

##### Added the square root, power of 2 and 3, log, ratio, and diff from all coverage columns

In [533]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import PowerTransformer

# Find all columns that contain 'COVERAGE' in the name
coverage_columns = [col for col in merged_df.columns if 'COVERAGE' in col]

# Apply feature engineering techniques to each coverage column
for col in coverage_columns:
    # Square root transformation
    merged_df[f'{col}_sqrt'] = np.sqrt(merged_df[col].apply(pd.to_numeric))

    # Power transformation with power of 2
    merged_df[f'{col}_power_2'] = np.power(merged_df[col], 2)
    
    # Power transformation with power of 3
    merged_df[f'{col}_power_3'] = np.power(merged_df[col], 3)
    
    # Log transformation
    merged_df[f'{col}_log'] = np.log(merged_df[col])

    # Ratio
    merged_df[f'{col}_ratio'] = merged_df[col] / merged_df[col].sum()

    # Difference
    merged_df[f'{col}_diff'] = merged_df[col] - merged_df[col].mean()


##### Perform one-hot encoding on 'Gavi / Income status'

In [534]:
# Perform one-hot encoding on 'Gavi / Income status'
merged_df = pd.get_dummies(merged_df, columns=['Gavi / Income status'])

##### Create a new DataFrame to store the average ratios for gov to total immunization and vaccine for 3 years

In [535]:
# Create a new DataFrame to store the average ratios
avg_ratios_df = pd.DataFrame(columns=['Country', 'Year', 'Avg_3yr_gov_immunization_ratio'])

# Iterate over each country in the merged_df DataFrame
for country in merged_df['Country'].unique():
    # Select only the rows for the current country
    country_df = merged_df.loc[merged_df['Country'] == country]
    
    # Iterate over each year for the current country
    for i in range(len(country_df)):
        # If the current year is null, use the latest available 3 years to calculate the average
        if pd.isnull(country_df.iloc[i]['Year']):
            latest_years = country_df['Year'].dropna().tail(3)
            if len(latest_years) < 3:
                avg_ratio = country_df['gov_to_total_immunization_ratio'].dropna().mean()
            else:
                avg_ratio = country_df.loc[country_df['Year'].isin(latest_years)].\
                            iloc[-3:]['gov_to_total_immunization_ratio'].mean()
        # Otherwise, calculate the average of the current and two previous years
        else:
            if i < 2:
                avg_ratio = country_df.loc[country_df['Year'] <= country_df.iloc[i]['Year']]\
                            ['gov_to_total_immunization_ratio'].dropna().mean()
            else:
                avg_ratio = country_df.iloc[i-2:i+1]['gov_to_total_immunization_ratio'].mean()
        # Add the average ratio to the avg_ratios_df DataFrame
        avg_ratios_df = avg_ratios_df.append({'Country': country, 'Year': country_df.iloc[i]['Year'], 
                                              'Avg_3yr_gov_immunization_ratio': avg_ratio}, ignore_index=True)

# Merge the avg_ratios_df DataFrame with the merged_df DataFrame on 'Country' and 'Year'
merged_df = pd.merge(merged_df, avg_ratios_df, on=['Country', 'Year'], how='left')


In [536]:
# Create a new DataFrame to store the average ratios
avg_ratios_df = pd.DataFrame(columns=['Country', 'Year', 'Avg_3yr_gov_vaccine_ratio'])

# Iterate over each country in the merged_df DataFrame
for country in merged_df['Country'].unique():
    # Select only the rows for the current country
    country_df = merged_df.loc[merged_df['Country'] == country]
    
    # Iterate over each year for the current country
    for i in range(len(country_df)):
        # If the current year is null, use the latest available 3 years to calculate the average
        if pd.isnull(country_df.iloc[i]['Year']):
            latest_years = country_df['Year'].dropna().tail(3)
            if len(latest_years) < 3:
                avg_ratio = country_df['gov_to_total_vaccine_ratio'].dropna().mean()
            else:
                avg_ratio = country_df.loc[country_df['Year'].isin(latest_years)].\
                            iloc[-3:]['gov_to_total_vaccine_ratio'].mean()
        # Otherwise, calculate the average of the current and two previous years
        else:
            if i < 2:
                avg_ratio = country_df.loc[country_df['Year'] <= country_df.iloc[i]['Year']]\
                            ['gov_to_total_vaccine_ratio'].dropna().mean()
            else:
                avg_ratio = country_df.iloc[i-2:i+1]['gov_to_total_vaccine_ratio'].mean()
        # Add the average ratio to the avg_ratios_df DataFrame
        avg_ratios_df = avg_ratios_df.append({'Country': country, 'Year': country_df.iloc[i]['Year'], 
                                              'Avg_3yr_gov_vaccine_ratio': avg_ratio}, ignore_index=True)

# Merge the avg_ratios_df DataFrame with the merged_df DataFrame on 'Country' and 'Year'
merged_df = pd.merge(merged_df, avg_ratios_df, on=['Country', 'Year'], how='left')


In [537]:
# sort the dataframe by country and year
merged_df = merged_df.sort_values(['Country', 'Year'])

##### Density

In [539]:
merged_df['density'] = merged_df['total_population']/merged_df['land_area']

##### Create new Coverage columns. Created label column, calculate coverage times population and land area, coverage per unit gdp, coverage change, coverage change percentage and coverage trend.

In [540]:
coverage_list =  [col for col in merged_df.columns if 'COVERAGE' in col]
# convert all columns to float data type
cols_to_float = coverage_list

merged_df[cols_to_float] = merged_df[cols_to_float].astype(float)

# create the bins and labels for the coverage categories
bins = [0, 50, 75, 90, 100]
labels = ['Low', 'Medium', 'High', 'Very High']

# list of coverage columns to categorize
coverage_cols =coverage_list

# categorize each coverage column and create a new categorical column for each one
for col in coverage_cols:
    merged_df[col+'_CATEGORY'] = pd.cut(merged_df[col], bins=bins, labels=labels, include_lowest=True)


In [541]:
# Define the columns to be transformed
vaccine_cols = coverage_list

# Calculate the coverage per capita and coverage x the population
pop_list = ['birthrate_crude_per_1000',
       'total_population', 'total_population_0-14', 'total_population_15-64',
       'total_population_65-Up', 'birth_per_year', 'land_area','density']
for i in pop_list:
    for col in vaccine_cols:
        #merged_df[f'{col}_PER_{i}'] = merged_df[col] / merged_df[i]
        merged_df[f'{col}_TIMES_{i}'] = merged_df[col] * merged_df[i]
        #merged_df[f'{col}_PERCENTAGE'] = merged_df[col] / merged_df['total_population'] * 100

# Calculate the coverage per unit of GDP
for col in vaccine_cols:
    merged_df[f'{col}_PER_GDP'] = merged_df[col] / merged_df['GDP_Million']

# Create a new feature that captures the change in coverage over time, grouped by Country
for col in vaccine_cols:
    merged_df[f'{col}_CHANGE'] = merged_df.groupby('Country')[col].diff()

# Create a new feature that captures the change in coverage compared to the previous year as a percentage, grouped by Country
for col in vaccine_cols:
    merged_df[f'{col}_CHANGE_PERCENTAGE'] = merged_df.groupby('Country')[f'{col}_CHANGE'].pct_change() * 100

# Create a new feature that captures the overall trend in coverage over time, grouped by Country
for col in vaccine_cols:
    merged_df[f'{col}_TREND'] = merged_df.groupby('Country')[col].apply(lambda x: x.diff().fillna(0).rolling(window=3).sum()).fillna(0)


##### Created columns for previous year gdp and percentage change

In [542]:
# Define a function to create the previous year columns for GDP in million
def create_prev_year_gdp_cols_million():
    return [f"GDP_prev_{i}_year" for i in range(1, 3)]

# Create new columns for GDP, for the value for previous year up until last 5 years
prev_gdp_cols = create_prev_year_gdp_cols_million()
for i, prev_gdp_col in enumerate(prev_gdp_cols):
    merged_df[prev_gdp_col] = grouped_df['GDP_Million'].apply(lambda x: x.shift(i+1))

    # Calculate the percentage change between the current year and the previous year
    curr_col = "GDP_Million"
    pct_col = f"GDP_prev_{i+1}_year_pct_change"
    if curr_col in merged_df.columns and prev_gdp_col in merged_df.columns:
        merged_df[pct_col] = (merged_df[curr_col] - merged_df[prev_gdp_col]) / merged_df[prev_gdp_col]
    else:
        merged_df[pct_col] = float('nan')


In [543]:
# sort the dataframe by country and year
merged_df.sort_values(['Country', 'Year'], inplace=True)

# group the dataframe by country and calculate the percentage change in immunization USD between consecutive years
merged_df['current_Immunization_cost_pct_change_from_last_year'] = merged_df.groupby('Country')['Immunization USD Mil'].pct_change() * 100

##### Compute the predictive score for each column towards total immunization

In [546]:
import numpy as np
from joblib import Parallel, delayed

target_column = 'Immunization USD Mil'

# Define a function to compute PPS for a single column
def compute_pps(col):
    return pps.score(merged_df, col, target_column)['ppscore']

# Compute PPS for all columns in parallel
num_cores = 10  # change this to the number of cores you have available

# Split the columns into equal-sized chunks for better load balancing
column_chunks = np.array_split(merged_df.columns, num_cores)
pps_scores = Parallel(n_jobs=num_cores)(delayed(lambda x: [compute_pps(col) for col in x])(chunk) for chunk in column_chunks)

# Flatten the list of lists and create a dictionary mapping column names to PPS scores
pps_scores = [score for sublist in pps_scores for score in sublist]
pps_dict = dict(zip(merged_df.columns, pps_scores))

# Find the high-PPS columns
high_pps_cols = set(col for col, score in pps_dict.items() if 0.3 < score < 0.9)

for col in high_pps_cols:
    pps_score = pps_dict[col]
    print(f'{col}: {pps_score}')


total_vaccine in USD_prev_2_year_million: 0.4005435017373997
total_immunization in USD_prev_4_year_million: 0.36363549411765617
PCV1_COVERAGE_diff_TIMES_total_population_65-Up: 0.34853471771287503
PAB_COVERAGE_diff_TIMES_land_area: 0.4657140397721836
RCV1_COVERAGE_diff_TIMES_total_population_0-14: 0.32336199955685596
total_immunization in USD_prev_5_year_avg_million: 0.46917218695628793
VAD1_COVERAGE_diff_TIMES_land_area: 0.43386908560884574
total_vaccine in USD: 0.7066186667778147
Country Code: 0.5718428138119132
total_immunization in USD_prev_2_year_million: 0.4511508818730061
HEPB_BD_COVERAGE_diff_TIMES_land_area: 0.3826238524851442
total_vaccine in USD_prev_4_year_million: 0.3839179255088534
ROTAC_COVERAGE_diff_TIMES_land_area: 0.45715074518732024
RCV1_COVERAGE_diff_TIMES_land_area: 0.3906004816522147
ROTAC_COVERAGE_diff_TIMES_total_population_65-Up: 0.3035953122364783
gov_vaccine in USD: 0.47605500348488816
JAPENC_COVERAGE_diff_TIMES_land_area: 0.5941785322281505
total_vaccine in 

In [547]:
high_pps_cols

{'Country',
 'Country Code',
 'HEPB_BD_COVERAGE_diff_TIMES_land_area',
 'JAPENC_COVERAGE_diff_TIMES_land_area',
 'MCV2_COVERAGE_diff_TIMES_land_area',
 'PAB_COVERAGE_diff_TIMES_land_area',
 'PCV1_COVERAGE_diff_TIMES_land_area',
 'PCV1_COVERAGE_diff_TIMES_total_population_65-Up',
 'PCV2_COVERAGE_diff_TIMES_total_population_65-Up',
 'POL3_COVERAGE_sqrt_TIMES_land_area',
 'RCV1_COVERAGE_diff_TIMES_land_area',
 'RCV1_COVERAGE_diff_TIMES_total_population_0-14',
 'ROTA1_COVERAGE_diff_TIMES_land_area',
 'ROTAC_COVERAGE_diff_TIMES_land_area',
 'ROTAC_COVERAGE_diff_TIMES_total_population_65-Up',
 'VAD1_COVERAGE_diff_TIMES_land_area',
 'Vaccine USD Mil',
 'gov_immunization in USD',
 'gov_vaccine in USD',
 'land_area',
 'total_immunization in USD_prev_1_year_million',
 'total_immunization in USD_prev_2_year_million',
 'total_immunization in USD_prev_3_year_million',
 'total_immunization in USD_prev_4_year_million',
 'total_immunization in USD_prev_5_year_avg_million',
 'total_vaccine in USD',
 't

In [463]:
merged_df.to_csv('immunization_prediction_data.csv')

### Data Selection

#### Selected columns to be included in prediction except Year, Country, Immunization USD Mil (target column)

In [548]:
selected_column = ['Year','Country Code','Gavi / Income status_Gavi low income countries',
'Gavi / Income status_Gavi low-middle income countries',
'Gavi / Income status_non-Gavi middle income countries',
'density',
                   
#'Country',
# 'Country Code',
 'HEPB_BD_COVERAGE_diff_TIMES_land_area',
# 'JAPENC_COVERAGE_diff_TIMES_land_area',
 'PAB_COVERAGE_diff_TIMES_land_area',
 'PCV1_COVERAGE_diff_PER_GDP',
 'PCV1_COVERAGE_diff_TIMES_land_area',
 'PCV1_COVERAGE_diff_TIMES_total_population_65-Up',
 'PCV2_COVERAGE_diff_TIMES_total_population_65-Up',
 'POL3_COVERAGE_sqrt_TIMES_land_area',
 'RCV1_COVERAGE_diff_TIMES_land_area',
 'ROTA1_COVERAGE_diff_TIMES_land_area',
 'ROTAC_COVERAGE_diff_TIMES_land_area',
 'ROTAC_COVERAGE_diff_TIMES_total_population_65-Up',
 'VAD1_COVERAGE_diff_TIMES_land_area',
# 'gov_immunization in USD',
# 'gov_vaccine in USD',
 'land_area',
# 'total_immunization in USD_prev_1_year_million',
 'total_immunization in USD_prev_5_year_avg_million',
# 'total_immunization in USD_prev_3_year_million',
# 'total_vaccine in USD',
# 'total_vaccine in USD_prev_1_year_million',
# 'total_vaccine in USD_prev_2_year_million',
 'total_vaccine in USD_prev_5_year_avg_million',
# 'total_vaccine in USD_prev_3_year_million',
                   
                   
  'Vaccine USD Mil',              
 'Immunization USD Mil'
       ]

selected_data_df = merged_df[selected_column]


##### Extrapolate the data

In [549]:
x = selected_data_df.columns
cols_to_interpolate = []
for i in x:
    if i not in ['Country','Country Code','Region','Gavi / Income status','Year','Gavi / Income status_Gavi low income countries',
'Gavi / Income status_Gavi low-middle income countries',
'Gavi / Income status_non-Gavi middle income countries']:
        cols_to_interpolate.append(i)

In [550]:
# Specify columns to interpolate missing values
num_rounds = 1

for round_loop in range(num_rounds):
    for j in cols_to_interpolate:
        # Group the data by country and year
        grouped = selected_data_df.groupby(['Country Code', 'Year'])

        # Loop through each group
        for name, group in grouped:
            # Create a new dataframe with the year as the index
            ts = pd.DataFrame(group[j].values, index=group['Year'], columns=[j])
            # Convert the column to numeric dtype
            ts[j] = pd.to_numeric(ts[j], errors='coerce')
            # Interpolate missing values using time series method
            ts = ts.interpolate(method='time')
            # Update the merged_df with the interpolated values
            selected_data_df.loc[group.index,j] = ts[j].values
            

In [551]:
selected_data_df['Year'] = selected_data_df['Year'].dt.year.astype(float)

In [552]:
selected_data_df = selected_data_df.dropna(axis = 0)

In [553]:
check_2021_data = selected_data_df[selected_data_df['Year']==2020]
check_2021_data

        Year Country Code  Gavi / Income status_Gavi low income countries  \
1026  2020.0          ALB                                               0   
501   2020.0          AGO                                               0   
502   2020.0          AZE                                               0   
584   2020.0          BGD                                               0   
1053  2020.0          BLZ                                               0   
...      ...          ...                                             ...   
1408  2020.0          TUN                                               0   
222   2020.0          UZB                                               0   
1442  2020.0          VNM                                               0   
355   2020.0          ZMB                                               0   
783   2020.0          ZWE                                               0   

      Gavi / Income status_Gavi low-middle income countries  \
1026        

In [554]:
selected_data_df.to_csv('selected_data.csv')

### ML

In [560]:
# Import required libraries
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score

import pandas as pd
import numpy as np

#year to start predicting
year_pred = 2020

def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100


def mean_absolute_percentage_error_adjusted(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / np.minimum(y_true, y_pred))) * 100

# Define a function to fit and predict using the best model
def fit_and_predict(best_model, X_train, X_test, y_train):
    # Train the best model on the full dataset
    best_model.fit(X_train, y_train)
    
    # Make predictions on the testing set and evaluate performance
    y_pred = best_model.predict(X_test)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return y_pred, mape, r2

# Filter data by year
train_data = selected_data_df[selected_data_df['Year'] < year_pred]
test_data = selected_data_df[selected_data_df['Year'] >= year_pred]

# Split the data into training and testing sets
target_columns = ['Immunization USD Mil']
X_col = [x for x in selected_column if x not in target_columns]
X_train = train_data[X_col].drop(['Year', 'Country Code','Vaccine USD Mil'], axis=1)
X_test = test_data[X_col].drop(['Year', 'Country Code','Vaccine USD Mil'], axis=1)

# Label encode categorical columns
label_encoder = LabelEncoder()
# X_train['Country'] = label_encoder.fit_transform(X_train['Country'])
# X_test['Country'] = label_encoder.transform(X_test['Country'])

# One-hot encode categorical columns (alternative to label encoding)
# ct = ColumnTransformer([('encoder', OneHotEncoder(), [0, 1, 2])], remainder='passthrough')
# X_train = ct.fit_transform(X_train)
# X_test = ct.transform(X_test)

# Define the models and their hyperparameters
models = [
    {
        'name': 'linear regression',
        'model': LinearRegression(),
        'params': {}
    },
    {
        'name': 'random forest',
        'model': RandomForestRegressor(),
        'params': {
            'model__n_estimators': [10, 50, 100, 200],
            'model__max_depth': [None, 5, 10, 20]
        }
    },
#     {
#         'name': 'gradient boosting',
#         'model': GradientBoostingRegressor(),
#         'params': {
#             'model__n_estimators': [10, 50, 100, 200],
#             'model__max_depth': [3, 5, 10],
#             'model__learning_rate': [0.1, 0.01, 0.001]
#         }
#     },
#     {
#         'name': 'neural network',
#         'model': MLPRegressor(activation='relu'),
#         'params': {
#             'model__hidden_layer_sizes': [(50,), (100,), (50, 50)],
#             'model__learning_rate_init': [0.1, 0.01, 0.001],
#             'model__max_iter': [500],
#             'model__solver': ['adam'],
#         }
#     }
]

# Evaluate each model using cross-validation and select the best one for each target variable
best_models = {}
for target_column in target_columns:
    y_train = train_data[target_column]
    y_test = test_data[target_column]
    best_model = None
    best_score = None
    for model in models:
        pipeline = Pipeline([('scaler', StandardScaler()), ('model', model['model'])])
        clf = GridSearchCV(pipeline, model['params'], cv=10, scoring='neg_mean_absolute_percentage_error')
    
        clf.fit(X_train, y_train)
        score = -clf.best_score_
        print(model['name'], f'mean absolute percentage error ({target_column}):', score)
        if best_score is None or score < best_score:
            best_score = score
            best_model = clf.best_estimator_
    # Save the best model for the current target variable
    best_models[target_column] = best_model

#Get the name and parameters of the best model for each target variable
best_model_names = {target_column: best_models[target_column].named_steps['model'].__class__.__name__ for target_column in target_columns}
best_model_params = {target_column: best_models[target_column].named_steps['model'].get_params() for target_column in target_columns}
print('Best models:', best_model_names)
print('Best model parameters:', best_model_params)

for target_column in target_columns:
    best_model = best_models[target_column]
    y_pred, mape, r2 = fit_and_predict(best_model, X_train, X_test, train_data[target_column])

    # Append the predictions and error to the DataFrame
    selected_data_df[f'{target_column}_predicted'] = np.nan
    selected_data_df.loc[selected_data_df['Year'] >= year_pred, f'{target_column}_predicted'] = y_pred

print(f'mean absolute percentage error ({target_columns[0]}) {mean_absolute_percentage_error(y_test, y_pred)}%')
selected_data_df['mape_immunization'] = selected_data_df.apply(lambda x: mean_absolute_percentage_error(x['Immunization USD Mil'], x['Immunization USD Mil_predicted']), axis=1)
selected_data_df['mape_immunization_adjusted'] = selected_data_df.apply(lambda x: mean_absolute_percentage_error_adjusted(x['Immunization USD Mil'], x['Immunization USD Mil_predicted']), axis=1)


linear regression mean absolute percentage error (Immunization USD Mil): 3.051563412829554
random forest mean absolute percentage error (Immunization USD Mil): 0.5530598268371854
Best models: {'Immunization USD Mil': 'RandomForestRegressor'}
Best model parameters: {'Immunization USD Mil': {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 20, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 50, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}}
mean absolute percentage error (Immunization USD Mil) 56.17475304067825%


In [561]:
print(f'mean absolute percentage error ({target_columns[0]}) {mean_absolute_percentage_error(y_test, y_pred)}%')
# Calculate and print the R2 score
r2 = r2_score(y_test, y_pred)
print(f'R2 score: {r2}')

mean absolute percentage error (Immunization USD Mil) 56.17475304067825%
R2 score: 0.7508210630366774


In [562]:
# Filter data by year
train_data = selected_data_df[selected_data_df['Year'] < year_pred]
test_data = selected_data_df[selected_data_df['Year'] >= year_pred]

# Split the data into training and testing sets
target_columns = ['Vaccine USD Mil']
X_col = [x for x in selected_column if x not in target_columns]
X_train = train_data[X_col].drop(['Year', 'Country Code','Immunization USD Mil'], axis=1)
X_test = test_data[X_col].drop(['Year', 'Country Code','Immunization USD Mil'], axis=1)

# Label encode categorical columns
label_encoder = LabelEncoder()
# X_train['Country'] = label_encoder.fit_transform(X_train['Country'])
# X_test['Country'] = label_encoder.transform(X_test['Country'])

# One-hot encode categorical columns (alternative to label encoding)
# ct = ColumnTransformer([('encoder', OneHotEncoder(), [0, 1, 2])], remainder='passthrough')
# X_train = ct.fit_transform(X_train)
# X_test = ct.transform(X_test)

# Define the models and their hyperparameters
models = [
    {
        'name': 'linear regression',
        'model': LinearRegression(),
        'params': {}
    },
    {
        'name': 'random forest',
        'model': RandomForestRegressor(),
        'params': {
            'model__n_estimators': [10, 50, 100, 200],
            'model__max_depth': [None, 5, 10, 20]
        }
    },
#     {
#         'name': 'gradient boosting',
#         'model': GradientBoostingRegressor(),
#         'params': {
#             'model__n_estimators': [10, 50, 100, 200],
#             'model__max_depth': [3, 5, 10],
#             'model__learning_rate': [0.1, 0.01, 0.001]
#         }
#     },
#     {
#         'name': 'neural network',
#         'model': MLPRegressor(activation='relu'),
#         'params': {
#             'model__hidden_layer_sizes': [(50,), (100,), (50, 50)],
#             'model__learning_rate_init': [0.1, 0.01, 0.001],
#             'model__max_iter': [500],
#             'model__solver': ['adam'],
#         }
#     }
]

# Evaluate each model using cross-validation and select the best one for each target variable
best_models = {}
for target_column in target_columns:
    y_train = train_data[target_column]
    y_test = test_data[target_column]
    best_model = None
    best_score = None
    for model in models:
        pipeline = Pipeline([('scaler', StandardScaler()), ('model', model['model'])])
        clf = GridSearchCV(pipeline, model['params'], cv=10, scoring='neg_mean_absolute_percentage_error')
    
        clf.fit(X_train, y_train)
        score = -clf.best_score_
        print(model['name'], f'mean absolute percentage error ({target_column}):', score)
        if best_score is None or score < best_score:
            best_score = score
            best_model = clf.best_estimator_
    # Save the best model for the current target variable
    best_models[target_column] = best_model

#Get the name and parameters of the best model for each target variable
best_model_names = {target_column: best_models[target_column].named_steps['model'].__class__.__name__ for target_column in target_columns}
best_model_params = {target_column: best_models[target_column].named_steps['model'].get_params() for target_column in target_columns}
print('Best models:', best_model_names)
print('Best model parameters:', best_model_params)

for target_column in target_columns:
    best_model = best_models[target_column]
    y_pred, mape, r2 = fit_and_predict(best_model, X_train, X_test, train_data[target_column])

    # Append the predictions and error to the DataFrame
    selected_data_df[f'{target_column}_predicted'] = np.nan
    selected_data_df.loc[selected_data_df['Year'] >= year_pred, f'{target_column}_predicted'] = y_pred

print(f'mean absolute percentage error ({target_columns[0]}) {mean_absolute_percentage_error(y_test, y_pred)}%')
selected_data_df['mape_vaccine'] = selected_data_df.apply(lambda x: mean_absolute_percentage_error(x['Vaccine USD Mil'], x['Vaccine USD Mil_predicted']), axis=1)
selected_data_df['mape_vaccine_adjusted'] = selected_data_df.apply(lambda x: mean_absolute_percentage_error_adjusted(x['Vaccine USD Mil'], x['Vaccine USD Mil_predicted']), axis=1)


linear regression mean absolute percentage error (Vaccine USD Mil): 4.459411293919879
random forest mean absolute percentage error (Vaccine USD Mil): 0.5730409987524013
Best models: {'Vaccine USD Mil': 'RandomForestRegressor'}
Best model parameters: {'Vaccine USD Mil': {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 10, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 10, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}}
mean absolute percentage error (Vaccine USD Mil) 49.80621073344707%


In [563]:
print(f'mean absolute percentage error ({target_columns[0]}) {mean_absolute_percentage_error_adjusted(y_test, y_pred)}%')
# Calculate and print the R2 score
r2 = r2_score(y_test, y_pred)
print(f'R2 score: {r2}')

mean absolute percentage error (Vaccine USD Mil) 61.769545559793826%
R2 score: 0.8955825333095989


##### Reinflate the data and output

In [565]:
selected_data_df['prediction_immunization'] = ['underprediction' if (x['Immunization USD Mil_predicted'] - x['Immunization USD Mil']) < 0 else 'overprediction' for idx, x in selected_data_df.iterrows()]
selected_data_df['prediction_vaccine'] = ['underprediction' if (x['Vaccine USD Mil_predicted'] - x['Vaccine USD Mil']) < 0 else 'overprediction' for idx, x in selected_data_df.iterrows()]
predicted_data = selected_data_df[selected_data_df['Year'] >= year_pred]

cols_to_adjust = ['Vaccine Delivery Cost USD Mil','Immunization USD Mil_predicted','Vaccine USD Mil_predicted','Immunization USD Mil','Vaccine USD Mil','total_vaccine in USD', 'total_immunization in USD', 'gov_immunization in USD','gov_vaccine in USD']
origin_merged_df['Year']= pd.to_datetime(origin_merged_df['Year'], format='%Y').dt.strftime('%Y')
predicted_data['Year']= pd.to_datetime(predicted_data['Year'], format='%Y').dt.strftime('%Y')

    
output = pd.merge(predicted_data[['Country Code', 'Year','Immunization USD Mil_predicted','Vaccine USD Mil_predicted','mape_immunization','mape_immunization_adjusted','mape_vaccine','mape_vaccine_adjusted','prediction_immunization','prediction_vaccine']], origin_merged_df, on = ['Country Code','Year'], how = 'inner')
for col in cols_to_adjust:
    output[f'{col}'] = (output[col] / output['gdp deflator index'])
output.to_csv('cleaned_data_with_predictions.csv', index=False)