In [1]:
# import pandas for basic cleaning
import pandas as pd
import warnings
# very long warning message took up half the notebook
warnings.filterwarnings("ignore")

In [2]:
# read in csv file GHED = global health expenditures data
expenditures_df = pd.read_csv('data/GHED_data.csv')

# we only want a few columns from the couple thousand in the original file
expenditures_df = expenditures_df[['country','code','region',
                                   'income','year','che_gdp', 'hk_gdp',
                                   'che_pc_usd','che',
                                   'gdp_pc_usd','che_usd']]
# select the years that correspond with the mortality dataset
expenditures_df = expenditures_df.loc[expenditures_df['year']>=2010,:]


In [3]:
# look at the datatypes of the columns
expenditures_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2324 entries, 10 to 4243
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   country     2324 non-null   object 
 1   code        2324 non-null   object 
 2   region      2324 non-null   object 
 3   income      2324 non-null   object 
 4   year        2324 non-null   int64  
 5   che_gdp     2283 non-null   float64
 6   hk_gdp      1281 non-null   float64
 7   che_pc_usd  2282 non-null   object 
 8   che         2283 non-null   object 
 9   gdp_pc_usd  2281 non-null   object 
 10  che_usd     2281 non-null   object 
dtypes: float64(2), int64(1), object(8)
memory usage: 217.9+ KB


In [4]:
# preview the dataframe
expenditures_df.head()

Unnamed: 0,country,code,region,income,year,che_gdp,hk_gdp,che_pc_usd,che,gdp_pc_usd,che_usd
10,Algeria,DZA,AFR,Lower-middle,2010,5.1,,230,613629,4496,8249
11,Algeria,DZA,AFR,Lower-middle,2011,5.3,,288,768473,5473,10536
12,Algeria,DZA,AFR,Lower-middle,2012,6.0,,337,972659,5611,12545
13,Algeria,DZA,AFR,Lower-middle,2013,6.0,,333,1004828,5520,12660
14,Algeria,DZA,AFR,Lower-middle,2014,6.5,,361,1127993,5516,13999


#### visuals that show counties that specifically target the communicable diseases and one that does not

In [5]:
# rename some columns to show that they are percentages
clean_expenditures_df = expenditures_df.rename(columns={
   'che_gdp':'che_gdp(%)', 'hk_gdp':'hk_gdp(%)'
}).dropna()

In [6]:
# preview the file again
clean_expenditures_df.head()

Unnamed: 0,country,code,region,income,year,che_gdp(%),hk_gdp(%),che_pc_usd,che,gdp_pc_usd,che_usd
15,Algeria,DZA,AFR,Lower-middle,2015,7.0,0.0,292,1163740,4197,11557
54,Benin,BEN,AFR,Lower-middle,2010,3.0,0.4,30,142008,1009,287
55,Benin,BEN,AFR,Lower-middle,2011,3.0,0.1,33,151029,1099,320
56,Benin,BEN,AFR,Lower-middle,2012,3.1,0.3,34,174951,1113,343
57,Benin,BEN,AFR,Lower-middle,2013,2.9,0.2,35,177366,1214,359


In [7]:
# export the dataframe as a csv to data file
clean_expenditures_df.to_csv('data/expenditures.csv', index=False)

In [9]:
# read back in to ensure that it exported correctly
df = pd.read_csv('data/expenditures.csv')
df.head()

Unnamed: 0,country,code,region,income,year,che_gdp(%),hk_gdp(%),che_pc_usd,che,gdp_pc_usd,che_usd
0,Algeria,DZA,AFR,Lower-middle,2015,7.0,0.0,292,1163740,4197,11557
1,Benin,BEN,AFR,Lower-middle,2010,3.0,0.4,30,142008,1009,287
2,Benin,BEN,AFR,Lower-middle,2011,3.0,0.1,33,151029,1099,320
3,Benin,BEN,AFR,Lower-middle,2012,3.1,0.3,34,174951,1113,343
4,Benin,BEN,AFR,Lower-middle,2013,2.9,0.2,35,177366,1214,359
