In [87]:
%matplotlib inline

import pandas as pd
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt

In [102]:
# DATA CLEANING

# Display the data
raw = Path("..", "Resources", "WHO_health_expenditure.csv")
raw_df = pd.read_csv(raw, encoding="ISO-8859-1")

# Display unique values in the two indicator columns in order to identify relevant rows for further processing
# print(raw_df["Indicators"].unique())
# print(raw_df["Unnamed: 2"].unique())
# Decided to use "Current Health Expenditure (CHE) as % Gross Domestic Product (GDP)" and "Current Health Expenditure (CHE) per Capita in US$"

# Isolate the rows of interest
abridged_df = raw_df.loc[(raw_df["Indicators"] == "Current Health Expenditure (CHE) as % Gross Domestic Product (GDP)") | (raw_df["Indicators"] == "Current Health Expenditure (CHE) per Capita in US$"), :]
# display(abridged_df)

# Remove the un-used column "Unnamed: 2"
clean_1 = abridged_df.drop(columns=["Unnamed: 2"])
# display(clean_1)

# Sort by country name
clean_2 = clean_1.sort_values(by="Countries")
clean_2.reset_index(drop=True, inplace=True)
# display(clean_2)

# Remove thousands-place commas, then convert values to float
for column in clean_2.columns[2:]:
    clean_2[column] = clean_2[column].str.replace(",", "").astype(float)

# Export to csv
df = pd.DataFrame(clean_2)
df.to_csv('WHO_health_expenditure_ready_to_use.csv', index=False, encoding="ISO-8859-1")

