In [20]:
# Import modules
import pandas as pd
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
from zipfile import ZipFile

In [21]:
## Read in the Data

# Create file path (zipped file)
zip_file_path = Path('../Resources/world_development_indicators.csv.zip')

# Extracted CSV file path (temporary)
extracted_csv_path = Path('../Resources/world_development_indicators.csv')

# Extract the CSV file from the ZIP archive
with ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extract('world_development_indicators.csv', path='../Resources')

# Read the CSV File
df = pd.read_csv(extracted_csv_path, encoding='UTF-8', low_memory=False)

In [22]:
#df['Country Name'].unique().tolist()

In [23]:
## Clean the Data

oecd_countries = [
    'Australia', 'Austria', 'Belgium', 'Canada', 'Chile', 'Czechia', 'Denmark',
    'Estonia', 'Finland', 'France', 'Germany', 'Greece', 'Hungary', 'Iceland', 'Ireland',
    'Israel', 'Italy', 'Japan', 'Korea, Rep.', 'Latvia', 'Lithuania', 'Luxembourg', 'Mexico',
    'Netherlands', 'New Zealand', 'Norway', 'Poland', 'Portugal', 'Slovak Republic',
    'Slovenia', 'Spain', 'Sweden', 'Switzerland', 'Turkiye', 'United Kingdom', 'United States'
]


# Filtering based on OECD countries
oecd_df = df[df['Country Name'].isin(oecd_countries)]
oecd_df.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
88028,Australia,AUS,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.ZS,,,,,,,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,
88029,Australia,AUS,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.RU.ZS,,,,,,,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,
88030,Australia,AUS,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.UR.ZS,,,,,,,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,
88031,Australia,AUS,Access to electricity (% of population),EG.ELC.ACCS.ZS,,,,,,,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,
88032,Australia,AUS,"Access to electricity, rural (% of rural popul...",EG.ELC.ACCS.RU.ZS,,,,,,,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,


In [24]:
oecd_df['Country Name'].unique().tolist()

['Australia',
 'Austria',
 'Belgium',
 'Canada',
 'Chile',
 'Czechia',
 'Denmark',
 'Estonia',
 'Finland',
 'France',
 'Germany',
 'Greece',
 'Hungary',
 'Iceland',
 'Ireland',
 'Israel',
 'Italy',
 'Japan',
 'Korea, Rep.',
 'Latvia',
 'Lithuania',
 'Luxembourg',
 'Mexico',
 'Netherlands',
 'New Zealand',
 'Norway',
 'Poland',
 'Portugal',
 'Slovak Republic',
 'Slovenia',
 'Spain',
 'Sweden',
 'Switzerland',
 'Turkiye',
 'United Kingdom',
 'United States']

In [25]:
# Melt the DataFrame to long format
melted_df = pd.melt(oecd_df, id_vars=['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code'],
                    var_name='Year', value_name='Value')

# Pivot the melted DataFrame
pivot_df = melted_df.pivot(index=['Country Name', 'Country Code', 'Year'], columns='Indicator Name', values='Value')

# Reset index to make 'Country Name', 'Country Code', 'Year' regular columns again
pivot_df = pivot_df.reset_index()

# Display the transposed DataFrame
pivot_df.head(20)

Indicator Name,Country Name,Country Code,Year,ARI treatment (% of children under 5 taken to a health provider),Access to clean fuels and technologies for cooking (% of population),"Access to clean fuels and technologies for cooking, rural (% of rural population)","Access to clean fuels and technologies for cooking, urban (% of urban population)",Access to electricity (% of population),"Access to electricity, rural (% of rural population)","Access to electricity, urban (% of urban population)",...,Women who believe a husband is justified in beating his wife (any of five reasons) (%),Women who believe a husband is justified in beating his wife when she argues with him (%),Women who believe a husband is justified in beating his wife when she burns the food (%),Women who believe a husband is justified in beating his wife when she goes out without telling him (%),Women who believe a husband is justified in beating his wife when she neglects the children (%),Women who believe a husband is justified in beating his wife when she refuses sex with him (%),Women who were first married by age 15 (% of women ages 20-24),Women who were first married by age 18 (% of women ages 20-24),Women's share of population ages 15+ living with HIV (%),Young people (ages 15-24) newly infected with HIV
0,Australia,AUS,1960,,,,,,,,...,,,,,,,,,,
1,Australia,AUS,1961,,,,,,,,...,,,,,,,,,,
2,Australia,AUS,1962,,,,,,,,...,,,,,,,,,,
3,Australia,AUS,1963,,,,,,,,...,,,,,,,,,,
4,Australia,AUS,1964,,,,,,,,...,,,,,,,,,,
5,Australia,AUS,1965,,,,,,,,...,,,,,,,,,,
6,Australia,AUS,1966,,,,,,,,...,,,,,,,,,,
7,Australia,AUS,1967,,,,,,,,...,,,,,,,,,,
8,Australia,AUS,1968,,,,,,,,...,,,,,,,,,,
9,Australia,AUS,1969,,,,,,,,...,,,,,,,,,,


In [26]:
pivot_df.columns = pivot_df.columns.str.lower()

# Filter columns that contain 'gdp', 'healthcare', 'health', 'education'
filtered_columns = pivot_df.filter(regex='gdp|health|education|alcohol|private', axis=1)

# Select necessary columns like 'Country Name', 'Country Code', 'Year'
necessary_columns = pivot_df[['country name', 'country code', 'year']]

# Concatenate necessary columns with filtered columns
indicator_df = pd.concat([necessary_columns, filtered_columns], axis=1)

# Display the resulting DataFrame
(indicator_df.info())
indicator_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2304 entries, 0 to 2303
Columns: 233 entries, country name to women participating in the three decisions (own health care, major household purchases, and visiting family) (% of women age 15-49)
dtypes: float64(230), object(3)
memory usage: 4.1+ MB


Indicator Name,country name,country code,year,ari treatment (% of children under 5 taken to a health provider),"account ownership at a financial institution or with a mobile-money-service provider, primary education or less (% of population ages 15+)","account ownership at a financial institution or with a mobile-money-service provider, secondary education or more (% of population ages 15+)",adjusted savings: education expenditure (% of gni),adjusted savings: education expenditure (current us$),"agriculture, forestry, and fishing, value added (% of gdp)",births attended by skilled health staff (% of total),...,"unemployment with advanced education, male (% of male labor force with advanced education)",unemployment with basic education (% of total labor force with basic education),"unemployment with basic education, female (% of female labor force with basic education)","unemployment with basic education, male (% of male labor force with basic education)",unemployment with intermediate education (% of total labor force with intermediate education),"unemployment with intermediate education, female (% of female labor force with intermediate education)","unemployment with intermediate education, male (% of male labor force with intermediate education)","water productivity, total (constant 2015 us$ gdp per cubic meter of total freshwater withdrawal)","women making their own informed decisions regarding sexual relations, contraceptive use and reproductive health care (% of women age 15-49)","women participating in the three decisions (own health care, major household purchases, and visiting family) (% of women age 15-49)"
0,Australia,AUS,1960,,,,,,,,...,,,,,,,,,,
1,Australia,AUS,1961,,,,,,,,...,,,,,,,,,,
2,Australia,AUS,1962,,,,,,,,...,,,,,,,,,,
3,Australia,AUS,1963,,,,,,,,...,,,,,,,,,,
4,Australia,AUS,1964,,,,,,,,...,,,,,,,,,,


In [28]:
world_bank_df = indicator_df[['country name', 'country code', 'year', 'adjusted savings: education expenditure (current us$)', 
                   'current health expenditure (% of gdp)', 
                   'domestic credit to private sector (% of gdp)', 
                   'gdp (constant 2015 us$)', 
                   'labor force with advanced education (% of total working-age population with advanced education)',
                   'labor force with intermediate education (% of total working-age population with intermediate education)',
                   'labor force with basic education (% of total working-age population with basic education)',
                   'market capitalization of listed domestic companies (% of gdp)',
                   'military expenditure (% of gdp)', 
                   'out-of-pocket expenditure (% of current health expenditure)', 
                   'research and development expenditure (% of gdp)', 
                   'stocks traded, total value (% of gdp)']]

world_bank_df.to_csv('../output/csv/cleaned_world_bank_dataset.csv', index=False)
world_bank_df.head(10)

Indicator Name,country name,country code,year,adjusted savings: education expenditure (current us$),current health expenditure (% of gdp),domestic credit to private sector (% of gdp),gdp (constant 2015 us$),labor force with advanced education (% of total working-age population with advanced education),labor force with intermediate education (% of total working-age population with intermediate education),labor force with basic education (% of total working-age population with basic education),market capitalization of listed domestic companies (% of gdp),military expenditure (% of gdp),out-of-pocket expenditure (% of current health expenditure),research and development expenditure (% of gdp),"stocks traded, total value (% of gdp)"
0,Australia,AUS,1960,,,19.116355,204552700000.0,,,,,2.369545,,,
1,Australia,AUS,1961,,,17.629453,209631000000.0,,,,,2.41514,,,
2,Australia,AUS,1962,,,18.647403,212344100000.0,,,,,2.363695,,,
3,Australia,AUS,1963,,,19.283486,225544200000.0,,,,,2.446635,,,
4,Australia,AUS,1964,,,20.111524,241288200000.0,,,,,2.648243,,,
5,Australia,AUS,1965,,,21.126633,255718100000.0,,,,,2.952293,,,
6,Australia,AUS,1966,,,22.380855,261801100000.0,,,,,3.451383,,,
7,Australia,AUS,1967,,,23.095913,278307800000.0,,,,,3.793764,,,
8,Australia,AUS,1968,,,23.997398,292484100000.0,,,,,3.837274,,,
9,Australia,AUS,1969,,,24.822459,313090200000.0,,,,,3.38592,,,
