In [1]:
# Import dependencies
import os
import pandas as pd
import numpy as np
import country_converter as cc

In [2]:
# View data sets
print(os.listdir('assets/data/raw_data'))
print(f"\n{os.listdir('assets/data')}")

['ddf--entities--geo--world_4region.csv', 'income_per_person_gdppercapita_ppp_inflation_adjusted.csv', 'life_expectancy_years.csv', 'population_total.csv']

['all_data.csv', 'income_pivoted.csv', 'life_expectancy_pivoted.csv', 'population_pivoted.csv', 'raw_data']


In [3]:
# Read data into Pandas data frames
# *Pivoted raw data sets using Tableau
income = pd.read_csv('assets/data/income_pivoted.csv')
lex = pd.read_csv('assets/data/life_expectancy_pivoted.csv')
population = pd.read_csv('assets/data/population_pivoted.csv')
regions = pd.read_csv('assets/data/raw_data/ddf--entities--geo--world_4region.csv', encoding='utf-8')

# Preview data frames
display(income.head(3))
display(lex.head(3))
display(population.head(3))
regions.head(3)

Unnamed: 0,Number of Records,year,income,country
0,1,1800,603,Afghanistan
1,1,1800,667,Albania
2,1,1800,715,Algeria


Unnamed: 0,Number of Records,year,life_expectancy,country
0,1,1800,28.2,Afghanistan
1,1,1800,35.4,Albania
2,1,1800,28.8,Algeria


Unnamed: 0,Number of Records,year,population,country
0,1,1800,3280000,Afghanistan
1,1,1800,410000,Albania
2,1,1800,2500000,Algeria


Unnamed: 0,world_4region,color,description,is--world_4region,latitude,longitude,name,name_long,name_short,rank,shape_lores_svg
0,africa,#00d5e9,"The entire African continent, Madagascar and s...",True,-14.33333,28.5,Africa,The African continent including Madagascar & o...,Africa,2,"M322.7,114.7l-1-1.8l-6.5,2.3l-16-4.8l-2.3,1.7l..."
1,americas,#7feb00,"North, South & central America make up roughly...",True,8.9936,-79.51973,The Americas,"North, South & Central America",America,3,"M134.8,152l-11.4,1.8l-3.1-1.7l5.3-1.3l-0.7-1.1..."
2,asia,#ff5872,"Asia as defined by Gapmidner, to make up rough...",True,16.16667,107.83333,Asia,"Australia, Pacific Islands, South & South East...",Asia,1,"M322.9,118.9l22.8,42.5l13.5-5.9l16.8-19l-7.3-6..."


In [4]:
# Merge data frames, drop unnecessary column and drop rows that contain NaN values
income_lex_merged = pd.merge(income, lex)
all_data = pd.merge(income_lex_merged, population).drop(columns=['Number of Records']).dropna()

# Preview data frame
all_data.head()

Unnamed: 0,year,income,country,life_expectancy,population
0,1800,603,Afghanistan,28.2,3280000
1,1800,667,Albania,35.4,410000
2,1800,715,Algeria,28.8,2500000
4,1800,618,Angola,27.0,1570000
5,1800,757,Antigua and Barbuda,33.5,37000


In [5]:
# Create 'region' column by converting the 'country' column values to continents/regions
all_data['region'] = cc.convert(list(all_data['country']), to='continent')

# Check 'region' column values to make sure only the 4 regions defined by Gapminder are included and preview data frame
display(all_data['region'].value_counts())
all_data.head()

Africa     11826
Asia       10512
Europe      8588
America     7274
Oceania     2237
Name: region, dtype: int64

Unnamed: 0,year,income,country,life_expectancy,population,region
0,1800,603,Afghanistan,28.2,3280000,Asia
1,1800,667,Albania,35.4,410000,Europe
2,1800,715,Algeria,28.8,2500000,Africa
4,1800,618,Angola,27.0,1570000,Africa
5,1800,757,Antigua and Barbuda,33.5,37000,America


In [6]:
# Gapminder defines only 4 regions so one of the regions (since there are currently 5 different regions in the 'region' column) needs to be converted into one of the 4 Gapminder defined regions
# Display the Gapminder data set that provides details on the Gapminder defined regions
pd.set_option("display.max_colwidth", -1)
regions[['world_4region', 'description', 'name_long']]

Unnamed: 0,world_4region,description,name_long
0,africa,"The entire African continent, Madagascar and some islands make up roughly a quarter of the world's total land surface.",The African continent including Madagascar & other islands
1,americas,"North, South & central America make up roughly a quarter of the world's total land surface.","North, South & Central America"
2,asia,"Asia as defined by Gapmidner, to make up roughly a quarter of the world's total land surface.","Australia, Pacific Islands, South & South East Asia"
3,europe,"West & East Europe including all of Russia and the Central Asian countries. The group is defined by Gapmidner, to make up roughly a quarter of the world's total land surface.","Central Asia, East & West Europe"


In [7]:
# Gapminder's regions definitions require 'Oceania' values to be converted to 'Asia' values
# Gapminder's 4 regions definitions: https://www.gapminder.org/fw/four-regions/
all_data.replace(to_replace='Oceania', value='Asia', inplace=True)

# Confirm successful conversions
display(all_data['region'].value_counts())

Asia       12749
Africa     11826
Europe     8588 
America    7274 
Name: region, dtype: int64

In [8]:
# Reorder and sort data frame and reset index
all_data = all_data[['year', 'region', 'country', 'life_expectancy', 'income', 'population']].sort_values(by='year').reset_index(drop=True)
all_data.head()

Unnamed: 0,year,region,country,life_expectancy,income,population
0,1800,Asia,Afghanistan,28.2,603,3280000
1,1800,America,Nicaragua,25.4,973,219000
2,1800,Africa,Niger,30.8,446,1240000
3,1800,Africa,Nigeria,30.4,851,12100000
4,1800,Asia,North Korea,26.0,578,4340000


In [9]:
# Final check - no NaNs, min/max year = 1800/2018, and no negative numbers
all_data.describe()

Unnamed: 0,year,life_expectancy,income,population
count,40437.0,40437.0,40437.0,40437.0
mean,1909.2929,43.13218,4619.558424,13711840.0
std,63.308682,16.313553,10210.561501,66056240.0
min,1800.0,1.0,247.0,2130.0
25%,1854.0,31.2,876.0,411000.0
50%,1909.0,35.5,1440.0,1990000.0
75%,1964.0,56.0,3490.0,6500000.0
max,2018.0,84.2,178000.0,1420000000.0


In [10]:
# Write data frame to csv file
all_data.to_csv('assets/data/all_data_cleaned.csv', sep=',', index=False)