## Part 1. Compairing the number of covid confirmed cases across 5 continents

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
covid_confirmed_cases = pd.read_csv("total-cases-covid-19.csv")
covid_confirmed_cases

Unnamed: 0,Entity,Code,Date,Total confirmed cases,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8
0,Afghanistan,AFG,25-Feb-20,1,,,,,
1,Afghanistan,AFG,26-Feb-20,1,,,,,
2,Afghanistan,AFG,27-Feb-20,1,,,,,
3,Afghanistan,AFG,28-Feb-20,1,,,,,
4,Afghanistan,AFG,29-Feb-20,1,,,,,
...,...,...,...,...,...,...,...,...,...
12282,Zimbabwe,ZWE,25-Apr-20,29,,,,,
12283,Zimbabwe,ZWE,26-Apr-20,31,,,,,
12284,Zimbabwe,ZWE,27-Apr-20,31,,,,,
12285,Zimbabwe,ZWE,28-Apr-20,32,,,,,


### Change column name "Total confirmed cases" to "Total_confirmed_cases"

In [3]:
covid_confirmed_cases = covid_confirmed_cases.rename(columns = {'Total confirmed cases ': 'Total_confirmed_cases'})
covid_confirmed_cases.head()

Unnamed: 0,Entity,Code,Date,Total_confirmed_cases,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8
0,Afghanistan,AFG,25-Feb-20,1,,,,,
1,Afghanistan,AFG,26-Feb-20,1,,,,,
2,Afghanistan,AFG,27-Feb-20,1,,,,,
3,Afghanistan,AFG,28-Feb-20,1,,,,,
4,Afghanistan,AFG,29-Feb-20,1,,,,,


### Remove unwanted columns

In [4]:
covid_confirmed_cases.columns

Index(['Entity', 'Code', 'Date', 'Total_confirmed_cases', 'Unnamed: 4',
       'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8'],
      dtype='object')

In [5]:
covid_confirmed_cases = covid_confirmed_cases[['Entity', 'Code', 'Date', 'Total_confirmed_cases']]
covid_confirmed_cases.head()

Unnamed: 0,Entity,Code,Date,Total_confirmed_cases
0,Afghanistan,AFG,25-Feb-20,1
1,Afghanistan,AFG,26-Feb-20,1
2,Afghanistan,AFG,27-Feb-20,1
3,Afghanistan,AFG,28-Feb-20,1
4,Afghanistan,AFG,29-Feb-20,1


### Create a separeate dataframe of confirmed cases of six continents

In [6]:
continents = ["Asia", "Europe", "North America", "South America", "Oceania", "Africa"]
covid_continents= pd.DataFrame()

for continent in continents:
    covid_continents = covid_continents.append(covid_confirmed_cases.loc[covid_confirmed_cases.Entity == continent])

covid_continents

Unnamed: 0,Entity,Code,Date,Total_confirmed_cases
545,Asia,,31-Dec-19,27
546,Asia,,1-Jan-20,27
547,Asia,,2-Jan-20,27
548,Asia,,3-Jan-20,44
549,Asia,,4-Jan-20,44
...,...,...,...,...
125,Africa,,25-Apr-20,29075
126,Africa,,26-Apr-20,30316
127,Africa,,27-Apr-20,31748
128,Africa,,28-Apr-20,33164


- display a plot with 5 legends(continents) showing Date vs. Total_confirmed_cases  
- claculate the log (rate of increase) for each line


## Part 2. Comparing across top 3 continents with the highest total confirmed cases

### Remove rows whose Code is NaN (non countries)

In [7]:
covid_confirmed_cases.Code.unique()

array(['AFG', nan, 'ALB', 'DZA', 'AND', 'AGO', 'AIA', 'ATG', 'ARG', 'ARM',
       'ABW', 'AUS', 'AUT', 'AZE', 'BHS', 'BHR', 'BGD', 'BRB', 'BLR',
       'BEL', 'BLZ', 'BEN', 'BMU', 'BTN', 'BOL', 'BIH', 'BWA', 'BRA',
       'VGB', 'BRN', 'BGR', 'BFA', 'BDI', 'KHM', 'CMR', 'CAN', 'CPV',
       'CYM', 'CAF', 'TCD', 'CHL', 'CHN', 'COL', 'COG', 'CRI', 'CIV',
       'HRV', 'CUB', 'CUW', 'CYP', 'CZE', 'COD', 'DNK', 'DJI', 'DMA',
       'DOM', 'ECU', 'EGY', 'SLV', 'GNQ', 'ERI', 'EST', 'ETH', 'FRO',
       'FLK', 'FJI', 'FIN', 'FRA', 'PYF', 'GAB', 'GMB', 'GEO', 'DEU',
       'GHA', 'GIB', 'GRC', 'GRL', 'GRD', 'GUM', 'GTM', 'GGY', 'GIN',
       'GNB', 'GUY', 'HTI', 'HND', 'HUN', 'ISL', 'IND', 'IDN', 'IRN',
       'IRQ', 'IRL', 'IMN', 'ISR', 'ITA', 'JAM', 'JPN', 'JEY', 'JOR',
       'KAZ', 'KEN', 'OWID_KOS', 'KWT', 'KGZ', 'LAO', 'LVA', 'LBN', 'LBR',
       'LBY', 'LIE', 'LTU', 'LUX', 'MKD', 'MDG', 'MWI', 'MYS', 'MDV',
       'MLI', 'MLT', 'MRT', 'MUS', 'MEX', 'MDA', 'MCO', 'MNG', 'MNE',
       'MS

In [8]:
bool_crit = (pd.isnull(covid_confirmed_cases.Code))
print("Number of NaN entries identified: ", sum(bool_crit))
covid_confirmed_cases[bool_crit]

Number of NaN entries identified:  1528


Unnamed: 0,Entity,Code,Date,Total_confirmed_cases
55,Africa,,15-Feb-20,1
56,Africa,,16-Feb-20,1
57,Africa,,17-Feb-20,1
58,Africa,,18-Feb-20,1
59,Africa,,19-Feb-20,1
...,...,...,...,...
12180,"World excl. China, South Korea, Japan and Sing...",,25-Apr-20,2612056
12181,"World excl. China, South Korea, Japan and Sing...",,26-Apr-20,2711959
12182,"World excl. China, South Korea, Japan and Sing...",,27-Apr-20,2794347
12183,"World excl. China, South Korea, Japan and Sing...",,28-Apr-20,2858738


In [9]:
covid_confirmed_cases_countries = covid_confirmed_cases[~bool_crit]
covid_confirmed_cases_countries

Unnamed: 0,Entity,Code,Date,Total_confirmed_cases
0,Afghanistan,AFG,25-Feb-20,1
1,Afghanistan,AFG,26-Feb-20,1
2,Afghanistan,AFG,27-Feb-20,1
3,Afghanistan,AFG,28-Feb-20,1
4,Afghanistan,AFG,29-Feb-20,1
...,...,...,...,...
12282,Zimbabwe,ZWE,25-Apr-20,29
12283,Zimbabwe,ZWE,26-Apr-20,31
12284,Zimbabwe,ZWE,27-Apr-20,31
12285,Zimbabwe,ZWE,28-Apr-20,32


### Select data entries on 29-Apr-20

In [10]:
countries_0429 = covid_confirmed_cases_countries.loc[covid_confirmed_cases_countries.Date == "29-Apr-20"]
countries_0429

Unnamed: 0,Entity,Code,Date,Total_confirmed_cases
54,Afghanistan,AFG,29-Apr-20,1827
181,Albania,ALB,29-Apr-20,750
240,Algeria,DZA,29-Apr-20,3649
287,Andorra,AND,29-Apr-20,748
326,Angola,AGO,29-Apr-20,27
...,...,...,...,...
11739,Vietnam,VNM,29-Apr-20,270
11860,World,OWID_WRL,29-Apr-20,3052370
12204,Yemen,YEM,29-Apr-20,1
12246,Zambia,ZMB,29-Apr-20,95


### Create a dictionary of all countries in the world

In [11]:
pip install pycountry-convert

Note: you may need to restart the kernel to use updated packages.


In [12]:
import pycountry
countries = list(pycountry.countries)
countries_alpha2 = []
countries_alpha3 = []

for i in range(len(countries)):
    countries_alpha2.append(countries[i].alpha_2)
    countries_alpha3.append(countries[i].alpha_3)

countries_alpha3
countries_alpha2
countries

[Country(alpha_2='AW', alpha_3='ABW', name='Aruba', numeric='533'),
 Country(alpha_2='AF', alpha_3='AFG', name='Afghanistan', numeric='004', official_name='Islamic Republic of Afghanistan'),
 Country(alpha_2='AO', alpha_3='AGO', name='Angola', numeric='024', official_name='Republic of Angola'),
 Country(alpha_2='AI', alpha_3='AIA', name='Anguilla', numeric='660'),
 Country(alpha_2='AX', alpha_3='ALA', name='Åland Islands', numeric='248'),
 Country(alpha_2='AL', alpha_3='ALB', name='Albania', numeric='008', official_name='Republic of Albania'),
 Country(alpha_2='AD', alpha_3='AND', name='Andorra', numeric='020', official_name='Principality of Andorra'),
 Country(alpha_2='AE', alpha_3='ARE', name='United Arab Emirates', numeric='784'),
 Country(alpha_2='AR', alpha_3='ARG', name='Argentina', numeric='032', official_name='Argentine Republic'),
 Country(alpha_2='AM', alpha_3='ARM', name='Armenia', numeric='051', official_name='Republic of Armenia'),
 Country(alpha_2='AS', alpha_3='ASM', nam

### Convert country_alpha3 to continent code

In [13]:
from pycountry_convert import country_alpha2_to_continent_code, country_alpha3_to_country_alpha2

for i in range(len(countries_0429.index)):
    country_alpha3 = countries_0429.Code.iloc[i]
    if country_alpha3 not in countries_alpha3:
        continents.append("invalid")
        print(country_alpha3)
    else:
        country_alpha2 = country_alpha3_to_country_alpha2(country_alpha3)
        continents.append(country_alpha2_to_continent_code(country_alpha2) )

continents

OWID_KOS
OWID_WRL


['Asia',
 'Europe',
 'North America',
 'South America',
 'Oceania',
 'Africa',
 'AS',
 'EU',
 'AF',
 'EU',
 'AF',
 'NA',
 'NA',
 'SA',
 'AS',
 'NA',
 'OC',
 'EU',
 'AS',
 'NA',
 'AS',
 'AS',
 'NA',
 'EU',
 'EU',
 'NA',
 'AF',
 'NA',
 'AS',
 'SA',
 'EU',
 'AF',
 'SA',
 'NA',
 'AS',
 'EU',
 'AF',
 'AF',
 'AS',
 'AF',
 'NA',
 'AF',
 'NA',
 'AF',
 'AF',
 'SA',
 'AS',
 'SA',
 'AF',
 'NA',
 'AF',
 'EU',
 'NA',
 'NA',
 'AS',
 'EU',
 'AF',
 'EU',
 'AF',
 'NA',
 'NA',
 'SA',
 'AF',
 'NA',
 'AF',
 'AF',
 'EU',
 'AF',
 'EU',
 'SA',
 'OC',
 'EU',
 'EU',
 'OC',
 'AF',
 'AF',
 'AS',
 'EU',
 'AF',
 'EU',
 'EU',
 'NA',
 'NA',
 'OC',
 'NA',
 'EU',
 'AF',
 'AF',
 'SA',
 'NA',
 'NA',
 'EU',
 'EU',
 'AS',
 'AS',
 'AS',
 'AS',
 'EU',
 'EU',
 'AS',
 'EU',
 'NA',
 'AS',
 'EU',
 'AS',
 'AS',
 'AF',
 'invalid',
 'AS',
 'AS',
 'AS',
 'EU',
 'AS',
 'AF',
 'AF',
 'EU',
 'EU',
 'EU',
 'EU',
 'AF',
 'AF',
 'AS',
 'AS',
 'AF',
 'EU',
 'AF',
 'AF',
 'NA',
 'EU',
 'EU',
 'AS',
 'EU',
 'NA',
 'AF',
 'AF',
 'AS',
 'AF',

### Add column Continent to the dataframe countries_0429

In [14]:
countries_0429 = countries_0429.copy()
countries_0429["Continent"] = continents

countries_0429

ValueError: Length of values does not match length of index

## Age Demographics - all countries

### Median Age by countries

In [76]:
median_age= pd.read_csv('median-age.csv')

In [77]:
median_age.rename(columns={'UN Population Division (Median Age) (2017) (years)':'median age'}, inplace=True)

In [78]:
median_age

Unnamed: 0,Entity,Code,Year,median age
0,Afghanistan,AFG,1950,19.400000
1,Afghanistan,AFG,1955,19.200001
2,Afghanistan,AFG,1960,18.799999
3,Afghanistan,AFG,1965,18.400000
4,Afghanistan,AFG,1970,17.900000
...,...,...,...,...
7466,Zimbabwe,ZWE,2080,36.299999
7467,Zimbabwe,ZWE,2085,37.700001
7468,Zimbabwe,ZWE,2090,39.000000
7469,Zimbabwe,ZWE,2095,40.099998


In [79]:
median_age_2020 = median_age[median_age['Year'] == 2020]

In [80]:
median_age_2020

Unnamed: 0,Entity,Code,Year,median age
14,Afghanistan,AFG,2020,18.600000
45,Africa,,2020,19.799999
76,Albania,ALB,2020,38.000000
107,Algeria,DZA,2020,29.100000
138,Angola,AGO,2020,16.799999
...,...,...,...,...
7330,Western Sahara,ESH,2020,28.400000
7361,World,OWID_WRL,2020,30.900000
7392,Yemen,YEM,2020,20.299999
7423,Zambia,ZMB,2020,17.700001


#### drop rows that contain a particular string 

In [81]:
median_age_2020.Code.unique()

array(['AFG', nan, 'ALB', 'DZA', 'AGO', 'ATG', 'ARG', 'ARM', 'ABW', 'AUS',
       'AUT', 'AZE', 'BHS', 'BHR', 'BGD', 'BRB', 'BLR', 'BEL', 'BLZ',
       'BEN', 'BTN', 'BOL', 'BIH', 'BWA', 'BRA', 'BRN', 'BGR', 'BFA',
       'BDI', 'KHM', 'CMR', 'CAN', 'CPV', 'CAF', 'TCD', 'OWID_CIS', 'CHL',
       'CHN', 'COL', 'COM', 'COG', 'CRI', 'CIV', 'HRV', 'CUB', 'CUW',
       'CYP', 'CZE', 'COD', 'DNK', 'DJI', 'DOM', 'ECU', 'EGY', 'SLV',
       'GNQ', 'ERI', 'EST', 'ETH', 'FJI', 'FIN', 'FRA', 'GUF', 'PYF',
       'GAB', 'GMB', 'GEO', 'DEU', 'GHA', 'GRC', 'GRD', 'GLP', 'GUM',
       'GTM', 'GIN', 'GNB', 'GUY', 'HTI', 'HND', 'HKG', 'HUN', 'ISL',
       'IND', 'IDN', 'IRN', 'IRQ', 'IRL', 'ISR', 'ITA', 'JAM', 'JPN',
       'JOR', 'KAZ', 'KEN', 'KIR', 'KWT', 'KGZ', 'LAO', 'LVA', 'LBN',
       'LSO', 'LBR', 'LBY', 'LTU', 'LUX', 'MAC', 'MKD', 'MDG', 'MWI',
       'MYS', 'MDV', 'MLI', 'MLT', 'MTQ', 'MRT', 'MUS', 'MYT', 'OWID_MNS',
       'MEX', 'FSM', 'MDA', 'MNG', 'MNE', 'MAR', 'MOZ', 'MMR', 'NAM',
     

In [82]:
median_age_2020 = median_age_2020[median_age_2020['Code'] != 'OWID_WRL']
median_age_2020 = median_age_2020[median_age_2020['Code'] != 'OWID_MNS']
median_age_2020 = median_age_2020[median_age_2020['Code'] != 'OWID_PYA']
median_age_2020 = median_age_2020[median_age_2020['Code'] != 'OWID_CIS']

In [83]:
median_age_2020.Code.unique()

array(['AFG', nan, 'ALB', 'DZA', 'AGO', 'ATG', 'ARG', 'ARM', 'ABW', 'AUS',
       'AUT', 'AZE', 'BHS', 'BHR', 'BGD', 'BRB', 'BLR', 'BEL', 'BLZ',
       'BEN', 'BTN', 'BOL', 'BIH', 'BWA', 'BRA', 'BRN', 'BGR', 'BFA',
       'BDI', 'KHM', 'CMR', 'CAN', 'CPV', 'CAF', 'TCD', 'CHL', 'CHN',
       'COL', 'COM', 'COG', 'CRI', 'CIV', 'HRV', 'CUB', 'CUW', 'CYP',
       'CZE', 'COD', 'DNK', 'DJI', 'DOM', 'ECU', 'EGY', 'SLV', 'GNQ',
       'ERI', 'EST', 'ETH', 'FJI', 'FIN', 'FRA', 'GUF', 'PYF', 'GAB',
       'GMB', 'GEO', 'DEU', 'GHA', 'GRC', 'GRD', 'GLP', 'GUM', 'GTM',
       'GIN', 'GNB', 'GUY', 'HTI', 'HND', 'HKG', 'HUN', 'ISL', 'IND',
       'IDN', 'IRN', 'IRQ', 'IRL', 'ISR', 'ITA', 'JAM', 'JPN', 'JOR',
       'KAZ', 'KEN', 'KIR', 'KWT', 'KGZ', 'LAO', 'LVA', 'LBN', 'LSO',
       'LBR', 'LBY', 'LTU', 'LUX', 'MAC', 'MKD', 'MDG', 'MWI', 'MYS',
       'MDV', 'MLI', 'MLT', 'MTQ', 'MRT', 'MUS', 'MYT', 'MEX', 'FSM',
       'MDA', 'MNG', 'MNE', 'MAR', 'MOZ', 'MMR', 'NAM', 'NPL', 'NLD',
       'NCL', '

In [84]:
bool_crit_2 = (pd.isnull(median_age_2020.Code))
print('Number of NaN entries identified:', sum(bool_crit_2))
median_age_2020[bool_crit_2]

Number of NaN entries identified: 37


Unnamed: 0,Entity,Code,Year,median age
45,Africa,,2020,19.799999
293,Asia,,2020,32.099998
355,Australia/New Zealand,,2020,37.900002
1099,Caribbean,,2020,31.9
1161,Central America,,2020,28.299999
1192,Central Asia,,2020,27.9
1781,Eastern Africa,,2020,18.700001
1812,Eastern Asia,,2020,39.599998
1843,Eastern Europe,,2020,40.799999
2091,Europe,,2020,42.700001


In [85]:
median_age_2020_countries = median_age_2020[~bool_crit_2]
median_age_2020_countries

Unnamed: 0,Entity,Code,Year,median age
14,Afghanistan,AFG,2020,18.600000
76,Albania,ALB,2020,38.000000
107,Algeria,DZA,2020,29.100000
138,Angola,AGO,2020,16.799999
169,Antigua and Barbuda,ATG,2020,32.099998
...,...,...,...,...
7206,Vietnam,VNM,2020,32.599998
7330,Western Sahara,ESH,2020,28.400000
7392,Yemen,YEM,2020,20.299999
7423,Zambia,ZMB,2020,17.700001


### Age Structure/breakdown by countries

In [56]:
age_breakdown = pd.read_csv('population-by-broad-age-group.csv')
age_breakdown

Unnamed: 0,Entity,Code,Year,Under-5s,15-24 years,25-64 years,65+ years,5-14 years
0,Afghanistan,AFG,1950,1291622.0,1476233.0,2873065.0,222954.0,1888244.0
1,Afghanistan,AFG,1951,1314241.0,1488524.0,2887631.0,228191.0,1920923.0
2,Afghanistan,AFG,1952,1318908.0,1507272.0,2914594.0,232456.0,1961750.0
3,Afghanistan,AFG,1953,1321524.0,1529407.0,2947925.0,235252.0,2004488.0
4,Afghanistan,AFG,1954,1331640.0,1553939.0,2984840.0,236391.0,2043637.0
...,...,...,...,...,...,...,...,...
15901,Zimbabwe,ZWE,2011,2296548.0,3186709.0,4796283.0,432774.0,3674335.0
15902,Zimbabwe,ZWE,2012,2360671.0,3196932.0,4974148.0,434714.0,3744361.0
15903,Zimbabwe,ZWE,2013,2417880.0,3213123.0,5156228.0,436408.0,3830867.0
15904,Zimbabwe,ZWE,2014,2466295.0,3235973.0,5339527.0,439654.0,3930226.0


In [68]:
age_breakdown.Year.unique()

array([1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960,
       1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971,
       1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982,
       1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993,
       1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
       2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015])

In [87]:
age_breakdown_2015 = age_breakdown[age_breakdown['Year'] == 2015]
age_breakdown_2015

Unnamed: 0,Entity,Code,Year,Under-5s,15-24 years,25-64 years,65+ years,5-14 years
65,Afghanistan,AFG,2015,5239401.0,7.062361e+06,1.080950e+07,841098.0,9.784129e+06
131,Africa,,2015,186990405.0,2.305106e+08,4.323211e+08,41273702.0,3.032741e+08
197,Albania,ALB,2015,171783.0,4.752570e+05,1.554614e+06,366492.0,3.552060e+05
263,Algeria,DZA,2015,4663613.0,6.614548e+06,1.949221e+07,2340373.0,6.760782e+06
329,Angola,AGO,2015,5158374.0,5.329040e+06,8.771497e+06,642523.0,7.957871e+06
...,...,...,...,...,...,...,...,...
15641,Western Sahara,ESH,2015,54611.0,9.493500e+04,2.678560e+05,13374.0,9.544000e+04
15707,World,OWID_WRL,2015,673649680.0,1.194506e+09,3.646074e+09,611897166.0,1.256883e+09
15773,Yemen,YEM,2015,4016664.0,5.917403e+06,9.309621e+06,769003.0,6.903516e+06
15839,Zambia,ZMB,2015,2750356.0,3.313496e+06,5.078100e+06,402852.0,4.555783e+06


In [90]:
age_breakdown_2015.Code.unique()

array(['AFG', nan, 'ALB', 'DZA', 'AGO', 'ATG', 'ARG', 'ARM', 'ABW', 'AUS',
       'AUT', 'AZE', 'BHS', 'BHR', 'BGD', 'BRB', 'BLR', 'BEL', 'BLZ',
       'BEN', 'BTN', 'BOL', 'BIH', 'BWA', 'BRA', 'BRN', 'BGR', 'BFA',
       'BDI', 'KHM', 'CMR', 'CAN', 'CPV', 'CAF', 'TCD', 'OWID_CIS', 'CHL',
       'CHN', 'COL', 'COM', 'COG', 'CRI', 'CIV', 'HRV', 'CUB', 'CUW',
       'CYP', 'CZE', 'COD', 'DNK', 'DJI', 'DOM', 'ECU', 'EGY', 'SLV',
       'GNQ', 'ERI', 'EST', 'ETH', 'FJI', 'FIN', 'FRA', 'GUF', 'PYF',
       'GAB', 'GMB', 'GEO', 'DEU', 'GHA', 'GRC', 'GRD', 'GUM', 'GTM',
       'GIN', 'GNB', 'GUY', 'HTI', 'HND', 'HKG', 'HUN', 'ISL', 'IND',
       'IDN', 'IRN', 'IRQ', 'IRL', 'ISR', 'ITA', 'JAM', 'JPN', 'JOR',
       'KAZ', 'KEN', 'KIR', 'KWT', 'KGZ', 'LAO', 'LVA', 'LBN', 'LSO',
       'LBR', 'LBY', 'LTU', 'LUX', 'MAC', 'MKD', 'MDG', 'MWI', 'MYS',
       'MDV', 'MLI', 'MLT', 'MTQ', 'MRT', 'MUS', 'MYT', 'OWID_MNS', 'MEX',
       'FSM', 'MDA', 'MNG', 'MNE', 'MAR', 'MOZ', 'MMR', 'NAM', 'NPL',
     

In [92]:
age_breakdown_2015 = age_breakdown_2015[age_breakdown_2015['Code'] != 'OWID_WRL']
age_breakdown_2015 = age_breakdown_2015[age_breakdown_2015['Code'] != 'OWID_MNS']
age_breakdown_2015 = age_breakdown_2015[age_breakdown_2015['Code'] != 'OWID_PYA']
age_breakdown_2015 = age_breakdown_2015[age_breakdown_2015['Code'] != 'OWID_CIS']
age_breakdown_2015.Code.unique()

array(['AFG', nan, 'ALB', 'DZA', 'AGO', 'ATG', 'ARG', 'ARM', 'ABW', 'AUS',
       'AUT', 'AZE', 'BHS', 'BHR', 'BGD', 'BRB', 'BLR', 'BEL', 'BLZ',
       'BEN', 'BTN', 'BOL', 'BIH', 'BWA', 'BRA', 'BRN', 'BGR', 'BFA',
       'BDI', 'KHM', 'CMR', 'CAN', 'CPV', 'CAF', 'TCD', 'CHL', 'CHN',
       'COL', 'COM', 'COG', 'CRI', 'CIV', 'HRV', 'CUB', 'CUW', 'CYP',
       'CZE', 'COD', 'DNK', 'DJI', 'DOM', 'ECU', 'EGY', 'SLV', 'GNQ',
       'ERI', 'EST', 'ETH', 'FJI', 'FIN', 'FRA', 'GUF', 'PYF', 'GAB',
       'GMB', 'GEO', 'DEU', 'GHA', 'GRC', 'GRD', 'GUM', 'GTM', 'GIN',
       'GNB', 'GUY', 'HTI', 'HND', 'HKG', 'HUN', 'ISL', 'IND', 'IDN',
       'IRN', 'IRQ', 'IRL', 'ISR', 'ITA', 'JAM', 'JPN', 'JOR', 'KAZ',
       'KEN', 'KIR', 'KWT', 'KGZ', 'LAO', 'LVA', 'LBN', 'LSO', 'LBR',
       'LBY', 'LTU', 'LUX', 'MAC', 'MKD', 'MDG', 'MWI', 'MYS', 'MDV',
       'MLI', 'MLT', 'MTQ', 'MRT', 'MUS', 'MYT', 'MEX', 'FSM', 'MDA',
       'MNG', 'MNE', 'MAR', 'MOZ', 'MMR', 'NAM', 'NPL', 'NLD', 'NCL',
       'NZL', '

In [93]:
bool_crit_3 = (pd.isnull(age_breakdown_2015.Code))
print('Number of NaN entries identified:', sum(bool_crit_3))
age_breakdown_2015[bool_crit_3]

Number of NaN entries identified: 39


Unnamed: 0,Entity,Code,Year,Under-5s,15-24 years,25-64 years,65+ years,5-14 years
131,Africa,,2015,186990405.0,230510600.0,432321100.0,41273702.0,303274100.0
659,Asia,,2015,368116983.0,717102400.0,2282897000.0,334742675.0,717038500.0
791,Australia/New Zealand,,2015,1853797.0,3802218.0,14967240.0,4240364.0,3550474.0
2375,Caribbean,,2015,3611501.0,7199968.0,21229230.0,4054027.0,7214886.0
2507,Central America,,2015,16443025.0,32420480.0,80036540.0,10839384.0,32895680.0
2573,Central Asia,,2015,7779756.0,12459060.0,33011020.0,3271278.0,12183890.0
3827,Eastern Africa,,2015,64685579.0,81431280.0,132724500.0,12021351.0,108595300.0
3893,Eastern Asia,,2015,96913267.0,209446200.0,963039600.0,181628477.0,184122900.0
3959,Eastern Europe,,2015,17057763.0,32061720.0,171007400.0,43445311.0,29671490.0
4487,Europe,,2015,40069690.0,81676000.0,411983200.0,130378427.0,76706670.0


In [94]:
age_breakdown_2015_countries = age_breakdown_2015[~bool_crit_3]

In [95]:
age_breakdown_2015_countries.Code.unique()

array(['AFG', 'ALB', 'DZA', 'AGO', 'ATG', 'ARG', 'ARM', 'ABW', 'AUS',
       'AUT', 'AZE', 'BHS', 'BHR', 'BGD', 'BRB', 'BLR', 'BEL', 'BLZ',
       'BEN', 'BTN', 'BOL', 'BIH', 'BWA', 'BRA', 'BRN', 'BGR', 'BFA',
       'BDI', 'KHM', 'CMR', 'CAN', 'CPV', 'CAF', 'TCD', 'CHL', 'CHN',
       'COL', 'COM', 'COG', 'CRI', 'CIV', 'HRV', 'CUB', 'CUW', 'CYP',
       'CZE', 'COD', 'DNK', 'DJI', 'DOM', 'ECU', 'EGY', 'SLV', 'GNQ',
       'ERI', 'EST', 'ETH', 'FJI', 'FIN', 'FRA', 'GUF', 'PYF', 'GAB',
       'GMB', 'GEO', 'DEU', 'GHA', 'GRC', 'GRD', 'GUM', 'GTM', 'GIN',
       'GNB', 'GUY', 'HTI', 'HND', 'HKG', 'HUN', 'ISL', 'IND', 'IDN',
       'IRN', 'IRQ', 'IRL', 'ISR', 'ITA', 'JAM', 'JPN', 'JOR', 'KAZ',
       'KEN', 'KIR', 'KWT', 'KGZ', 'LAO', 'LVA', 'LBN', 'LSO', 'LBR',
       'LBY', 'LTU', 'LUX', 'MAC', 'MKD', 'MDG', 'MWI', 'MYS', 'MDV',
       'MLI', 'MLT', 'MTQ', 'MRT', 'MUS', 'MYT', 'MEX', 'FSM', 'MDA',
       'MNG', 'MNE', 'MAR', 'MOZ', 'MMR', 'NAM', 'NPL', 'NLD', 'NCL',
       'NZL', 'NIC',

In [97]:
age_breakdown_2015_countries

Unnamed: 0,Entity,Code,Year,Under-5s,15-24 years,25-64 years,65+ years,5-14 years
65,Afghanistan,AFG,2015,5239401.0,7062361.0,10809505.0,841098.0,9784129.0
197,Albania,ALB,2015,171783.0,475257.0,1554614.0,366492.0,355206.0
263,Algeria,DZA,2015,4663613.0,6614548.0,19492212.0,2340373.0,6760782.0
329,Angola,AGO,2015,5158374.0,5329040.0,8771497.0,642523.0,7957871.0
395,Antigua and Barbuda,ATG,2015,8036.0,16895.0,51910.0,6578.0,16504.0
...,...,...,...,...,...,...,...,...
15377,Vietnam,VNM,2015,7752861.0,15798527.0,49852905.0,6310979.0,13856295.0
15641,Western Sahara,ESH,2015,54611.0,94935.0,267856.0,13374.0,95440.0
15773,Yemen,YEM,2015,4016664.0,5917403.0,9309621.0,769003.0,6903516.0
15839,Zambia,ZMB,2015,2750356.0,3313496.0,5078100.0,402852.0,4555783.0


### Reorder columns

In [151]:
age_breakdown_2015_countries = age_breakdown_2015_countries[['Entity','Code','Year','Under-5s','5-14 years','15-24 years','25-64 years','65+ years']]

In [153]:
age_breakdown_2015_countries

Unnamed: 0,Entity,Code,Year,Under-5s,5-14 years,15-24 years,25-64 years,65+ years
65,Afghanistan,AFG,2015,5239401.0,9784129.0,7062361.0,10809505.0,841098.0
197,Albania,ALB,2015,171783.0,355206.0,475257.0,1554614.0,366492.0
263,Algeria,DZA,2015,4663613.0,6760782.0,6614548.0,19492212.0,2340373.0
329,Angola,AGO,2015,5158374.0,7957871.0,5329040.0,8771497.0,642523.0
395,Antigua and Barbuda,ATG,2015,8036.0,16504.0,16895.0,51910.0,6578.0
...,...,...,...,...,...,...,...,...
15377,Vietnam,VNM,2015,7752861.0,13856295.0,15798527.0,49852905.0,6310979.0
15641,Western Sahara,ESH,2015,54611.0,95440.0,94935.0,267856.0,13374.0
15773,Yemen,YEM,2015,4016664.0,6903516.0,5917403.0,9309621.0,769003.0
15839,Zambia,ZMB,2015,2750356.0,4555783.0,3313496.0,5078100.0,402852.0


In [146]:
#age_data = age_breakdown_2015_countries.merge(median_age_2020_countries, how = 'inner', on = ['Entity'])

In [147]:
#age_data

## Economic Indicators - all countries

### GDP per capita, PPP

In [142]:
GDP = pd.read_csv('API_NY.GDP.PCAP.PP.CD_DS2_en_csv_v2_988619.csv', skiprows = 1, header = 1)
GDP

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,Unnamed: 64
0,Aruba,ABW,"GDP per capita, PPP (current international $)",NY.GDP.PCAP.PP.CD,,,,,,,...,35492.618487,35498.982089,37419.892817,38223.372261,38249.054868,38390.271649,39454.629831,,,
1,Afghanistan,AFG,"GDP per capita, PPP (current international $)",NY.GDP.PCAP.PP.CD,,,,,,,...,1626.764793,1806.763930,1874.765634,1897.525938,1886.692977,1896.992520,1934.636754,1955.006208,,
2,Angola,AGO,"GDP per capita, PPP (current international $)",NY.GDP.PCAP.PP.CD,,,,,,,...,6346.395122,6772.528333,6980.423038,7199.245478,7096.600615,6756.935074,6650.584940,6452.355165,,
3,Albania,ALB,"GDP per capita, PPP (current international $)",NY.GDP.PCAP.PP.CD,,,,,,,...,10207.752347,10526.235446,10571.010650,11259.225894,11662.030481,11868.178968,12930.140035,13364.155397,,
4,Andorra,AND,"GDP per capita, PPP (current international $)",NY.GDP.PCAP.PP.CD,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259,Kosovo,XKX,"GDP per capita, PPP (current international $)",NY.GDP.PCAP.PP.CD,,,,,,,...,8222.285712,8547.652384,8903.763429,9194.741819,9781.019679,10208.857900,10756.663461,11348.363449,,
260,"Yemen, Rep.",YEM,"GDP per capita, PPP (current international $)",NY.GDP.PCAP.PP.CD,,,,,,,...,3876.302619,3935.167105,4084.882967,4045.500031,3320.110156,2827.691023,2645.308383,2575.126385,,
261,South Africa,ZAF,"GDP per capita, PPP (current international $)",NY.GDP.PCAP.PP.CD,,,,,,,...,12179.174188,12488.215572,12815.727074,13090.476828,13185.253283,13188.029617,13438.282887,13686.882361,,
262,Zambia,ZMB,"GDP per capita, PPP (current international $)",NY.GDP.PCAP.PP.CD,,,,,,,...,3419.010460,3634.780353,3765.584162,3893.549478,3927.761816,3998.004435,4090.120040,4223.906936,,


In [143]:
GDP_2018 = GDP[['Country Name','Country Code','2018']]
GDP_2018

Unnamed: 0,Country Name,Country Code,2018
0,Aruba,ABW,
1,Afghanistan,AFG,1955.006208
2,Angola,AGO,6452.355165
3,Albania,ALB,13364.155397
4,Andorra,AND,
...,...,...,...
259,Kosovo,XKX,11348.363449
260,"Yemen, Rep.",YEM,2575.126385
261,South Africa,ZAF,13686.882361
262,Zambia,ZMB,4223.906936


In [136]:
bool_crit_4 = (pd.isnull(GDP_2018['2018']))
print('Number of NaN entries identified:', sum(bool_crit_4))
GDP_2018[bool_crit_4]

Number of NaN entries identified: 33


Unnamed: 0,Country Name,Country Code,2018
0,Aruba,ABW,
4,Andorra,AND,
9,American Samoa,ASM,
25,Bermuda,BMU,
36,Channel Islands,CHI,
48,Cuba,CUB,
50,Cayman Islands,CYM,
54,Djibouti,DJI,
67,Eritrea,ERI,
76,Faroe Islands,FRO,


In [154]:
GDP_2018_countries = GDP_2018[~bool_crit_4]
GDP_2018_countries

Unnamed: 0,Country Name,Country Code,2018
1,Afghanistan,AFG,1955.006208
2,Angola,AGO,6452.355165
3,Albania,ALB,13364.155397
5,Arab World,ARB,17570.137596
6,United Arab Emirates,ARE,75075.257411
...,...,...,...
259,Kosovo,XKX,11348.363449
260,"Yemen, Rep.",YEM,2575.126385
261,South Africa,ZAF,13686.882361
262,Zambia,ZMB,4223.906936


### unemployment rate

In [158]:
unemployment = pd.read_csv('unemployment.csv',skiprows = 1, header = 1 )
unemployment

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,Unnamed: 64
0,Aruba,ABW,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,,,,,,,...,,,,,,,,,,
1,Afghanistan,AFG,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,,,,,,,...,11.488,11.508,11.534,11.448000,11.387,11.313000,11.184000,11.057,11.118,
2,Angola,AGO,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,,,,,,,...,7.362,7.379,7.400,7.331000,7.282,7.223000,7.119000,7.019,6.886,
3,Albania,ALB,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,,,,,,,...,13.481,13.376,15.866,17.490000,17.080,15.220000,13.750000,12.340,12.331,
4,Andorra,AND,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259,Kosovo,XKX,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,,,,,,,...,,,,,,,,,,
260,"Yemen, Rep.",YEM,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,,,,,,,...,13.080,13.248,13.424,13.467000,13.395,13.307000,13.152000,13.002,12.910,
261,South Africa,ZAF,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,,,,,,,...,24.653,24.732,24.569,24.898001,25.156,26.551001,27.070999,26.920,28.181,
262,Zambia,ZMB,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,,,,,,,...,10.343,7.850,8.618,9.340000,10.105,10.882000,11.626000,11.500,11.425,


In [160]:
unemployment_rate = unemployment[['Country Name', 'Country Code','2019']] 
unemployment_rate

Unnamed: 0,Country Name,Country Code,2019
0,Aruba,ABW,
1,Afghanistan,AFG,11.118
2,Angola,AGO,6.886
3,Albania,ALB,12.331
4,Andorra,AND,
...,...,...,...
259,Kosovo,XKX,
260,"Yemen, Rep.",YEM,12.910
261,South Africa,ZAF,28.181
262,Zambia,ZMB,11.425


In [161]:
bool_crit_5 = (pd.isnull(unemployment_rate['2019']))
print('Number of NaN entries identified:', sum(bool_crit_5))
unemployment_rate[bool_crit_5]

Number of NaN entries identified: 31


Unnamed: 0,Country Name,Country Code,2019
0,Aruba,ABW,
4,Andorra,AND,
9,American Samoa,ASM,
10,Antigua and Barbuda,ATG,
25,Bermuda,BMU,
49,Curacao,CUW,
50,Cayman Islands,CYM,
55,Dominica,DMA,
76,Faroe Islands,FRO,
77,"Micronesia, Fed. Sts.",FSM,


In [165]:
unemployment_rate_2019_countries = unemployment_rate[~bool_crit_5]
unemployment_rate_2019_countries

Unnamed: 0,Country Name,Country Code,2019
1,Afghanistan,AFG,11.118000
2,Angola,AGO,6.886000
3,Albania,ALB,12.331000
5,Arab World,ARB,10.337095
6,United Arab Emirates,ARE,2.348000
...,...,...,...
258,Samoa,WSM,8.359000
260,"Yemen, Rep.",YEM,12.910000
261,South Africa,ZAF,28.181000
262,Zambia,ZMB,11.425000
