In [1]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
sb.set() # set the default Seaborn style for graphics

# EDUCATION (4 COLUMNS)

In [2]:
Education = pd.read_csv('Education.csv', encoding = "ISO-8859-1")
new_header = Education.iloc[0] #grab the first row for the header
Education = Education[1:] #take the data less the header row
Education.columns = new_header #set the header row as the df header
Education.head()

Unnamed: 0,Region/Country/Area,NaN,Year,Series,Value,Footnotes,Source
1,1,"Total, all countries or areas",2005,Students enrolled in primary education (thousa...,679013.0,,"United Nations Educational, Scientific and Cul..."
2,1,"Total, all countries or areas",2005,Gross enrollment ratio - Primary (male),104.5,,"United Nations Educational, Scientific and Cul..."
3,1,"Total, all countries or areas",2005,Gross enrollment ratio - Primary (female),99.7,,"United Nations Educational, Scientific and Cul..."
4,1,"Total, all countries or areas",2005,Students enrolled in secondary education (thou...,509274.0,,"United Nations Educational, Scientific and Cul..."
5,1,"Total, all countries or areas",2005,Gross enrollment ratio - Secondary (male),65.8,,"United Nations Educational, Scientific and Cul..."


In [3]:
Education.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6940 entries, 1 to 6940
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Region/Country/Area  6940 non-null   object
 1   nan                  6940 non-null   object
 2   Year                 6940 non-null   object
 3   Series               6940 non-null   object
 4   Value                6940 non-null   object
 5   Footnotes            631 non-null    object
 6   Source               6940 non-null   object
dtypes: object(7)
memory usage: 379.7+ KB


In [4]:
# Remove duplicate column name that caused columns names to shift left
cols = list(Education.columns)
cols[1] = 'country'
Education.columns = cols

In [5]:
Education.head()

Unnamed: 0,Region/Country/Area,country,Year,Series,Value,Footnotes,Source
1,1,"Total, all countries or areas",2005,Students enrolled in primary education (thousa...,679013.0,,"United Nations Educational, Scientific and Cul..."
2,1,"Total, all countries or areas",2005,Gross enrollment ratio - Primary (male),104.5,,"United Nations Educational, Scientific and Cul..."
3,1,"Total, all countries or areas",2005,Gross enrollment ratio - Primary (female),99.7,,"United Nations Educational, Scientific and Cul..."
4,1,"Total, all countries or areas",2005,Students enrolled in secondary education (thou...,509274.0,,"United Nations Educational, Scientific and Cul..."
5,1,"Total, all countries or areas",2005,Gross enrollment ratio - Secondary (male),65.8,,"United Nations Educational, Scientific and Cul..."


In [6]:
continents = ['Total, all countries or areas', 'Northern Africa', 'Sub-Saharan Africa', 'Eastern Africa', 
              'Middle Africa', 'Southern Africa', 'Western Africa', 'Northern America', 
              'Latin America & the Caribbean', 'Caribbean', 'Latin America', 'Asia', 'Central Asia', 
              'Eastern Asia', 'South-central Asia', 'South-eastern Asia', 'Southern Asia', 'Western Asia', 
              'Europe', 'Oceania', 'Australia and New Zealand']
for i in continents:
    Education= Education[Education['country'].str.contains(i)==False]

In [7]:
Education=Education[Education['Year'].str.contains('2015')!=False]
Education.head()

Unnamed: 0,Region/Country/Area,country,Year,Series,Value,Footnotes,Source
454,4,Afghanistan,2015,Students enrolled in primary education (thousa...,6199.0,,"United Nations Educational, Scientific and Cul..."
455,4,Afghanistan,2015,Gross enrollment ratio - Primary (male),122.7,,"United Nations Educational, Scientific and Cul..."
456,4,Afghanistan,2015,Gross enrollment ratio - Primary (female),83.5,,"United Nations Educational, Scientific and Cul..."
457,4,Afghanistan,2015,Students enrolled in secondary education (thou...,2651.0,,"United Nations Educational, Scientific and Cul..."
458,4,Afghanistan,2015,Gross enrollment ratio - Secondary (male),65.9,,"United Nations Educational, Scientific and Cul..."


In [8]:
Secondary_female = Education[Education.Series == "Gross enrollment ratio - Secondary (female)" ]
Secondary_female = Secondary_female.rename(columns={'Value': 'Education: Secondary gross enrol. ratio (female per 100 pop.)'})
Secondary_female = pd.DataFrame(Secondary_female[['country', 'Year', 'Education: Secondary gross enrol. ratio (female per 100 pop.)']])
Secondary_female.head()

Unnamed: 0,country,Year,Education: Secondary gross enrol. ratio (female per 100 pop.)
459,Afghanistan,2015,36.8
495,Albania,2015,95.1
630,Antigua and Barbuda,2015,109.3
666,Argentina,2015,110.2
698,Armenia,2015,88.3


In [9]:
Secondary_male = Education[Education.Series == "Gross enrollment ratio - Secondary (male)" ]
Secondary_male = Secondary_male.rename(columns={'Value': 'Education: Secondary gross enrol. ratio (male per 100 pop.)'})
Secondary_male = pd.DataFrame(Secondary_male[['country', 'Year', 'Education: Secondary gross enrol. ratio (male per 100 pop.)']])
Secondary_male.head()

Unnamed: 0,country,Year,Education: Secondary gross enrol. ratio (male per 100 pop.)
458,Afghanistan,2015,65.9
494,Albania,2015,99.5
629,Antigua and Barbuda,2015,107.1
665,Argentina,2015,103.8
697,Armenia,2015,84.0


In [10]:
Tertiary_female = Education[Education.Series == "Gross enrollment ratio - Upper secondary level (female)" ]
Tertiary_female = Tertiary_female.rename(columns={'Value': 'Education: Tertiary gross enrol. ratio (female per 100 pop.)'})
Tertiary_female = pd.DataFrame(Tertiary_female[['country', 'Year', 'Education: Tertiary gross enrol. ratio (female per 100 pop.)']])
Tertiary_female.head()

Unnamed: 0,country,Year,Education: Tertiary gross enrol. ratio (female per 100 pop.)
462,Afghanistan,2015,27.1
498,Albania,2015,89.6
633,Antigua and Barbuda,2015,90.7
669,Argentina,2015,91.1
701,Armenia,2015,95.0


In [11]:
Tertiary_male = Education[Education.Series == "Gross enrollment ratio - Upper secondary level (male)" ]
Tertiary_male = Tertiary_male.rename(columns={'Value': 'Education: Tertiary gross enrol. ratio (male per 100 pop.)'})
Tertiary_male = pd.DataFrame(Tertiary_male[['country', 'Year', 'Education: Tertiary gross enrol. ratio (male per 100 pop.)']])
Tertiary_male.head()

Unnamed: 0,country,Year,Education: Tertiary gross enrol. ratio (male per 100 pop.)
461,Afghanistan,2015,52.6
497,Albania,2015,99.6
632,Antigua and Barbuda,2015,79.0
668,Argentina,2015,79.4
700,Armenia,2015,85.3


# Life Expectancy (2 COLUMNS)

In [12]:
Life_Expectancy = pd.read_csv('population_fertility.csv', encoding = "ISO-8859-1")
new_header = Life_Expectancy.iloc[0] #grab the first row for the header
Life_Expectancy = Life_Expectancy[1:] #take the data less the header row
Life_Expectancy.columns = new_header #set the header row as the df header
Life_Expectancy.head()

Unnamed: 0,Region/Country/Area,NaN,Year,Series,Value,Footnotes,Source
1,1,"Total, all countries or areas",2010,Population annual rate of increase (percent),1.2,Data refers to a 5-year period preceding the r...,"United Nations Population Division, New York, ..."
2,1,"Total, all countries or areas",2010,Total fertility rate (children per women),2.6,Data refers to a 5-year period preceding the r...,"United Nations Population Division, New York, ..."
3,1,"Total, all countries or areas",2010,"Infant mortality for both sexes (per 1,000 liv...",41.0,Data refers to a 5-year period preceding the r...,"United Nations Statistics Division, New York, ..."
4,1,"Total, all countries or areas",2010,"Maternal mortality ratio (deaths per 100,000 p...",248.0,,"World Health Organization (WHO), the United Na..."
5,1,"Total, all countries or areas",2010,Life expectancy at birth for both sexes (years),68.9,Data refers to a 5-year period preceding the r...,"United Nations Population Division, New York, ..."


In [13]:
Life_Expectancy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4899 entries, 1 to 4899
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Region/Country/Area  4899 non-null   object
 1   nan                  4899 non-null   object
 2   Year                 4899 non-null   object
 3   Series               4899 non-null   object
 4   Value                4899 non-null   object
 5   Footnotes            4263 non-null   object
 6   Source               4899 non-null   object
dtypes: object(7)
memory usage: 268.0+ KB


In [14]:
# Remove duplicate column name that caused columns names to shift left
cols = list(Life_Expectancy.columns)
cols[1] = 'country'
Life_Expectancy.columns = cols

In [15]:
Life_Expectancy.head()

Unnamed: 0,Region/Country/Area,country,Year,Series,Value,Footnotes,Source
1,1,"Total, all countries or areas",2010,Population annual rate of increase (percent),1.2,Data refers to a 5-year period preceding the r...,"United Nations Population Division, New York, ..."
2,1,"Total, all countries or areas",2010,Total fertility rate (children per women),2.6,Data refers to a 5-year period preceding the r...,"United Nations Population Division, New York, ..."
3,1,"Total, all countries or areas",2010,"Infant mortality for both sexes (per 1,000 liv...",41.0,Data refers to a 5-year period preceding the r...,"United Nations Statistics Division, New York, ..."
4,1,"Total, all countries or areas",2010,"Maternal mortality ratio (deaths per 100,000 p...",248.0,,"World Health Organization (WHO), the United Na..."
5,1,"Total, all countries or areas",2010,Life expectancy at birth for both sexes (years),68.9,Data refers to a 5-year period preceding the r...,"United Nations Population Division, New York, ..."


In [16]:
continents = ['Total, all countries or areas', 'Northern Africa', 'Sub-Saharan Africa', 'Eastern Africa', 
              'Middle Africa', 'Southern Africa', 'Western Africa', 'Northern America', 
              'Latin America & the Caribbean', 'Caribbean', 'Latin America', 'Asia', 'Central Asia', 
              'Eastern Asia', 'South-central Asia', 'South-eastern Asia', 'Southern Asia', 'Western Asia', 
              'Europe', 'Oceania', 'Australia and New Zealand']
for i in continents:
    Life_Expectancy= Life_Expectancy[Life_Expectancy['country'].str.contains(i)==False]

In [17]:
Life_Expectancy=Life_Expectancy[Life_Expectancy['Year'].str.contains('2015')!=False]
Life_Expectancy.head()

Unnamed: 0,Region/Country/Area,country,Year,Series,Value,Footnotes,Source
28,2,Africa,2015,Population annual rate of increase (percent),2.6,Data refers to a 5-year period preceding the r...,"United Nations Population Division, New York, ..."
29,2,Africa,2015,Total fertility rate (children per women),4.7,Data refers to a 5-year period preceding the r...,"United Nations Population Division, New York, ..."
30,2,Africa,2015,"Infant mortality for both sexes (per 1,000 liv...",55.9,Data refers to a 5-year period preceding the r...,"United Nations Statistics Division, New York, ..."
31,2,Africa,2015,Life expectancy at birth for both sexes (years),60.2,Data refers to a 5-year period preceding the r...,"United Nations Population Division, New York, ..."
32,2,Africa,2015,Life expectancy at birth for males (years),58.6,Data refers to a 5-year period preceding the r...,"United Nations Population Division, New York, ..."


In [18]:
LE_females = Life_Expectancy[Life_Expectancy.Series == "Life expectancy at birth for females (years)" ]
LE_females = LE_females.rename(columns={'Value': 'Life expectancy at birth (females, years)'})
LE_females = pd.DataFrame(LE_females[['country', 'Year', 'Life expectancy at birth (females, years)']])
LE_females.head()

Unnamed: 0,country,Year,"Life expectancy at birth (females, years)"
33,Africa,2015,61.9
242,Central America,2015,77.6
263,South America,2015,78.1
575,Melanesia,2015,65.5
596,Micronesia,2015,74.7


In [19]:
LE_males = Life_Expectancy[Life_Expectancy.Series == "Life expectancy at birth for males (years)" ]
LE_males = LE_males.rename(columns={'Value': 'Life expectancy at birth (males, years)'})
LE_males = pd.DataFrame(LE_males[['country', 'Year', 'Life expectancy at birth (males, years)']])
LE_males.head()

Unnamed: 0,country,Year,"Life expectancy at birth (males, years)"
32,Africa,2015,58.6
241,Central America,2015,71.6
262,South America,2015,71.3
574,Melanesia,2015,62.7
595,Micronesia,2015,69.1


# Drinking Water & Sanitation

In [20]:
Water_Sanitation = pd.read_csv('Water_Sanitation.csv', encoding = "ISO-8859-1")
new_header = Water_Sanitation.iloc[0] #grab the first row for the header
Water_Sanitation = Water_Sanitation[1:] #take the data less the header row
Water_Sanitation.columns = new_header #set the header row as the df header
Water_Sanitation.head()

Unnamed: 0,Region/Country/Area,NaN,Year,Series,Value,Footnotes,Source
1,1,"Total, all countries or areas",2010,"Safely managed drinking water sources, urban (...",84.0,,World Health Organization (WHO) and United Nat...
2,1,"Total, all countries or areas",2010,"Safely managed drinking water sources, rural (...",46.3,,World Health Organization (WHO) and United Nat...
3,1,"Total, all countries or areas",2010,"Safely managed drinking water sources, total (...",65.8,,World Health Organization (WHO) and United Nat...
4,1,"Total, all countries or areas",2010,"Safely managed sanitation facilities, urban (P...",51.6,,World Health Organization (WHO) and United Nat...
5,1,"Total, all countries or areas",2010,"Safely managed sanitation facilities, rural (P...",27.5,,World Health Organization (WHO) and United Nat...


In [21]:
Water_Sanitation.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2160 entries, 1 to 2160
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Region/Country/Area  2160 non-null   object
 1   nan                  2160 non-null   object
 2   Year                 2160 non-null   object
 3   Series               2160 non-null   object
 4   Value                2160 non-null   object
 5   Footnotes            12 non-null     object
 6   Source               2160 non-null   object
dtypes: object(7)
memory usage: 118.3+ KB


In [22]:
# Remove duplicate column name that caused columns names to shift left
cols = list(Water_Sanitation.columns)
cols[1] = 'country'
Water_Sanitation.columns = cols
Water_Sanitation.head()

Unnamed: 0,Region/Country/Area,country,Year,Series,Value,Footnotes,Source
1,1,"Total, all countries or areas",2010,"Safely managed drinking water sources, urban (...",84.0,,World Health Organization (WHO) and United Nat...
2,1,"Total, all countries or areas",2010,"Safely managed drinking water sources, rural (...",46.3,,World Health Organization (WHO) and United Nat...
3,1,"Total, all countries or areas",2010,"Safely managed drinking water sources, total (...",65.8,,World Health Organization (WHO) and United Nat...
4,1,"Total, all countries or areas",2010,"Safely managed sanitation facilities, urban (P...",51.6,,World Health Organization (WHO) and United Nat...
5,1,"Total, all countries or areas",2010,"Safely managed sanitation facilities, rural (P...",27.5,,World Health Organization (WHO) and United Nat...


In [23]:
continents = ['Total, all countries or areas', 'Northern Africa', 'Sub-Saharan Africa', 'Eastern Africa', 
              'Middle Africa', 'Southern Africa', 'Western Africa', 'Northern America', 
              'Latin America & the Caribbean', 'Caribbean', 'Latin America', 'Asia', 'Central Asia', 
              'Eastern Asia', 'South-central Asia', 'South-eastern Asia', 'Southern Asia', 'Western Asia', 
              'Europe', 'Oceania', 'Australia and New Zealand']
for i in continents:
    Water_Sanitation= Water_Sanitation[Water_Sanitation['country'].str.contains(i)==False]

In [24]:
Water_Sanitation=Water_Sanitation[Water_Sanitation['Year'].str.contains('2015')!=False]
Water_Sanitation.head()

Unnamed: 0,Region/Country/Area,country,Year,Series,Value,Footnotes,Source
138,13,Central America,2015,"Safely managed drinking water sources, rural (...",42.1,,World Health Organization (WHO) and United Nat...
139,13,Central America,2015,"Safely managed drinking water sources, total (...",46.6,,World Health Organization (WHO) and United Nat...
140,13,Central America,2015,"Safely managed sanitation facilities, urban (P...",38.7,,World Health Organization (WHO) and United Nat...
141,13,Central America,2015,"Safely managed sanitation facilities, total (P...",31.3,,World Health Organization (WHO) and United Nat...
151,5,South America,2015,"Safely managed drinking water sources, urban (...",83.7,,World Health Organization (WHO) and United Nat...


In [25]:
water_urban = Water_Sanitation[Water_Sanitation.Series == "Safely managed drinking water sources, urban (Proportion of population with access)" ]
water_urban  = water_urban .rename(columns={'Value': 'Pop. using improved drinking water (urban, %)'})
water_urban = pd.DataFrame(water_urban[['country', 'Year', 'Pop. using improved drinking water (urban, %)']])
water_urban .head()

Unnamed: 0,country,Year,"Pop. using improved drinking water (urban, %)"
151,South America,2015,83.7
341,Micronesia,2015,72.7
352,Polynesia,2015,85.2
365,Afghanistan,2015,32.4
389,Algeria,2015,79.5


In [26]:
water_rural = Water_Sanitation[Water_Sanitation.Series == "Safely managed drinking water sources, rural (Proportion of population with access)" ]
water_rural = water_rural .rename(columns={'Value': 'Pop. using improved drinking water (rural, %)'})
water_rural = pd.DataFrame(water_rural[['country', 'Year', 'Pop. using improved drinking water (rural, %)']])
water_rural .head()

Unnamed: 0,country,Year,"Pop. using improved drinking water (rural, %)"
138,Central America,2015,42.1
152,South America,2015,51.2
342,Micronesia,2015,30.5
353,Polynesia,2015,61.2
366,Afghanistan,2015,20.4


In [27]:
sanitation_urban = Water_Sanitation[Water_Sanitation.Series == "Safely managed drinking water sources, urban (Proportion of population with access)" ]
sanitation_urban  = sanitation_urban.rename(columns={'Value': 'Pop. using improved sanitation facilities (urban, %)'})
sanitation_urban = pd.DataFrame(sanitation_urban[['country', 'Year', 'Pop. using improved sanitation facilities (urban, %)']])
sanitation_urban.head()

Unnamed: 0,country,Year,"Pop. using improved sanitation facilities (urban, %)"
151,South America,2015,83.7
341,Micronesia,2015,72.7
352,Polynesia,2015,85.2
365,Afghanistan,2015,32.4
389,Algeria,2015,79.5


In [28]:
sanitation_rural = Water_Sanitation[Water_Sanitation.Series == "Safely managed drinking water sources, rural (Proportion of population with access)" ]
sanitation_rural = sanitation_rural.rename(columns={'Value': 'Pop. using improved sanitation facilities (rural, %)'})
sanitation_rural= pd.DataFrame(sanitation_rural[['country', 'Year', 'Series','Pop. using improved sanitation facilities (rural, %)']])
sanitation_rural.head()

Unnamed: 0,country,Year,Series,"Pop. using improved sanitation facilities (rural, %)"
138,Central America,2015,"Safely managed drinking water sources, rural (...",42.1
152,South America,2015,"Safely managed drinking water sources, rural (...",51.2
342,Micronesia,2015,"Safely managed drinking water sources, rural (...",30.5
353,Polynesia,2015,"Safely managed drinking water sources, rural (...",61.2
366,Afghanistan,2015,"Safely managed drinking water sources, rural (...",20.4


# Population Age

In [29]:
Age = pd.read_csv('Population_Density.csv', encoding = "ISO-8859-1")
new_header = Age.iloc[0] #grab the first row for the header
Age = Age[1:] #take the data less the header row
Age.columns = new_header #set the header row as the df header
Age.head()

Unnamed: 0,Region/Country/Area,NaN,Year,Series,Value,Footnotes,Source
1,1,"Total, all countries or areas",2010,Population mid-year estimates (millions),6956.82,,"United Nations Population Division, New York, ..."
2,1,"Total, all countries or areas",2010,Population mid-year estimates for males (milli...,3507.7,,"United Nations Population Division, New York, ..."
3,1,"Total, all countries or areas",2010,Population mid-year estimates for females (mil...,3449.12,,"United Nations Population Division, New York, ..."
4,1,"Total, all countries or areas",2010,Sex ratio (males per 100 females),101.7,,"United Nations Population Division, New York, ..."
5,1,"Total, all countries or areas",2010,Population aged 0 to 14 years old (percentage),27.0,,"United Nations Population Division, New York, ..."


In [30]:
Age.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7260 entries, 1 to 7260
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Region/Country/Area  7260 non-null   object
 1   nan                  7260 non-null   object
 2   Year                 7260 non-null   object
 3   Series               7260 non-null   object
 4   Value                7260 non-null   object
 5   Footnotes            2282 non-null   object
 6   Source               7260 non-null   object
dtypes: object(7)
memory usage: 397.2+ KB


In [31]:
# Remove duplicate column name that caused columns names to shift left
cols = list(Age.columns)
cols[1] = 'country'
Age.columns = cols

In [32]:
continents = ['Total, all countries or areas', 'Northern Africa', 'Sub-Saharan Africa', 'Eastern Africa', 
              'Middle Africa', 'Southern Africa', 'Western Africa', 'Northern America', 
              'Latin America & the Caribbean', 'Caribbean', 'Latin America', 'Asia', 'Central Asia', 
              'Eastern Asia', 'South-central Asia', 'South-eastern Asia', 'Southern Asia', 'Western Asia', 
              'Europe', 'Oceania', 'Australia and New Zealand']
for i in continents:
    Age= Age[Age['country'].str.contains(i)==False]

In [33]:
Age=Age[Age['Year'].str.contains('2015')!=False]
Age.head()

Unnamed: 0,Region/Country/Area,country,Year,Series,Value,Footnotes,Source
38,2,Africa,2015,Population mid-year estimates (millions),1182.44,,"United Nations Population Division, New York, ..."
39,2,Africa,2015,Population mid-year estimates for males (milli...,590.28,,"United Nations Population Division, New York, ..."
40,2,Africa,2015,Population mid-year estimates for females (mil...,592.16,,"United Nations Population Division, New York, ..."
41,2,Africa,2015,Sex ratio (males per 100 females),99.7,,"United Nations Population Division, New York, ..."
42,2,Africa,2015,Population aged 0 to 14 years old (percentage),41.1,,"United Nations Population Division, New York, ..."


In [34]:
Age_0 = Age[Age.Series == "Population aged 0 to 14 years old (percentage)" ]
Age_0= Age_0.rename(columns={'Value': 'Population age distribution (0-14 years, %)'})
Age_0 = pd.DataFrame(Age_0[['country', 'Year', 'Population age distribution (0-14 years, %)']])
Age_0.head()

Unnamed: 0,country,Year,"Population age distribution (0-14 years, %)"
42,Africa,2015,41.1
252,Americas,2015,23.2
372,Central America,2015,28.8
402,South America,2015,24.4
852,Melanesia,2015,36.1


In [35]:
Age_60 = Age[Age.Series == "Population aged 60+ years old (percentage)" ]
Age_60= Age_60.rename(columns={'Value': 'Population age distribution (60+ years, %)'})
Age_60 = pd.DataFrame(Age_60[['country','Year', 'Population age distribution (60+ years, %)']])
Age_60.head()

Unnamed: 0,country,Year,"Population age distribution (60+ years, %)"
43,Africa,2015,5.3
253,Americas,2015,14.8
373,Central America,2015,9.5
403,South America,2015,11.9
853,Melanesia,2015,5.9
