In [1]:
#Dependencies
import pandas as pd
import requests

# Fetch population data from the World in Data Server.
df = pd.read_csv("https://ourworldindata.org/grapher/population.csv?v=1&csvType=full&useColumnShortNames=true", storage_options = {'User-Agent': 'Our World In Data data fetch/1.0'})

# Fetch the metadata
metadata = requests.get("https://ourworldindata.org/grapher/population.metadata.json?v=1&csvType=full&useColumnShortNames=true").json()

In [9]:
# Filter the data for years from 2000-2023
filtered_df = df[(df['Year'] >= 2000) & (df['Year'] <= 2023)].reset_index(drop=True)

filtered_df.head(30)

Unnamed: 0,Entity,Code,Year,population_historical
0,Afghanistan,AFG,2000,20130279
1,Afghanistan,AFG,2001,20284252
2,Afghanistan,AFG,2002,21378081
3,Afghanistan,AFG,2003,22733007
4,Afghanistan,AFG,2004,23560598
5,Afghanistan,AFG,2005,24404520
6,Afghanistan,AFG,2006,25424050
7,Afghanistan,AFG,2007,25909790
8,Afghanistan,AFG,2008,26482577
9,Afghanistan,AFG,2009,27466056


In [13]:
#Rename columns
filtered_df.rename(columns={
    "Entity": "Country",
    "population_historical": "Population"
}, inplace=True)

#Drop Code column for merging at the end
filtered_df.drop("Code", axis =1, inplace =True)

#Preview cleaned data
filtered_df.head(23)

Unnamed: 0,Country,Year,Population
0,Afghanistan,2000,20130279
1,Afghanistan,2001,20284252
2,Afghanistan,2002,21378081
3,Afghanistan,2003,22733007
4,Afghanistan,2004,23560598
5,Afghanistan,2005,24404520
6,Afghanistan,2006,25424050
7,Afghanistan,2007,25909790
8,Afghanistan,2008,26482577
9,Afghanistan,2009,27466056


In [15]:
#Preview unquie contries to check if merged properly
unique_countries = filtered_df["Country"].unique()
print(unique_countries)

['Afghanistan' 'Africa' 'Africa (UN)' 'Akrotiri and Dhekelia' 'Albania'
 'Algeria' 'American Samoa' 'Americas (UN)' 'Andorra' 'Angola' 'Anguilla'
 'Antigua and Barbuda' 'Argentina' 'Armenia' 'Aruba' 'Asia' 'Asia (UN)'
 'Asia (excl. China and India)' 'Australia' 'Austria' 'Azerbaijan'
 'Bahamas' 'Bahrain' 'Bangladesh' 'Barbados' 'Belarus' 'Belgium' 'Belize'
 'Benin' 'Bermuda' 'Bhutan' 'Bolivia' 'Bonaire Sint Eustatius and Saba'
 'Bosnia and Herzegovina' 'Botswana' 'Brazil' 'British Virgin Islands'
 'Brunei' 'Bulgaria' 'Burkina Faso' 'Burundi' 'Cambodia' 'Cameroon'
 'Canada' 'Cape Verde' 'Cayman Islands' 'Central African Republic' 'Chad'
 'Chile' 'China' 'Colombia' 'Comoros' 'Congo' 'Cook Islands' 'Costa Rica'
 "Cote d'Ivoire" 'Croatia' 'Cuba' 'Curacao' 'Cyprus' 'Czechia'
 'Democratic Republic of Congo' 'Denmark' 'Djibouti' 'Dominica'
 'Dominican Republic' 'East Timor' 'Ecuador' 'Egypt' 'El Salvador'
 'Equatorial Guinea' 'Eritrea' 'Estonia' 'Eswatini' 'Ethiopia' 'Europe'
 'Europe (UN)' '

In [17]:
# Predefined list of country names (ISO 3166-1 countries as an example)
valid_countries = [
    'Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola', 'Anguilla', 'Antigua and Barbuda', 
    'Argentina', 'Armenia', 'Aruba', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh',
    'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bermuda', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina',
    'Botswana', 'Brazil', 'Brunei', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon', 'Canada', 
    'Cape Verde', 'Chad', 'Chile', 'China', 'Colombia', 'Comoros', 'Congo', 'Costa Rica', 'Croatia', 'Cuba', 
    'Cyprus', 'Czechia', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 
    'Equatorial Guinea', 'Eritrea', 'Estonia', 'Ethiopia', 'Fiji', 'Finland', 'France', 'Gabon', 'Gambia', 'Germany',
    'Ghana', 'Greece', 'Grenada', 'Guatemala', 'Guinea', 'Guinea-Bissau', 'Guyana', 'Haiti', 'Honduras', 'Hungary', 
    'Iceland', 'India', 'Indonesia', 'Iran', 'Iraq', 'Ireland', 'Israel', 'Italy', 'Jamaica', 'Japan', 'Jordan', 
    'Kazakhstan', 'Kenya', 'Kosovo', 'Kuwait', 'Kyrgyzstan', 'Laos', 'Latvia', 'Lebanon', 'Lesotho', 'Liberia', 
    'Libya', 'Lithuania', 'Luxembourg', 'Madagascar', 'Malawi', 'Malaysia', 'Maldives', 'Mali', 'Malta', 'Mexico', 
    'Moldova', 'Monaco', 'Mongolia', 'Morocco', 'Mozambique', 'Myanmar', 'Namibia', 'Nepal', 'Netherlands', 
    'New Zealand', 'Nicaragua', 'Nigeria', 'North Macedonia', 'Norway', 'Oman', 'Pakistan', 'Palau', 'Panama', 
    'Papua New Guinea', 'Paraguay', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Qatar', 'Romania', 'Russia', 
    'Rwanda', 'Saint Kitts and Nevis', 'Saint Lucia', 'Samoa', 'San Marino', 'Saudi Arabia', 'Senegal', 'Serbia', 
    'Seychelles', 'Singapore', 'Slovakia', 'Slovenia', 'Solomon Islands', 'South Africa', 'South Korea', 'South Sudan', 
    'Spain', 'Sri Lanka', 'Sudan', 'Suriname', 'Sweden', 'Switzerland', 'Syria', 'Taiwan', 'Tajikistan', 'Tanzania', 
    'Thailand', 'Togo', 'Trinidad and Tobago', 'Tunisia', 'Turkey', 'Uganda', 'Ukraine', 'United Arab Emirates', 
    'United Kingdom', 'United States', 'Uruguay', 'Uzbekistan', 'Vanuatu', 'Venezuela', 'Vietnam', 'Yemen', 'Zambia', 
    'Zimbabwe'
]

# Filter the DataFrame to only keep rows where 'country' is in the valid countries list
df_final = filtered_df[filtered_df['Country'].isin(valid_countries)].reset_index(drop=True)

# Display the filtered countries name in the DataFrame
all_countries = df_final["Country"].unique()
print(all_countries)

['Afghanistan' 'Albania' 'Algeria' 'Andorra' 'Angola' 'Anguilla'
 'Antigua and Barbuda' 'Argentina' 'Armenia' 'Aruba' 'Australia' 'Austria'
 'Azerbaijan' 'Bahamas' 'Bahrain' 'Bangladesh' 'Barbados' 'Belarus'
 'Belgium' 'Belize' 'Benin' 'Bermuda' 'Bhutan' 'Bolivia'
 'Bosnia and Herzegovina' 'Botswana' 'Brazil' 'Brunei' 'Bulgaria'
 'Burkina Faso' 'Burundi' 'Cambodia' 'Cameroon' 'Canada' 'Cape Verde'
 'Chad' 'Chile' 'China' 'Colombia' 'Comoros' 'Congo' 'Costa Rica'
 'Croatia' 'Cuba' 'Cyprus' 'Czechia' 'Denmark' 'Djibouti' 'Dominica'
 'Dominican Republic' 'Ecuador' 'Egypt' 'El Salvador' 'Equatorial Guinea'
 'Eritrea' 'Estonia' 'Ethiopia' 'Fiji' 'Finland' 'France' 'Gabon' 'Gambia'
 'Germany' 'Ghana' 'Greece' 'Grenada' 'Guatemala' 'Guinea' 'Guinea-Bissau'
 'Guyana' 'Haiti' 'Honduras' 'Hungary' 'Iceland' 'India' 'Indonesia'
 'Iran' 'Iraq' 'Ireland' 'Israel' 'Italy' 'Jamaica' 'Japan' 'Jordan'
 'Kazakhstan' 'Kenya' 'Kosovo' 'Kuwait' 'Kyrgyzstan' 'Laos' 'Latvia'
 'Lebanon' 'Lesotho' 'Liberia' 'L

In [21]:
#View if filtered properly
df_final.head(35)

Unnamed: 0,Country,Year,Population
0,Afghanistan,2000,20130279
1,Afghanistan,2001,20284252
2,Afghanistan,2002,21378081
3,Afghanistan,2003,22733007
4,Afghanistan,2004,23560598
5,Afghanistan,2005,24404520
6,Afghanistan,2006,25424050
7,Afghanistan,2007,25909790
8,Afghanistan,2008,26482577
9,Afghanistan,2009,27466056


In [23]:
# Export as csv file
df_final.to_csv("data/population_data.csv", index=False)

In [51]:
#read previous cleaned data and new population cleaned data
country_path = "data/all_countries_merged_data.csv"
pop_path = "data/population_data.csv"

#Create dataframes
country_df = pd.read_csv(country_path)
pop_df = pd.read_csv(pop_path)

In [53]:
#Preview Country data
country_df.head()

Unnamed: 0,Country,Code,Year,GDP per Capita,Health_Expenditure,Gini_Coefficient,Income Class,Population density per square kilometer
0,Afghanistan,AFG,2000,1617.8264,No data,No data,Low-income countries,30.863773
1,Afghanistan,AFG,2001,1454.1108,No data,No data,Low-income countries,31.099846
2,Afghanistan,AFG,2002,1774.3087,87.39655,No data,Low-income countries,32.776905
3,Afghanistan,AFG,2003,1815.9282,86.26034,No data,Low-income countries,34.85428
4,Afghanistan,AFG,2004,1776.9182,93.952965,No data,Low-income countries,36.123142


In [35]:
#Preview population data
pop_df.head()

Unnamed: 0,Country,Year,Population
0,Afghanistan,2000,20130279
1,Afghanistan,2001,20284252
2,Afghanistan,2002,21378081
3,Afghanistan,2003,22733007
4,Afghanistan,2004,23560598


In [37]:
#Merge population data with countries data
All_count_df = country_df.merge(pop_df, on=["Country", "Year"], how="outer")

In [39]:
All_count_df.head()

Unnamed: 0,Country,Code,Year,GDP per Capita,Health_Expenditure,Gini_Coefficient,Income Class,Population density per square kilometer,Population
0,Afghanistan,AFG,2000,1617.8264,No data,No data,Low-income countries,30.863773,20130279.0
1,Afghanistan,AFG,2001,1454.1108,No data,No data,Low-income countries,31.099846,20284252.0
2,Afghanistan,AFG,2002,1774.3087,87.39655,No data,Low-income countries,32.776905,21378081.0
3,Afghanistan,AFG,2003,1815.9282,86.26034,No data,Low-income countries,34.85428,22733007.0
4,Afghanistan,AFG,2004,1776.9182,93.952965,No data,Low-income countries,36.123142,23560598.0


In [41]:
#Rename columns
All_count_df.rename(columns={
    "Health_Expenditure": "Health Expenditure",
    "Gini_Coefficient": "GINI Coefficient",
    "Population density per square kilometer" : "Population Density"
}, inplace=True)

#Preview cleaned data
All_count_df.head()

Unnamed: 0,Country,Code,Year,GDP per Capita,Health Expenditure,GINI Coefficient,Income Class,Population Density,Population
0,Afghanistan,AFG,2000,1617.8264,No data,No data,Low-income countries,30.863773,20130279.0
1,Afghanistan,AFG,2001,1454.1108,No data,No data,Low-income countries,31.099846,20284252.0
2,Afghanistan,AFG,2002,1774.3087,87.39655,No data,Low-income countries,32.776905,21378081.0
3,Afghanistan,AFG,2003,1815.9282,86.26034,No data,Low-income countries,34.85428,22733007.0
4,Afghanistan,AFG,2004,1776.9182,93.952965,No data,Low-income countries,36.123142,23560598.0


In [43]:
# Predefined list of country names (ISO 3166-1 countries as an example)
valid_countries = [
    'Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola', 'Anguilla', 'Antigua and Barbuda', 
    'Argentina', 'Armenia', 'Aruba', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh',
    'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bermuda', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina',
    'Botswana', 'Brazil', 'Brunei', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon', 'Canada', 
    'Cape Verde', 'Chad', 'Chile', 'China', 'Colombia', 'Comoros', 'Congo', 'Costa Rica', 'Croatia', 'Cuba', 
    'Cyprus', 'Czechia', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 
    'Equatorial Guinea', 'Eritrea', 'Estonia', 'Ethiopia', 'Fiji', 'Finland', 'France', 'Gabon', 'Gambia', 'Germany',
    'Ghana', 'Greece', 'Grenada', 'Guatemala', 'Guinea', 'Guinea-Bissau', 'Guyana', 'Haiti', 'Honduras', 'Hungary', 
    'Iceland', 'India', 'Indonesia', 'Iran', 'Iraq', 'Ireland', 'Israel', 'Italy', 'Jamaica', 'Japan', 'Jordan', 
    'Kazakhstan', 'Kenya', 'Kosovo', 'Kuwait', 'Kyrgyzstan', 'Laos', 'Latvia', 'Lebanon', 'Lesotho', 'Liberia', 
    'Libya', 'Lithuania', 'Luxembourg', 'Madagascar', 'Malawi', 'Malaysia', 'Maldives', 'Mali', 'Malta', 'Mexico', 
    'Moldova', 'Monaco', 'Mongolia', 'Morocco', 'Mozambique', 'Myanmar', 'Namibia', 'Nepal', 'Netherlands', 
    'New Zealand', 'Nicaragua', 'Nigeria', 'North Macedonia', 'Norway', 'Oman', 'Pakistan', 'Palau', 'Panama', 
    'Papua New Guinea', 'Paraguay', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Qatar', 'Romania', 'Russia', 
    'Rwanda', 'Saint Kitts and Nevis', 'Saint Lucia', 'Samoa', 'San Marino', 'Saudi Arabia', 'Senegal', 'Serbia', 
    'Seychelles', 'Singapore', 'Slovakia', 'Slovenia', 'Solomon Islands', 'South Africa', 'South Korea', 'South Sudan', 
    'Spain', 'Sri Lanka', 'Sudan', 'Suriname', 'Sweden', 'Switzerland', 'Syria', 'Taiwan', 'Tajikistan', 'Tanzania', 
    'Thailand', 'Togo', 'Trinidad and Tobago', 'Tunisia', 'Turkey', 'Uganda', 'Ukraine', 'United Arab Emirates', 
    'United Kingdom', 'United States', 'Uruguay', 'Uzbekistan', 'Vanuatu', 'Venezuela', 'Vietnam', 'Yemen', 'Zambia', 
    'Zimbabwe'
]
# Filter the DataFrame to only keep rows where 'country' is in the valid countries list
country_v1_df = All_count_df[All_count_df['Country'].isin(valid_countries)].reset_index(drop=True)

# Display the filtered DataFrame
all_count = country_v1_df["Country"].unique()
print(all_count)

['Afghanistan' 'Albania' 'Algeria' 'Andorra' 'Angola' 'Anguilla'
 'Antigua and Barbuda' 'Argentina' 'Armenia' 'Aruba' 'Australia' 'Austria'
 'Azerbaijan' 'Bahamas' 'Bahrain' 'Bangladesh' 'Barbados' 'Belarus'
 'Belgium' 'Belize' 'Benin' 'Bermuda' 'Bhutan' 'Bolivia'
 'Bosnia and Herzegovina' 'Botswana' 'Brazil' 'Brunei' 'Bulgaria'
 'Burkina Faso' 'Burundi' 'Cambodia' 'Cameroon' 'Canada' 'Cape Verde'
 'Chad' 'Chile' 'China' 'Colombia' 'Comoros' 'Congo' 'Costa Rica'
 'Croatia' 'Cuba' 'Cyprus' 'Czechia' 'Denmark' 'Djibouti' 'Dominica'
 'Dominican Republic' 'Ecuador' 'Egypt' 'El Salvador' 'Equatorial Guinea'
 'Eritrea' 'Estonia' 'Ethiopia' 'Fiji' 'Finland' 'France' 'Gabon' 'Gambia'
 'Germany' 'Ghana' 'Greece' 'Grenada' 'Guatemala' 'Guinea' 'Guinea-Bissau'
 'Guyana' 'Haiti' 'Honduras' 'Hungary' 'Iceland' 'India' 'Indonesia'
 'Iran' 'Iraq' 'Ireland' 'Israel' 'Italy' 'Jamaica' 'Japan' 'Jordan'
 'Kazakhstan' 'Kenya' 'Kosovo' 'Kuwait' 'Kyrgyzstan' 'Laos' 'Latvia'
 'Lebanon' 'Lesotho' 'Liberia' 'L

In [45]:
#Preview data
country_v1_df.tail(20)

Unnamed: 0,Country,Code,Year,GDP per Capita,Health Expenditure,GINI Coefficient,Income Class,Population Density,Population
4180,Zimbabwe,ZWE,2004,2828.8455,No data,No data,Low-income countries,31.965483,12365847.0
4181,Zimbabwe,ZWE,2005,2642.1743,No data,No data,Low-income countries,32.269314,12483384.0
4182,Zimbabwe,ZWE,2006,2519.8293,No data,No data,Low-income countries,32.664864,12636402.0
4183,Zimbabwe,ZWE,2007,2395.9907,No data,No data,Low-income countries,33.098156,12804022.0
4184,Zimbabwe,ZWE,2008,1949.0364,No data,No data,Low-income countries,33.49902,12959096.0
4185,Zimbabwe,ZWE,2009,2152.7952,No data,No data,Low-income countries,33.973736,13142740.0
4186,Zimbabwe,ZWE,2010,2572.77,189.67076,No data,Low-income countries,34.526306,13356502.0
4187,Zimbabwe,ZWE,2011,2897.1008,168.25894,0.43153575,Low-income countries,35.143784,13595372.0
4188,Zimbabwe,ZWE,2012,3299.2598,162.58827,No data,Low-income countries,35.718853,13817838.0
4189,Zimbabwe,ZWE,2013,3357.1282,179.45686,No data,Low-income countries,36.22531,14013761.0


In [47]:
# Export as csv file
country_v1_df.to_csv("data/Final_Countries.csv", index=False)

In [49]:
# Convert to JSON
All_Countries_json = country_v1_df.to_json(orient="records")

# Save JSON file
with open("data/All_Countries.json", "w") as f:
    f.write(All_Countries_json)