In [1]:
# Librairies
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [2]:
# Importing the dataset
url = 'https://en.wikipedia.org/wiki/List_of_countries_by_past_and_projected_future_population'

# Let's send a get request
response = requests.get(url)

# Let's add a control to check if the request was successful
if response.status_code == 200:
    # Read the page
    soup = BeautifulSoup(response.content, 'html.parser')

else:
    print("There was an error:", response.status_code)

In [3]:
# Finding all tables
tables = soup.find_all('table', {'class': 'wikitable'})

# Number of tables
num_tables = len(tables)
print('The number of tables is ', num_tables)

The number of tables is  4


In [4]:
# Function to convert the table to dataframe
def table_to_df(table):
    # Extracting column headers
    headers = [header.get_text(strip=True) for header in table.find_all('th')]
    
    # Extracting rows
    rows = []
    for row in table.find_all('tr'):
        columns = row.find_all(['td', 'th'])
        rows.append([column.get_text(strip=True) for column in columns])
    
    # Creatinng the dataframe
    df = pd.DataFrame(rows, columns=headers)  
    
    return df

In [5]:
# Let's create a list to store the dataframes
dataframes = []

# We are iterating over the tables and converting them to dataframe
for table in tables:
    df = table_to_df(table)
    dataframes.append(df)

In [6]:
len(dataframes)

4

In [7]:
# Extracting dataframes we need from the list
# Dataset for the population from 1985 to 2015
df_pop_85_15 = dataframes[2]

# Dataset for the population from 2020 to 2050
df_pop_20_50 = dataframes[3]

In [8]:
# Printing the first 5 rows of the dataframes
df_pop_85_15.head(5)

Unnamed: 0,Country (or dependent territory),1985,%,1990,%.1,1995,%.2,2000,%.3,2005,%.4,2010,%.5,2015,%.6
0,Country (or dependent territory),1985,%,1990,%,1995,%,2000,%,2005,%,2010,%,2015,%
1,Afghanistan,13120,-2.70,13569,0.67,19446,7.46,22462,2.93,26335,3.23,29121,2.03,32565,2.26
2,Albania,2957,2.05,3245,1.88,3159,-0.54,3159,0.00,3025,-0.86,2987,-0.25,3030,0.28
3,Algeria,22009,3.19,25191,2.74,28322,2.37,30639,1.58,32918,1.45,35950,1.78,39543,1.92
4,American Samoa,39,3.57,48,4.09,54,2.69,58,1.39,57,-0.28,56,-0.53,55,-0.41


In [9]:
df_pop_20_50.head(5)

Unnamed: 0,Country (or dependent territory),2020,%,2025,%.1,2030,%.2,2035,%.3,2040,%.4,2045,%.5,2050,%.6
0,Country (or dependent territory),2020,%,2025,%,2030,%,2035,%,2040,%,2045,%,2050,%
1,Afghanistan,36644,2.39,41118,2.33,45665,2.12,50195,1.91,54717,1.74,59256,1.61,63796,1.49
2,Albania,3075,0.30,3105,0.20,3103,-0.01,3063,-0.26,2994,-0.45,2913,-0.55,2825,-0.61
3,Algeria,42973,1.68,45842,1.30,48149,0.99,50118,0.80,52030,0.75,53894,0.71,55445,0.57
4,American Samoa,54,-0.21,54,-0.17,53,-0.32,52,-0.45,51,-0.45,50,-0.29,50,-0.06


In [10]:
# The header is repeated on the 1st row, we will delete it on both dataframes
df_pop_85_15 = df_pop_85_15.drop(index=0)
df_pop_20_50 = df_pop_20_50.drop(index=0)

In [11]:
df_pop_85_15.head(5)

Unnamed: 0,Country (or dependent territory),1985,%,1990,%.1,1995,%.2,2000,%.3,2005,%.4,2010,%.5,2015,%.6
1,Afghanistan,13120,-2.7,13569,0.67,19446,7.46,22462,2.93,26335,3.23,29121,2.03,32565,2.26
2,Albania,2957,2.05,3245,1.88,3159,-0.54,3159,0.0,3025,-0.86,2987,-0.25,3030,0.28
3,Algeria,22009,3.19,25191,2.74,28322,2.37,30639,1.58,32918,1.45,35950,1.78,39543,1.92
4,American Samoa,39,3.57,48,4.09,54,2.69,58,1.39,57,-0.28,56,-0.53,55,-0.41
5,Andorra,45,5.84,53,3.41,64,3.7,66,0.58,77,3.18,85,2.12,86,0.25


In [12]:
df_pop_20_50.head(5)

Unnamed: 0,Country (or dependent territory),2020,%,2025,%.1,2030,%.2,2035,%.3,2040,%.4,2045,%.5,2050,%.6
1,Afghanistan,36644,2.39,41118,2.33,45665,2.12,50195,1.91,54717,1.74,59256,1.61,63796,1.49
2,Albania,3075,0.3,3105,0.2,3103,-0.01,3063,-0.26,2994,-0.45,2913,-0.55,2825,-0.61
3,Algeria,42973,1.68,45842,1.3,48149,0.99,50118,0.8,52030,0.75,53894,0.71,55445,0.57
4,American Samoa,54,-0.21,54,-0.17,53,-0.32,52,-0.45,51,-0.45,50,-0.29,50,-0.06
5,Andorra,86,0.01,86,-0.12,85,-0.21,83,-0.3,82,-0.46,79,-0.69,75,-0.93


In [13]:
# Let's remove the columns where the labels are %
df_pop_85_15 = df_pop_85_15.drop(['%'], axis=1)
df_pop_20_50 = df_pop_20_50.drop(['%'], axis=1)

In [14]:
df_pop_20_50.columns

Index(['Country (or dependent territory)', '2020', '2025', '2030', '2035',
       '2040', '2045', '2050'],
      dtype='object')

In [15]:
# Dropping columns
df_pop_85_15 = df_pop_85_15.drop(['1985'], axis = 1)

In [16]:
# Removing the columns we do not need for our analysis
df_pop_20_50 = df_pop_20_50.drop(['2025','2030','2035','2040','2045','2050'], axis=1)
df_pop_20_50.head(5)

Unnamed: 0,Country (or dependent territory),2020
1,Afghanistan,36644
2,Albania,3075
3,Algeria,42973
4,American Samoa,54
5,Andorra,86


In [17]:
# Printing the size of each dataframe
print('The size of df_pop_85_15 is ', df_pop_85_15.shape)
print('The size of df_pop_20_50 is ', df_pop_20_50.shape)

The size of df_pop_85_15 is  (227, 7)
The size of df_pop_20_50 is  (228, 2)


In [18]:
# Let's rename the 1st column and call it Country
df_pop_85_15.rename(columns={'Country (or dependent territory)': 'Country'}, inplace = True)
df_pop_20_50.rename(columns={'Country (or dependent territory)': 'Country'}, inplace = True)

# Let's set the Country column as key
df_pop_85_15.set_index('Country')
df_pop_20_50.set_index('Country')

# Combining the 2 dataframes on Country with a suffix
df_country_population = df_pop_85_15.merge(df_pop_20_50, on='Country', how='outer')

# Printing the new dataframe
df_country_population.head(10)

Unnamed: 0,Country,1990,1995,2000,2005,2010,2015,2020
0,Afghanistan,13569,19446,22462,26335,29121,32565,36644
1,Albania,3245,3159,3159,3025,2987,3030,3075
2,Algeria,25191,28322,30639,32918,35950,39543,42973
3,American Samoa,48,54,58,57,56,55,54
4,Andorra,53,64,66,77,85,86,86
5,Angola,9486,11000,12683,14770,17043,19626,22485
6,Anguilla,9,10,12,14,15,17,19
7,Antigua and Barbuda,65,69,76,82,87,93,99
8,Argentina,33036,35274,37336,39182,41344,43432,45379
9,Armenia,3530,3131,3101,3085,3072,3057,3022


In [19]:
#Let's check the size of the final dataframe
df_country_population.shape

(228, 8)

In [20]:
# Looking for duplicates
duplicate_rows = df_country_population.duplicated()
# Number of duplicated rows
print("Number of duplicated rows")
print (duplicate_rows.sum())

Number of duplicated rows
0


In [21]:
# Check for NaN values in each column
nan_columns = df_country_population.columns[df_country_population.isnull().any()]

# Print columns with NaN values, if any
if len(nan_columns) > 0:
    print("Columns with NaN values:")
    print(nan_columns)
else:
    print("No columns with NaN values.")

Columns with NaN values:
Index(['1990', '1995', '2000', '2005', '2010', '2015'], dtype='object')


In [22]:
# Print rows with missing values
print(df_country_population[df_country_population.isnull().any(axis=1)])

    Country 1990 1995 2000 2005 2010 2015 2020
227   Aruba  NaN  NaN  NaN  NaN  NaN  NaN  120


In [23]:
# Dropping the row with missing values. Only 1 row will be deleted
df_country_population.dropna(axis='index', inplace=True)

In [24]:
# Checking the size of the dataframe
df_country_population.shape

(227, 8)

In [25]:
# Converting country to upper cases
df_country_population['Country'] = df_country_population['Country'].str.upper()

# Printing the first 5 rows
df_country_population.head(5)

Unnamed: 0,Country,1990,1995,2000,2005,2010,2015,2020
0,AFGHANISTAN,13569,19446,22462,26335,29121,32565,36644
1,ALBANIA,3245,3159,3159,3025,2987,3030,3075
2,ALGERIA,25191,28322,30639,32918,35950,39543,42973
3,AMERICAN SAMOA,48,54,58,57,56,55,54
4,ANDORRA,53,64,66,77,85,86,86


In [26]:
# Removing commas from the population
df_country_population[df_country_population.columns[1:]] = df_country_population[df_country_population.columns[1:]].replace(',', '', regex=True)

# Printing the first few rows
df_country_population.head(5)

Unnamed: 0,Country,1990,1995,2000,2005,2010,2015,2020
0,AFGHANISTAN,13569,19446,22462,26335,29121,32565,36644
1,ALBANIA,3245,3159,3159,3025,2987,3030,3075
2,ALGERIA,25191,28322,30639,32918,35950,39543,42973
3,AMERICAN SAMOA,48,54,58,57,56,55,54
4,ANDORRA,53,64,66,77,85,86,86


In [27]:
# Function to convert non numerical values to numerals
def compute_population(df):
    # Select population columns (excluding the country names)
    population_cols = df.columns[1:]
    
    # Converting population to numerals
    df[population_cols] = df[population_cols].apply(pd.to_numeric, errors='coerce')
    
    # Multiply by 1000
    df[population_cols] *= 1000
    
    return df
    
df_country_population = compute_population(df_country_population)
df_country_population.head()

Unnamed: 0,Country,1990,1995,2000,2005,2010,2015,2020
0,AFGHANISTAN,13569000,19446000,22462000,26335000,29121000,32565000,36644000
1,ALBANIA,3245000,3159000,3159000,3025000,2987000,3030000,3075000
2,ALGERIA,25191000,28322000,30639000,32918000,35950000,39543000,42973000
3,AMERICAN SAMOA,48000,54000,58000,57000,56000,55000,54000
4,ANDORRA,53000,64000,66000,77000,85000,86000,86000


In [28]:
df = df_country_population

In [29]:
df_melted = pd.melt(df, id_vars=['Country'], var_name='Year', value_name='Population')

In [30]:
df_melted.to_csv('country_population.csv')