<img width="8%" alt="Worldometer.jpg" src="https://raw.githubusercontent.com/jupyter-naas/awesome-notebooks/master/.github/assets/logos/Worldometer.jpg" style="border-radius: 15%">

# Worldometer - World population evolution and projections
<a href="https://bit.ly/3JyWIk6">Give Feedback</a> | <a href="https://github.com/jupyter-naas/awesome-notebooks/issues/new?assignees=&labels=bug&template=bug_report.md&title=Worldometer+-+World+population+evolution+and+projections:+Error+short+description">Bug report</a>

**Tags:** #worldometer #opendata #population #snippet #plotly

**Author:** [Jeremy Ravenel](https://www.linkedin.com/in/ACoAAAJHE7sB5OxuKHuzguZ9L6lfDHqw--cdnJg/)

**Last update:** 2023-10-14 (Created: 2021-05-05)

**Description:** This notebook provides an overview of the current and projected population of the world, as tracked by Worldometer.

## Input

### Import libraries

In [None]:
import pandas as pd
import plotly.express as px
from bs4 import BeautifulSoup
import requests

### Data to scrap tables

In [None]:
DATA_URLS = [
    "https://www.worldometers.info/world-population/world-population-by-year/",
    "https://www.worldometers.info/world-population/world-population-projections/",
]

# Update this list to have latest cols from worldometers
TABLE_COLS = [
    "Year",
    "WorldPopulation",
    "YearlyChange",
    "NetChange",
    "Density(P/KmÂ²)",
]

## Model

### Functions to scrap tables on several sites, and merge them

In [None]:
# Generic functions

def preprocess_cols(cols):
    cols = [col.replace(' ', '') for col in cols]
    return cols

def scrap_table(url, table_cloumns):
    page = requests.get(url)
    soup = BeautifulSoup(page.text, "html.parser")
    dfs = pd.read_html(page.text)
    for df in dfs:
        # Need this as df columns have space inconsistency
        columns_in_page = preprocess_cols(df.columns.to_list())
        if columns_in_page == table_cloumns:
            df.columns = columns_in_page
            return df
    return None


def merge_tables_from_urls(urls, table_columns):
    table = None
    for url in urls:
        new_value = scrap_table(url, table_columns)
        if new_value is not None:
            if table is None:
                table = new_value
            else:
                table = pd.concat([table, new_value])
    return table

### Create function to display graph

In [None]:
def create_graph(x_label, y_label, table, title="", graph_type=px.line):
    fig = graph_type(table, x=x_label, y=y_label, title=title)
    fig.show()

In [None]:
# Print population graph from year to year
def display_population_graph(table, x_from=None, x_to=None, graph_type=px.line):
    x_label = TABLE_COLS[0]
    y_label = TABLE_COLS[1]
    if x_from is not None:
        table = table[table.Year >= x_from]
    if x_to is not None:
        table = table[table.Year <= x_to]
    title = f"{y_label} by {x_label}, between {table[x_label].to_list()[-1]} and {table[x_label].to_list()[0]}"
    create_graph(x_label, y_label, table, title, graph_type)

### Fetch tables, sort the result and remove duplicate data

In [None]:
table = merge_tables_from_urls(DATA_URLS, TABLE_COLS)

table = table.sort_values(by=[TABLE_COLS[0]], ascending=False)

table.drop_duplicates(subset=TABLE_COLS[0], keep="first", inplace=True)

## Output

### Display the graph between -5000 and 2100

In [None]:
chart1 = display_population_graph(table)

### Display the graph between 1800 and 2020

In [None]:
display_population_graph(table, x_from=1800, x_to=2020)

### Display the graph between 2000 and 2100

In [None]:
display_population_graph(table, x_from=2000, x_to=2100)

### Display a barchart between 2000 and 2100 

The graph type can be change by passing a graph function as 'graph_type' (graph_type=px.line, etc)

In [None]:
display_population_graph(table, x_from=1950, x_to=2100, graph_type=px.bar)