In [5]:
import requests
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import os

In [6]:
regions = {
    'asia': {
        'southern-asia': [
            'india', 'pakistan', 'bangladesh', 'iran',
            'afghanistan', 'nepal', 'sri-lanka', 'bhutan', 'maldives'
        ],
        'eastern-asia': [
            'china', 'japan', 'south-korea', 'north-korea',
            'taiwan', 'china-hong-kong-sar', 'china-macao-sar', 'mongolia'
        ],
        'south-eastern-asia': [
            'indonesia', 'philippines', 'vietnam', 'thailand',
            'myanmar', 'malaysia', 'cambodia', 'laos',
            'singapore', 'timor-leste', 'brunei-darussalam'
        ],
        'western-asia': [
            'turkey', 'iraq', 'yemen', 'saudi-arabia', 'syria',
            'jordan', 'united-arab-emirates', 'azerbaijan',
            'israel', 'lebanon', 'state-of-palestine', 'oman',
            'kuwait', 'georgia', 'qatar', 'armenia',
            'bahrain', 'cyprus'
        ],
        'central-asia': [
            'kazakhstan', 'uzbekistan', 'turkmenistan',
            'tajikistan', 'kyrgyzstan'
        ]
    },
    'africa': {
        'eastern-africa': [
            'ethiopia', 'tanzania', 'kenya', 'uganda',
            'mozambique', 'madagascar', 'malawi', 'zambia',
            'somalia', 'zimbabwe', 'rwanda', 'burundi',
            'south-sudan', 'eritrea', 'mauritius', 'djibouti',
            'comoros', 'reunion', 'mayotte', 'seychelles'
        ],
        'western-africa': [
            'nigeria', 'ghana', 'cote-d-ivoire', 'niger',
            'mali', 'burkina-faso', 'senegal', 'guinea',
            'benin', 'togo', 'sierra-leone', 'liberia',
            'mauritania', 'gambia', 'guinea-bissau',
            'cabo-verde', 'saint-helena'
        ],
        'northern-africa': [
            'egypt', 'sudan', 'algeria', 'morocco',
            'tunisia', 'libya', 'western-sahara'
        ],
        'middle-africa': [
            'democratic-republic-of-the-congo', 'angola', 'cameroon', 'chad',
            'congo', 'central-african-republic', 'gabon',
            'equatorial-guinea', 'sao-tome-and-principe'
        ],
        'southern-africa': [
            'south-africa', 'namibia', 'botswana',
            'lesotho', 'swaziland'
        ]
    },
    'europe': {
        'eastern-europe': [
            'russia', 'ukraine', 'poland', 'romania',
            'czech-republic', 'hungary', 'belarus',
            'bulgaria', 'slovakia', 'moldova'
        ],
        'western-europe': [
            'germany', 'france', 'netherlands', 'belgium',
            'austria', 'switzerland', 'luxembourg', 'liechtenstein', 'monaco'
        ],
        'southern-europe': [
            'italy', 'spain', 'portugal', 'greece',
            'croatia', 'slovenia', 'serbia', 'bosnia-and-herzegovina',
            'north-macedonia', 'albania', 'montenegro', 'andorra',
            'san-marino', 'malta'
        ],
        'northern-europe': [
            'uk', 'ireland', 'denmark',
            'sweden', 'finland', 'norway', 'iceland',
            'estonia', 'latvia', 'lithuania'
        ]
    },
    'latin-america-and-the-caribbean': {
        'south-america': [
            'brazil', 'colombia', 'argentina', 'peru',
            'venezuela', 'chile', 'ecuador', 'bolivia',
            'paraguay', 'uruguay', 'guyana', 'suriname'
        ],
        'central-america': [
            'mexico', 'guatemala', 'honduras', 'nicaragua',
            'el-salvador', 'costa-rica', 'panama', 'belize'
        ],
        'caribbean': [
            'cuba', 'haiti', 'dominican-republic', 'jamaica',
            'trinidad-and-tobago', 'bahamas', 'barbados',
            'saint-lucia', 'saint-vincent-and-the-grenadines',
            'grenada', 'antigua-and-barbuda', 'dominica',
            'saint-kitts-and-nevis'
        ]
    },
    'northern-america': {
        'northern-america': [
            'us', 'canada', 'bermuda'
        ]
    },
    'oceania': {
        'australia-and-new-zealand': [
            'australia', 'new-zealand'
        ],
        'melanesia': [
            'papua-new-guinea', 'fiji', 'solomon-islands',
            'vanuatu'
        ],
        'polynesia': [
            'samoa', 'tonga', 'tuvalu'
        ],
        'micronesia': [
            'micronesia', 'marshall-islands', 'palau',
            'nauru', 'kiribati'
        ]
    }
}

In [7]:
def scrapp_worldometers(regions, folder_path):
  country_option_list = []
  for continent, subregions in regions.items():
      for subregion, countries in subregions.items():
          for country in countries:
                base_url = "https://www.worldometers.info/world-population/"
                url = base_url + country + "-population/"
                print(f"Scraping: {url}")

                try:
                    response = requests.get(url)
                    response.raise_for_status()
                except requests.exceptions.RequestException as e:
                    print(f"Error fetching {url}: {e}")
                    continue

                soup = BeautifulSoup(response.content, "html.parser")
                table = soup.find("table", attrs={"class": lambda x: x and "table" in x})

                if table is None:
                    print(f"Warning: Population table not found for {country}. Skipping.")
                    continue

                headers = [th.text.strip() for th in table.find_all("th")]

                rows = []
                for tr in table.tbody.find_all("tr"):
                    cells = [td.text.strip().replace(",", "") for td in tr.find_all("td")]
                    rows.append(cells)

                df = pd.DataFrame(rows, columns=headers)

                if 'Year' in df.columns:
                    df = df.sort_values('Year')
                else:
                    print(f"Warning: 'Year' column not found for {country}. Skipping sorting.")

                # Ensure subfolder exists for continent/subregion
                dir_path = os.path.join(folder_path, continent, subregion)
                os.makedirs(dir_path, exist_ok=True)

                file_path = os.path.join(dir_path, f"{country}.csv")
                df.to_csv(file_path, index=False)

                option_label = (country.title(), file_path)
                country_option_list.append(option_label)

  return country_option_list # return list of each file_path

In [9]:
country_option_list = scrapp_worldometers(regions, '../raw_data')

Scraping: https://www.worldometers.info/world-population/india-population/
Scraping: https://www.worldometers.info/world-population/pakistan-population/
Scraping: https://www.worldometers.info/world-population/bangladesh-population/
Scraping: https://www.worldometers.info/world-population/iran-population/
Scraping: https://www.worldometers.info/world-population/afghanistan-population/
Scraping: https://www.worldometers.info/world-population/nepal-population/
Scraping: https://www.worldometers.info/world-population/sri-lanka-population/
Scraping: https://www.worldometers.info/world-population/bhutan-population/
Scraping: https://www.worldometers.info/world-population/maldives-population/
Scraping: https://www.worldometers.info/world-population/china-population/
Scraping: https://www.worldometers.info/world-population/japan-population/
Scraping: https://www.worldometers.info/world-population/south-korea-population/
Scraping: https://www.worldometers.info/world-population/north-korea-popu