In [1]:
from bs4 import BeautifulSoup
import logging
import re
import requests

In [2]:
class CountryLoader:
    # Class to load and process United Nations Member States

    def __init__(self):
        self.base_url = 'https://www.un.org/en/about-us/member-states'

        # Set up logger to log messages for various events and errors
        logging.basicConfig(level=logging.INFO) # Log messages with a security level of INFO or higher
        self.logger = logging.getLogger(__name__)
    
    def fetch_and_parse(self, url: str) -> BeautifulSoup:
        # Fetch content from the URL and return parsed BeautifulSoup object
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36'
            }
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
            return BeautifulSoup(response.content, 'lxml')

        except requests.RequestException as e:
            self.logger.error(f'Failed to fetch URL {url}: {str(e)}')
            raise

    def extract_countries(self, soup: BeautifulSoup): # -> List[str]
        # Extract country names from parsed HTML
        countries = []

        try:
            # 'mb-2' is a unique CSS class, not present elsewhere in the HTML
            # This div contains the countries inside 'col-md-12' divs
            # Names are contained in h2 elements with class 'mb-0'
            block = soup.find('div', class_='mb-2') # Works as of 22nd January 2025

            if block is None:
                self.logger.error(f'No div with class "mb-2" found, URL {url} structure has likely been changed"')
                                            
            for country in block.find_all('h2', class_='mb-0'): # Works as of 22nd Janaury 2025
                name = country.text.strip()
                if name:
                    countries.append(name)

            # Check if the countries list is populated
            if not countries:
                self.logger.warning('No country names found in "mb-2" block')

        except Exception as e:
            self.logger.error(f'An error occured while extracting country names: {str(e)}')
            raise

        return countries

    def clean_country_name(self, name: str) -> str:
        # Standardizing country names for clarity and consistency, where official names are more commonly referred to by other names in international contexts.
        name_mapping = {
            'Democratic People\'s Republic of Korea': 'North Korea',
            'Democratic Republic of the Congo': 'DR Congo',
            'Lao People’s Democratic Republic': 'Laos',
            'Republic of Korea': 'South Korea',
            'Republic of Moldova': 'Moldova',
            'Russian Federation': 'Russia',
            'Syrian Arab Republic': 'Syria',
            'United Kingdom of Great Britain and Northern Ireland': 'United Kingdom',
            'United Republic of Tanzania': 'Tanzania',
        }

        if name in name_mapping:
            name = name_mapping[name] # Will change this later to generalize to lower case and other stuff

        # Some country names have official designations in brackets
        # Other country names like Venezuela have designations after a comma
        # Remove these since they aren't relevant in the game

        # Match sequence of characters in paranthesis and remove it
        name = re.sub(r'\([^)]*\)', '', name)

        # Remove everything after (and incluing) a comma
        name = re.sub(r',.*', '', name)

        return name

    def load_countries(self): # -> List[str]
        # Main method to load and process country data
        try:
            soup = self.fetch_and_parse(self.base_url)
            countries = self.extract_countries(soup)
            countries = [self.clean_country_name(country) for country in countries]

            return countries

        except Exception as e:
            self.logger.error(f'Failed to load countries: {str(e)}')
            raise

In [3]:
def load_country_data(): # -> List[str]
    # Wrapper function to create a class, load country data and return a list of countries
    loader = CountryLoader()
    return loader.load_countries()

# Test the loader
try:
    countries = load_country_data()
    print(f'Successfully extracted {len(countries)} countries')
    print('\n'.join(sorted(countries[:10])))

except Exception as e:
    print(f'Error during testing: {str(e)}')

Successfully extracted 192 countries
Afghanistan
Albania
Algeria
Andorra
Angola
Antigua and Barbuda
Argentina
Armenia
Australia
Austria


In [4]:
class CityLoader:
    # Class to load and process 500 most populated cities as per World Population Review

    def __init__(self):
        self.base_url = 'https://worldpopulationreview.com/cities'

        # Set up logger to log messages for various events and errors
        logging.basicConfig(level=logging.INFO) # Log messages with a security level of INFO or higher
        self.logger = logging.getLogger(__name__)
    
    def fetch_and_parse(self, url: str) -> BeautifulSoup:
        # Fetch content from the URL and return parsed BeautifulSoup object
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36'
            }
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
            return BeautifulSoup(response.content, 'lxml')

        except requests.RequestException as e:
            self.logger.error(f'Failed to fetch URL {url}: {str(e)}')
            raise

    def extract_cities(self, soup: BeautifulSoup): # -> List[str]
        # Extract city names from parsed HTML
        cities = []

        try:
            # 'my-6' is a unique CSS class, not present elsewhere in the HTML
            # This div contains the names of cities inside <a class="text-wpr-link">
            block = soup.find('div', class_='my-6') # Works as of 22nd January 2025

            if block is None:
                self.logger.error(f'No div with class "my-6" found, URL {url} structure has likely been changed"')
                                            
            for city in block.find_all('a', class_='text-wpr-link'): # Works as of 22nd Janaury 2025
                name = city.text.strip()
                if name:
                    cities.append(name)

            # Check if the cities list is populated
            if not cities:
                self.logger.warning('No city names found in "my-6" block')

        except Exception as e:
            self.logger.error(f'An error occured while extracting city names: {str(e)}')
            raise

        return cities

    def load_cities(self): # -> List[str]
        # Main method to load and process city data
        try:
            soup = self.fetch_and_parse(self.base_url)
            cities = self.extract_cities(soup)

            return cities

        except Exception as e:
            self.logger.error(f'Failed to load cities: {str(e)}')
            raise

In [5]:
def load_city_data(): # -> List[str]
    # Wrapper function to create a class, load city data and return a list of cities
    loader = CityLoader()
    return loader.load_cities()

# Test the loader
try:
    cities = load_city_data()[:500]
    print(f'Successfully extracted {len(cities)} cities')
    print('\n'.join(cities[:10])) # Printing 10 city names for testing

except Exception as e:
    print(f'Error during testing: {str(e)}')

Successfully extracted 500 cities
Tokyo
Delhi
Shanghai
Dhaka
Sao Paulo
Cairo
Mexico City
Beijing
Mumbai
Osaka
