In [40]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


# URL of the website to scrape
url = 'https://data.worldbank.org/country'

# Send a GET request to the website
response = requests.get(url)

# Check if the request was successful
if response.status_code != 200:
    print(f"Failed to retrieve content: {response.status_code}")
else:
    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the list of countries and relevant data
    countries = []
    for ul in soup.find_all('ul'):
        for li in ul.find_all('li'):
            a_tag = li.find('a')
            if a_tag and '/country/' in a_tag['href']:
                country_name = a_tag.text.strip()
                country_link = a_tag['href']
                countries.append({
                    'name': country_name,
                    'link': f"https://data.worldbank.org{country_link}"
                })

    # Convert to DataFrame
    df_countries = pd.DataFrame(countries)



In [41]:
df_countries.head

<bound method NDFrame.head of                       name                                               link
0              Afghanistan  https://data.worldbank.org/country/afghanistan...
1                  Albania  https://data.worldbank.org/country/albania?vie...
2                  Algeria  https://data.worldbank.org/country/algeria?vie...
3           American Samoa  https://data.worldbank.org/country/american-sa...
4                  Andorra  https://data.worldbank.org/country/andorra?vie...
..                     ...                                                ...
212  Virgin Islands (U.S.)  https://data.worldbank.org/country/virgin-isla...
213     West Bank and Gaza  https://data.worldbank.org/country/west-bank-a...
214            Yemen, Rep.  https://data.worldbank.org/country/yemen-rep?v...
215                 Zambia  https://data.worldbank.org/country/zambia?view...
216               Zimbabwe  https://data.worldbank.org/country/zimbabwe?vi...

[217 rows x 2 columns]>

In [23]:
def get_country_data(country_url):
    response = requests.get(country_url)
    if response.status_code != 200:
        print(f"Failed to retrieve content for {country_url}: {response.status_code}")
        return {}

    soup = BeautifulSoup(response.content, 'html.parser')

    # Initialize a dictionary to store the data
    country_data = {
        'Net migration': 'N/A',
        'Life expectancy at birth': 'N/A',
        'Population, total': 'N/A',
        'Population growth': 'N/A'
    }

    # Look for specific data in the page
    for indicator in soup.find_all('div', class_='indicator-item__wrapper'):
        title_tag = indicator.find('div', class_='indicator-item__title')
        if title_tag:
            title_link = title_tag.find('a')
            value_tag = indicator.find('div', class_='indicator-item__data-info')
            if title_link and value_tag:
                title = title_link.text.strip()
                value_span = value_tag.find('span')
                if value_span:
                    value = value_span.text.strip()

                    if 'Net migration' in title:
                        country_data['Net migration'] = value
                    elif 'Life expectancy at birth' in title:
                        country_data['Life expectancy at birth'] = value
                    elif 'Population, total' in title:
                        country_data['Population, total'] = value
                    elif 'Population growth' in title:
                        country_data['Population growth'] = value

    return country_data

In [42]:
# Add additional data to the dataframe
additional_data = []
for index, row in df_countries.iterrows():
    country_data = get_country_data(row['link'])
    additional_data.append(country_data)

# Create a new dataframe with the additional data
df_additional_data = pd.DataFrame(additional_data)

# Combine the country data with the additional data
df_countries = pd.concat([df_countries, df_additional_data], axis=1)




In [43]:
df_countries.head

<bound method NDFrame.head of                       name                                               link  \
0              Afghanistan  https://data.worldbank.org/country/afghanistan...   
1                  Albania  https://data.worldbank.org/country/albania?vie...   
2                  Algeria  https://data.worldbank.org/country/algeria?vie...   
3           American Samoa  https://data.worldbank.org/country/american-sa...   
4                  Andorra  https://data.worldbank.org/country/andorra?vie...   
..                     ...                                                ...   
212  Virgin Islands (U.S.)  https://data.worldbank.org/country/virgin-isla...   
213     West Bank and Gaza  https://data.worldbank.org/country/west-bank-a...   
214            Yemen, Rep.  https://data.worldbank.org/country/yemen-rep?v...   
215                 Zambia  https://data.worldbank.org/country/zambia?view...   
216               Zimbabwe  https://data.worldbank.org/country/zimbabwe?vi...  

In [44]:
df_countries.to_csv('worldbank_countries_with_data.csv', index=False)

In [45]:
def clean_numeric_data(value):
    try:
        if isinstance(value, str):
            value = value.replace(',', '')  # Remove commas from numbers
        float_val = float(value)
        if float_val == float('inf') or float_val == float('-inf') or pd.isna(float_val):
            return None
        return float_val
    except (ValueError, TypeError):
        return value
for column in df_countries.columns:
    if df_countries[column].dtype in ['float64', 'int64']:
        df_countries[column] = df_countries[column].apply(clean_numeric_data)

In [46]:
df_countries.head

<bound method NDFrame.head of                       name                                               link  \
0              Afghanistan  https://data.worldbank.org/country/afghanistan...   
1                  Albania  https://data.worldbank.org/country/albania?vie...   
2                  Algeria  https://data.worldbank.org/country/algeria?vie...   
3           American Samoa  https://data.worldbank.org/country/american-sa...   
4                  Andorra  https://data.worldbank.org/country/andorra?vie...   
..                     ...                                                ...   
212  Virgin Islands (U.S.)  https://data.worldbank.org/country/virgin-isla...   
213     West Bank and Gaza  https://data.worldbank.org/country/west-bank-a...   
214            Yemen, Rep.  https://data.worldbank.org/country/yemen-rep?v...   
215                 Zambia  https://data.worldbank.org/country/zambia?view...   
216               Zimbabwe  https://data.worldbank.org/country/zimbabwe?vi...  

In [47]:
df_countries.to_csv('worldbank_countries_with_data.csv', index=False)

In [48]:
import pandas as pd
import numpy as np
import time
from geopy.geocoders import Nominatim

# Load your dataframe (assuming df_countries is already loaded)
df = df_countries

# Ensure all data can be serialized to JSON by converting NaNs to None and handling commas in numbers
def clean_numeric_data(value):
    try:
        if isinstance(value, str):
            value = value.replace(',', '')  # Remove commas from numbers
        float_val = float(value)
        if float_val == float('inf') or float_val == float('-inf') or pd.isna(float_val):
            return None
        return float_val
    except (ValueError, TypeError):
        return value

# Apply the cleaning function to the appropriate columns
numeric_columns = ['Net migration', 'Life expectancy at birth', 'Population, total', 'Population growth']
for column in numeric_columns:
    df[column] = df[column].apply(clean_numeric_data)

# Replace any remaining NaN values with None
df = df.replace({np.nan: None})

# Initialize the geolocator
geolocator = Nominatim(user_agent="worldbank_data_geocoder", timeout=10)

# Function to get latitude and longitude
def get_lat_lon(country_name):
    try:
        location = geolocator.geocode(country_name)
        if location:
            return location.latitude, location.longitude
        else:
            return None, None
    except Exception as e:
        print(f"Error getting location for {country_name}: {e}")
        return None, None

# Add latitude and longitude to the dataframe
df['latitude'] = None
df['longitude'] = None

for index, row in df.iterrows():
    print(f"Geocoding {row['name']}...")
    lat, lon = get_lat_lon(row['name'])
    df.at[index, 'latitude'] = lat
    df.at[index, 'longitude'] = lon
    time.sleep(1)  # To respect the service's usage policy

# Save the updated data with geo-locations
df.to_csv('worldbank_countries_with_geo_data.csv', index=False)


Geocoding Afghanistan...
Geocoding Albania...
Geocoding Algeria...
Geocoding American Samoa...
Geocoding Andorra...
Geocoding Angola...
Geocoding Antigua and Barbuda...
Geocoding Argentina...
Geocoding Armenia...
Geocoding Aruba...
Geocoding Australia...
Geocoding Austria...
Geocoding Azerbaijan...
Geocoding Bahamas, The...
Geocoding Bahrain...
Geocoding Bangladesh...
Geocoding Barbados...
Geocoding Belarus...
Geocoding Belgium...
Geocoding Belize...
Geocoding Benin...
Geocoding Bermuda...
Geocoding Bhutan...
Geocoding Bolivia...
Geocoding Bosnia and Herzegovina...
Geocoding Botswana...
Geocoding Brazil...
Geocoding British Virgin Islands...
Geocoding Brunei Darussalam...
Geocoding Bulgaria...
Geocoding Burkina Faso...
Geocoding Burundi...
Geocoding Cabo Verde...
Geocoding Cambodia...
Geocoding Cameroon...
Geocoding Canada...
Geocoding Cayman Islands...
Geocoding Central African Republic...
Geocoding Chad...
Geocoding Channel Islands...
Geocoding Chile...
Geocoding China...
Geocoding C

In [52]:
df.head

<bound method NDFrame.head of                       name                                               link  \
0              Afghanistan  https://data.worldbank.org/country/afghanistan...   
1                  Albania  https://data.worldbank.org/country/albania?vie...   
2                  Algeria  https://data.worldbank.org/country/algeria?vie...   
3           American Samoa  https://data.worldbank.org/country/american-sa...   
4                  Andorra  https://data.worldbank.org/country/andorra?vie...   
..                     ...                                                ...   
212  Virgin Islands (U.S.)  https://data.worldbank.org/country/virgin-isla...   
213     West Bank and Gaza  https://data.worldbank.org/country/west-bank-a...   
214            Yemen, Rep.  https://data.worldbank.org/country/yemen-rep?v...   
215                 Zambia  https://data.worldbank.org/country/zambia?view...   
216               Zimbabwe  https://data.worldbank.org/country/zimbabwe?vi...  

In [54]:
df['Life expectancy at birth'] = pd.to_numeric(df['Life expectancy at birth'], errors='coerce')
# Function to calculate Life Expectancy Index
def calculate_lei(life_expectancy):
    if pd.isna(life_expectancy):
        return None
    return (life_expectancy - 20) / (85 - 20)

# Apply the function to the dataframe
df['Life Expectancy Index (LEI)'] = df['Life expectancy at birth'].apply(calculate_lei)

# Placeholder values for education and income indices
df['Education Index (EI)'] = 0.75  # Placeholder
df['Income Index (II)'] = 0.75  # Placeholder

# Function to calculate HDI
def calculate_hdi(row):
    if pd.isna(row['Life Expectancy Index (LEI)']):
        return None
    return (row['Life Expectancy Index (LEI)'] * row['Education Index (EI)'] * row['Income Index (II)']) ** (1/3)

# Apply the function to the dataframe
df['HDI'] = df.apply(calculate_hdi, axis=1)

 
df.head

<bound method NDFrame.head of                       name                                               link  \
0              Afghanistan  https://data.worldbank.org/country/afghanistan...   
1                  Albania  https://data.worldbank.org/country/albania?vie...   
2                  Algeria  https://data.worldbank.org/country/algeria?vie...   
3           American Samoa  https://data.worldbank.org/country/american-sa...   
4                  Andorra  https://data.worldbank.org/country/andorra?vie...   
..                     ...                                                ...   
212  Virgin Islands (U.S.)  https://data.worldbank.org/country/virgin-isla...   
213     West Bank and Gaza  https://data.worldbank.org/country/west-bank-a...   
214            Yemen, Rep.  https://data.worldbank.org/country/yemen-rep?v...   
215                 Zambia  https://data.worldbank.org/country/zambia?view...   
216               Zimbabwe  https://data.worldbank.org/country/zimbabwe?vi...  

In [56]:
df['Population, total'] = df['Population, total'].apply(lambda x: x * 1_000_000_000 if x < 10 else x)
df.to_csv('worldbank_countries_with_geo_data.csv', index=False)