# Energy & Development Database - Data Cleaning

*Author: Leilanie Rubinstein*

## Introduction

This notebook imports and cleans data from the World Bank and Energy Institute for ingestion into a SQL database.

In [9]:
# Load libraries
import os

import numpy as np
import pandas as pd

# Define data directories
data_dir = os.path.abspath("data")
wb_dir = os.path.join(data_dir, "world_bank")
ei_dir = os.path.join(data_dir, "energy_institute")
clean_dir = os.path.join(data_dir, "cleaned_data")

The country names do not match across the datasets. The function below will standardize country names

In [10]:
def standardize_country_names(df, country_col="Country"):
    """Standardize country names across datasets"""
    country_mapping = {
        "CongoRep": "Congo, Rep.",
        "Ivory Coast": "Côte d'Ivoire",
        "Cape Verde": "Cabo Verde",
        "Gambia, The": "Gambia",
        "Sao Tome and Principe": "São Tomé and Príncipe",
        "Egypt, Arab Rep.": "Egypt",
        # Remove region/category prefixes from Africa electricity data
        "Region_Low Income Fragile": None,
        "Region_Low Income Non-Fragile": None,
        "Region_Middle Income": None,
        "Region_Sub-Saharan Africa": None,
        "Economic Community_CEMAC": None,
        "Economic Community_COMESA": None,
        "Economic Community_EAC": None,
        "Economic Community_ECOWAS": None,
        "Economic Community_SADC": None,
        "Power Pool_Central Africa": None,
        "Power Pool_Eastern Africa": None,
        "Power Pool_Southern Africa": None,
        "Power Pool_Western Africa": None,
        "Capacity Category_Under 200MW": None,
        "Capacity Category_Over 1000MW": None,
        "Capacity Category_200MW-1000MW": None,
    }

    # Apply mapping
    df[country_col] = df[country_col].replace(country_mapping)

    # Remove rows that were mapped to None (regional aggregates)
    df = df[df[country_col].notna()]

    # Clean up extracted country names
    def extract_clean_country(country_str):
        if pd.isna(country_str):
            return None
        
        # Remove entries that start with numbers (like "034. Low Income, Fragile")
        if str(country_str).strip()[:3].replace('.', '').isdigit():
            return None

        # If it contains '. ' and '_', extract the country name
        if ". " in country_str and "_" in country_str:
            start = country_str.find(". ") + 2
            end = country_str.find("_")
            if start < end:
                extracted = country_str[start:end].strip()
                # Apply mapping to extracted name too
                return country_mapping.get(extracted, extracted)

        return country_str.strip()

    df[country_col] = df[country_col].apply(extract_clean_country)
    
    # Remove any remaining None values
    df = df[df[country_col].notna()]

    return df

### Import International Development Association Data

In [11]:
# Import IDA series
df = pd.read_excel(
    os.path.join(wb_dir, "P_Data_Extract_From_IDA_Results_Measurement_System.xlsx")
)

In [12]:
# Remove metadata rows (non-country entries)
df = df[
    df["Country Name"].notna()
    & ~df["Country Name"].str.contains("Data from database|Last Updated", na=False)
]

# Apply country name standardization
df = standardize_country_names(df, "Country Name")

# Drop rows with all NA indicator values
df = df.dropna(subset=df.columns[2:], how="all")

# Melt to long format
years = [str(y) for y in range(2006, 2021)]
id_vars = ["Country Name", "Series Name"]
df_long = df.melt(
    id_vars=id_vars, value_vars=years, var_name="Year", value_name="Value"
)

# Convert Year to integer and handle NAs
df_long["Year"] = df_long["Year"].astype(int)
df_long["Value"] = pd.to_numeric(df_long["Value"], errors="coerce")

df_long.to_csv(os.path.join(clean_dir, "00_cleaned_indicators.csv"), index=False)
print(f"Saved {len(df_long)} rows to cleaned_indicators.csv")

# Check IDA country names
print("IDA Dataset - Unique Countries:")
print(sorted(df_long['Country Name'].unique()))
print(f"Total: {df_long['Country Name'].nunique()} countries")

Saved 33690 rows to cleaned_indicators.csv
IDA Dataset - Unique Countries:
['Afghanistan', 'Bangladesh', 'Benin', 'Bhutan', 'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cambodia', 'Cameroon', 'Central African Republic', 'Chad', 'Comoros', 'Congo, Dem. Rep.', 'Congo, Rep.', "Cote d'Ivoire", 'Djibouti', 'Dominica', 'East Asia & Pacific (IDA total)', 'Eritrea', 'Ethiopia', 'Europe & Central Asia (IDA total)', 'Fiji', 'Gambia', 'Ghana', 'Grenada', 'Guinea', 'Guinea-Bissau', 'Guyana', 'Haiti', 'Honduras', 'IDA blend', 'IDA countries classified as fragile situations', 'IDA countries not classified as fragile situations', 'IDA only', 'IDA total', 'Kenya', 'Kiribati', 'Kosovo', 'Kyrgyz Republic', 'Lao PDR', 'Latin America & Caribbean (IDA total)', 'Lesotho', 'Liberia', 'Madagascar', 'Malawi', 'Maldives', 'Mali', 'Marshall Islands', 'Mauritania', 'Micronesia, Fed. Sts.', 'Middle East & North Africa (IDA total)', 'Mozambique', 'Myanmar', 'Nepal', 'Nicaragua', 'Niger', 'Nigeria', 'Pakistan', 'Papua 

### Import World Bank Africa Electricity Infrastructure Data

In [13]:
afr_df = pd.read_excel(
    os.path.join(wb_dir, "P_Data_Extract_From_Africa_Infrastructure_Electricity.xlsx")
)

print(f"Original data shape: {afr_df.shape}")
print("Sample country names:")
print(afr_df["Country Name"].head(10).tolist())

Original data shape: (3125, 12)
Sample country names:
['001. Benin_SBEE', '001. Benin_SBEE', '001. Benin_SBEE', '001. Benin_SBEE', '001. Benin_SBEE', '001. Benin_SBEE', '001. Benin_SBEE', '001. Benin_SBEE', '001. Benin_SBEE', '001. Benin_SBEE']


In [14]:
# Remove metadata
afr_df_clean = afr_df[
    afr_df["Country Name"].notna()
    & ~afr_df["Country Name"].astype(str).str.contains("Data from database", na=False)
    & ~afr_df["Country Name"].astype(str).str.contains("Last Updated", na=False)
    & ~afr_df["Country Name"].astype(str).str.startswith("Source:", na=False)
]

# Extract country names
def extract_country_name(country_str):
    if pd.isna(country_str):
        return None
    
    country_str = str(country_str).strip()
    
    # Extract country name from format like "001. Benin_SBEE"
    if ". " in country_str and "_" in country_str:
        # Find the country name between ". " and "_"
        start = country_str.find(". ") + 2
        end = country_str.find("_")
        if start < end:
            country_name = country_str[start:end].strip()
            
            # Apply standardization
            if country_name == 'CongoRep':
                return 'Congo, Rep.'
            elif country_name == 'Ivory Coast':
                return "Côte d'Ivoire"
            elif country_name == 'Cape Verde':
                return 'Cabo Verde'
            else:
                return country_name
    
    return country_str

# Apply country name extraction
afr_df_clean = afr_df_clean.copy()
afr_df_clean['Country'] = afr_df_clean['Country Name'].apply(extract_country_name)

# Remove any None values
afr_df_clean = afr_df_clean[afr_df_clean['Country'].notna()]

# Filter out regional aggregates
def is_actual_country(country_name):
    """Check if this is an actual country, not a regional aggregate"""
    if pd.isna(country_name):
        return False
    
    country_str = str(country_name).strip()
    
    # Remove entries that start with numbers (regional categories)
    if country_str[:3].replace('.', '').replace(' ', '').isdigit():
        return False
    
    # Remove obvious regional/category entries
    regional_keywords = [
        'Low Income', 'middle Income', 'Resource Rich', 'SSA',
        'CEMAC', 'COMESA', 'EAC', 'ECOWAS', 'SADC',
        'Power Pool', 'Countries with', 'predominantly'
    ]
    
    if any(keyword in country_str for keyword in regional_keywords):
        return False
    
    return True

# Filter to actual countries only
afr_df_clean = afr_df_clean[afr_df_clean['Country'].apply(is_actual_country)]

# Handle year columns
year_columns = ["2004", "2005", "2006", "2007", "2008", "2009"]
mrv_columns = ["MRV2"]
id_vars = ["Country", "Series Name"]

# Melt year columns to long format
afr_df_years = afr_df_clean.melt(
    id_vars=id_vars, value_vars=year_columns, var_name="Year", value_name="Value"
)

# Convert year data types
afr_df_years["Value"] = pd.to_numeric(afr_df_years["Value"], errors="coerce")

# Handle MRV2 column separately
afr_df_mrv = afr_df_clean.melt(
    id_vars=id_vars, value_vars=mrv_columns, var_name="Year", value_name="Value"
)

# Convert MRV2 values to numeric
afr_df_mrv["Value"] = pd.to_numeric(afr_df_mrv["Value"], errors="coerce")

# Combine both datasets
afr_df_long = pd.concat([afr_df_years, afr_df_mrv], ignore_index=True)

# Drop rows with NA values
afr_df_long = afr_df_long.dropna(subset=["Value"])

# Function to determine aggregation method
def determine_agg_method(series_name):
    """Determine whether to sum or average based on indicator type"""
    sum_keywords = [
        "customer", "revenue", "collection amount", "output",
        "capacity", "sales", "number", "total", "installed",
    ]
    if any(kw.lower() in series_name.lower() for kw in sum_keywords):
        return "sum"
    return "mean"

# Group and aggregate data
agg_df = (
    afr_df_long.groupby(["Country", "Series Name", "Year"])
    .agg({
        "Value": lambda x: x.sum() if determine_agg_method(x.name) == "sum" else x.mean(),
    })
    .reset_index()
)

# Add observation count
obs_count = (
    afr_df_long.groupby(["Country", "Series Name", "Year"])
    .size()
    .reset_index(name="N_Observations")
)

agg_df = agg_df.merge(obs_count, on=["Country", "Series Name", "Year"])

# Clean up series names
agg_df["Series Name"] = agg_df["Series Name"].str.replace(r"\(.*?\)", "", regex=True)
agg_df["Series Name"] = agg_df["Series Name"].str.strip()

# Save the cleaned data
agg_df.to_csv(
    os.path.join(clean_dir, "01_africa_electricity_infrastructure.csv"), index=False
)
print(f"\nSaved {len(agg_df)} rows to 01_africa_electricity_infrastructure.csv")

# Check Africa Electricity country names
print("\nAfrica Electricity Dataset - Unique Countries:")
print(sorted(agg_df['Country'].unique()))
print(f"Total: {agg_df['Country'].nunique()} countries")


Saved 2942 rows to 01_africa_electricity_infrastructure.csv

Africa Electricity Dataset - Unique Countries:
['Benin', 'Burkina Faso', 'Cabo Verde', 'Cameroon', 'Chad', 'Congo, Dem. Rep.', 'Congo, Rep.', "Cote d'Ivoire", 'Ethiopia', 'Ghana', 'Kenya', 'Lesotho', 'Madagascar', 'Malawi', 'Mali', 'Mozambique', 'Namibia', 'Niger', 'Nigeria', 'Rwanda', 'Senegal', 'South Africa', 'Sudan', 'Tanzania', 'Uganda', 'Zambia']
Total: 26 countries


### Import Energy Institute Data

In [15]:
def get_african_countries():
    """Return set of African country names"""
    return {
        "Algeria",
        "Angola",
        "Benin",
        "Botswana",
        "Burkina Faso",
        "Burundi",
        "Cameroon",
        "Cape Verde",
        "Central African Republic",
        "Chad",
        "Comoros",
        "Congo",
        "Congo, Rep.",
        "Congo, Dem. Rep.",
        "Côte d'Ivoire",
        "Djibouti",
        "Egypt",
        "Egypt, Arab Rep.",
        "Equatorial Guinea",
        "Eritrea",
        "Eswatini",
        "Ethiopia",
        "Gabon",
        "Gambia",
        "Ghana",
        "Guinea",
        "Guinea-Bissau",
        "Kenya",
        "Lesotho",
        "Liberia",
        "Libya",
        "Madagascar",
        "Malawi",
        "Mali",
        "Mauritania",
        "Mauritius",
        "Morocco",
        "Mozambique",
        "Namibia",
        "Niger",
        "Nigeria",
        "Rwanda",
        "São Tomé and Príncipe",
        "Senegal",
        "Seychelles",
        "Sierra Leone",
        "Somalia",
        "South Africa",
        "South Sudan",
        "Sudan",
        "Tanzania",
        "Togo",
        "Tunisia",
        "Uganda",
        "Zambia",
        "Zimbabwe",
    }


def clean_energy_sheet(sheet_data, sheet_name):
    """Clean Energy Institute sheet for African countries only"""
    df = sheet_data.copy()

    # Find header row by looking for years in the data
    header_row = None
    for i in range(min(5, len(df))):
        row = df.iloc[i]
        year_count = sum(
            1
            for val in row
            if str(val).replace(".0", "").isdigit()
            and 1960 <= float(str(val).replace(".0", "")) <= 2030
        )
        if year_count >= 3:
            header_row = i
            break

    if header_row is None:
        return pd.DataFrame()

    # Set header
    df.columns = df.iloc[header_row]
    df = df.iloc[header_row + 1 :]

    # Find the country column
    country_col = None
    for col in df.columns:
        if not str(col).replace(".0", "").isdigit():
            country_col = col
            break

    if country_col is None:
        return pd.DataFrame()

    # Find year columns
    year_cols = []
    for col in df.columns:
        try:
            col_str = str(col).replace(".0", "")
            year = int(col_str)
            if 1960 <= year <= 2030:
                year_cols.append(col)
        except:
            continue

    if not year_cols:
        return pd.DataFrame()

    # Select country + year columns
    df = df[[country_col] + year_cols]
    df = df.rename(columns={country_col: "Country"})

    # Clean country names and filter for African countries
    df["Country"] = df["Country"].astype(str).str.strip()
    african_countries = get_african_countries()

    # Filter for African countries
    df_filtered = df[df["Country"].isin(african_countries)]

    if df_filtered.empty:
        return pd.DataFrame()

    # Convert to long format
    df_long = df_filtered.melt(
        id_vars=["Country"], value_vars=year_cols, var_name="Year", value_name="Value"
    )

    # Clean data
    df_long["Value"] = df_long["Value"].replace(["..", "-", ""], np.nan)
    df_long["Value"] = pd.to_numeric(df_long["Value"], errors="coerce")

    # Convert year column
    df_long["Year"] = df_long["Year"].astype(str).str.replace(".0", "", regex=False)
    df_long["Year"] = pd.to_numeric(df_long["Year"], errors="coerce")

    df_long["Series Name"] = sheet_name

    # Remove missing values
    df_long = df_long.dropna(subset=["Value"])

    return df_long[["Country", "Series Name", "Year", "Value"]]

In [16]:
def process_energy_institute_data(file_path):
    """Process Energy Institute file for African countries"""

    # Define sheets to process
    sheets_to_process = {
        "Oil Consumption - barrels": "Oil Consumption (thousand barrels daily)",
        "Oil Production - barrels": "Oil Production (thousand barrels daily)",
        "Gas Consumption - Bcm": "Natural Gas Consumption (Bcm)",
        "Gas Production - Bcm": "Natural Gas Production (Bcm)",
        "Primary energy cons - EJ": "Primary Energy Consumption (EJ)",
        "Carbon Dioxide from Energy": "CO2 Emissions from Energy (Mt)",
        "Electricity Generation - TWh": "Electricity Generation (TWh)",
        "Nuclear Generation - TWh": "Nuclear Generation (TWh)",
        "Hydro Generation - TWh": "Hydro Generation (TWh)",
        "Solar Generation - TWh": "Solar Generation (TWh)",
        "Wind Generation - TWh": "Wind Generation (TWh)",
        "Coal Production - EJ": "Coal Production (EJ)",
        "Coal Consumption - EJ": "Coal Consumption (EJ)",
    }

    # Read all sheets
    excel_data = pd.read_excel(file_path, sheet_name=None)

    all_data = []

    for sheet_key, indicator_name in sheets_to_process.items():
        if sheet_key in excel_data:
            cleaned = clean_energy_sheet(excel_data[sheet_key], indicator_name)
            if not cleaned.empty:
                all_data.append(cleaned)

    # Combine all data
    if all_data:
        final_df = pd.concat(all_data, ignore_index=True)

        # Remove duplicates
        final_df = final_df.drop_duplicates()

        print(
            f"Final result: {len(final_df)} records for {final_df['Country'].nunique()} African countries"
        )
        print(f"Indicators: {final_df['Series Name'].nunique()}")
        print(
            f"Year range: {final_df['Year'].min():.0f} - {final_df['Year'].max():.0f}"
        )

        return final_df

    return pd.DataFrame()

# Apply function to EI data
ei_data = process_energy_institute_data(
    "data/energy_institute/Statistical Review of World Energy Data.xlsx"
)
ei_data = standardize_country_names(ei_data, "Country")
ei_data.to_csv(os.path.join(clean_dir, "02_energy_institute_africa.csv"), index=False)

Final result: 3339 records for 14 African countries
Indicators: 13
Year range: 1965 - 2023


In [17]:
# Compare country names across all datasets
ida_countries = set(pd.read_csv(os.path.join(clean_dir, "00_cleaned_indicators.csv"))['Country Name'].unique())
africa_countries = set(pd.read_csv(os.path.join(clean_dir, "01_africa_electricity_infrastructure.csv"))['Country'].unique())
energy_countries = set(pd.read_csv(os.path.join(clean_dir, "02_energy_institute_africa.csv"))['Country'].unique())

print("COUNTRY NAME COMPARISON")
print(f"IDA countries: {len(ida_countries)}")
print(f"Africa Electricity countries: {len(africa_countries)}")
print(f"Energy Institute countries: {len(energy_countries)}")

print("\nCOUNTRIES IN ALL DATASETS")
common_countries = ida_countries & africa_countries & energy_countries
print(sorted(common_countries))
print(f"Total overlap: {len(common_countries)} countries")

print("\nCOUNTRIES ONLY IN IDA")
print(sorted(ida_countries - africa_countries - energy_countries))

print("\nONLY IN AFRICA ELECTRICITY")
print(sorted(africa_countries - ida_countries - energy_countries))

print("\nONLY IN ENERGY INSTITUTE")
print(sorted(energy_countries - ida_countries - africa_countries))

COUNTRY NAME COMPARISON
IDA countries: 85
Africa Electricity countries: 26
Energy Institute countries: 14

COUNTRIES IN ALL DATASETS
['Chad', 'Nigeria', 'Sudan']
Total overlap: 3 countries

COUNTRIES ONLY IN IDA
['Afghanistan', 'Bangladesh', 'Bhutan', 'Burundi', 'Cambodia', 'Central African Republic', 'Comoros', 'Djibouti', 'Dominica', 'East Asia & Pacific (IDA total)', 'Eritrea', 'Europe & Central Asia (IDA total)', 'Fiji', 'Gambia', 'Grenada', 'Guinea', 'Guinea-Bissau', 'Guyana', 'Haiti', 'Honduras', 'IDA blend', 'IDA countries classified as fragile situations', 'IDA countries not classified as fragile situations', 'IDA only', 'IDA total', 'Kiribati', 'Kosovo', 'Kyrgyz Republic', 'Lao PDR', 'Latin America & Caribbean (IDA total)', 'Liberia', 'Maldives', 'Marshall Islands', 'Mauritania', 'Micronesia, Fed. Sts.', 'Middle East & North Africa (IDA total)', 'Myanmar', 'Nepal', 'Nicaragua', 'Pakistan', 'Papua New Guinea', 'Samoa', 'Sierra Leone', 'Solomon Islands', 'Somalia', 'South Asia (