In [24]:
import pandas as pd
from pandas import Series, DataFrame
import string
import country_converter as coco
import os

Original data sources
- https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy.html
- Statistical Review of World Energy - all data, 1965-2021
- https://www.bp.com/content/dam/bp/business-sites/en/global/corporate/xlsx/energy-economics/statistical-review/bp-stats-review-2022-all-data.xlsx 


Find file paths

In [25]:
## Get current working directory
print("Current working directory:", os.getcwd())

## Find path to raw data (for the raw files)
raw_data_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'raw_data'))

## Find path to folder for saving cleaned csv
cleaned_data_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'cleaned_data'))

## Find path to folder for inflation
inflation_data_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'inflation'))

Current working directory: /Users/jennagreene/Documents/GitHub/HATCH_data/reading_files


In [26]:
# Find and read the CSV file from the raw_data folder
target_file = 'bp-stats-review-2022-all-data.xlsx'
target_file_path = os.path.join(raw_data_path, target_file)

Define function

In [27]:
def read_bp(sheet):

    # Read data from Excel file into DataFrame
    df = pd.read_excel(target_file_path, 
                       sheet_name=sheet, header=2, 
                       index_col=0, na_values=['-', '^','♦'])
    
    # Remove non-integer columns
    omit = []
    for col in df.columns:
        if type(col) != int:
            omit.append(col)
    df.drop(columns=omit, inplace=True)

    # Drop rows with all NaN values
    df.dropna(how='all', inplace=True)
    
    # Extract country name from index
    idx = []
    for country in df.index:
        x = country.strip(string.digits)
        idx.append(x)
    df['Country Name'] = idx

    # Extract unit from index name
    unit = df.index.name.rstrip('*')
    unit = unit.strip(string.digits)
    df['Unit'] = unit
    
    # Add data source and spatial scale
    df['Data Source'] = 'BP'
    spatial_scale = []
    for country in df.index:
        strings = ['Total', 'Rest of World', 'Other', 'European Union', 'OECD', 'Central America', \
                   'Eastern Africa', 'Middle Africa', 'Western Africa', 'OPEC']
        for s in strings:
            if s in country and country != 'Total World':
                df.drop(country, inplace=True)
    iso2 = []
    for country in df.index:
        if country=='USSR':
            iso2.append('SU')
            spatial_scale.append('National')
        elif country=='Netherlands Antilles':
            iso2.append('AN')
            spatial_scale.append('National')
        elif country=='Total World':
            iso2.append('World')
            spatial_scale.append('Global')
        else:
            iso2.append(coco.convert(names=country, to='iso2'))
            spatial_scale.append('National')
    df.replace('Total World', 'World', inplace=True)
    df['Spatial Scale'] = spatial_scale
    df['Country Code'] = iso2
    df.reset_index(drop=True, inplace=True)

    # Replace units with standardized names
    df.replace({'Petajoules':'petajoules','Million tonnes':'million metric tons',
                'Million tonnes ':'million metric tons','million tonnes ':'million metric tons',
                'Thousand tonnes':'thousand metric tons','Tonnes':'metric tons',
                'Terawatt-hours':'TWh','Billion cubic metres':'billion cubic metres',
               'Thousand barrels daily':'thousand barrels/day'}, inplace=True)
    return df

In [28]:
def tech_name(df, tech):
    # Add a column for the technology name
    df['Technology Name'] = tech

    # Combine technology name, metric, and country code to form unique IDs
    df['ID'] = df['Technology Name'] + '_' + df['Metric'] + '_' + df['Country Code']

    # Set the index of the DataFrame to the ID column
    df.set_index('ID', inplace=True)
    return df

In [29]:
bp = []

Oil Production

In [30]:
bp_oil_production = read_bp('Oil Production - Tonnes')
bp_oil_production['Metric'] = 'Annual Production'
bp_oil_production = tech_name(bp_oil_production, 'Oil Production')
bp.append(bp_oil_production)

Oil refining

In [31]:
bp_oil_refining = read_bp('Oil - Refining capacity')
bp_oil_refining['Metric'] = 'Total Capacity'
bp_oil_refining = tech_name(bp_oil_refining, 'Oil Refining Capacity')
bp.append(bp_oil_refining)

Gas

In [32]:
bp_gas = read_bp('Gas Production - Bcm')
bp_gas['Metric'] = 'Annual Production'
bp_gas = tech_name(bp_gas, 'Natural Gas Production')
bp.append(bp_gas)

Coal

In [33]:
bp_coal = read_bp('Coal Production - Tonnes')
bp_coal['Metric'] = 'Annual Production'
bp_coal = tech_name(bp_coal, 'Coal Production')
bp.append(bp_coal)

Nuclear

In [34]:
bp_nuclear = read_bp('Nuclear Generation - TWh')
bp_nuclear['Metric'] = 'Annual Production'
bp_nuclear = tech_name(bp_nuclear, 'Nuclear Energy')
bp.append(bp_nuclear)

Hydro

In [35]:
bp_hydro = read_bp('Hydro Generation - TWh')
bp_hydro['Metric'] = 'Annual Production'
bp_hydro = tech_name(bp_hydro, 'Hydroelectricity')
bp.append(bp_hydro)

Renewables

In [36]:
bp_renewable = read_bp('Renewable power - TWh')
bp_renewable['Metric'] = 'Annual Production'
bp_renewable = tech_name(bp_renewable, 'Renewable Power')
bp.append(bp_renewable)

Electricity generation

In [37]:
bp_electricity_gen = read_bp('Electricity Generation')
bp_electricity_gen['Metric'] = 'Annual Production'
bp_electricity_gen = tech_name(bp_electricity_gen, 'Electricity')
bp.append(bp_electricity_gen)

In [38]:
# bp_lithium = read_bp('Lithium Production-Reserves')
# bp_lithium['Metric'] = 'Annual Production'
# bp_lithium = tech_name(bp_lithium, 'Lithium Mine Production')
# bp.append(bp_lithium)

In [39]:
# bp_cobalt = read_bp('Cobalt Production-Reserves')
# bp_cobalt['Metric'] = 'Annual Production'
# bp_cobalt = tech_name(bp_cobalt, 'Cobalt Mine Production')
# bp.append(bp_cobalt)

In [40]:
# bp_rare_earth = read_bp('Rare Earth Production-Reserves')
# bp_rare_earth['Metric'] = 'Annual Production'
# bp_rare_earth = tech_name(bp_rare_earth, 'Rare Earth Mine Production')
# bp.append(bp_rare_earth)

In [41]:
# bp_graphite = read_bp('Graphite Production-Reserves')
# bp_graphite['Metric'] = 'Annual Production'
# bp_graphite = tech_name(bp_graphite, 'Graphite Mine Production')
# bp.append(bp_graphite)

In [42]:
def read_bp2(sheet):
    # Define list of missing values to be treated as NaN
    missing_values = ['-', '^','♦']
    
    # Read data from Excel file into DataFrame
    df = pd.read_excel(target_file_path, sheet_name=sheet, header=2, index_col=0, 
                       na_values=missing_values, skipfooter=40)
    
    # Remove non-integer columns
    omit = []
    for col in df.columns:
        if type(col) != int:
            omit.append(col)
    df.drop(columns=omit, inplace=True)
    
    # Drop rows with all NaN values
    df.dropna(how='all', inplace=True)
    
    # Extract country name from index
    idx = []
    for country in df.index:
        x = country.strip(string.digits)
        idx.append(x)
    df['Country Name'] = idx
    
    # Extract unit from index name
    unit = df.index.name.rstrip('*')
    unit = unit.strip(string.digits)
    df['Unit'] = unit
    
    # Add data source and spatial scale
    df['Data Source'] = 'BP'
    df['Spatial Scale'] = 'National'
    
    # Drop rows representing aggregated regions
    for country in df.index:
        strings = ['Total', 'Rest of World', 'Other', 'European Union', 'OECD', 'Central America',
                   'Eastern Africa', 'Middle Africa', 'Western Africa', 'OPEC']
        for s in strings:
            if s in country:
                df.drop(country, inplace=True)
    
    # Map country names to ISO2 country codes
    iso2 = []
    for country in df.index:
        if country == 'USSR':
            iso2.append('SU')
        elif country == 'Netherlands Antilles':
            iso2.append('AN')
        else:
            iso2.append(coco.convert(names=country, to='iso2'))
    df['Country Code'] = iso2
    
    # Reset the index of the DataFrame
    df.reset_index(drop=True, inplace=True)
    
    # Replace units with standardized names
    df.replace({'Petajoules':'petajoules','Million tonnes':'million metric tons',
                'Million tonnes ':'million metric tons','million tonnes ':'million metric tons',
                'Thousand tonnes':'thousand metric tons','Tonnes':'metric tons',
                'Terawatt-hours':'TWh','Billion cubic metres':'billion cubic metres',
               'Thousand barrels daily':'thousand barrels/day'}, inplace=True)
    
    # Return the processed DataFrame
    return df

Biofuels

In [43]:
bp_biofuels = read_bp2('Biofuels production - PJ')
bp_biofuels['Metric'] = 'Annual Production'
bp_biofuels = tech_name(bp_biofuels, 'Biofuels Production')
bp.append(bp_biofuels)

Combine all data 

In [44]:
bp = pd.concat(bp)

In [45]:
# Initialize an empty list to store non-integer column names
omit = []

# Iterate through the column names of the DataFrame
for col in bp.columns:
    # Check if the column name is not an integer
    if type(col) != int:
        # If it's not an integer, add it to the list of columns to omit
        omit.append(col)

# Drop columns with non-integer names from the DataFrame
empty_rows = bp.drop(columns=omit)

# Drop rows with all NaN values
empty_rows.dropna(how='all', inplace=True)

# Initialize an empty list to store index labels of rows to be dropped
na_idx = []

# Iterate through the index labels of the original DataFrame
for country in bp.index:
    # Check if the country index label is not present in the modified DataFrame
    if country not in empty_rows.index:
        # If it's not present, add it to the list of index labels to be dropped
        na_idx.append(country)

# Drop rows with index labels stored in na_idx list from the original DataFrame
bp.drop(na_idx, inplace=True)

Save file

In [46]:
output_file = 'bp.csv'
output_file_path = os.path.join(cleaned_data_path, output_file)

bp.to_csv(output_file_path)
print("Data saved to:", output_file_path)


Data saved to: /Users/jennagreene/Documents/GitHub/HATCH_data/cleaned_data/bp.csv
