# Energy Data PreProcessing

In [36]:
import pandas as pd
import geopandas as gpd
import seaborn as sns
import matplotlib.pyplot as plt
import os
from scipy.stats.stats import pearsonr

## Load Data

In [37]:
def prepare_data(df, column_name):
    """
    Processes BP data. column_name is desired name for this data
    """
    df = df.loc[~(df["country"].isnull() | df["country"].str.contains("Total") | df["country"].str.contains("OECD"))]
    df = df.set_index("country")
    df = df.stack().rename(column_name).reset_index().rename(columns={"level_1":"year"})
    df = df.loc[df["year"].astype(str).str.len() == 4]
    df.loc[:, "year"] = pd.to_numeric(df.loc[:,"year"])
    df.loc[:, column_name] = pd.to_numeric(df.loc[:,column_name])
    return df

#### CO2

Units: Million tonnes of CO2

In [38]:
co2 = pd.read_excel("../../data/usage/bp-stats-review-2019-all-data.xlsx", "Carbon Dioxide Emissions", header=2)
co2.rename(columns={"Million tonnes of carbon dioxide":"country"}, inplace=True)
co2 = prepare_data(co2, "co2")

#### Total Energy

Units: Mtoe (million tonnes oil equivalent)

In [39]:
energy = pd.read_excel("../../data/usage/bp-stats-review-2019-all-data.xlsx", "Primary Energy Consumption", header=2)
energy.rename(columns={"Million tonnes oil equivalent":"country"}, inplace=True)
energy = prepare_data(energy, "energy_consumption")

#### Total Renewables

Units: Mtoe

In [40]:
renew = pd.read_excel("../../data/usage/bp-stats-review-2019-all-data.xlsx", "Renewables - Mtoe", header=2)
renew.rename(columns={"Million tonnes oil equivalent":"country"}, inplace=True)
renew = prepare_data(renew, "renewable_generation")

#### Solar

Units: TWh

In [41]:
solar = pd.read_excel("../../data/usage/bp-stats-review-2019-all-data.xlsx", "Solar Generation - TWh", header=2)
solar.rename(columns={"Terawatt-hours":"country"}, inplace=True)
solar = prepare_data(solar, "solar_generation")

#### Wind

Units: TWh

In [42]:
wind = pd.read_excel("../../data/usage/bp-stats-review-2019-all-data.xlsx", "Wind Generation - TWh ", header=2)
wind.rename(columns={"Terawatt-hours":"country"}, inplace=True)
wind = prepare_data(wind, "wind_generation")

#### Hydroelectricity

Units: TWh

In [43]:
hydro = pd.read_excel("../../data/usage/bp-stats-review-2019-all-data.xlsx", "Hydro Generation - TWh", header=2)
hydro.rename(columns={"Terawatt-hours":"country"}, inplace=True)
hydro = prepare_data(hydro, "hydro_generation")

#### Geothermal, biomass, other

Units: TWh

In [44]:
other = pd.read_excel("../../data/usage/bp-stats-review-2019-all-data.xlsx", "Geo Biomass Other - TWh", header=2)
other.rename(columns={"Terawatt-hours":"country"}, inplace=True)
other = prepare_data(other, "geo_bio_other_generation")

## Combine

1 mtoe = 4.4 TWh 

In [61]:
df = (co2.merge(energy, on=["country","year"], how="left")
      .merge(renew, on=["country", "year"], how="left")
      .merge(solar, on=["country", "year"], how="left")
      .merge(wind, on=["country", "year"], how="left")
      .merge(hydro, on=["country", "year"], how="left")
      .merge(other, on=["country", "year"], how="left")
     )

In [62]:
df.head()

Unnamed: 0,country,year,co2,energy_consumption,renewable_generation,solar_generation,wind_generation,hydro_generation,geo_bio_other_generation
0,Canada,1965,259.855545,115.910849,0.0,0.0,0.0,117.122939,0.0
1,Canada,1966,271.067865,122.991657,0.0,0.0,0.0,128.821091,0.0
2,Canada,1967,284.90544,129.026541,0.0,0.0,0.0,133.125586,0.0
3,Canada,1968,307.276993,137.695457,0.0,0.0,0.0,136.320812,0.0
4,Canada,1969,319.22662,145.303023,0.0,0.0,0.0,148.226676,0.0
