In [1]:
from attribute_table import AttributeTable
import datetime
import importlib
import matplotlib.pyplot as plt
import model_attributes as ma
import model_afolu as mafl
import model_ippu as mi
import model_circular_economy as mc
import model_energy as me
import model_electricity as ml
import model_socioeconomic as se
from model_socioeconomic import Socioeconomic
import numpy as np
import os, os.path
import pandas as pd
import re
import setup_analysis as sa
import support_classes as sc
import support_functions as sf
import time
from typing import *
import warnings

importlib.reload(ma)
importlib.reload(sa)
importlib.reload(sf)
importlib.reload(mafl)
importlib.reload(mc)
importlib.reload(mi)
importlib.reload(me)
importlib.reload(se)
importlib.reload(ml)

warnings.filterwarnings("ignore")




In [2]:
##  IMPORT SOME ATTRIBUTES, MODELS, AND SHARED VARIABLES

attr_fuel = sa.model_attributes.get_attribute_table(f"{sa.model_attributes.subsec_name_enfu}")
attr_region = sa.model_attributes.dict_attributes.get(f"{sa.model_attributes.dim_region}")
attr_technology = sa.model_attributes.get_attribute_table(f"{sa.model_attributes.subsec_name_entc}")
attr_time_period = sa.model_attributes.dict_attributes.get(f"dim_{sa.model_attributes.dim_time_period}")
attr_time_slice = sa.model_attributes.dict_attributes.get(f"time_slice")

# support classes
time_periods = sc.TimePeriods(sa.model_attributes)
regions = sc.Regions(sa.model_attributes)

# set some fields
field_country = "Country"
field_date_string = "date_string"
field_fraction_production = "fraction_production"
field_generation = "generation_gwh"
field_gwp = "max_generation_gwp"
field_iso = "iso_code3"
field_iso_region_attr = "iso_alpha_3"
field_key = "GHD_ID"
field_latitude = "latitude_population_centroid_2020"
field_longitude = "longitude_population_centroid_2020"
field_month = "month"
field_ndays = "n_days"
field_technology = "technology"
field_wb_global_region = "world_bank_global_region"
field_year = "year"

# map each country to ISO code 3 and each code to 
dict_country_to_iso = dict((k, v.upper()) for k, v in attr_region.field_maps.get(f"{attr_region.key}_to_{field_iso_region_attr}").items())
dict_iso_to_country = sf.reverse_dict(dict_country_to_iso)
all_iso = list(dict_iso_to_country.keys())



# call variables from the electric model
model_elec = ml.ElectricEnergy(sa.model_attributes, sa.dir_jl, sa.dir_ref_nemo, initialize_julia = False)
model_energy = me.NonElectricEnergy(sa.model_attributes)

# Notebook Contents

This notebook inlcudes several basic datasets:
- Basic enery costs (not most current)
- Residual Capacities and assumed technology lifetimes
- Baseline Minimum Production Shares (MinShareProduction)
- Electricity Transmission Loss
- Fuel Prices

##  Energy Costs from EIA
- source: https://www.eia.gov/outlooks/aeo/assumptions/pdf/table_8.2.pdf

In [37]:
fp_read = "/Users/jsyme/Documents/Projects/git_jbus/lac_decarbonization/ref/data_tables_and_derivations/ENERGY/eia_outlooks_table_8.2.xlsx"
df_eia = pd.read_excel(fp_read, skiprows = 1).rename(columns = {
    "Unnamed: 0": "tech", 
    "Unnamed: 1": "year_start",
    "Total Overnight Cost (2021$/kW)": "capital_cost",
    "Variable O&M (2021 $/MWh)": "variable_cost",
    "Fixed O&M 2021$/kW-y)": "fixed_cost"
})

fields_group = ["cat_technology"]
fields_mean = [f"{x}_cost" for x in ["capital", "variable", "fixed"]]

dict_agg = dict(zip(fields_group, ["first" for x in fields_group]))
dict_agg.update(dict(zip(fields_mean, ["mean" for x in fields_mean])))

df_eia = df_eia.groupby(fields_group).agg(dict_agg).reset_index(drop = True).sort_values(by = ["cat_technology"])
df_eia.to_csv("/Users/jsyme/Desktop/tmp.csv", index = None, encoding = "UTF-8")

##  Build Residual Capacity Data
- See inline source commenting
- WRI Global Power Plant database: https://datasets.wri.org/dataset/globalpowerplantdatabase
    - Global Energy Observatory, Google, KTH Royal Institute of Technology in Stockholm, Enipedia, World Resources Institute. 2018. Global Power Plant Database. Published on Resource Watch and Google Earth Engine; http://resourcewatch.org/ https://earthengine.google.com/
- powerplant level data may be incomplete, so scale to aggregate statistics from UN http://data.un.org/Data.aspx?d=EDATA&f=cmID%3AEC
- Ocean (wave and tidal) rough lifetimes and efficiencies from 
    - Are Wave and Tidal Energy Plants New Green Technologies? Mélanie Douziech, Stefanie Hellweg, and Francesca Verones. Environmental Science & Technology 2016 50 (14), 7870-7878, DOI: 10.1021/acs.est.6b00156

In [525]:
########################################################
###                                                  ###
###    BUILD NEMOMOD ReserveCapacity INITIAL DATA    ###
###                                                  ###
########################################################

# get data 
#fp_data = "/Users/jsyme/Documents/Projects/FY21/SWCHE131_1000/Data/LAC_global_power_plant_database.csv"
fp_data = "/Users/jsyme/Documents/Projects/FY21/SWCHE131_1000/Data/Energy/global_power_plant_database_v_1_3/global_power_plant_database.csv"
df_data = pd.read_csv(fp_data)
# some cleaning of ISO codes
df_data["country"].replace(
    {
        "KOS": "XKX"
    },
    inplace = True
)

##  integrate aggreate production from UN data to scale up Residual Capacities 
df_un_pp_agg = pd.read_csv("/Users/jsyme/Documents/Projects/FY21/SWCHE131_1000/Data/Energy/UNdata_Export_20230307_234559434.csv")


if False:
    df_data.dropna(
        how = "all", 
        subset = ["estimated_generation_gwh_2017", "estimated_generation_gwh_2016", "estimated_generation_gwh_2015", "estimated_generation_gwh_2014", "estimated_generation_gwh_2013"],
        inplace = True
    )

# assumed lifetimes (baseline) - add sources to attribute table
dict_lifetimes = {
    "Biomass": 25, # https://www.nrel.gov/analysis/tech-footprint.html
    "Other": 50, 
    "Gas": 25, # 22, but set to 25 https://www.eia.gov/todayinenergy/detail.php?id=34172
    "Hydro": 100, # https://www.nrel.gov/docs/fy04osti/34916.pdf
    "Oil": 40, 
    "Nuclear": 30, # https://www.iaea.org/sites/default/files/29402043133.pdf
    "Coal": 50, # https://www.nature.com/articles/s41467-019-12618-3
    "Solar": 30, # https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=&cad=rja&uact=8&ved=2ahUKEwiExIeGvL35AhVPKkQIHR1ABJMQFnoECBsQAw&url=https%3A%2F%2Fnews.energysage.com%2Fhow-long-do-solar-panels-last%2F&usg=AOvVaw0rJ8w3zaBIP4b83sJgsKcr
    "Wind": 20, # https://nepis.epa.gov/Exe/ZyNET.exe/P100IL8K.TXT?ZyActionD=ZyDocument&Client=EPA&Index=2011+Thru+2015&Docs=&Query=&Time=&EndTime=&SearchMethod=1&TocRestrict=n&Toc=&TocEntry=&QField=&QFieldYear=&QFieldMonth=&QFieldDay=&IntQFieldOp=0&ExtQFieldOp=0&XmlQuery=&File=D%3A%5Czyfiles%5CIndex%20Data%5C11thru15%5CTxt%5C00000010%5CP100IL8K.txt&User=ANONYMOUS&Password=anonymous&SortMethod=h%7C-&MaximumDocuments=1&FuzzyDegree=0&ImageQuality=r75g8/r75g8/x150y150g16/i425&Display=hpfr&DefSeekPage=x&SearchBack=ZyActionL&Back=ZyActionS&BackDesc=Results%20page&MaximumPages=1&ZyEntry=1&SeekPage=x&ZyPURL
    "Waste": 30, # https://www.pbs.org/newshour/science/is-burning-trash-a-good-way-to-dispose-of-it-waste-incineration-in-charts,
    "Geothermal": 30, # https://geothermal-energy-journal.springeropen.com/articles/10.1186/s40517-021-00183-2
    "Ocean": 34 # mean of 5 plants, from https://pubs.acs.org/doi/10.1021/acs.est.6b00156 ()
}

# real lifetimes are available here
attr_entc = sa.model_attributes.get_attribute_table(sa.model_attributes.subsec_name_entc)
dict_lifetimes = attr_entc.field_maps.get("cat_technology_to_operational_life")


# TEMPORARY (20230424): DROP `OTHER` POWER PLANTS (AFFECTS ONE IN ALBANIA GLOBALLY) AND COGENERATION (UK and USA ONLY—SISEPUEDE NEEDS WORK TO IMP)
fuels_drop = ["Storage"]
fuels_try_before_drop = ["Cogeneration", "Other"]
# setup a dictionary to map some fuels in the database to SISEPUEDE fuels
dict_fuel_repls = {"Petcoke": "Coal"}

# FOR PURPOSES OF INITIAL STATES, SET PETCOKE TO COAL
df_data["primary_fuel"].replace(dict_fuel_repls, inplace = True)
    
    
##  FOR OTHER POWER PLANTS, USE FIRST AVAILABLE NON-PRIMARY FUEL 

# setup regex for other fuel columnsassume less than 10 are specified
regex_other_fuel = re.compile("other_fuel(\d$)")

def get_other_fuel_from_other(
    row: pd.Series,
    dict_repl_fuel: Union[Dict[str, str], None] = None,
    fuels_drop: Union[List[str], None] = None,
    regex_fuel: re.Pattern = re.compile("other_fuel(\d$)")
) -> Union[str, None]:
    """
    Using a row from input data frame, return a fuel based on "other_fuel" if
        primary_fuel is invalid
        
    
    Function Arguments
    ------------------
    - row: Pandas series representing a row from a data frame
    
    Keyword Arguments
    -----------------
    - dict_repl_fuel: dictionary of fuels to replace with other fuels
    - fuels_drop: optional list of fuels to drop
    - regex_fuel: regular expression used to define other_fuels in the row/df
    """
    fields_other_fuel = [x for x in row.index if (regex_other_fuel.match(x) is not None)]
    fields_other_fuel.sort()
    
    if len(fields_other_fuel) == 0:
        return None
    
    fuels_drop = [] if not isinstance(fuels_drop, list) else fuels_drop
    
    # get locations of potentially valid fuels
    vec = np.array(row[fields_other_fuel])
    w = [i for i in range(len(vec)) if isinstance(vec[i], str)]
    
    out = None
    
    if len(w) > 0:
        i = 0
        ind_take = -1
        while i <= len(w):
            ind_take = (
                i 
                if (vec[i] not in fuels_drop) or (vec[i] in dict_repl_fuel.keys())
                else ind_take
            )
            
            if (ind_take >= 0):
                break 
            i += 1
            
        out = vec[w[i]] if (i < len(w)) else out
        out = dict_repl_fuel.get(out, out)
        
    return out



if len(df_data["primary_fuel"][df_data["primary_fuel"].isin(fuels_try_before_drop)]) > 0:
    
    vec_new_pf = np.array(df_data["primary_fuel"])
    
    # try for any of the drop fuels
    inds = df_data[df_data["primary_fuel"].isin(fuels_try_before_drop)].index
    
    for i in inds:
        
        fuel_new = get_other_fuel_from_other(
            df_data.iloc[i], 
            dict_repl_fuel = dict_fuel_repls,
            fuels_drop = fuels_drop + fuels_try_before_drop
        )
        
        vec_new_pf[i] = (
            fuel_new 
            if (fuel_new is not None)
            else (
                "Solar"
                if (df_data["name"].iloc[i] == "Sol")
                else vec_new_pf[i]
            )
        )
     
    df_data["primary_fuel"] = vec_new_pf


        
            
# CONVERT TO FORMAT COMPATIBLE WITH SISEPUEDE

df_data["primary_fuel"] = df_data["primary_fuel"].replace(
    {
        "Hydro": "Hydropower",
        "Waste": "Waste Incineration",
        "Wave and Tidal": "Ocean"
    }
)

# drop any remainining instances of invalid fuels
df_data = df_data[
    ~df_data["primary_fuel"].isin(fuels_drop + fuels_try_before_drop)
].reset_index(drop = True)

all_fuel = list(set(df_data["primary_fuel"]))
dict_repl_fuel = {}
for fuel in all_fuel:
    fuel_new = fuel.lower().replace(" ", "_")
    fuel_new = f"pp_{fuel_new}"
    dict_repl_fuel.update({fuel: fuel_new})
df_data["primary_fuel"] = df_data["primary_fuel"].replace(dict_repl_fuel)



#############################################
#    FILL IN MISSING COMMISSIONING YEARS    #
#############################################

#
# TO FILL MISSING COMMISSION YEARS, GET MEAN COMMISSION YEAR FOR PLANTS BY TYPE IN LAC
# - USE RANDOM NUMBERS WITH A SEED
# - NEED TO ESTIMATE WHEN EXISTING PLANTS GO OFFLINE
# - CAN IMPROVE WITH BETTER INFORMATION LATER
#

# add some really rough numbers for commissionoing years for some plants where there are NO commissioning year data
dict_years_commission = {
    # see 
    "pp_ocean": {
        # https://en.wikipedia.org/wiki/European_Marine_Energy_Centre
        #     "ANDRITZ HYDRO Hammerfest installed their 1MW HS1000 tidal energy converter in 2011"
        "Hammerfest (EMEC)": 2011,
        # https://en.wikipedia.org/wiki/European_Marine_Energy_Centre
        #     "The test site was officially opened by Scotland's First Minister in September 2007"
        "Fall of Warness Tidal Demonstrator (EMEC)": 2007,
        # https://en.wikipedia.org/wiki/Wave_Hub
        "Hayle Wave Hub (Test Site)": 2010,
        # https://www.nsenergybusiness.com/projects/meygen-tidal-power-project/
        #    Offshore installation works for the initial 6MW project was completed in October 2016, while the first electricity was exported to the grid in the month that followed
        "Inner Sound Phase 1A (MeyGen)": 2016,
    }
}

all_plants = list(set(df_data["primary_fuel"]))
dict_mean_commission_year_by_plant = {}
dict_mean_commission_year_by_plant_by_country = {}
dict_std_commission_year_by_plant = {}
dict_std_commission_year_by_plant_by_country = {}

# get global averages
for plant in all_plants:

    df_tmp = df_data[df_data["primary_fuel"] == plant]   

    if len(df_tmp) > 0:

        yr_mean_commission = np.array(df_tmp["commissioning_year"])
        yr_mean_commission = yr_mean_commission[np.where(~np.isnan(yr_mean_commission))[0]]

        if len(yr_mean_commission) == 0:
            yr_mean_commission = dict_years_commission.get(plant)
            yr_mean_commission = (
                np.array(list(yr_mean_commission.values()))
                if yr_mean_commission is not None
                else np.array([])
            )

        yr_std_commission = np.std(yr_mean_commission)
        yr_mean_commission = int(np.round(np.mean(yr_mean_commission)))

        dict_mean_commission_year_by_plant.update({plant: yr_mean_commission})
        dict_std_commission_year_by_plant.update({plant: yr_std_commission})
            

##  GET MEANS BY COUNTRY

df_data_grouped = df_data.groupby([field_country.lower()])

for iso, df in df_data_grouped:

    dict_mean_commission_year_by_plant_by_country.update({iso: {}})
    dict_std_commission_year_by_plant_by_country.update({iso: {}})
    
    for plant in all_plants:
        
        df_tmp = df[df["primary_fuel"] == plant]   
        
        if len(df_tmp) > 0:
            
            yr_mean_commission = np.array(df_tmp["commissioning_year"])
            yr_mean_commission = yr_mean_commission[np.where(~np.isnan(yr_mean_commission))[0]]
            
            if len(yr_mean_commission) == 0:
                yr_mean_commission = dict_mean_commission_year_by_plant.get(plant)
                yr_std_commission = dict_std_commission_year_by_plant.get(plant)
            
            else: 
                yr_std_commission = np.std(yr_mean_commission)
                yr_mean_commission = int(np.round(np.mean(yr_mean_commission)))
            
            dict_mean_commission_year_by_plant_by_country[iso].update({plant: yr_mean_commission})
            dict_std_commission_year_by_plant_by_country[iso].update({plant: yr_std_commission})

            
            
# initialize some components
countries_iso = list(set(df_data[field_country.lower()]))
countries_iso.sort()
df_years = pd.DataFrame({"year": range(1920, 2056)})
# 
max_year_commission = 2020

# set a seed - I just chose 50 - and get some last-line numbers for sampling
np.random.seed(50)
commission_year_no_info = np.mean(df_data["commissioning_year"].dropna()).astype(int)
std_no_info = np.std(df_data["commissioning_year"].dropna()).astype(int)

df_out_total = []


for ind_country, country_iso in enumerate(countries_iso):
    
    df_tmp = df_data[df_data[field_country.lower()] == country_iso].copy().reset_index(drop = True)
    
    # check commision years
    df_na_comissions = df_tmp[df_tmp["commissioning_year"].isna()]
    inds_na_commissions = df_na_comissions.index
    
    for i, ind in enumerate(inds_na_commissions):
        plant = str(df_na_comissions["primary_fuel"].iloc[i])
        
        mu = dict_mean_commission_year_by_plant_by_country.get(country_iso)
        mu = mu.get(plant) if (mu is not None) else commission_year_no_info
        
        sd = dict_std_commission_year_by_plant_by_country.get(country_iso)
        sd = sd.get(plant) if (sd is not None) else std_no_info
        
        rand_yr = int(min(np.random.normal(mu, sd), max_year_commission))
        df_tmp["commissioning_year"].iloc[ind] = rand_yr
        

    df_years_tmp = []
    df_years_out = df_years.copy()
    
    for i in range(len(df_tmp)):
        field_plant = f"plant_{i}"
        plant = str(df_tmp["primary_fuel"].iloc[i])
        commission_year = int(df_tmp["commissioning_year"].iloc[i])
        lifetime = dict_lifetimes.get(plant)
        capacity = float(df_tmp["capacity_mw"].iloc[i])
        
        df_years_merge = pd.DataFrame({
            "year": range(commission_year, commission_year + lifetime), 
            "capacity": capacity,
            "plant": plant
        })
        
        if len(df_years_tmp) == 0:
            df_years_tmp = [df_years_merge for x in range(len(df_tmp))]
        else:
            df_years_tmp[i] = df_years_merge[df_years_tmp[0].columns]
            
    df_years_tmp = pd.concat(df_years_tmp, axis = 0)
    df_years_tmp = df_years_tmp.groupby(["year", "plant"]).agg({"year": "first", "plant": "first", "capacity": "sum"}).reset_index(drop = True)
    #
    df_years_out = pd.merge(df_years_out, df_years_tmp, how = "left")
    df_years_out["capacity"] = df_years_out["capacity"].fillna(0)
    df_years_out = df_years_out.dropna(how = "any", subset = ["plant"]).sort_values(by = ["year", "plant"]).reset_index(drop = True)
    df_years_out[field_country.lower()] = dict_iso_to_country.get(country_iso);
    
    df_years_out = (
        pd.pivot(
            df_years_out,
            ["year", field_country.lower()], 
            ["plant"], 
            "capacity"
        )
        .reset_index()
        .dropna(subset = [field_country.lower()])
    )
    
    df_out = pd.DataFrame()
    for k in df_years_out.columns:
        df_out[k] = df_years_out[k].copy().fillna(0.0)
        
    
    if len(df_out_total) == 0:
        df_out_total = [df_out for x in countries]
    else:
        df_out_total[ind_country] = df_out
    
df_out_total = pd.concat(df_out_total, axis = 0).fillna(0)


##  FORMAT VARIABLES FOR INGESTION

model_elec = ml.ElectricEnergy(
    sa.model_attributes, 
    sa.dir_jl,
    sa.dir_ref_nemo,
    initialize_julia = False
)

fields_rnm = [x for x in attr_entc.key_values if x in df_out_total.columns]
fields_new = sa.model_attributes.build_varlist(
    "Energy Technology", model_elec.modvar_entc_nemomod_residual_capacity,
    restrict_to_category_values = fields_rnm
)
dict_rnm = dict(zip(fields_rnm, fields_new))

#
#  do units conversion
#

units_target = sa.model_attributes.get_variable_characteristic(
    model_elec.modvar_entc_nemomod_residual_capacity, 
    sa.model_attributes.varchar_str_unit_power
)
scalar = sa.model_attributes.get_power_equivalent("mw", units_target)

for field in fields_rnm:
    df_out_total[field] = np.array(df_out_total[field])*scalar


df_out_total.rename(columns = dict_rnm, inplace = True)
fields_ind = [x for x in ["year", "country"] if x in df_out_total.columns]
fields_dat = sorted([x for x in df_out_total.columns if (x not in fields_ind)])

df_out_total = df_out_total[fields_ind + fields_dat]




    

In [531]:
###############################################################
###                                                         ###
###    ADD UN AGGREGATE DATA TO SCALE UP WHERE NECESSARY    ###
###                                                         ###
###############################################################

field_total_capacity = f"total_capacity_{units_target}"
df_out_total[field_total_capacity] = df_out_total[fields_new].sum(axis = 1)


#attr_region.table[attr_region.table[attr_region.key] == "montenegro"]

dict_repl_un = {
    "Bolivia (Plur. State of)": "Bolivia",
    "Central African Rep.": "Central African Republic",
    "China, Hong Kong SAR": "Hong Kong SAR, China",
    "China, Macao SAR": "Macao SAR, China",
    "Côte d'Ivoire": "Cote d'Ivoire",
    "Congo": "Republic of the Congo",
    "Congo, Rep.": "Republic of the Congo",
    "Congo, Dem. Rep.": "Democratic Republic of the Congo",
    "Dem. Rep. of the Congo": "Democratic Republic of the Congo",
    "Curaçao": "Curacao",
    "Ethiopia PDR": "Ethiopia",
    "Faeroe Islands": "Faroe Islands",
    "Gambia, The": "Gambia",
    "Iran (Islamic Republic of)": "Iran",
    "Iran (Islamic Rep. of)": "Iran",
    "United Kingdom of Great Britain and Northern Ireland": "United Kingdom",
    "Korea, Dem. People's Rep.": "Democratic People's Republic of Korea",
    "Korea, Dem.Ppl's.Rep.": "Democratic People's Republic of Korea",
    "Korea": "Republic of Korea",
    "Korea, Republic of": "Republic of Korea",
    "Kyrgyz Republic": "Kyrgyzstan",
    "Lao PDR": "Lao People's Democratic Republic",
    "Lao People's Dem. Rep.": "Lao People's Democratic Republic",
    "Micronesia, Fed. Sts.": "Micronesia (Federated States of)",
    "Micronesia (Fed. States of)": "Micronesia (Federated States of)",
    "Moldova": "Republic of Moldova",
    "St. Kitts and Nevis": "Saint Kitts and Nevis",
    "St. Kitts-Nevis": "Saint Kitts and Nevis",
    "St. Martin (French part)": "Saint-Martin (French part)",
    "St. Vincent and the Grenadines": "Saint Vincent and the Grenadines",
    "St. Vincent-Grenadines": "Saint Vincent and the Grenadines",
    "Slovak Republic": "Slovakia",
    "St. Lucia": "Saint Lucia",
    "Tanzania": "United Republic of Tanzania",
    "Türkiye": "Turkey",
    "United Rep. of Tanzania": "United Republic of Tanzania",
    "United States": "United States of America",
    "Venezuela (Bolivarian Republic of)": "Venezuela",
    "Venezuela (Bolivar. Rep.)": "Venezuela",
    "Vietnam": "Viet Nam",
    "Virgin Islands (U.S.)": "United States Virgin Islands",
    "United States Virgin Is.": "United States Virgin Islands",
    "Yemen, Rep.": "Yemen",
}

# ok to drop
# Anguilla - Britain
# Bonaire, St Eustatius, Saba - Netherlands
# Cook Islands - Free association with New Zealand
# Ethiopia, incl. Eritrea - no longer exists
# Falkland - na
# French Guiana - France
# Guernsey - dutch protectorate
# Jersey - dutch protectorate
# Martinique - France
# Mayotte - France
# Montserrat - Britain
# Niue - "free association with New Zealand"
# Réunion - France
# Serbia and Montenegro - no longer exists
# St. Helena and Depend. - UK dependency
# St. Pierre-Miquelon - France
# Wallis and Futuna - France


# some basic fields
field_capacity = "capacity"
field_commodity = "Commodity - Transaction"
field_plant_type = "plant_type_un"

# map power plants to grouping in UN
dict_plant_to_subgroup = {
    "pp_biogas": "Electricity - total net installed capacity of electric power plants, combustible fuels",
    "pp_biomass": "Electricity - total net installed capacity of electric power plants, combustible fuels",
    "pp_coal": "Electricity - total net installed capacity of electric power plants, combustible fuels",
    "pp_coal_ccs": "Electricity - total net installed capacity of electric power plants, combustible fuels",
    "pp_geothermal": "Electricity - total net installed capacity of electric power plants, geothermal",
    "pp_hydropower": "Electricity - total net installed capacity of electric power plants, hydro",
    "pp_gas": "Electricity - total net installed capacity of electric power plants, combustible fuels",
    "pp_gas_ccs": "Electricity - total net installed capacity of electric power plants, combustible fuels",
    "pp_nuclear": "Electricity - total net installed capacity of electric power plants, nuclear",
    "pp_ocean": "Electricity - total net installed capacity of electric power plants, tide, wave, marine",
    "pp_oil": "Electricity - total net installed capacity of electric power plants, combustible fuels",
    "pp_solar": "Electricity - total net installed capacity of electric power plants, solar",
    "pp_waste_incineration": "Electricity - total net installed capacity of electric power plants, combustible fuels",
    "pp_wind": "Electricity - total net installed capacity of electric power plants, wind"
}

# reduce to scale to aggregate installed capacity
df_capacity_un_total = df_un_pp_agg[
    df_un_pp_agg["Commodity - Transaction"].isin(
        [
            "Electricity - total net installed capacity of electric power plants, main activity & autoproducer"
        ]
    )
]

df_capacity_un_by_subgroup = df_un_pp_agg[
    df_un_pp_agg["Commodity - Transaction"].isin(
        dict_plant_to_subgroup.values()
    )
]



##  CLEAN THE DATA

field_cat_nam = "category_name"
# rename
dict_rnm = {
    "Country or Area": field_cat_nam, #attr_region.key,
    "Year": field_year,
    "Quantity": field_capacity,
    field_commodity: field_plant_type
}
df_capacity_un_total = df_capacity_un_total[dict_rnm.keys()].rename(columns = dict_rnm).drop(field_plant_type, axis = 1)
df_capacity_un_by_subgroup = df_capacity_un_by_subgroup[dict_rnm.keys()].rename(columns = dict_rnm)
# replace country names
df_capacity_un_total[field_cat_nam].replace(dict_repl_un, inplace = True)
df_capacity_un_by_subgroup[field_cat_nam].replace(dict_repl_un, inplace = True)


# UN data are in mw
units_target = sa.model_attributes.get_variable_characteristic(
    model_elec.modvar_entc_nemomod_residual_capacity, 
    sa.model_attributes.varchar_str_unit_power
)
scalar = sa.model_attributes.get_power_equivalent("mw", units_target)


# merge in region key
df_capacity_un_by_subgroup = (
    pd.merge(
        df_capacity_un_by_subgroup,
        attr_region.table[[field_cat_nam, attr_region.key]],
        how = "inner"
    )
    .drop([field_cat_nam], axis = 1)
)

df_capacity_un_total = (
    pd.merge(
        df_capacity_un_total,
        attr_region.table[[field_cat_nam, attr_region.key]],
        how = "inner"
    )
    .drop([field_cat_nam], axis = 1)
)

# standardize index fields - subgroups
df_capacity_un_by_subgroup[field_year] = np.array(df_capacity_un_by_subgroup[field_year]).astype(int)
df_capacity_un_by_subgroup[attr_time_period.key] = df_capacity_un_by_subgroup[field_year].apply(time_periods.year_to_tp)
df_capacity_un_by_subgroup[field_capacity] = np.array(df_capacity_un_by_subgroup[field_capacity])*scalar
# standardize index fields - total
df_capacity_un_total[field_year] = np.array(df_capacity_un_total[field_year]).astype(int)
df_capacity_un_total[attr_time_period.key] = df_capacity_un_total[field_year].apply(time_periods.year_to_tp)
df_capacity_un_total[field_capacity] = np.array(df_capacity_un_total[field_capacity])*scalar


##  BUILD SCALARS

field_scale_residual_capacity = "scalar_residual_capacity"
df_get_scalars = pd.merge(
    df_out_total[[field_year, field_country.lower(), field_total_capacity]],
    df_capacity_un_total.rename(
        columns = {
            field_capacity: f"{field_capacity}_un",
            attr_region.key: field_country.lower()
        }
    ),
    how = "left"
).dropna()

df_get_scalars[field_scale_residual_capacity] = sf.vec_bounds(
    np.nan_to_num(
        np.array(df_get_scalars[f"{field_capacity}_un"])/np.array(df_get_scalars[field_total_capacity]),
        0.0,
        posinf = 0.0
    ),
    (1, np.inf)
)


    
# get scalars by region to convert power plants to aggregate metrics from UN
df_scalars_by_region = df_get_scalars.groupby([field_country.lower()])
df_scalars_by_region_cln = []
df_left = pd.DataFrame({field_year: list(range(min(df_get_scalars["year"]), max(df_out_total[field_year]) + 1))})
for i, df in df_scalars_by_region:
    # get last residual capacity scalar
    df = df.sort_values(by = [field_year], ascending = False)
    scalar_final = float(df[field_scale_residual_capacity].iloc[0])

    df = pd.merge(
        df_left, 
        df[[field_year, field_country.lower(), field_scale_residual_capacity]], 
        how = "left"
    )

    df[field_country.lower()] = i
    df[field_scale_residual_capacity].interpolate(method = "pad", inplace = True)
    df[field_scale_residual_capacity].interpolate(method = "bfill", inplace = True)

    df_scalars_by_region_cln.append(df)

df_scalars_by_region = pd.concat(df_scalars_by_region_cln, axis = 0).reset_index(drop = True)



# MERGE INTO DF_OUT_TOTAL

df_out_total_rescaled = pd.merge(
    df_out_total,
    df_scalars_by_region,
    how = "left"
)
df_out_total_rescaled[field_scale_residual_capacity].interpolate(method = "bfill", inplace = True)

# finally, provide rescale
for fld in fields_new:
    df_out_total_rescaled[fld] = np.array(df_out_total_rescaled[fld])*np.array(df_out_total_rescaled[field_scale_residual_capacity])


df_out_total_rescaled.rename(
    columns = {
        field_country.lower(): attr_region.key
    }, 
    inplace = True
)

df_out_total_rescaled = pd.merge(
    df_out_total_rescaled,
    attr_region.table[[attr_region.key, field_iso_region_attr]],
    how = "left"
)

df_out_total_rescaled = df_out_total_rescaled[[field_year, attr_region.key, field_iso_region_attr] + fields_new]


if True:
    df_out_total_rescaled.to_csv(
        sa.fp_csv_nemomod_residual_capacity_inputs, 
        index = None,
        encoding = "UTF-8"
    )

"""
regex_gen_capacity = re.compile("generation_gwh_(\d*$)")
fields_gen_capac = [x for x in df_data.columns if regex_gen_capacity.match(x) is not None]

df_data.dropna(subset = fields_gen_capac, how = "all")["country"].unique()
#df_data.columns
""";
    



In [303]:
importlib.reload(sf)

<module 'support_functions' from '/Users/jsyme/Documents/Projects/git_jbus/lac_decarbonization/python/support_functions.py'>

In [None]:
"""
df_production_fractions_mean = sf.explode_merge(
        pd.DataFrame({
            field_year: sorted(list(df_production_fractions_annual[field_year].unique()))
        }),
        pd.DataFrame({
            field_iea_product: sorted(list(df_production_fractions_annual[field_iea_product].unique()))
        })
    )
    df_production_fractions_mean = sf.explode_merge(
        df_production_fractions_mean,
        pd.DataFrame({
            field_country: sorted(list(df_production_fractions_annual[field_country].unique()))
        })
    )
    df_production_fractions_mean = (
        pd.merge(
            df_production_fractions_mean, 
            df_production_fractions_annual,
            how = "left"
        )
        #.drop([field_year], axis = 1)
        .fillna(0.0)
    )
"""

#  Build MinShareProduction data 
- Currently read in aggregate, based on Monthly Data from IEA
- Used to ensure historical production aligns

**NOTE** will require integrating additional code to aggregate monthly data
- Based on IEA monthly electricity generation data
- See https://www.iea.org/data-and-statistics/data-product/monthly-electricity-statistics#monthly-electricity-statistics


In [643]:
# NOTE: IEA puts these out monthly, easy to update regularly
fp_prod_elec = "/Users/jsyme/Documents/Projects/FY21/SWCHE131_1000/Data/Energy/MES_012023.csv"


##  FIELDS

field_iea_balance = "Balance"
field_iea_country = "Country"
field_iea_product = "Product"
field_iea_time = "Time"
field_iea_unit = "Unit"
field_iea_value = "Value"


##  SOME DICTIONARIES

# replace IEA products with SISEPUEDE powerplants
dict_repl_iea_product = {
    "Coal, Peat and Manufactured Gases": "pp_coal",
    "Combustible Renewables": "pp_biomass",
    "Oil and Petroleum Products": "pp_oil",
    "Natural Gas": "pp_gas",
    "Hydro": "pp_hydropower",
    "Solar": "pp_solar",
    "Geothermal": "pp_geothermal",
    "Nuclear": "pp_nuclear",
    "Other Renewables": "pp_ocean"
}

dict_wb_region_to_regions = sf.group_df_as_dict(
    attr_region.table,
    [field_wb_global_region],
    fields_out_set = attr_region.key
)
dict_region_to_wb_region = attr_region.field_maps.get(f"{attr_region.key}_to_{field_wb_global_region}")



###################
#    FUNCTIONS    #
###################

def time_str_to_month_year(
    date_str: str,
    format_str: str = "%B %Y"
) -> Tuple[int, int]:
    """
    Convert date string date_str using format_str to (month, year)
        tuple of ints
    """
    dt = datetime.datetime.strptime(date_str, format_str)
    
    return (dt.month, dt.year)



def clean_production_df(
    df_in: pd.DataFrame,
    cats_drop: Union[List[str], None] = None,
    field_country: str = field_country,
    field_iso: str = field_iso,
    field_technology: str = field_technology,
    **kwargs,
) -> pd.DataFrame:
    """
    Clean electricity production data frame df_in
    """
    df_in[field_iso] = get_isos_from_iea(
        df_in,  
        field_country = field_country,
        **kwargs
    )
    df_in.drop([field_country], axis = 1, inplace = True)
    
    if cats_drop is not None:
        df_in = df_in[
            ~df_in[field_technology].isin(cats_drop)
        ].reset_index(drop = True)
        
    return df_in



def get_and_format_electricity_production(
    fp_in: str,
    balance_elec: str = "Net Electricity Production",
    dict_repl_iea_product: Dict[str, str] = dict_repl_iea_product,
    field_balance: str = field_iea_balance,
    field_month: str = field_month,
    field_product: str = field_iea_product,
    field_time: str = field_iea_time,
    field_year: str = field_year,
    year_min: int = 2010,
) -> Union[pd.DataFrame, None]:
    """
    Function Arguments
    ------------------
    - fp_in: path to input data frame
    
    Keyword Arguments
    -----------------
    - balance_elec: value in field_balance associated with electricity production
    - dict_repl_iea_product: dictionary mapping IEA Product values to SISEPUEDE
        electricity generation technologies (used for allocation)
    - field_balance: field storing IEA balances in the MES table
    - field_month: field with months
    - field_product: field storing IEA input products (fuels) associated with
        electricity produciton
    - field_time: field storing IEA time
    - field_year: field with year
    - year_min: minimum year to use for average factor
    """
    
    if not os.path.exists(fp_in):
        return None
    
    
    ##  READ IN PRODUCTION AND ADD/CLEAN SOME FIELDS
    
    df_production = pd.read_csv(
        fp_in, 
        encoding = "cp1252",
        skiprows = 8
    )
    
    # select balance that is needed
    df_production = df_production[
        df_production[field_balance].isin([balance_elec])
    ].drop([field_balance], axis = 1)
    
    # replace fields and drop any products that are unneeded
    df_production[field_product].replace(dict_repl_iea_product, inplace = True)
    df_production = df_production[
        df_production[field_product].isin(list(dict_repl_iea_product.values()))
    ].reset_index(drop = True)
    
    # add month/year
    df_production = pd.concat(
        [
            df_production.drop(field_time, axis = 1),
            pd.DataFrame(
                list(
                    df_production[field_time].apply(time_str_to_month_year)
                ),
                columns = [field_month, field_year]
            )
        ],
        axis = 1
    )
    
    return df_production
    
    
    
def get_electricity_production_dictionary(
    df_in: Union[pd.DataFrame, str],    
    dict_rename_output: Union[Dict[str, str], None] = None,
    field_country: str = field_iea_country,
    field_month: str = field_month,
    field_product: str = field_iea_product,
    field_unit: str = field_iea_unit,
    field_value: str = field_iea_value,
    field_year: str = field_year,
    key_annual_prod_proportions: str = "annual_production_proportions",
    key_avg_prod_proportions: str = "average_annual_production_proportions",
    model_attributes: ma.ModelAttributes = sa.model_attributes,
    time_periods: Union[sc.TimePeriods, None] = None,
    **kwargs
) -> Union[Dict[str, pd.DataFrame], None]:
    """
    Return a dictionary of different averages for electricity production

    Function Arguments
    ------------------
    - df_in: data frame containing production data OR file path to 
        production data to read in (IEA MES file)

    Keyword Arguments
    -----------------
    - dict_rename_output: dictionary to rename output fields. If None, 
        returns DataFrame with IEA fields
    - field_country: field storing IEA country/regions
    - field_month: field with months
    - field_product: field storing IEA input products (fuels) associated 
        with electricity produciton
    - field_unit: field storing IEA Units
    - field_value: field storing IEA values
    - field_year: field with year
    - key_annual_prod_proportions: output dictionary key storing annual 
        production proportions by SISEPUEDE power plant type
    - key_avg_prod_proportions: output dictionary key storing average 
        production proportions by SISEPUEDE power plant type (across years)
    - model_attributes: ModelAttributes object used to determine time period 
        and key fields
    - time_periods: optional TimePeriods object used to map years to time 
        periods
    - **kwargs: passed to get_and_format_electricity_production if df_in is 
        a string
    """
    
    ##  INITIALIZATION
    
    attr_region = model_attributes.dict_attributes.get(f"{model_attributes.dim_region}")
    attr_time_period = model_attributes.dict_attributes.get(f"dim_{model_attributes.dim_time_period}")
    time_periods = sc.TimePeriods(model_attributes) if (time_periods is None) else time_periods
    
    df_in = (
        get_and_format_electricity_production(
            df_in,
            field_month = field_month,
            field_product = field_product,
            field_year = field_year,
            **kwargs
        )
        if isinstance(df_in, str)
        else (df_in if isinstance(df_in, pd.DataFrame) else None)
    )
    
    if df_in is None:
        return None
    
    
    ##  GET AGGREGATIONS
    
    # initialize output
    dict_out = {}
    
    # check acceptable years
    years_keep = []
    df_in_grouped = df_in.groupby([field_year])
    for yr, df in df_in_grouped:
        years_keep.append(yr) if (len(set(df[field_month])) == 12) else None

    # total production by fuel (product) for each year
    df_production_annual_total = sf.simple_df_agg(
        df_in[
            df_in[field_year].isin(years_keep)
        ].drop([field_unit, field_month], axis = 1),
        [
            field_country, 
            field_product,
            field_year
        ],
        {
            field_value: "sum"
        }
    )
    
    # get production fractions by year and update dictionary
    df_production_fractions_annual = sf.get_cols_as_grouped_proportions(
        df_production_annual_total, 
        [field_iea_value], 
        [field_iea_country, field_year],
        drop_if_zero_sum = True
    )
    dict_out.update({key_annual_prod_proportions: df_production_fractions_annual})
    
    
    # get averages across years and add to output dictionary
    
    df_production_fractions_mean = []
    
    df_pfm_grouped = (
        df_production_fractions_annual
        .groupby([field_country])
    )
    
    for country, df in df_pfm_grouped:
        df_cur = sf.simple_df_agg(
            df,
            [
                field_country,
                field_product
            ],
            {
                field_iea_value: "sum"
            }
        )
        
        df_cur[field_iea_value] = np.array(df_cur[field_iea_value])/len(df[field_year].unique())
        df_production_fractions_mean.append(df_cur)
        
        
    df_production_fractions_mean = (
        pd.concat(df_production_fractions_mean, axis = 0)
        .reset_index(drop = True)
    )
    dict_out.update({key_avg_prod_proportions: df_production_fractions_mean})

    
    ##  SOME UPDATES TO EACH DATAFRAME
    
    for k in dict_out.keys():
        df_tmp = dict_out.get(k)
        
        # add time period
        if field_year in df_tmp.columns:
            df_tmp[attr_time_period.key] = df_tmp[field_year].apply(time_periods.year_to_tp)
    
        # rename
        if dict_rename_output is not None:
            dict_rnm_tmp = {}
            for r, v in dict_rename_output.items():
                dict_rnm_tmp.update({r: v}) if (r in df_tmp.columns) else None

            df_tmp.rename(columns = dict_rnm_tmp, inplace = True)
        
        
    return dict_out




#
#   ADD TO REGIONS CLASS
#
def get_isos_from_iea(
    df_in: pd.DataFrame,
    attr_region: AttributeTable = attr_region,
    field_country: str = field_country,
    field_iso_attr_region: str = field_iso_region_attr,
) -> np.ndarray:
    """
    Map IEA countries in field_country to ISO codes
    """
    dict_country_to_iso = attr_region.field_maps.get(f"{attr_region.key}_to_{field_iso_attr_region}")
    
    # some generic replacements
    dict_repl_consumption = {
        "czech_republic": "czechia",
        "korea": "republic_of_korea",
        "people's_republic_of_china": "china",
        "republic_of_turkiye": "turkey",
        "slovak_republic": "slovakia",
        "united_states": "united_states_of_america"
    }

    vec_iso = [x.lower().replace(" ", "_") for x in list(df_in[field_country])]
    vec_iso = [dict_repl_consumption.get(x, x) for x in vec_iso]
    vec_iso = [dict_country_to_iso.get(x, x) for x in vec_iso]
    
    return np.array(vec_iso)



##############
#    MAIN    #
##############

# retrieve and clean
dict_rnm_elec_prods = {
    field_iea_product: field_technology,
    field_iea_value: field_fraction_production,
}
dfs_production_by_country = get_electricity_production_dictionary(
    fp_prod_elec,
    dict_rename_output = dict_rnm_elec_prods
)
df_production_by_country = dfs_production_by_country.get("annual_production_proportions")
df_avg_production_by_country = dfs_production_by_country.get("average_annual_production_proportions")



##  CLEAN FIELDS AND DATA FRAMES

#  drop integrated techs for now
df_production_by_country = clean_production_df(
    df_production_by_country,
    cats_drop = cats_entc_drop
)

df_avg_production_by_country = clean_production_df(
    df_avg_production_by_country,
    cats_drop = cats_entc_drop
)





#set sets of tech & isos available
all_technology = sorted(list(df_production_by_country[field_technology].unique()))
all_iso_defined_in_production = sorted(list(df_production_by_country[field_iso].unique()))

# get all years and techs to merge to
years_merge = range(
    min(df_production_by_country[field_year]), 
    max(df_production_by_country[field_year]) + 1
)
df_left = pd.DataFrame({field_year: years_merge})
df_left = sf.explode_merge(
    df_left,
    pd.DataFrame({field_technology: all_technology})
)
df_left = sf.explode_merge(
    df_left,
    pd.DataFrame({field_iso: all_iso_defined_in_production})
)

# merge to all years/techs available and fill missing fractions with 0
df_production_by_country = pd.merge(
    df_left, 
    df_production_by_country,
    how = "left"
)
df_production_by_country[sa.model_attributes.dim_time_period] = df_production_by_country[field_year].apply(time_periods.year_to_tp).astype(int)

# clean the time period and group by country; group and iterate to     
df_production_by_country = sf.pivot_df_clean(
    df_production_by_country,
    [field_technology],
    [field_fraction_production]
)

# interpolate (backfill) missing years
df_production_by_country_list = []
df_production_by_country_grouped = df_production_by_country.groupby([field_iso])
fields_data = [x for x in df_production_by_country if x not in [field_iso, field_year, attr_time_period.key]]

for iso, df in df_production_by_country_grouped:
    
    for i in range(len(df)):
        if np.abs(df[fields_data].iloc[i].sum() - 1.0) < 0.000001:
            df.iloc[i] = df.iloc[i].fillna(0.0)
            
    df[fields_data] = df[fields_data].interpolate()
    df[fields_data] = df[fields_data].interpolate(method = "bfill")
    
    df_production_by_country_list.append(df)
    
df_production_by_country = pd.concat(df_production_by_country_list, axis = 0).reset_index(drop = True)

df_out = [
    df_production_by_country
]



##  NEXT, EXPAND TO ALL YEARS

years_merge = range(
    max(df_production_by_country[field_year]) + 1, 
    max(attr_time_period.table[field_year]) + 1
)
df_left = pd.DataFrame({field_year: years_merge})
df_left = sf.explode_merge(
    df_left,
    pd.DataFrame({field_technology: all_technology})
)
df_left = sf.explode_merge(
    df_left,
    pd.DataFrame({field_iso: all_iso_defined_in_production})
)

# use averages for all future dates 
df_production_by_country_append = pd.merge(
    df_left,
    df_avg_production_by_country,
    how = "left"
)


df_production_by_country_append = sf.pivot_df_clean(
    df_production_by_country_append,
    [field_technology],
    [field_fraction_production]
).fillna(0.0)

# clean the time period
df_production_by_country_append[sa.model_attributes.dim_time_period] = df_production_by_country_append[
    field_year
].apply(time_periods.year_to_tp).astype(int)

df_out += [
    df_production_by_country_append
]


# concatenate
df_out = (
    pd.concat(df_out, axis = 0)
    .sort_values(by = [field_iso, field_year])
    .reset_index(drop = True)
)




# use global average in absence of anything else
isos_missing = sorted(list(set(regions.all_isos) - set(all_iso_defined_in_production)))
isos_avail = set(df_production_by_country[field_iso])
iso_dummy_default = "iea_total"

# for regions that have no IEA/OECD data, try to relate to closest region
dict_try_wb_region_to_related_wb_region = {
    "Middle East & North Africa": "Latin America & Caribbean", 
    "Sub-Saharan Africa": "Latin America & Caribbean",
    "South Asia": "East Asia & Pacific"
}

# some fields to use in the aggregation
flds_group = [field_year, attr_time_period.key]
flds_data = [x for x in df_out.columns if (x != field_iso) and (x not in flds_group)]


if len(isos_missing) > 0:

    df_append = [df_out]

    for iso_missing in isos_missing:

        # initialize some potential components
        df_cur = None
        iso_replace = None

        region_wb = regions.get_world_bank_region(iso_missing)
        isos_wb = set([regions.return_region_or_iso(x, return_type = "iso") for x in dict_wb_region_to_regions.get(region_wb)])
        isos_valid = list(isos_wb & isos_avail)

        if len(isos_valid) > 0:
            
            # get closest region within global WB region
            iso_replace = regions.get_closest_region(
                iso_missing,
                regions_valid = isos_valid,
                type_input = "iso",
                type_return = "iso"
            )
            
            print(f"No MSP found for region '{iso_missing}' -- using value from closest neighbor (in WB Region) '{iso_replace}' ")
            
        elif region_wb in dict_try_wb_region_to_related_wb_region.keys():

            region_wb = dict_try_wb_region_to_related_wb_region.get(region_wb)
            df_cur = regions.aggregate_df_by_wb_global_region(
                df_out,
                region_wb,
                flds_group,
                dict((x, "mean") for x in flds_data),
                field_iso = field_iso
            )
            
            print(f"No MSP found for region '{iso_missing}' -- using regional average from WB Region '{region_wb}'")


        else:
            # default to global IEA average
            iso_replace = iso_dummy_default


        df_cur = (
            df_out[
                df_out[field_iso] == iso_replace
            ].copy().reset_index(drop = True)
            if (df_cur is None) and (iso_replace is not None)
            else df_cur
        ) 

        df_cur[field_iso] = iso_missing
        df_append.append(df_cur)

    df_out = pd.concat(df_append, axis = 0)
    df_out = (
        df_out[df_out[field_iso].isin(regions.all_isos)]
        .sort_values(by = [field_iso, field_year])
        .reset_index(drop = True)
    )
    

    
    
    
    





##  FORMAT OUTPUT DATASET

fields_group = [field_year, attr_time_period.key, field_iso]
fields_data = [x for x in attr_technology.key_values if x in df_out.columns]

# name as MSP variable
modvar = model_elec.modvar_entc_nemomod_min_share_production
subsec = model_elec.model_attributes.get_variable_subsector(modvar)
fields_new = sa.model_attributes.build_varlist(
    subsec, 
    modvar,
    restrict_to_category_values = fields_data
)

dict_rnm = dict(zip(fields_data, fields_new))


df_out_grouped = df_out.groupby([field_iso])
df_out_new = []
dfk = None
for i, df in df_out_grouped:
    
    yrs = df[fields_group].copy()
    
    df["TMP"] = df[fields_data].sum(axis = 1)
    
    df = pd.merge(
        yrs,
        df[df["TMP"] > 0],
        how = "left"
    )
    dfk = df if (i == "CRI") else dfk
    
    # perform interpolations
    df[fields_data] = df[fields_data].interpolate()
    df[fields_data] = df[fields_data].interpolate(method = "bfill")
    
    df_out_new.append(df)

    
df_out = pd.concat(df_out_new, axis = 0).reset_index(drop = True)
df_out = df_out[fields_group + fields_data].rename(columns = dict_rnm)

# FINALLY--0 OUT SOME VALUES
fields_zero = sa.model_attributes.build_varlist(
    None,
    modvar,
    restrict_to_category_values = ["pp_waste_incineration", "pp_biogas", "pp_biomass"]
)
fields_zero = [x for x in fields_zero if x in df_out.columns]

df_out[fields_zero] = 0

if True:
    df_out.to_csv(
        sa.fp_csv_nemomod_minimum_share_of_production_baselines,
        index = None,
        encoding = "UTF-8"
    )

#if False:
    
    

No MSP found for region 'ABW' -- using value from closest neighbor (in WB Region) 'COL' 
No MSP found for region 'AFG' -- using value from closest neighbor (in WB Region) 'IND' 
No MSP found for region 'AGO' -- using regional average from WB Region 'Latin America & Caribbean'
No MSP found for region 'ALB' -- using value from closest neighbor (in WB Region) 'MKD' 
No MSP found for region 'AND' -- using value from closest neighbor (in WB Region) 'FRA' 
No MSP found for region 'ARE' -- using value from closest neighbor (in WB Region) 'MLT' 
No MSP found for region 'ARM' -- using value from closest neighbor (in WB Region) 'TUR' 
No MSP found for region 'ASM' -- using value from closest neighbor (in WB Region) 'NZL' 
No MSP found for region 'ATG' -- using value from closest neighbor (in WB Region) 'COL' 
No MSP found for region 'AZE' -- using value from closest neighbor (in WB Region) 'TUR' 
No MSP found for region 'BDI' -- using regional average from WB Region 'Latin America & Caribbean'
N

# Get Electric Transmission Loss data
- source of CSV (World Bank): https://data.worldbank.org/indicator/EG.ELC.LOSS.ZS

In [661]:
df_losses = pd.read_csv(
    "/Users/jsyme/Documents/Projects/FY21/SWCHE131_1000/Data/Energy/API_EG.ELC.LOSS.ZS_DS2_en_csv_v2_4898900/API_EG.ELC.LOSS.ZS_DS2_en_csv_v2_4898900.csv",
    skiprows = 3
)

# some filtering
field_cc = "Country Code"
indicator = "Electric power transmission and distribution losses (% of output)"
df_losses = df_losses[
    df_losses["Indicator Name"] == indicator
].reset_index(drop = True).rename(columns = {field_cc: field_iso}).dropna(how = "all", axis = 1)

# get variable name
subsec = sa.model_attributes.get_variable_subsector(
    model_elec.modvar_enfu_transmission_loss_frac_electricity
)
field_var = sa.model_attributes.build_varlist(
    subsec,
    model_elec.modvar_enfu_transmission_loss_frac_electricity,
    restrict_to_category_values = [model_elec.cat_enfu_elec]
)[0]


# get data and melt
fields_data = [x for x in df_losses.columns if str(x).isnumeric()]
df_losses = pd.melt(
    df_losses[[field_iso] + fields_data],
    [field_iso],
    fields_data,
    value_name = field_var,
    var_name = field_year
).dropna().reset_index(drop = True)
# convert strings to int
df_losses[field_year] = [int(x) for x in list(df_losses[field_year])]

# get full set of years to merge into 
year_min = 2010
years_merge = range(
    year_min, 
    time_periods.year_max + 1
)
df_left = pd.DataFrame({field_year: years_merge})
df_left = sf.explode_merge(
    df_left,
    pd.DataFrame({field_iso: all_iso})
)

# group and iterate
dfs_losses = df_losses.groupby([field_iso])
df_out = []
for i, df in dfs_losses:
    
    if i in all_iso:
        df_cur = df.sort_values(by = ["year"], ascending = False)
        mv = np.mean(np.array(df_cur[field_var])[0:min(5, len(df))])

        df_cur = pd.merge(df_left[df_left[field_iso] == i], df_cur, how = "left")
        df_cur = df_cur[df_cur[field_year] >= year_min].fillna(mv)
        
        # add time period and reduce to fraction
        df_cur[attr_time_period.key] = df_cur[field_year].apply(time_periods.year_to_tp)
        df_cur[field_var] = sf.vec_bounds(np.array(df_cur[field_var])/100, (0.0, 1.0))
        
        df_out.append(df_cur)

df_out = pd.concat(df_out, axis = 0)



# use global average in absence of anything else
isos_avail = set(df_out[field_iso])
isos_missing = sorted(list(set(regions.all_isos) - isos_avail))

# for regions that have no IEA/OECD data, try to relate to closest region
dict_try_wb_region_to_related_wb_region = {
    "Middle East & North Africa": "Latin America & Caribbean", 
    "Sub-Saharan Africa": "Latin America & Caribbean",
    "South Asia": "East Asia & Pacific"
}

# some fields to use in the aggregation
flds_group = [field_year, attr_time_period.key]
flds_data = [x for x in df_out.columns if (x != field_iso) and (x not in flds_group)]


if len(isos_missing) > 0:

    df_append = [df_out]

    for iso_missing in isos_missing:

        # initialize some potential components
        df_cur = None
        iso_replace = None

        region_wb = regions.get_world_bank_region(iso_missing)
        isos_wb = set([regions.return_region_or_iso(x, return_type = "iso") for x in dict_wb_region_to_regions.get(region_wb)])
        isos_valid = list(isos_wb & isos_avail)

        if len(isos_valid) > 0:
            
            # get closest region within global WB region
            iso_replace = regions.get_closest_region(
                iso_missing,
                regions_valid = isos_valid,
                type_input = "iso",
                type_return = "iso"
            )
            
            print(f"No MSP found for region '{iso_missing}' -- using value from closest neighbor (in WB Region) '{iso_replace}' ")
            
        elif region_wb in dict_try_wb_region_to_related_wb_region.keys():

            region_wb = dict_try_wb_region_to_related_wb_region.get(region_wb)
            df_cur = regions.aggregate_df_by_wb_global_region(
                df_out,
                region_wb,
                flds_group,
                dict((x, "mean") for x in flds_data),
                field_iso = field_iso
            )
            
            print(f"No MSP found for region '{iso_missing}' -- using regional average from WB Region '{region_wb}'")


        else:
            # default to global IEA average
            iso_replace = iso_dummy_default


        df_cur = (
            df_out[
                df_out[field_iso] == iso_replace
            ].copy().reset_index(drop = True)
            if (df_cur is None) and (iso_replace is not None)
            else df_cur
        ) 

        df_cur[field_iso] = iso_missing
        df_append.append(df_cur)

    df_out = pd.concat(df_append, axis = 0)
        
        
        
fields_ord = [field_iso, field_year, attr_time_period.key, field_var]
df_out = df_out[fields_ord].sort_values(by = [field_iso, field_year]).reset_index(drop = True)

if True:
    df_out.to_csv(
        sa.fp_csv_nemomod_transmission_losses,
        index = None,
        encoding = "UTF-8"
    )

No MSP found for region 'ABW' -- using value from closest neighbor (in WB Region) 'CUW' 
No MSP found for region 'AFG' -- using value from closest neighbor (in WB Region) 'PAK' 
No MSP found for region 'AND' -- using value from closest neighbor (in WB Region) 'FRA' 
No MSP found for region 'ASM' -- using value from closest neighbor (in WB Region) 'NZL' 
No MSP found for region 'ATG' -- using value from closest neighbor (in WB Region) 'TTO' 
No MSP found for region 'BDI' -- using value from closest neighbor (in WB Region) 'TZA' 
No MSP found for region 'BFA' -- using value from closest neighbor (in WB Region) 'TGO' 
No MSP found for region 'BHS' -- using value from closest neighbor (in WB Region) 'CUB' 
No MSP found for region 'BLZ' -- using value from closest neighbor (in WB Region) 'HND' 
No MSP found for region 'BMU' -- using value from closest neighbor (in WB Region) 'CAN' 
No MSP found for region 'BRB' -- using value from closest neighbor (in WB Region) 'TTO' 
No MSP found for regi

##  FUEL COSTS
- fuel costs come from Edmundo
- tonne/barrel of oil comes from https://sciencing.com/convert-metric-tons-barrels-8220711.html
    - 0.14459225
- m3/barrel comes from https://www.metric-conversions.org
    - 0.158987

In [682]:
df_fuel = pd.read_csv("/Users/jsyme/Documents/Projects/FY21/SWCHE131_1000/Data/Energy/fuel_prices_from_edmundo_20230306_with_thermal_hydrogen.csv")

warnings.filterwarnings("ignore")

# set some field names
field_fuel = "fuel"
field_price = "price"
field_unit = "UNIT"
field_unit_denominator = "unit_denominator"
field_tmp_scalar = "scalar"

# some other fuel conversion costs
oil_tonne_per_barrel = 0.14459225
oil_m3_per_barrel = 0.158987
oil_tonne_per_m3 = oil_tonne_per_barrel/oil_m3_per_barrel

# drop some fields
fields_drop = [x for x in df_fuel.columns if x in ["Unnamed: 0", "Country.Name", "unit_type"]]
df_fuel.drop(fields_drop, axis = 1, inplace = True) if (len(fields_drop) > 0) else None

# expand fuel before filling NAs
df_fuel_all = sf.explode_merge(
    df_fuel[[field_fuel, field_unit_denominator]].drop_duplicates(),
    df_fuel[[field_iso]].drop_duplicates()
)
df_fuel = pd.merge(
    df_fuel_all,
    df_fuel, 
    how = "left"
)


# loop over fuels and replace oil units if necessary
df_fuel_grouped = df_fuel.groupby([field_fuel])
fuels_mass_to_volume = ["fuel_oil", "fuel_crude"]

df_fuel_new = []

for i, df in df_fuel_grouped:
    if i in fuels_mass_to_volume:
        if set(df[field_unit_denominator]) == set({"tonne"}):
            df[field_unit_denominator].replace({"tonne": "m3"}, inplace = True)
            df[field_price] = np.array(df[field_price])*oil_tonne_per_m3
        else:
            print(f"Check fuel {fuel}: not entered in tonnes")

    # get regional mean
    price_mean = np.array(list(set(
        df[
            df[field_iso].isin(dict_iso_to_country.keys()) & 
            ~df[field_price].isna()
        ][field_price]
    ))).mean()

    df[field_price].fillna(price_mean, inplace = True)

    df_fuel_new.append(df)

df_fuel = pd.concat(df_fuel_new, axis = 0).reset_index(drop = True)
df_fuelc = df_fuel.copy()


# replace input units
dict_repl_units = {
    'Mwh': "mmbtu", 
    "mwh": "mmbtu",
    'liter': "m3", 
    'tonne': "tonne", 
    'MWH': "mmbtu", 
    '1000 liters': "m3", 
    '1000 liter': "m3", 
    'MBtu': "mmbtu",
    "mmbtu": "mmbtu"
}

dict_repl_num_units = {
    "Total price (USD/unit using PPP)": 1,
    "Total price (USD/unit)": 1
}

# add scalars in terms of old per new

# get variable units
units_gravimetric = sa.model_attributes.get_variable_characteristic(
    model_energy.modvar_enfu_price_gravimetric,
    sa.model_attributes.varchar_str_unit_mass
)
units_thermal = sa.model_attributes.get_variable_characteristic(
    model_energy.modvar_enfu_price_thermal,
    sa.model_attributes.varchar_str_unit_energy
)
units_volumetric = sa.model_attributes.get_variable_characteristic(
    model_energy.modvar_enfu_price_volumetric,
    sa.model_attributes.varchar_str_unit_volume
)

dict_repl_units_scalars = {
    'Mwh': sa.model_attributes.get_energy_equivalent("mwh", units_thermal), 
    'liter': sa.model_attributes.get_volume_equivalent("litre", units_volumetric), 
    'tonne': sa.model_attributes.get_mass_equivalent("tonne", units_gravimetric), 
    'MWH': sa.model_attributes.get_energy_equivalent("mwh", units_thermal),
    '1000 liters': sa.model_attributes.get_volume_equivalent("m3", units_volumetric),
    '1000 liter': sa.model_attributes.get_volume_equivalent("m3", units_volumetric), 
    "m3": sa.model_attributes.get_volume_equivalent("m3", units_volumetric),
    'MBtu': sa.model_attributes.get_energy_equivalent("mbtu", units_thermal),
    'mmbtu': sa.model_attributes.get_energy_equivalent("mmbtu", units_thermal),
}

#
df_fuel[field_tmp_scalar] = df_fuel[field_unit_denominator].replace(dict_repl_units_scalars)
df_fuel[field_price] = np.array(df_fuel[field_price])/np.array(df_fuel[field_tmp_scalar])
df_fuel[field_unit_denominator].replace(dict_repl_units, inplace = True)



# loop over fuels again, group to build variable 
df_fuel_grouped = df_fuel.groupby([field_fuel])

cats_mass = sa.model_attributes.get_variable_categories(model_energy.modvar_enfu_price_gravimetric)
cats_thermal = sa.model_attributes.get_variable_categories(model_energy.modvar_enfu_price_thermal)
cats_volume = sa.model_attributes.get_variable_categories(model_energy.modvar_enfu_price_volumetric)

# should all be the same, but just for consistency's sake
subsec_mass = sa.model_attributes.get_variable_subsector(model_energy.modvar_enfu_price_gravimetric)
subsec_thermal = sa.model_attributes.get_variable_subsector(model_energy.modvar_enfu_price_thermal)
subsec_volume = sa.model_attributes.get_variable_subsector(model_energy.modvar_enfu_price_volumetric)

dict_repl = {}
fuels_unresolved = []


for i, df in df_fuel_grouped:
    
    new_val = None
    
    if (str(df[field_unit_denominator].iloc[0]) == units_gravimetric) & (i in cats_mass):
        
        # check mass
        new_val = sa.model_attributes.build_varlist(
            subsec_mass,
            model_energy.modvar_enfu_price_gravimetric,
            restrict_to_category_values = [i]
        )[0]
    
    if (str(df[field_unit_denominator].iloc[0]) == units_thermal) & (i in cats_thermal):
        
        # check thermal
        new_val = sa.model_attributes.build_varlist(
            subsec_thermal,
            model_energy.modvar_enfu_price_thermal,
            restrict_to_category_values = [i]
        )[0]
    
    if (str(df[field_unit_denominator].iloc[0]) == units_volumetric) & (i in cats_volume):
        
        # check volume
        new_val = sa.model_attributes.build_varlist(
            subsec_volume,
            model_energy.modvar_enfu_price_volumetric,
            restrict_to_category_values = [i]
        )[0]
        
    
    fuels_unresolved += [i] if (new_val is None) else []
    dict_repl.update({i: new_val}) if (new_val is not None) else None
    
        
df_fuel[field_fuel].replace(dict_repl, inplace = True)
    
# pivot and reorder
df_fuel = sf.pivot_df_clean(
    df_fuel[[field_iso, field_fuel, field_price]],
    [field_fuel],
    [field_price]
)

df_fuel = sf.explode_merge(
    attr_time_period.table[[attr_time_period.key]],
    df_fuel[
        df_fuel[field_iso].isin(regions.all_isos)
    ]
).sort_values(by = [field_iso, attr_time_period.key]).reset_index(drop = True)
    

fields_index = [field_iso, attr_time_period.key]
df_fuel = df_fuel[fields_index + sorted([x for x in df_fuel.columns if x not in fields_index])]

if True:
    df_fuel.to_csv(
        sa.fp_csv_nemomod_fuel_costs,
        index = None, 
        encoding = "UTF-8"
    )

