In [4]:
import batch_data_support_regions as bds_reg
import geopy.distance
import os, os.path
import numpy as np
import pandas as pd
import model_attributes as ma
from attribute_table import AttributeTable
import model_afolu as mafl
import model_ippu as mi
import model_circular_economy as mc
import model_energy as me
import model_electricity as ml
import model_socioeconomic as se
import setup_analysis as sa
import sisepuede as ssp
import support_functions as sf
import importlib
import time
import warnings
import matplotlib.pyplot as plt
import sql_utilities as sq
from typing import *
import sqlalchemy
import sql_utilities as sqlutil
import re
importlib.reload(ma)
importlib.reload(sa)
importlib.reload(sf)
importlib.reload(mafl)
importlib.reload(mc)
importlib.reload(mi)
importlib.reload(me)
importlib.reload(se)

warnings.filterwarnings("ignore")

##  For Seasonal Variationo in Hydropower Capacity Factors, derive from global model of hydropower generation developed by Wan et al. (2021)

- Wan, W., Zhao, J., Popat, E., Herbert, C., & Döll, P. (2021). Analyzing the impact of streamflow drought on hydroelectricity production: A global-scale study. Water Resources Research, 57, e2020WR028087. https://doi.org/10.1029/2020WR028087

- Code (modified to read variable "days" from a CSV, see below) available from 
    - https://energy.duke.edu/content/global-hydropower-database, which leads to
    - https://figshare.com/articles/dataset/Global_Hydropower_Database_GHD_/11283758/3?file=22767863


In [272]:
##  IMPORT SOME ATTRIBUTES, MODELS, AND SHARED VARIABLES

attr_region = sa.model_attributes.dict_attributes.get(f"{sa.model_attributes.dim_region}")
attr_time_period = sa.model_attributes.dict_attributes.get(f"dim_{sa.model_attributes.dim_time_period}")
attr_time_slice = sa.model_attributes.dict_attributes.get(f"time_slice")

# call variables from the electric model
model_elec = ml.ElectricEnergy(sa.model_attributes, sa.dir_jl, sa.dir_ref_nemo, initialize_julia = False)

# map each country to ISO code 3 and each code to 
dict_country_to_iso = dict((k, v.upper()) for k, v in attr_region.field_maps.get(f"{attr_region.key}_to_iso_alpha_3").items())
dict_iso_to_country = sf.reverse_dict(dict_country_to_iso)


# used in a number of places
dict_n_days_per_month = {
    1: 31,
    2: 28,
    3: 31,
    4: 30,
    5: 31,
    6: 30,
    7: 31,
    8: 31,
    9: 30,
    10: 31,
    11: 30,
    12: 31
}
# weights days/month on average when only monthly data are avaiable
dict_num_days_per_month_weights = dict((k, (v if (k != 2) else 28.25)) for k, v in dict_n_days_per_month.items())

# setup some fields
field_capacity = "capacity_mw"
field_capacity_factor = "capacity_factor"
field_cfs = "cf_scalar"
field_country = "Country"
field_date_string = "date_string"
field_generation = "generation_gwh"
field_gwp = "max_generation_gwp"
field_iso = "iso_code3"
field_iso_region_attr = "iso_alpha_3"
field_lat_region = "latitude_population_centroid_2020"
field_lon_region = "longitude_population_centroid_2020"
field_key = "GHD_ID"
field_month = "month"
field_ndays = "n_days"
field_weight_month = "weight_month"
field_weight_tg1 = "weight_tg1"
field_year = "year"




##  TIME GROUP MANIPULATIONS

attr_tg1 = sa.model_attributes.dict_attributes.get("ts_group_1")

# format month/time group 1 dictionaries
dict_tg1_to_months = dict(
    (k, [int(x) for x in v.split("|")]) for k, v in attr_tg1.field_maps.get(f"{attr_tg1.key}_to_months").items()
)

# map each month to the TG1
dict_month_to_tg1 = {}
for k in dict_tg1_to_months.keys():
    mos = dict_tg1_to_months.get(k)
    for m in mos:
        dict_month_to_tg1.update({m: k})
        
# build within year weights for ts groups
dict_tg1_num_days_weights = {}
for k in attr_tg1.key_values:
    mos = dict_tg1_to_months.get(k)
    total = sum([dict_num_days_per_month_weights.get(x) for x in mos])
    dict_tg1_num_days_weights.update({k: total})
    


    
#
#    READ DATA FROM Wan et al. Package and monthly results file, "Plant_monthly_V1.csv"
# 

dir_data = "/Users/jsyme/Documents/Projects/FY21/SWCHE131_1000/Data/Energy/wan_et_al_hydro_model/11283758"
dfs_power_plants = pd.read_excel(os.path.join(dir_data, "Plant_Database.xlsx"), sheet_name = "2 GHD with estimations")
df_generated_simulated = pd.read_csv(os.path.join(dir_data, "Plant_monthly_V1.csv"), low_memory = False)


##  Note: Had to overwrite "days" variable since dayseries() function called in `HP_model.R` does not seem to exist. 
## The following code shows the creation of that series (also called in this notebook)
df_days_per_month = []
n_y = 41
y_0 = 1975

for y in range(y_0, y_0 + n_y + 1):
    for m in range(1, 13):
        n_day = 29 if ((y%4 == 0) and (m == 2)) else dict_n_days_per_month.get(m)
        df_days_per_month.append([y, m, n_day])

df_days_per_month = pd.DataFrame(df_days_per_month, columns = ["year", "month", "n_days"])
df_days_per_month.to_csv(os.path.join(dir_data, "n_days.csv"), index = None, encoding = "UTF-8")



#############################
#    SOME USED FUNCTIONS    #
#############################

def get_closest_region(
    region: str,
    attr_region: AttributeTable, 
    field_iso: str = "iso_alpha_3",
    field_lat: str = "latitude_population_centroid_2020",
    field_lon: str = "longitude_population_centroid_2020",
    missing_flag: float = -999,
    regions_valid: Union[List[str], None] = None,
    type_input: str = "region",
    type_return: str = "region",
) -> Union[str, None]:
    """
    Based on latitude/longitude of population centers, find the 
        closest neighboring region.
    
    
    Function Arguments
    ------------------
    - region: region to search for closest neighbor
    - attr_region: attribute table for regions
    
    Keyword Arguments
    -----------------
    - field_iso: iso field in attr_regin
    - field_lat: field storing latitude
    - field_lon: field storing longitude
    - missing_flag: flag indicating a missing value
    - regions_valid: optional list of regions to restrict search to. If None,
        searches through all regions specified in attr_region
    - type_input: input region type. Either "region" or "iso"
    - type_return: return type. Either "region" or "iso"
    """
    
    ##  INITIALIZATION
    
    type_return = "region" if (type_return not in ["region", "iso"]) else type_return
    type_input = "region" if (type_input not in ["region", "iso"]) else type_input
    
    # get some dictionaries
    dict_region_to_lat = attr_region.field_maps.get(f"{attr_region.key}_to_{field_lat}")
    dict_region_to_lon = attr_region.field_maps.get(f"{attr_region.key}_to_{field_lon}")
    dict_iso_to_region = attr_region.field_maps.get(f"{field_iso}_to_{attr_region.key}")
    dict_region_to_iso = attr_region.field_maps.get(f"{attr_region.key}_to_{field_iso}")
    
    # check region/lat/lon
    region = dict_iso_to_region.get(region) if (type_input == "iso") else region
    region = region if (region in attr_region.key_values) else None
    lat, lon = dict_region_to_lat.get(region), dict_region_to_lon.get(region)
    
    # return None if one of the dimensions is missing
    if (lat is None) or (lon is None) or (region is None):
        return None
    
    
    ##  FILTER TABLE AND APPLY DISTANCES
    
    if (regions_valid is None):
        regions_valid = attr_region.key_values 
    else:
        regions_valid = (
            [x for x in attr_region.key_values if x in (regions_valid)]
            if type_input == "region"
            else [x for x in attr_region.key_values if dict_region_to_iso.get(x) in (regions_valid)]
        )
        
    df_regions = attr_region.table[
        attr_region.table[attr_region.key].isin(regions_valid)
    ].copy().reset_index(drop = True)
    
    # function to apply
    def f(tup: Tuple[float, float]) -> float:
        y, x = tuple(tup)
        
        out = (
            -1.0
            if (min(y, lat) < -90) or (max(y, lat) > 90) or (min(x, lon) < -180) or (max(x, lon) > 180)
            else geopy.distance.geodesic((lat, lon), (y, x)).km
        )
        
        return out
    

    vec_dists = np.array(df_regions[[field_lat, field_lon]].apply(f, raw = True, axis = 1))
    valid_dists = vec_dists[vec_dists > 0.0]
    out = None
    
    if len(valid_dists) > 0:

        m = min(vec_dists)
        w = np.where(vec_dists == m)[0]

        out = (
            list(df_regions[attr_region.key])[w[0]]
            if len(w) > 0
            else None
        )
        out = dict_region_to_iso.get(out) if (type_return == "iso") else out


    return out


In [156]:
##  CONVERT SIMULATIONS TO LONG FILE


regex_match_dates = re.compile("(\d*)-(\d*)")
cat_name_hydro = "pp_hydropower"
cat_name_solar = "pp_solar"


def format_cf(cf:str, field_prepend: str = field_capacity_factor) -> str:
    return f"{field_prepend}_{cf}"


fields_date = [x for x in df_generated_simulated.columns if regex_match_dates.match(x) is not None]

# total generation (GWh--see script) per plant
df_generated = pd.melt(
    df_generated_simulated[[field_key, field_country, "Install_Act"] + fields_date],
    [field_key, field_country, "Install_Act"],
    fields_date,
    var_name = "date_string",
    value_name = field_generation
).rename(columns = {"Install_Act": field_capacity})


##  CLEAN DATES TO PREPARE AGGREGATION

# function to convert date columns to paired year/month
def ds_to_date(
    ds: str,
    regex_check: re.Pattern = regex_match_dates
) -> Tuple[int, int]:
    
    out = (
        tuple([int(x) for x in ds.split("-")])
        if regex_check.match(str(ds)) is not None
        else None
    )
    
    return out

df_year_month = pd.DataFrame(list(df_generated[field_date_string].apply(ds_to_date)), columns = [field_year, field_month])
df_generated = pd.concat([df_generated, df_year_month], axis = 1).drop([field_date_string], axis = 1)
df_generated = pd.merge(df_generated, df_days_per_month)

# add potential
df_generated[field_gwp] = np.array(df_generated[field_capacity])*np.array(df_generated[field_ndays])*24/1000



##  FIRST AGGREGATION -- TOTAL GENERATION/GENERATION POTENTIAL BY COUNTRY, YEAR, AND MONTH

fields_group = [field_country, field_year, field_month]
fields_agg = [field_generation, field_gwp]
dict_agg = dict((x, "first") for x in fields_group)
dict_agg.update(dict((x, "sum") for x in fields_agg))

# aggregate, then add an estimated capacity factor
df_generated_by_country = df_generated[list(dict_agg.keys())].groupby(fields_group).agg(dict_agg).reset_index(drop = True)
df_generated_by_country[format_cf(cat_name_hydro)] = np.array(df_generated_by_country[field_generation])/np.array(df_generated_by_country[field_gwp])


##  SECOND AGGREGATION -- MEAN CAPACITYFACTOR BY COUNTRY AND MONTH

# number of most recent years to keep 
n_years_keep = 20

fields_group = [field_country, field_month]
fields_agg = [format_cf(cat_name_hydro)]
dict_agg = dict((x, "first") for x in fields_group)
dict_agg.update(dict((x, "mean") for x in fields_agg))

df_gen_for_averages_init = df_generated_by_country.groupby(fields_group)
df_gen_for_averages = []

for df in df_gen_for_averages_init:
    i, df = df
    yr_max = max(df[field_year])
    yr_min = min(df[field_year])
    
    year_range = list(range(max(yr_max - n_years_keep + 1, yr_min), yr_max))
    
    df_gen_for_averages.append(df[df[field_year].isin(year_range)])

    
##  GET HYDROPOWER CAPACITY FACTOR ESTIMATES

df_capacity_factor_hydro = pd.concat(df_gen_for_averages, axis = 0).groupby(fields_group).agg(dict_agg).reset_index(drop = True)
df_capacity_factor_hydro[field_country] = [x.lower().replace(" ", "_") for x in df_capacity_factor_hydro[field_country]]
df_capacity_factor_hydro[field_country].replace(dict_country_to_iso, inplace = True)
df_capacity_factor_hydro.rename(columns = {field_country: field_iso}, inplace = True)

# filter out countries 
df_capacity_factor_hydro = df_capacity_factor_hydro[
    df_capacity_factor_hydro[field_iso].isin(dict_country_to_iso.values())
].reset_index(drop = True)



#################################
#    FORMAT FOR TIME SLICING    #
#################################


# adjust fields
df_capacity_factor_hydro_by_tg1 = df_capacity_factor_hydro.copy()
df_capacity_factor_hydro_by_tg1[model_elec.field_nemomod_tg1] = df_capacity_factor_hydro_by_tg1[field_month].replace(dict_month_to_tg1)

# add weights for aggregation
df_capacity_factor_hydro_by_tg1[field_weight_month] = df_capacity_factor_hydro_by_tg1[field_month].replace(dict_num_days_per_month_weights)
df_capacity_factor_hydro_by_tg1[field_weight_tg1] = df_capacity_factor_hydro_by_tg1[model_elec.field_nemomod_tg1].replace(dict_tg1_num_days_weights)
df_capacity_factor_hydro_by_tg1[format_cf(cat_name_hydro)] = np.array(
    df_capacity_factor_hydro_by_tg1[format_cf(cat_name_hydro)]
)*np.array(
    df_capacity_factor_hydro_by_tg1[field_weight_month]
)/np.array(df_capacity_factor_hydro_by_tg1[field_weight_tg1])


df_cf_avg_hydro_by_tg = sf.simple_df_agg(
    df_capacity_factor_hydro_by_tg1,
    [field_iso, model_elec.field_nemomod_tg1],
    {format_cf(cat_name_hydro): "sum"}
)




##  NEXT, COMBINE SEASONAL VARIATION WITH ANNUAL CAPACITY FACTORS TO ESTIMATE CAPACITY FACTORS BY TIME SLICE

# get hydro CF by region/tg1
df_cf_hydro_by_ts = pd.merge(
    attr_time_slice.table.copy().drop(["description"], axis = 1),
    df_cf_avg_hydro_by_tg,
    how = "outer"
)[[field_iso, attr_time_slice.key, format_cf(cat_name_hydro)]].sort_values(by = [field_iso, attr_time_slice.key])


# fill in missing regions
missing_regions = sorted(list(set(dict_iso_to_country.keys()) - set(df_cf_hydro_by_ts[field_iso])))

# use closest neighbor (by population) if unavailable
if len(missing_regions) > 0:
    
    df_append = [df_cf_hydro_by_ts]
    
    for region in missing_regions:
        
        iso_closest = get_closest_region(
            region,
            attr_region,
            regions_valid = list(set(df_cf_hydro_by_ts[field_iso])),
            type_input = "iso",
            type_return = "iso"
        )
        
        if iso_closest is not None:

            # fill in missing
            df_cur = df_cf_hydro_by_ts[
                df_cf_hydro_by_ts[field_iso] == iso_closest
            ].copy().reset_index(drop = True)
            df_cur[field_iso] = region

            df_append.append(df_cur)
        
    df_cf_hydro_by_ts = pd.concat(df_append).reset_index(drop = True)

    
# finally, format for input table
df_cf_hydro_by_ts[field_iso].replace(dict_iso_to_country, inplace = True)
df_cf_hydro_by_ts.rename(columns = {
        field_iso: model_elec.field_nemomod_region,
        format_cf(cat_name_hydro): cat_name_hydro,
        attr_time_slice.key: model_elec.field_nemomod_time_slice
    },
    inplace = True
)
df_cf_hydro_by_ts.reset_index(drop = True, inplace = True)




#  Next, generate Solar Capacity Factors from World Bank/Solar Atlas data
- Country-wide annual averages are available from WB/Solar Atlas
- Use Sunrise/Sunset model coupled with assumptions about time to solar peak (And time after surise/before sunset before generation) at each region's population centroid to generate diurnal irradiance curve (0 at night, e.g.)
- Then, combine diurnal irradiance with country-wide average to generate time_slice capacity factors for solar generation

In [316]:
df_solar_data = pd.read_excel(
    "/Users/jsyme/Documents/Projects/FY21/SWCHE131_1000/Data/Energy/solargis_pvpotential_countryranking_2020_data.xlsx", 
    sheet_name = "Monthly data", 
    skiprows = [0]
)

# some fields
field_hour = "hour"
field_hour_group = "hour_group"
field_time_of_day = "time_of_day"
field_weight = "weight"

dict_month_nm_to_num = {
    "January": 1,
    "February": 2,
    "March": 3,
    "April": 4,
    "May": 5,
    "June": 6,
    "July": 7,
    "August": 8,
    "September": 9,
    "October": 10,
    "November": 11,
    "December": 12
}




##  CLEAN AND REFORMAT

dict_rnm = {"ISO_A3": field_iso}
    
df_capacity_factor_solar = df_solar_data[["ISO_A3"] + list(dict_month_nm_to_num.keys())].rename(columns = dict_rnm)
df_capacity_factor_solar = df_capacity_factor_solar.melt(
    [field_iso],
    list(dict_month_nm_to_num.keys()),
    var_name = field_month,
    value_name = format_cf(cat_name_solar)
)

# adjust fields
df_capacity_factor_solar[field_month].replace(dict_month_nm_to_num, inplace = True)
df_capacity_factor_solar[format_cf(cat_name_solar)] /= 24
df_capacity_factor_solar[model_elec.field_nemomod_tg1] = df_capacity_factor_solar[field_month].replace(dict_month_to_tg1)

# add weights for aggregation
df_capacity_factor_solar[field_weight_month] = df_capacity_factor_solar[field_month].replace(dict_num_days_per_month_weights)
df_capacity_factor_solar[field_weight_tg1] = df_capacity_factor_solar[model_elec.field_nemomod_tg1].replace(dict_tg1_num_days_weights)
df_capacity_factor_solar[format_cf(cat_name_solar)] = np.array(
    df_capacity_factor_solar[format_cf(cat_name_solar)]
)*np.array(
    df_capacity_factor_solar[field_weight_month]
)/np.array(df_capacity_factor_solar[field_weight_tg1])


df_cf_avg_solar_by_tg = sf.simple_df_agg(
    df_capacity_factor_solar,
    [field_iso, model_elec.field_nemomod_tg1],
    {format_cf(cat_name_solar): "sum"}
)




#################################################################################################
#   READ IN SOLAR HOUR GROUP FACTOR SCALARS BY CAPACTITY FACTOR REGION (THESE ARE TEMPORARY)    #
#################################################################################################

# set up the regular expression to match hour groups on
def regex_by_hour_group(
    hour_group: int
) -> Union[str, None]:
    return re.compile(f"(\D*)w(\D*){hour_group}$")

# map time slices to hour group
attr_hour = sa.model_attributes.dict_attributes.get("hour")
all_hour_groups = sorted(list(set(attr_hour.table[field_hour_group])))

dict_time_slice_to_hour_group = {}
for hg in all_hour_groups:
    regex = regex_by_hour_group(hg)
    for time_slice in attr_time_slice.key_values:
        if regex.match(time_slice) is not None:
            dict_time_slice_to_hour_group.update({time_slice: hg})

# initialize the output in terms of hour group
df_cf_avg_solar_by_hour_group_base = attr_time_slice.table.copy().drop(["description"], axis = 1)
df_cf_avg_solar_by_hour_group_base[field_hour_group] = df_cf_avg_solar_by_hour_group_base[
    attr_time_slice.key
].replace(dict_time_slice_to_hour_group)


    
##  NEXT, COMBINE SEASONAL VARIATION WITH ANNUAL CAPACITY FACTORS TO ESTIMATE CAPACITY FACTORS BY TIME SLICE

df_cf_solar_by_ts = []
df_cf_solar_by_ts_wide = None

def combine_avg_cf_with_variability_solar(
    iso_region: str,
    df_cf_avg_solar_by_tg: pd.DataFrame,
    df_cf_avg_solar_by_hour_group_base: pd.DataFrame,
    field_cf_avg: str = format_cf(cat_name_solar),
    field_cfs: str = field_cfs,
    field_iso_avg: str = field_iso,
    field_iso_attr_region: str = field_iso_region_attr,
    field_hour_group: str = field_hour_group,
    field_lat: str = field_lat_region,
    field_lon: str = field_lon_region,
    field_nemomod_tg1: str = model_elec.field_nemomod_tg1,
    field_weight: str = field_weight, 
    model_attributes: ma.ModelAttributes = sa.model_attributes,
) -> pd.DataFrame:
    """
    For region `iso_region` (3-digit iso), build solar capacity factor
        by time slice by scaling average capacity factors to coincide
        seasonal and hourly variability.
        
    NOTE: If any regions are not present in the DataFrame specifying
        regional average annual capacity factors, thne the closes region, 
        by population centroid, is chosen.

    
    Function Arguments
    ------------------
    - iso_region: 3-digit ISO Alpha for the region to build curve for
    - df_cf_avg_solar_by_tg: DataFrame containing the average solar 
        capacity factor by NemoMod TimeSlice Group 1 (tg1), which 
        represents seasonal variation in solar availability
    - df_cf_avg_solar_by_hour_group_base: DataFrame containing a map
        of time slice to hour group (maps across time slice group names 
        to hour groups)
    
    Keyword Arguments
    -----------------
    - field_cf_avg: field in df_cf_avg_solar_by_tg that stores the 
        average region-wide solar capacity factor
    - field_cfs: field used to store capacity factor scalar
    - field_hour_group: field in time slice attribute (from 
        model_attributes) that contains the hour group
    - field_iso_attr_region: field in attribute_region
    - field_iso_avg: iso field in df_cf_avg_solar_by_tg
    - field_lat: field in attr_region.table that stores the latitude of
        the population centroid (used to determine solar curve)
    - field_lon: field in attr_region.table that stores the longitude of
    the population centroid (used to determine solar curve)
    - field_nemomod_tg1: field to use for time slice group 1 (NemoMod)
    - field_weight: field in df_cf_avg_solar_by_hour_group_base that 
        stores the weight of each time_slice
    - model_attributes: ModelAttributes object used to instantiate region
        attribute and 
        region_solar.build_solar_cf_seasonal_component_by_hour()
    """
    
    attr_region = model_attributes.dict_attributes.get("region")
    attr_tg1 = model_attributes.dict_attributes.get("ts_group_1")
    attr_time_slice = model_attributes.dict_attributes.get("time_slice")
    
    # get average cf for solar
    
    regions_valid = list(set(df_cf_avg_solar_by_tg[field_iso_avg]))
    
    # get the region that is used to pull WB data--a few sou
    iso_region_filt = (
        get_closest_region(
            iso_region,
            attr_region, 
            regions_valid = regions_valid,
            type_input = "iso",
            type_return = "iso",
        )
        if iso_region not in regions_valid
        else iso_region
    )
    
    df_cf_avg_solar_by_tg_cur = df_cf_avg_solar_by_tg[
        df_cf_avg_solar_by_tg[field_iso_avg] == iso_region_filt
    ].drop([field_iso_avg], axis = 1)
        
        
        
    ##  GET SOLAR IRRADIANCE SCALARS BY HOUR AND AGGREGATE, FOR EACH TS GROUP 1, TO HOUR GROUP
    
    # build the solar region object using the region_solar object
    region_info_cur = list(
        attr_region.table[
            attr_region.table[field_iso_attr_region] == iso_region
        ][[field_iso_attr_region, field_lat, field_lon]].iloc[0]
    )
    region_solar_obj = bds_reg.region_solar(*region_info_cur)
    df_solar_factor_by_season = region_solar_obj.build_solar_cf_seasonal_component_by_hour(sa.model_attributes)
    
    # any issue with lat/lon, return None
    if df_solar_factor_by_season is None:
        return None
    
    #
    df_solar_factor_by_season_agg = df_solar_factor_by_season.drop([field_hour], axis = 1).melt(
        [field_hour_group],
        attr_tg1.key_values,
        var_name = field_nemomod_tg1
    )

    df_solar_factor_by_season_agg = sf.simple_df_agg(
        df_solar_factor_by_season_agg,
        [field_hour_group, field_nemomod_tg1],
        {"value": "mean"}
    )

    # merge into aggregate
    df_solar_factor_by_season_agg = pd.merge(
        df_cf_avg_solar_by_hour_group_base,
        df_solar_factor_by_season_agg.rename(columns = {"value": field_cfs}),
        how = "left"
    )
    

    # add in aggregate average 
    df_solar_factor_by_season_agg = pd.merge(
        df_solar_factor_by_season_agg,
        df_cf_avg_solar_by_tg_cur,
        how = "left"
    )
    
    
    ##  NEXT, ITERATE OVER TIME SLICE GROUP 1 TO RESCALE VARIATION TIME SERIES TO MATCH REGION-WIDE AVERAGES
    
    # group by NemoMod time slice group 1 (seasons)
    dfs_group = df_solar_factor_by_season_agg.groupby([field_nemomod_tg1])
        
    df_cf_solar_by_ts_by_region = []
    
    for df in dfs_group:
        i, df = df

        vec_weight = np.array(df[field_weight])
        vec_scalar = np.array(df[field_cfs])
        vec_cf = np.array(df[field_cf_avg])

        # target average capacity factor for this tg1
        target = vec_cf[0]

        vec_total = vec_weight*(vec_scalar * vec_cf)/vec_weight.sum()
        scalar = target/vec_total.sum()

        # up capacity factor
        vec_cf_new = sf.vec_bounds(vec_scalar * vec_cf * scalar, (0, 1.0))

        df_add = df[[attr_time_slice.key]].copy()
        df_add[field_cf_avg] = vec_cf_new

        df_cf_solar_by_ts_by_region.append(df_add)

    df_cf_solar_by_ts_by_region = pd.concat(df_cf_solar_by_ts_by_region, axis = 0).reset_index(drop = True)
    df_cf_solar_by_ts_by_region[attr_region.key] = iso_region

    return df_cf_solar_by_ts_by_region



# loop over regions to build solar capacity factor by time slice
for iso_region, region in dict_iso_to_country.items():   
    
    if region in attr_region.key_values:
        df_cf_solar_by_ts_by_region = combine_avg_cf_with_variability_solar(
            iso_region, 
            df_cf_avg_solar_by_tg,
            df_cf_avg_solar_by_hour_group_base
        ) 

        # append to long data frame and add to wide data frame, used to generate regional average
        df_cf_solar_by_ts.append(df_cf_solar_by_ts_by_region)

# get solar cf by time slice (tg1) for all available regions--if unavailable, default to "cf region" average
df_cf_solar_by_ts = pd.concat(df_cf_solar_by_ts, axis = 0).reset_index(drop = True)


# finally, format for input table
df_cf_solar_by_ts[attr_region.key].replace(dict_iso_to_country, inplace = True)
df_cf_solar_by_ts.rename(
    columns = { 
        attr_region.key: model_elec.field_nemomod_region,
        format_cf(cat_name_solar): cat_name_solar,
        attr_time_slice.key: model_elec.field_nemomod_time_slice
    },
    inplace = True
)
        
    

##  Get other, constant Capacity Factors
- 2008-2012 regional averages by technology:
    - https://www.eia.gov/todayinenergy/detail.php?id=22832
- additional information on Ocean from https://www.nrel.gov/analysis/tech-cap-factor.html
- use https://www.pnas.org/doi/10.1073/pnas.2205429119 for other generation sources (biomass, wind, geothermal, fossil, nuclear)
- Assume 0.5 in absence of other information



In [329]:
attribute_tech = sa.model_attributes.dict_attributes.get("cat_technology")
dict_techs_to_capacity_factors = {
    "pp_biogas": 0.5,
    "pp_biomass": 0.37,
    "pp_coal": 0.36,
    #"pp_coal_ccs": 0.36,
    "pp_gas": 0.36,
    #"pp_gas_ccs": 0.36,
    "pp_geothermal": 0.67,
    "pp_nuclear": 0.8,
    "pp_ocean": 0.25,
    "pp_oil": 0.36,
    "pp_waste_incineration": 0.5,
    "pp_wind": 0.26
}



##  Build full dataframe of capacity factor inputs

In [330]:
# build full
df_capacity_factor = pd.merge(
    df_cf_hydro_by_ts,
    df_cf_solar_by_ts
)

for k in dict_techs_to_capacity_factors.keys():
    df_capacity_factor[k] = dict_techs_to_capacity_factors.get(k)

In [336]:
df_capacity_factor.to_csv(
    sa.dict_fp_csv_nemomod.get("CapacityFactor"),
    index = None,
    encoding = "UTF-8"
)

In [287]:
## build capacity factors by country

df_capacity_factor_base = pd.read_csv(sa.dict_fp_csv_nemomod.get("CapacityFactor"));


