# Format Batch Data for SISEPEUDE Data
- Transform batch data generation to inputs usable for data input pipelines

In [2]:
import os, os.path
import numpy as np
import pandas as pd
import model_attributes as ma
from attribute_table import AttributeTable
import setup_analysis as sa
import support_classes as sc
import support_functions as sf
import importlib
import time
from typing import *
import time




In [32]:
# read in some data sets
#  set up a dictionary that maps datasets to maximum historical year


importlib.reload(sc)

<module 'support_classes' from '/Users/jsyme/Documents/Projects/git_jbus/lac_decarbonization/python/support_classes.py'>

In [169]:


# initialize 
all_regions = None
dict_isos = {}
dict_sets = {}
field_country = "country"
field_iso = "iso_code3"
field_region = "nation"
field_year = "year"
fields_drop = [field_iso, field_region, field_year] # only apply later
field_time_period = sa.model_attributes.dim_time_period

# get some attributes
attr_region = sa.model_attributes.dict_attributes.get("region")
attr_sector = sa.model_attributes.dict_attributes.get("abbreviation_sector")
attr_strat = sa.model_attributes.dict_attributes.get(f"dim_{sa.model_attributes.dim_strategy_id}")
attr_time_period = sa.model_attributes.dict_attributes.get(f"dim_{sa.model_attributes.dim_time_period}")


time_periods = sc.TimePeriods(sa.model_attributes)
regions = sc.Regions(sa.model_attributes)

dir_git_sisepuede_data = "/Users/jsyme/Documents/Projects/git_jbus/sisepuede_data"

  for i, df in df_in_grouped:


In [171]:
def file_to_sd_dirs(
    fp_csv: str,
    fields_ind: List[str],
    years_historical: List[int],
    dict_rename: Union[Dict[str, str], None] = None,
    key_historical: str = "historical",
    key_projected: str = "projected",
    model_attributes: ma.ModelAttributes = sa.model_attributes,
    regions: Union[sc.Regions, None] = None,
    time_periods: Union[sc.TimePeriods, None] = None,
) -> Union[Dict[str, pd.DataFrame], None]:
    """
    Read a CSV file and return a dictionary of files to write
    
    Function Arguments
    ------------------
    - fp_csv: file path of CSV to read
    - fields_ind: fields to use as index (present in all output CSVs). 
        NOTE: if dict_rename is not None and any index fielids are 
        renamed by dict_rename, then fields_ind should specify renaming
        targets.
    - years_historical: years to consider as historical
    
    Keyword Arguments
    -----------------
    - dict_rename: optional dictionary used to rename fields that are read
        in.
        NOTE: Renaming occurs *before* extracting fields_ind, so fields_ind
            should reference target renamed fields if applicable
    - key_historical: dictionary key to use for historical (used in output 
        directory structure)
    - key_projected: dictionary key to use for projected (used in output 
        directory structure)
    - model_attributes: model_attributes.ModelAttributes object used to 
        reference and coordinate variables
    - regions: optional support_classes.Regions object to use to coordinate
        regions and ISO codes
    - time_periods: optional support_classes.TimePeriods object to use to 
        coordinate time periods and ISO codes
    """
    
    # some quick checks
    if fp_csv is None:
        return None
    
    if (not os.path.exists(fp_csv)) | (not sf.islistlike(years_historical)):
        return None
    
    # check time periods/regions
    dict_vars = model_attributes.dict_variables_to_model_variables
    regions = sc.TimePeriods(model_attributes) if (regions is None) else regions
    time_periods = sc.TimePeriods(model_attributes) if (time_periods is None) else time_periods
    if len(years_historical) == 0:
        return None
    
    # read and check time indexing--add years if only time period is included
    df_csv = pd.read_csv(fp_csv)
    if (time_periods.field_year not in df_csv.columns) and (time_periods.field_time_period not in df_csv.columns):
        return None
    df_csv = (
        time_periods.tps_to_years(df_csv)
        if time_periods.field_year not in df_csv.columns
        else df_csv
    )
    
    # rename csv 
    dict_rnm = {}
    if isinstance(dict_rename, dict):
        for k, v in dict_rename.items():
            dict_rnm.update({k: v}) if (k in df_csv.columns) else None
    df_csv.rename(columns = dict_rnm, inplace = True)
    field_year = dict_rnm.get(time_periods.field_year, time_periods.field_year)
    
    # get fields and return None if invalid
    fields_ind = [x for x in fields_ind if x in df_csv.columns]
    fields_dat = [x for x in df_csv.columns if x in dict_vars.keys()]
    if min(len(fields_dat), len(fields_ind)) == 0:
        return None
    
    
    # initialize and write output
    dict_out = {}
    for fld in fields_dat:
        
        df_ext = df_csv[fields_ind + [fld]]
        
        df_ext_hist = df_ext[
            df_ext[field_year].isin(years_historical)
        ].reset_index(drop = True)
        
        df_ext_proj = df_ext[
            ~df_ext[field_year].isin(years_historical)
        ].reset_index(drop = True)
        
        dict_out.update(
            {
                fld: {
                    key_historical: df_ext_hist,
                    key_projected: df_ext_proj
                }
            }
        )
        
    return dict_out
    


def generate_sisepuede_data_inputs(
    dir_batch: str,
    dir_sisepuede_data: str,
    dict_years_historical: Union[Dict[str, List[int]], List[int]],
    dirs_ignore: Union[list[str], None] = None,
    ext_read: str = "csv",
    field_iso_out: str = field_iso,
    field_region_out: str = "Nation",
    field_time_period: str = field_time_period,
    field_year: str = field_year,
    field_year_out: str = "Year",
    fps_ignore: Union[List[str], None] = None,
    key_historical: str = "historical",
    key_projected: str = "projected",
    model_attributes: ma.ModelAttributes = sa.model_attributes,
    regions: Union[sc.Regions, None] = None,
    time_periods: Union[sc.TimePeriods, None] = None,
    write_q: bool = True,
) -> None:
    """
    Using directory dir_batch (in sisepuede repository), generate inputs
        for sisepuede_data repo
        
    Function Arguments
    ------------------
    - dir_batch: directory storing batch data using lac_decarbonization structure
    - dir_sisepuede_data: directory to write to 
    - dict_years_historical: dictionary mapping a file to years historical OR a 
        list of integer years to consider histroical
    
    Keyword Arguments
    -----------------
    - dirs_ignore: list of subdirectories to ignore
    - ext_read: extension of input files to read
    - fields_ignore: list of fields to ignore in each file when check for field
        to write to dir_sisepuede_data
    - fps_ignore: optional file paths to ignore
    - key_historical: dictionary key to use for historical (used in output 
        directory structure)
    - key_projected: dictionary key to use for projected (used in output 
        directory structure)
    - model_attributes: model attributes object to use for variable checking
    - regions: optional support_classes.Regions object to use to coordinate
        regions and ISO codes
    - time_periods: optional support_classes.TimePeriods object to use to 
        coordinate time periods and ISO codes
    - write_q: write output data to files
    """
    
    dict_vars = model_attributes.dict_variables_to_model_variables
    regions = sc.TimePeriods(model_attributes) if (regions is None) else regions
    time_periods = sc.TimePeriods(model_attributes) if (time_periods is None) else time_periods

    # some field initialization
    fields_ind = [field_year_out, field_iso_out]
    dict_rename = {
        model_attributes.dim_region: field_region_out,
        "country": field_region_out,
        "nation": field_region_out,
        regions.field_iso: field_iso_out,
        time_periods.field_year: field_year_out
    }
    
    # directory checks--make output if not exstis + loop through subdirectories to check for available data
    (
        os.makedirs(dir_sisepuede_data, exist_ok = True) 
        if not os.path.exists(dir_sisepuede_data)
        else None
    )
    subdirs = (
        [x for x in os.listdir(dir_batch) if os.path.join(dir_batch, x) not in dirs_ignore]
        if sf.islistlike(dirs_ignore)
        else os.listdir(dir_batch)
    )

    
    dict_out = {}
    dict_paths = {}
    
    for subdir in subdirs:
        fp_subdir = os.path.join(dir_batch, subdir)

        if os.path.isdir(fp_subdir):
            
            fns_read = [x for x in os.listdir(fp_subdir) if x.endswith(f".{ext_read}")]
            
            for fn in fns_read:
                years_historical = (
                    dict_years_historical.get(fn)
                    if isinstance(dict_years_historical, dict)
                    else dict_years_historical
                )
                
                fp_read = os.path.join(fp_subdir, fn)
                fp_read = None if (fp_read in fps_ignore) else fp_read
                
                dict_read = file_to_sd_dirs(
                    fp_read,
                    fields_ind,
                    years_historical,
                    dict_rename = dict_rename,
                    key_historical = key_historical,
                    key_projected = key_projected,
                    model_attributes = model_attributes,
                    regions = regions,
                    time_periods = time_periods,
                )
                
                # get variable information
                if dict_read is not None:
                    
                    for fld in dict_read.keys():
                        modvar = dict_vars.get(fld)
                        sector = model_attributes.get_variable_subsector(modvar)
                        sector = model_attributes.get_subsector_attribute(sector, "sector")
                        
                        # create outputs
                        fp_out_base = os.path.join(dir_sisepuede_data, sector, fld, "input_to_sisepuede")
                        fp_out_hist = os.path.join(fp_out_base, key_historical, f"{fld}.csv")
                        fp_out_proj = os.path.join(fp_out_base, key_projected, f"{fld}.csv")
                        
                        dict_paths.update(
                            {
                                fld: {
                                    key_historical: fp_out_hist,
                                    key_projected: fp_out_proj
                                }
                            }
                        )
                        
                    dict_out.update(dict_read) 
                    
    global dict1
    global dict2
                
    dict1 = dict_out.copy()
    dict2 = dict_paths.copy()
    # write outputs?
    if write_q:
        
        for fld in dict_out.keys():

            dict_dfs_cur = dict_out.get(fld)
            dict_paths_cur = dict_paths.get(fld)
             

            for key in [key_historical, key_projected]:
                
                # get df
                df_write = dict_dfs_cur.get(key)
                
                if df_write is not None:
                    # check directory
                    fp = dict_paths_cur.get(key)
                    dir_base = os.path.dirname(fp)
                    os.makedirs(dir_base, exist_ok = True) if not os.path.exists(dir_base) else None
                    
                    df_write.to_csv(
                        fp, 
                        index = None,
                        encoding = "UTF-8"
                    )
                    
                    print(f"DataFrame successfully written to '{fp}'")
            
    
    return dict_out, dict_paths
    

    
    

dirs_ignore = [
    sa.dir_rbd_baseline_transition_probs,
]
fps_ignore = [
    sa.fp_csv_afolu_import_exports,
    sa.fp_csv_initial_industrial_production,
    sa.fp_csv_transition_probability_estimation_annual,
    sa.fp_csv_transition_probability_estimation_mean,
    sa.fp_csv_transition_probability_estimation_mean_recent,
    sa.fpt_csv_transition_probability_estimation_mean_with_growth,
    sa.fpt_pkl_transition_probability_estimation_mean_with_growth_assumptions,
    sa.fp_csv_industrial_production_scalar,
    sa.fp_csv_elasticity_of_industrial_production,
]

dict_dfs, dict_paths = generate_sisepuede_data_inputs(
    sa.dir_ref_batch_data,
    dir_git_sisepuede_data,
    range(2010, 2021),# TEMPORARY SETUP FOR HISTORICAL YEARS - NEED TO REWRITE TO PULL FROM DICT AND/OR YAML
    dirs_ignore = dirs_ignore,
    field_region_out = "Nation",
    field_iso_out = field_iso,
    field_year_out = "Year",
    fps_ignore = fps_ignore,
    model_attributes = sa.model_attributes,
    regions = regions,
    time_periods = time_periods,
    write_q = True,
)
    


DataFrame successfully written to '/Users/jsyme/Documents/Projects/git_jbus/sisepuede_data/IPPU/ef_ippu_tonne_c5f12_per_tonne_production_chemicals/input_to_sisepuede/historical/ef_ippu_tonne_c5f12_per_tonne_production_chemicals.csv'
DataFrame successfully written to '/Users/jsyme/Documents/Projects/git_jbus/sisepuede_data/IPPU/ef_ippu_tonne_c5f12_per_tonne_production_chemicals/input_to_sisepuede/projected/ef_ippu_tonne_c5f12_per_tonne_production_chemicals.csv'
DataFrame successfully written to '/Users/jsyme/Documents/Projects/git_jbus/sisepuede_data/IPPU/ef_ippu_tonne_nf3_per_tonne_production_chemicals/input_to_sisepuede/historical/ef_ippu_tonne_nf3_per_tonne_production_chemicals.csv'
DataFrame successfully written to '/Users/jsyme/Documents/Projects/git_jbus/sisepuede_data/IPPU/ef_ippu_tonne_nf3_per_tonne_production_chemicals/input_to_sisepuede/projected/ef_ippu_tonne_nf3_per_tonne_production_chemicals.csv'
DataFrame successfully written to '/Users/jsyme/Documents/Projects/git_jbus/si

In [12]:
df_ip = pd.read_excel(
    os.path.join(sa.dir_ref, "preliminary_calibration_info", "df_ip_var_calib.xlsx"),
    skiprows = 2
)
df_efs = pd.read_csv(sa.fp_csv_ippu_fc_efs)
[x for x in df_efs.columns if x in list(df_ip["Variable"])]

vars_need = sa.model_attributes.build_variable_dataframe_by_sector("IPPU", include_time_periods = False)
vars_del = set(df_ip["Variable"]) - set(vars_need["variable"])
vars_add = set(vars_need["variable"]) - set(df_ip["Variable"])

for k in sorted(list(vars_add)):
    print(k)

In [3]:
df1 = "/Users/jsyme/Documents/Projects/FY21/SWCHE131_1000/Data/calibrated_input_files_from_edmundo/en/data_complete_future_2023_04_27.csv"
df2 = "/Users/jsyme/Documents/Projects/FY21/SWCHE131_1000/Data/calibrated_input_files_from_edmundo/en/data_complete_future_2023_05_05.csv"

df1 = pd.read_csv(df1)
df2 = pd.read_csv(df2)

In [10]:
field_iso = "iso_code3"
df1a = df1[df1[field_iso].isin(["BRA"])].reset_index(drop = True)
df2a = df2[df2[field_iso].isin(["BRA"])].reset_index(drop = True)

fields_share = sorted(list(set(df1.columns) & set(df2.columns)))
fields_share = [x for x in fields_share if x not in ["Nation", field_iso]]
vec = []
for x in fields_share:
    (
        vec.append(x) 
        if float(df1a[x].iloc[-1]) != float(df2a[x].iloc[-1])
        else None
    )

vec2 = [x for x in df2a.columns if "hydrocarbon" in x]


In [11]:
df1a[vec2].tail()

Unnamed: 0,exports_enfu_pj_fuel_hydrocarbon_gas_liquids,frac_enfu_fuel_demand_imported_pj_fuel_hydrocarbon_gas_liquids,frac_inen_energy_agriculture_and_livestock_hydrocarbon_gas_liquids,frac_inen_energy_cement_hydrocarbon_gas_liquids,frac_inen_energy_chemicals_hydrocarbon_gas_liquids,frac_inen_energy_electronics_hydrocarbon_gas_liquids,frac_inen_energy_glass_hydrocarbon_gas_liquids,frac_inen_energy_lime_and_carbonite_hydrocarbon_gas_liquids,frac_inen_energy_metals_hydrocarbon_gas_liquids,frac_inen_energy_other_product_manufacturing_hydrocarbon_gas_liquids,...,frac_inen_energy_recycled_plastic_hydrocarbon_gas_liquids,frac_inen_energy_recycled_rubber_and_leather_hydrocarbon_gas_liquids,frac_inen_energy_recycled_textiles_hydrocarbon_gas_liquids,frac_inen_energy_recycled_wood_hydrocarbon_gas_liquids,frac_inen_energy_rubber_and_leather_hydrocarbon_gas_liquids,frac_inen_energy_textiles_hydrocarbon_gas_liquids,frac_inen_energy_wood_hydrocarbon_gas_liquids,frac_scoe_heat_energy_commercial_municipal_hydrocarbon_gas_liquids,frac_scoe_heat_energy_other_se_hydrocarbon_gas_liquids,frac_scoe_heat_energy_residential_hydrocarbon_gas_liquids
35,10.350032,0.174164,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.056032,0.177418,0.060975
36,10.350032,0.174164,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.056032,0.177418,0.060975
37,10.350032,0.174164,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.056032,0.177418,0.060975
38,10.350032,0.174164,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.056032,0.177418,0.060975
39,10.350032,0.174164,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.056032,0.177418,0.060975


In [12]:
df2a[vec2].tail()

Unnamed: 0,exports_enfu_pj_fuel_hydrocarbon_gas_liquids,frac_enfu_fuel_demand_imported_pj_fuel_hydrocarbon_gas_liquids,frac_inen_energy_agriculture_and_livestock_hydrocarbon_gas_liquids,frac_inen_energy_cement_hydrocarbon_gas_liquids,frac_inen_energy_chemicals_hydrocarbon_gas_liquids,frac_inen_energy_electronics_hydrocarbon_gas_liquids,frac_inen_energy_glass_hydrocarbon_gas_liquids,frac_inen_energy_lime_and_carbonite_hydrocarbon_gas_liquids,frac_inen_energy_metals_hydrocarbon_gas_liquids,frac_inen_energy_other_product_manufacturing_hydrocarbon_gas_liquids,...,frac_inen_energy_recycled_plastic_hydrocarbon_gas_liquids,frac_inen_energy_recycled_rubber_and_leather_hydrocarbon_gas_liquids,frac_inen_energy_recycled_textiles_hydrocarbon_gas_liquids,frac_inen_energy_recycled_wood_hydrocarbon_gas_liquids,frac_inen_energy_rubber_and_leather_hydrocarbon_gas_liquids,frac_inen_energy_textiles_hydrocarbon_gas_liquids,frac_inen_energy_wood_hydrocarbon_gas_liquids,frac_scoe_heat_energy_commercial_municipal_hydrocarbon_gas_liquids,frac_scoe_heat_energy_other_se_hydrocarbon_gas_liquids,frac_scoe_heat_energy_residential_hydrocarbon_gas_liquids
34,10.350032,0.174164,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.01198,0.01198,0.083318
35,10.350032,0.174164,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.01198,0.01198,0.083318
36,10.350032,0.174164,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.01198,0.01198,0.083318
37,10.350032,0.174164,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.01198,0.01198,0.083318
38,10.350032,0.174164,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.01198,0.01198,0.083318
