In [1]:
# add path to sispeuede to sys.path in python
import sys
import pathlib
import warnings
warnings.filterwarnings("ignore")


path_git = pathlib.Path("/Users/usuario/git")
for subdir in [
    "sisepuede", 
    "sisepuede_data_pipeline",
    "sisepuede_juypyter"
]:
    path_cur = path_git.joinpath(subdir)
    if str(path_cur) not in sys.path:
        sys.path.append(str(path_cur))

path_pipeline = path_git.joinpath("sisepuede_data_pipeline")


import importlib
import matplotlib.pyplot as plt
import numpy as np
import os, os.path
import pandas as pd
import sisepuede.legacy.data_api as api
import sisepuede.manager.sisepuede_examples as sxl
import sisepuede.manager.sisepuede_file_structure as sfs
import sisepuede.manager.sisepuede_models as sm
import sisepuede.plotting.plots as spp
import sisepuede.utilities._plotting as spu
import sisepuede.core.support_classes as sc
import sisepuede.utilities._toolbox as sf
import time
from typing import *

# from sisepuede_data_pipeline
import lib.process_utilities as pu
import lib.sisepuede_data_constructs as dc
import lib._util as lutil

# from sisepuede_jupyter
import temp_update_fields_from_wv_to_main as temp 


In [4]:
##  SETUP DIRECTORIES

path_cur = pathlib.Path(os.getcwd())
path_data = path_cur.joinpath("data")
# path_transformations = path_cur.joinpath("transformations_peru")


##  SETUP FILES
path_data_calib = path_data.joinpath("input_all_sectors_peru.csv")
path_data_built = path_data.joinpath("sisepuede_raw_global_inputs_peru.csv")


##  SETUP SOME PIPELINE STUFF

file_struct = sfs.SISEPUEDEFileStructure()
examples = sxl.SISEPUEDEExamples()

# assign some shortcuts
df_example_input = examples("input_data_frame")
matt = file_struct.model_attributes
regions = sc.Regions(matt)
time_periods = sc.TimePeriods(matt)


##  SOME GLOBALS

tab = regions.attributes.table
_REGIONS_BUILD = sorted(
    tab[
        tab["un_sub_region"].isin(["Latin America and the Caribbean"])
    ][regions.key]
    .unique()
)
_REGIONS_ISO = [
    regions.return_region_or_iso(x, return_type = "iso", )
    for x in _REGIONS_BUILD
]


# Setup old repository and read data

In [9]:
repo_old = api.SISEPUEDEBatchDataRepository(
    "/Users/usuario/git/sisepuede_data", 
    matt,
)

print("Getting old repository data...")
df_old = repo_old.read(None)
print("Old repository data complete.")

Getting old repository data...
Getting old repository data...


# Setup new repository

In [10]:
importlib.reload(dc)
construct = dc.SISEPUEDEDataConstructs(
    path_output_database = "/Users/usuario/git/sisepuede_data_pipeline/sisepuede_inputs.sqlite",
)

path_repo = pathlib.Path("/Users/usuario/SISEPUEDE_DATA_REPOSITORY")
repo = pu.Repository(
    {
        "local": {
            "path": str(path_repo)
        }
    }
)

# get from pipeline
df_from_pipeline = construct.build_inputs_from_database(
    regions_keep = _REGIONS_BUILD,
)


In [46]:
df_base[
    df_base[regions.field_iso].isin(["ABW"])
].dropna(
    subset = ["cost_enfu_fuel_gasoline_usd_per_m3"]
)

Unnamed: 0,iso_alpha_3,year,ef_lndu_conv_croplands_to_croplands_gg_co2_ha,ef_lndu_conv_croplands_to_forests_mangroves_gg_co2_ha,ef_lndu_conv_croplands_to_forests_primary_gg_co2_ha,ef_lndu_conv_croplands_to_forests_secondary_gg_co2_ha,ef_lndu_conv_croplands_to_grasslands_gg_co2_ha,ef_lndu_conv_croplands_to_other_gg_co2_ha,ef_lndu_conv_croplands_to_settlements_gg_co2_ha,ef_lndu_conv_croplands_to_wetlands_gg_co2_ha,...,pij_lndu_wetlands_to_other,pij_lndu_wetlands_to_settlements,pij_lndu_wetlands_to_wetlands,cost_enfu_fuel_biofuels_usd_per_m3,cost_enfu_fuel_crude_usd_per_m3,cost_enfu_fuel_diesel_usd_per_m3,cost_enfu_fuel_furnace_gas_usd_per_m3,cost_enfu_fuel_gasoline_usd_per_m3,cost_enfu_fuel_kerosene_usd_per_m3,cost_enfu_fuel_oil_usd_per_m3
55,ABW,2015.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,521.090317,367.362555,1840.438927,13725.64626,2070.217029,274.088907,524.80365
56,ABW,2016.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,521.090317,367.362555,1840.438927,13725.64626,2070.217029,274.088907,524.80365
57,ABW,2017.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,521.090317,367.362555,1840.438927,13725.64626,2070.217029,274.088907,524.80365
58,ABW,2018.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,521.090317,367.362555,1840.438927,13725.64626,2070.217029,274.088907,524.80365
59,ABW,2019.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,521.090317,367.362555,1840.438927,13725.64626,2070.217029,274.088907,524.80365
60,ABW,2020.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,521.090317,367.362555,1840.438927,13725.64626,2070.217029,274.088907,524.80365
61,ABW,2021.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,521.090317,367.362555,1840.438927,13725.64626,2070.217029,274.088907,524.80365
62,ABW,2022.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,521.090317,367.362555,1840.438927,13725.64626,2070.217029,274.088907,524.80365
63,ABW,2023.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,521.090317,367.362555,1840.438927,13725.64626,2070.217029,274.088907,524.80365
64,ABW,2024.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,521.090317,367.362555,1840.438927,13725.64626,2070.217029,274.088907,524.80365


In [83]:

def function_combine(
    df_repo_new: pd.DataFrame,
    df_repo_old: pd.DataFrame,
    df_example: pd.DataFrame,
    region_iso: str,
) -> pd.DataFrame:
    """Combine DataFrames by region, combining in a hierarchy
    """

    ##  FORMAT A BASE 

    df_base = (
        df_old[
            df_old[repo_old.field_repo_iso].isin([region_iso])
        ]
        .copy()
        .rename(
            columns = {
                repo_old.field_repo_iso: regions.field_iso,
                repo_old.field_repo_year: time_periods.field_year,
            }
        )
    )

    cols_drop = [
        x for x in df_base.columns 
        if df_base[x].dropna().shape[0] != df_base.shape[0]
    ]

    df_base = df_base.drop(columns = cols_drop, )
    df_base[time_periods.field_year] = df_base[time_periods.field_year].astype(int)


    ##  ADD IN PIPELINE DATA

    df_out = sf.match_df_to_target_df(
        df_base, 
        df_from_pipeline,
        [
            construct.time_periods.field_year,
            regions.field_iso,
        ],
        overwrite_only = False,
    )
    
    df_out[time_periods.field_year] = df_out[time_periods.field_year].astype(int)
    df_out = (time_periods.years_to_tps(df_out, ))


    ##  PULL MISSING FIELDS FROM EXAMPLE DF
    
    # fields not in peru
    fields_missing = [
        x for x in df_example_input.columns 
        if (x not in df_out.columns) 
        and (x in matt.all_variable_fields_input)
    ]
    
    # specify fields to pull from the example
    fields_from_ex = [
        x for x in fields_missing 
        if not (
            False#x.startswith("frac_lndu_")
            #x.startswith("factor_lndu")
            #or x.startswith("frac_lndu_")
        )
    ]
    
    fields_from_ex = [
        x for x in fields_from_ex
        if (x not in df_out.columns)
    ]
    
    # merge in from ex
    df_out = pd.merge(
        df_out,
        df_example_input[fields_from_ex + [time_periods.field_time_period]],
    )


    ##  TEMPORARY SCRIPT FOR MOVING FROM working_version TO latest full version
    
    df_out = temp.update_fields(
        df_out,
        matt,
    )

    return df_out


function_combine(
    df_from_pipeline,
    df_out,
    df_example_input,
    "ABW",
)

(36, 2405)

In [87]:
df_out = []

for iso in _REGIONS_ISO:
    df_cur = function_combine(
        df_from_pipeline,
        df_out,
        df_example_input,
        iso,
    )    
    df_cur[regions.field_iso] = iso

    df_out.append(df_cur,)

df_out = sf._concat_df(df_out, )

