In [1]:
# add path to sispeuede to sys.path in python
import sys
import pathlib
import warnings
warnings.filterwarnings("ignore")


path_git = pathlib.Path("/Users/usuario/git")
for subdir in [
    "sisepuede", 
    "sisepuede_data_pipeline",
    "sisepuede_juypyter"
]:
    path_cur = path_git.joinpath(subdir)
    if str(path_cur) not in sys.path:
        sys.path.append(str(path_cur))

path_pipeline = path_git.joinpath("sisepuede_data_pipeline")


import importlib
import matplotlib.pyplot as plt
import numpy as np
import os, os.path
import pandas as pd
import sisepuede.legacy.data_api as api
import sisepuede.manager.sisepuede_examples as sxl
import sisepuede.manager.sisepuede_file_structure as sfs
import sisepuede.manager.sisepuede_models as sm
import sisepuede.plotting.plots as spp
import sisepuede.utilities._plotting as spu
import sisepuede.core.support_classes as sc
import sisepuede.utilities._toolbox as sf
import time
from typing import *


import lib.process_utilities as pu
import lib.sisepuede_data_constructs as dc
import lib._util as lutil




##  Option to determine which columns are Peru specific - 20250807

In [33]:
##  SETUP DIRECTORIES

path_cur = pathlib.Path(os.getcwd())
path_cur = pathlib.Path("/Users/usuario/git/sisepuede_region_nbs/generic_regions")
path_data = path_cur.joinpath("data")
path_transformations = path_cur.joinpath("transformations_peru")


##  SETUP FILES

path_data_calib = path_data.joinpath("input_all_sectors_peru.csv")
path_data_built = path_data.joinpath("sisepuede_raw_global_inputs_peru.csv")


##  SETUP SOME PIPELINE STUFF


file_struct = sfs.SISEPUEDEFileStructure()

# assign some shortcuts
matt = file_struct.model_attributes
regions = sc.Regions(matt)
time_periods = sc.TimePeriods(matt)


##  SOME GLOBALS

_REGION_NAME = "united_republic_of_tanzania"
_REGION_ISO = regions.return_region_or_iso(_REGION_NAME, return_type = "iso", )


In [11]:
importlib.reload(dc)
construct = dc.SISEPUEDEDataConstructs(
    path_output_database = "/Users/usuario/git/sisepuede_data_pipeline/sisepuede_inputs.sqlite",
)

path_repo = pathlib.Path("/Users/usuario/SISEPUEDE_DATA_REPOSITORY")
repo = pu.Repository(
    {
        "local": {
            "path": str(path_repo)
        }
    }
)

In [3]:
repo_old = api.SISEPUEDEBatchDataRepository(
    "/Users/usuario/git/sisepuede_data", 
    matt,
)

In [4]:
df_old = repo_old.read(None)

In [41]:
import temp_update_fields_from_wv_to_main as temp 

examples = sxl.SISEPUEDEExamples()
df_example_input = examples("input_data_frame")


# get from original repo
df_base = (
    df_old[
        df_old[repo_old.field_repo_iso].isin([_REGION_ISO])
    ]
    .copy()
    .rename(
        columns = {
            repo_old.field_repo_iso: regions.field_iso,
            repo_old.field_repo_year: time_periods.field_year,
        }
    )
)

# get from pipeline
df_from_pipeline = construct.build_inputs_from_database(
    regions_keep = [_REGION_NAME],
)


# add in pipeline-based data
df_out = sf.match_df_to_target_df(
    df_base[
        df_base[regions.field_iso].isin([_REGION_ISO])
    ],
    
    df_from_pipeline
    .drop(columns = [regions.field_iso]),
    
    [
        construct.time_periods.field_year,
    ],
    overwrite_only = False,
)

df_out[time_periods.field_year] = df_out[time_periods.field_year].astype(int)



df_out = (
    time_periods.years_to_tps(
        df_out,
    )
    .drop(columns = time_periods.field_year)
)


##  PULL FROM EXAMPLE DF

# fields not in peru
fields_missing = [
    x for x in df_example_input.columns 
    if (x not in df_out.columns) 
    and (x in matt.all_variable_fields_input)
]

# specify fields to pull from the example
fields_from_ex = [
    x for x in fields_missing 
    if not (
        False#x.startswith("frac_lndu_")
        #x.startswith("factor_lndu")
        #or x.startswith("frac_lndu_")
    )
]

fields_from_ex = [
    x for x in fields_from_ex
    if (x not in df_out.columns)
]

# merge in from ex
df_out = pd.merge(
    df_out,
    df_example_input[fields_from_ex + [time_periods.field_time_period]],
)



##  TEMPORARY SCRIPT FOR MOVING FROM working_version TO latest full version

df_out = temp.update_fields(
    df_out,
    matt,
)

# add in key
df_out[regions.key] = _REGION_NAME


In [53]:
df_out.to_csv(
    path_data.joinpath("output", f"sisepuede_data_raw_v0_{_REGION_ISO.lower()}.csv"),
    encoding = "UTF-8",
    index = None,
)

PosixPath('/Users/usuario/git/sisepuede_region_nbs/article_6_tanzania_sri_lanka_peru/data')