In [1]:
from attribute_table import AttributeTable
import importlib
import ingestion as ing
import inspect
import logging
import matplotlib.pyplot as plt
#from model_attributes import *
import model_attributes as ma
import model_afolu as mafl
import model_ippu as mi
import model_circular_economy as mc
import model_electricity as ml
import model_energy as me
import model_socioeconomic as se
import numpy as np
import os, os.path
import pandas as pd
import re
import setup_analysis as sa
import sisepuede_data_api as api
import sisepuede_models as sm
import support_classes as sc
import support_functions as sf
import time
from typing import *
import warnings


warnings.filterwarnings("ignore")

  for desig, df in df_by_designation:


###  Get key datasets

In [905]:
##  COMPONENTS FOR READING TABLES
importlib.reload(sc)


# some directories 
dir_data_afolu = "/Users/jsyme/Documents/Projects/FY21/SWCHE131_1000/Data/AFOLU/"
dir_repo_data = "/Users/jsyme/Documents/Projects/git_jbus/sisepuede_data"


# file paths

fp_attr_luc = os.path.join(dir_data, "attribute_fao_land_use_category_glcshare.csv")
fp_ipcc_forest = os.path.join(sa.dir_ref, "data_tables_and_derivations", "AFOLU", "ipcc_afolu_c4_forested_land_tables.xlsx")
fp_ipcc_grassland = os.path.join(sa.dir_ref, "data_tables_and_derivations", "AFOLU", "ipcc_afolu_c6_grassland_tables.xlsx")
fp_kcc_cw = os.path.join(dir_data_afolu, "attribute_kcc.csv")


# some shared fields

field_cats_sisepuede = "sisepuede_categories"
field_continent = "continent"
field_count = "count"
field_country = "Country"
field_crop_area = "crop_area_ha"
field_crop_area_ca = "crop_area_ha_cons_agrc"
field_domain = "domain"
field_ecological_zone = "ecological_zone1"
field_frac_ca = "frac_crops_cons_agrc"
field_growth_natural = "biomass_net_growth_natural_dm_tonnes_per_ha_per_year"
field_growth_plantation = "biomass_net_growth_plantation_dm_tonnes_per_ha_per_year"
field_kcc = "kcc"
field_luc = "luc"
field_luc_attr_name = "land_use_category_name"
field_region_kassam = "region_kasssam"
field_storage_grassland = "biomass_storage_grasslands_dm_tonnes_per_ha"
field_storage_grassland_total = "biomass_storage_above_and_below_grasslands_dm_tonnes_per_ha"
field_storage_natural = "biomass_storage_forest_natural_dm_tonnes_per_ha"
field_storage_plantation = "biomass_storage_forest_plantation_dm_tonnes_per_ha"
field_type_factor = "factor_type"
field_type_forest = "forest_type"
field_type_forest_ipcc = "ipcc_forest"


# some derivative classes

model_afolu = mafl.AFOLU(sa.model_attributes)
model_socioeconomic = model_afolu.model_socioeconomic

regions = sc.Regions(sa.model_attributes)
time_periods = sc.TimePeriods(sa.model_attributes)
repo = api.SISEPUEDEBatchDataRepository(
    dir_repo_data,
    sa.model_attributes
)

attr_agrc = sa.model_attributes.get_attribute_table(f"{sa.model_attributes.subsec_name_agrc}")
attr_frst = sa.model_attributes.get_attribute_table(f"{sa.model_attributes.subsec_name_frst}")
attr_lndu = sa.model_attributes.get_attribute_table(f"{sa.model_attributes.subsec_name_lndu}")


# some lists

years_hist = list(range(2010, 2021))
years_proj = [x for x in time_periods.all_years if x not in years_hist]



####################################
#    DEFINE SOME DATA FUNCTIONS    #
####################################

def build_kcc_luc_agg_file(
    dir_read: str,
    regex_match: re.Pattern,
    model_attributes: ma.ModelAttributes,
    field_types: Union[Dict[str, str], None] = None,
) -> pd.DataFrame:
    """
    Build an aggregate data from of KCC and land use classification (LUC) by
        ISO code
    
    Function Arguments
    ------------------
    - dir_read: directory containing the files to concatenate
    - regex_match: regex for files containing the kcc/luc indices by ISO code
    - model_attributes: model attributes to use for assigning ISO field
    
    Keyword Arguments
    -----------------
    - field_types: optional map of string to Pandas data type to apply for fields
    """
    
    fls_read = sorted([x for x in os.listdir(dir_read) if regex_match.match(x) is not None])
    regions = sc.Regions(model_attributes)
    field_iso = regions.field_iso
    
    df_out = []

    for i, fl in enumerate(fls_read):
        
        # get iso code
        iso = regex_match.match(fl)
        iso = iso.groups()[0]
        
        fp_cur = os.path.join(dir_read, fl)
        df_cur = pd.read_csv(fp_cur)
        df_cur[field_iso] = iso
        
        if isinstance(field_types, dict):
            for field, tp in field_types.items():
                try:
                    df_cur[field] = (
                        df_cur[field].astype(tp)
                        if field in df_cur.columns
                        else df_cur[field]
                    )
                except:
                    continue
            
        df_out.append(df_cur)
        
        
    df_out = pd.concat(df_out, axis = 0)
    
    return df_out



def get_carbon_factors_forest(
    fp_ipcc_forest: str,
    sheet_name: str = "table 4.12_2019R",
) -> pd.DataFrame:
    """
    Read in IPCC V4 Table 4.12 from Excel and clean
    
    Function Arguments
    ------------------
    - fp_ipcc_forest: path to Excel file
        
    Keyword Arguments
    -----------------
    - sheet_name: sheet name in fp_ipcc_forest
    """
    # read and clean
    df_carbon_factors = pd.read_excel(fp_ipcc_forest, sheet_name = sheet_name)
    df_carbon_factors = sf.clean_field_names(df_carbon_factors)
    
    # rename
    dict_rnm = {
        "above__ground_biomass_in_natural_forests_(tonnes_d_m__ha_1)": field_storage_natural,
        "above__ground_biomass_in_forest_plantation_s_(tonnes_d_m__ha_1)": field_storage_plantation,
        "above__ground_net_biomass_growth_in_natural_forests_(tonnes_d_m__ha_1_yr_1)": field_growth_natural,
        "above__ground_net_biomass_growth_in_forest_plantations_(tonnes_d_m__ha_1_yr_1)": field_growth_plantation,
        "status/\ncondition": field_type_forest,
    }
    df_carbon_factors.rename(columns = dict_rnm, inplace = True)
    
    
    # clean fields
    fields_clean = [
        field_storage_natural,
        field_storage_plantation,
        field_growth_natural,
        field_growth_plantation
    ]
    for field in fields_clean:
        vec = list(df_carbon_factors[field])
        for i in range(len(df_carbon_factors)):
            val = df_carbon_factors[field].iloc[i]
            
            if sf.isnumber(val):
                val = val
            elif isinstance(val, str):
                # check if range is specified
                if "-" in val:
                    vals = val.split("-")
                    val = np.mean([float(x) for x in vals])
                    
                else:
                    try:
                        x = float(val)
                    except:
                        x = np.nan
                    val = x
            
            vec[i] = val
            
        df_carbon_factors[field] = vec     
    
    
    # clean
    return df_carbon_factors



def get_carbon_factors_grassland(
    fp_ipcc_grassland: str,
    field_kcc: str = field_kcc,
    field_zone: str = "ipcc_climate_zone",
    sheet_name: str = "table 6.4_2006",
) -> pd.DataFrame:
    """
    Read in IPCC V4 Table 6.4 from Excel and clean
    
    Function Arguments
    ------------------
    - fp_ipcc_grassland: path to Excel file
        
    Keyword Arguments
    -----------------
    - field_kcc: field storing kcc code
    - field_zone: field storing climate zone in input data frame (cleaned)
    - sheet_name: sheet name in fp_ipcc_grassland
    """
    # read and clean
    df_carbon_factors = pd.read_excel(fp_ipcc_grassland, sheet_name = sheet_name)
    df_carbon_factors = sf.clean_field_names(df_carbon_factors)
    (
        df_carbon_factors
        .drop(
            [x for x in df_carbon_factors.columns if "error" in x],
            axis = 1,
            inplace = True,
        )
    )
    
    
    # dictionary to map grass climates to kopen climate (manual)
    dict_ipcc_grass_climates_to_kcc = {
        "Boreal - Dry & Wet": [411, 412, 413, 414, 421, 422, 423, 424, 431, 432, 433, 434],
        "Cold Temperate - Dry": [212, 222, 313, 323, 510, 520],
        "Warm Temperate - Dry": [211, 221, 311, 312, 321, 322],
        "Cold Temperate - Wet": [333],
        "Warm Temperate - Wet": [331, 332],
        # tropical distinctions: https://en.wikipedia.org/wiki/Tropical_climate
        "Tropical - Wet": [110, 130],
        "Tropical - Dry": [120]
    }

    # rename
    dict_rnm = {
        "peak_above_ground_biomass_(tonnes_d_m__ha_1)": field_storage_grassland,
        "total_(above_ground_and_below_ground)_non_woody_biomass": field_storage_grassland_total,

    }
    df_carbon_factors.rename(columns = dict_rnm, inplace = True)
    
    # build climate zones domain data frame
    dict_cc_to_zone = dict(
        sum(
            [
                list(zip(v, [k for x in v])) 
                for k, v in dict_ipcc_grass_climates_to_kcc.items()
            ],
            []
        )
    )
    df_kcc = pd.DataFrame({field_kcc: list(dict_cc_to_zone.keys())})
    df_kcc[field_zone] = df_kcc[field_kcc].replace(dict_cc_to_zone)
    
    # merge in data and drop field
    df_carbon_factors = (
        pd.merge(
            df_kcc, 
            df_carbon_factors,
            how = "left"
        )
        .drop([field_zone], axis = 1)
    ) 
    
    return df_carbon_factors



def get_climate_data(
    fp_climate_counts: str,
    fp_kcc_attribute: str,
    field_codenum_kcc: str = "code_num",
    field_iso_climate: str = "ISO_A3",
    field_key_kcc: str = field_kcc,
) -> Union[Tuple[pd.DataFrame, AttributeTable], None]:
    """
    Using climate counts and classification crosswalk, return table
        of climates + attribute table of kcc
        
    Function Arguments
    ------------------
    - fp_climate_counts: file path to climate count CSV
    - fp_kcc_attribute: file path to kopen climate classifcation attribute
        (must contain field_key_kcc)
        
    Keyword Arguments
    -----------------
    - field_codenum_kcc: field in kcc attribute with code number 
    - field_iso_climate: field storing iso code
    - field_kcc: field storing Kopen Climate Classification code
    """
    
    df_climate = (
        pd.read_csv(fp_climate_counts)
        if os.path.exists(fp_climate_counts)
        else None
    )
    
    df_kcc = (
        pd.read_csv(fp_kcc_attribute, sep = ",")
        if os.path.exists(fp_kcc_attribute)
        else None
    )
    
    # actions to climate counts
    if df_climate is not None:
        if field_kcc in df_climate.columns:
            df_climate[field_kcc] = np.array(df_climate[field_kcc]).astype(int)
            
        if field_iso_climate in df_climate.columns:
            (
                df_climate
                .rename(
                    columns = {field_iso_climate: regions.field_iso}, 
                    inplace = True
                )
            )
    
    # actions to attribute
    if df_kcc is not None:
        
        if field_codenum_kcc in df_kcc.columns:
            (
                df_kcc
                .rename(
                    columns = {field_codenum_kcc: field_kcc}, 
                    inplace = True
                )
            )
        
        # add specification of 
        #field_specification
        
        df_kcc = AttributeTable(
            df_kcc,
            field_kcc,
        )
        
    
    if (df_climate is not None) & (df_kcc is not None):
        df_climate = (
            pd.merge(
                df_climate,
                df_kcc.table,
                how = "left"
            )
            if (field_kcc in df_kcc.table.columns) & (field_kcc in df_climate.columns)
            else df_climate
        )
        
    
    return df_climate, df_kcc



def get_iso_land_use_climate_data(
    dir_read: str,
    regex_match: re.Pattern,
    attr_kcc: AttributeTable,
    attr_luc: AttributeTable,
    model_attributes: ma.ModelAttributes,
    field_kcc: str = field_kcc,
    field_luc: str = field_luc,
    field_types: Union[Dict[str, str], None] = None,
) -> pd.DataFrame:
    """
    Build an complete data from of KCC and land use classification (LUC) by
        ISO code + attributes 
    
    Function Arguments
    ------------------
    - dir_read: directory containing the files to concatenate
    - regex_match: regex for files containing the kcc/luc indices by ISO code
    - attr_kcc: attribute table for Kopen Climate Classification
    - attr_luc: attribute table for land use classes
    - model_attributes: model attributes to use for assigning ISO field
    
    Keyword Arguments
    -----------------
    - field_kcc: output field for Kopen Climate Classification (in compiled 
        DataFrame built by build_kcc_luc_agg_file())
    - field_luc: output field for land use classification (in compiled DataFrame 
        built by build_kcc_luc_agg_file())
    - field_types: optional map of string to Pandas data type to apply for fields
    """
    
    # read data from storage and build
    df_kcc_and_luc_by_iso = build_kcc_luc_agg_file(
        dir_read, 
        regex_match,
        model_attributes,
        field_types = field_types,
    )
    
    df_kcc_and_luc_by_iso.sort_values(
        by = [
            regions.field_iso,
            field_count
        ],
        inplace = True
    )
    
    # merge in attributes
    df_kcc_and_luc_by_iso = pd.merge(
        df_kcc_and_luc_by_iso,
        attr_kcc.table.rename(columns = {attr_kcc.key: field_kcc}),
        how = "left"
    )

    df_kcc_and_luc_by_iso = pd.merge(
        df_kcc_and_luc_by_iso,
        attr_luc.table.rename(columns = {attr_luc.key: field_luc}),
        how = "left"
    )

    return df_kcc_and_luc_by_iso



def get_luc_name_to_sisepuede_cats(
    attr_luc: AttributeTable,
    delim: str = ";",
    field_categories_sisepuede: str = field_cats_sisepuede,
    field_luc_attr_name: str = field_luc_attr_name,
) -> Dict:
    """
    Build dictionary mapping land use category to list of sisepuede land use
        categories & inverse
    
    Function Arguments
    ------------------
    - attr_luc: land use category attribute table
    
    Keyword Arguments
    -----------------
    - delim: delimiter in entries in field_categories_sisepuede
    - field_categories_sisepuede: field in attr_luc.table containing sisepuede
        categories
    - field_luc_attr_name: field in attr_luc.table containing land use category
        name
    """
    
    dict_luc_to_sc = {}
    dict_sc_to_luc = {}
    
    for i, row in attr_luc.table.iterrows():
        
        luc = str(row[field_luc_attr_name])
        sisepuede_cats = str(row[field_categories_sisepuede]).split(delim)
        
        dict_luc_to_sc.update({luc: sisepuede_cats})
        
        for cat in sisepuede_cats: 
            (
                dict_sc_to_luc.update({cat: [luc]})
                if cat not in dict_sc_to_luc.keys()
                else dict_sc_to_luc[cat].append(luc)
            )
        
    return dict_luc_to_sc, dict_sc_to_luc



        
# read tables
df_climate, attr_kcc = get_climate_data(
    sa.fp_csv_kcc_cell_counts_by_country_kcc,
    fp_kcc_cw
)


# attribute for land use classification
attr_luc = AttributeTable(
    fp_attr_luc,
    "land_use_category_index"
)

# get land use by climate
df_lndu_climate_counts = get_iso_land_use_climate_data(
    os.path.join(dir_data, "KCC_LUC_aggs_by_country"),
    re.compile("kcc_and_lndusecat_by_iso_(\D*)_agg.csv"),
    attr_kcc, 
    attr_luc,
    sa.model_attributes,
    field_kcc = field_kcc,
    field_luc = field_luc,
    field_types = {
        field_count: "int",
        field_kcc: "int",
        field_luc: "int",
    }
)

# get some forest carbon factors
df_carbon_factors_forest = get_carbon_factors_forest(fp_ipcc_forest)
df_carbon_factors_grassland = get_carbon_factors_grassland(fp_ipcc_grassland)

# get some dictionaries
dict_luc_to_cats_lndu, dict_cat_lndu_to_lucs = get_luc_name_to_sisepuede_cats(attr_luc)


In [907]:
df_means_forest_storage_0 = clean_dfm(
    df_means_forest_storage_0,
    field_storage_natural,
    field_count,
    field_kcc,
    field_luc_attr_name,
    field_type_forest,
)

sf.pivot_df_clean(
        df_means_forest_storage_0,
        [field_type_forest],
        [field_growth_natural, field_luc_attr_name]
    )

Unnamed: 0,biomass_storage_forest_natural_dm_tonnes_per_ha,count,kcc,Primary,Secondary\n>20 years,Secondary\n≤20 years
0,25.6,22,211,,,Tree Covered Areas
1,42.9,58891,221,,,Tree Covered Areas
2,44.0,22,211,,Tree Covered Areas,
3,55.7,393,130,,,Mangroves
4,55.7,1660992,130,,,Tree Covered Areas
5,71.5,1772,120,Mangroves,Mangroves,Mangroves
6,71.5,2490921,120,Tree Covered Areas,Tree Covered Areas,Tree Covered Areas
7,74.6,25999,322,Tree Covered Areas,Tree Covered Areas,Tree Covered Areas
8,75.7,465,110,,,Mangroves
9,75.7,1647292,110,,,Tree Covered Areas


In [926]:
df = clean_dfm(
        df_means_forest_growth_0,
        field_storage_natural,
        field_count,
        field_kcc,
        field_luc_attr_name,
        field_type_forest,
    )

sf.pivot_df_clean(
    df,
    [field_type_forest, field_luc_attr_name],
    [field_growth_natural]
)
df_means_forest_growth_0

Unnamed: 0,count,kcc,land_use_category_name,forest_type
0,465,110,Mangroves,Primary
1,465,110,Mangroves,Secondary\n>20 years
2,465,110,Mangroves,Secondary\n≤20 years
3,1647292,110,Tree Covered Areas,Primary
4,1647292,110,Tree Covered Areas,Secondary\n>20 years
5,1647292,110,Tree Covered Areas,Secondary\n≤20 years
6,1772,120,Mangroves,Primary
7,1772,120,Mangroves,Secondary\n>20 years
8,1772,120,Mangroves,Secondary\n≤20 years
9,2490921,120,Tree Covered Areas,Primary


In [959]:
lucs_with_no_biomass = [
    "Artificial Surfaces",
    "Baresoil", # biomass is estimated directly in cropland, so assume all carbon is lost
    "Cropland",
    "Snow and glaciers",
    "Waterbodies",
]



# map land use categories (applicable) to forest types to use from ipcc_forest
dict_cats_lndu_to_type_forest = {
    "forests_mangroves": [
        "Primary",
        "Secondary\n>20 years",
        "Secondary\n≤20 years"
    ],
    "forests_primary": [
        "Primary"
    ],
    "forests_secondary": [
        "Secondary\n>20 years",
        "Secondary\n≤20 years"
    ]
}


def clean_dfm(
    dfm: pd.DataFrame,
    field_agg: str,
    field_count: str,
    field_kcc: str,
    field_luc_attr_name: str,
    field_type_forest: str,
) -> pd.DataFrame:
    
    dfg = dfm.groupby([field_kcc, field_luc_attr_name, field_type_forest])
    
    df_out = []
    for tup, df in dfg:
        df = sf.simple_df_agg(
            df.dropna(),
            [field_kcc, field_luc_attr_name, field_type_forest],
            {
                field_agg: "max",
                field_count: "sum",
            }
        )
        
        df_out.append(df)
    
    df_out = pd.concat(df_out, axis = 0).reset_index(drop = True)
    
    return df_out


def get_factor_info(
    df_means: pd.DataFrame,
    dict_cat_lndu_to_lucs: Dict[str, List[str]],
    dict_luc_to_cats_lndu: Dict[str, List[str]],
    iso: str, 
    default_c_per_biomass: float = 0.47,
    field_count: str = field_count,
    field_growth_forest: str = field_growth_natural,
    field_growth_forest_plantation: str = field_growth_plantation,
    field_iso: str = regions.field_iso,
    field_luc_name: str = field_luc_attr_name,
    field_storage_forest: str = field_storage_natural,
    field_storage_forest_plantation: str = field_storage_plantation,
    field_storage_grassland: str = field_storage_grassland,
    field_storage_grassland_total: str = field_storage_grassland_total,
    field_tmp: str = "tmp",
    field_type_factor: str = field_type_factor,
    field_type_forest: str = field_type_forest,
    field_type_forest_ipcc: str = field_type_forest_ipcc,
    lucs_zero: List[str] = lucs_with_no_biomass,
    model_afolu: mafl.AFOLU = model_afolu,
) -> pd.DataFrame:
    """
    Map df_means in get_average_forest_factors_by_iso() to biomass estimates
        for conversion between Land Use Categories
        
    Function Arguments
    ------------------
    - df_means: input df_means data frame
    - iso: iso code to attach
    
    Keyword Arguments
    -----------------
    - default_c_per_dm: default c/biomass (see IPCC V4 Table 4.3 2006)
    - lucs_zero: list of categories with 
    """
    # some key init
    attr_frst = model_afolu.model_attributes.get_attribute_table(
        model_afolu.model_attributes.subsec_name_frst
    )
    attr_lndu = model_afolu.model_attributes.get_attribute_table(
        model_afolu.model_attributes.subsec_name_lndu
    )
    count_total = int(df_means[field_count].sum())
    cats_forest = list(model_afolu.dict_cats_frst_to_cats_lndu.values())
    time_periods = sc.TimePeriods(model_afolu.model_attributes)
    
    ##  SET SOME LAND USE CATEGORY GROUPS
     
    lucs_zero = (
        lucs_zero
        if sf.islistlike(lucs_zero)
        else []
    )
    
    # forest types
    lucs_forest = set(
        sum(
            [
                dict_cat_lndu_to_lucs.get(x)
                for x in list(model_afolu.dict_cats_frst_to_cats_lndu.values())
            ], 
            []
        )
    )
    #HEREHERE
    global df_means_forest_growth_0
    global df_means_forest_storage_0
    # split means into forest/not-forest (grassland)
    df_means_forest = (
        df_means[
            [(x in lucs_forest) for x in df_means[field_luc_attr_name]]
        ]
        .drop([field_storage_grassland, field_storage_grassland_total], axis = 1)
        .reset_index(drop = True)
    )
    fields_ind = [field_kcc, field_count, field_luc_attr_name, field_type_forest]
    
    df_means_forest_growth = df_means_forest[fields_ind + [field_growth_forest]]
    df_means_forest_growth = clean_dfm(
        df_means_forest_growth,
        field_growth_forest,
        field_count,
        field_kcc,
        field_luc_attr_name,
        field_type_forest,
    )
    
    df_means_forest_storage = df_means_forest[fields_ind + [field_storage_forest]]
    df_means_forest_storage = clean_dfm(
        df_means_forest_storage,
        field_storage_forest,
        field_count,
        field_kcc,
        field_luc_attr_name,
        field_type_forest,
    )
    
    df_means_forest_growth_0 = df_means_forest_growth.copy()
    df_means_forest_storage_0 = df_means_forest_storage.copy()
    
    # pivot 
    df_means_forest_growth = sf.pivot_df_clean(
        df_means_forest_growth,
        [field_type_forest, field_luc_attr_name],
        [field_growth_forest]
    )
    df_means_forest_storage = sf.pivot_df_clean(
        df_means_forest_storage,
        [field_type_forest, field_luc_attr_name],
        [field_storage_forest]
    )
    
    
   
    
    
    # get non-forest values
    df_means_not_forest = (
        df_means[
            [(x not in lucs_forest) for x in df_means[field_luc_attr_name]]
        ]
        .drop(
            [
                field_growth_forest,
                field_growth_forest_plantation,
                field_storage_forest,
                field_storage_forest_plantation,
                field_type_forest,
                field_type_forest_ipcc,
                field_storage_grassland_total,
            ],
            axis = 1
        )
        .drop_duplicates()
        .reset_index(drop = True)
    )
    
    
    ##  GET FOREST FACTORS
    
    df_means_forest_storage_grouped = df_means_forest_storage.groupby([field_kcc])

    for cat in cats_forest:
        
        fields_retrieve = dict_cats_lndu_to_type_forest.get(cat)
        
        if set(fields_retrieve).issubset(set(df_means_forest_growth.columns)):
            # update growth
            df_means_forest_growth[cat] = df_means_forest_growth[fields_retrieve].mean(axis = 1)
            df_means_forest_growth[f"max_{cat}"] = df_means_forest_growth[fields_retrieve].max(axis = 1)
            df_means_forest_growth[f"min_{cat}"] = df_means_forest_growth[fields_retrieve].min(axis = 1)
            
        else:
            df_means_forest_growth[cat] = 0
            df_means_forest_growth[f"max_{cat}"] = 0
            df_means_forest_growth[f"min_{cat}"] = 0
            
            
        if set(fields_retrieve).issubset(set(df_means_forest_storage.columns)):
            #update storage
            vec_mean_storage = df_means_forest_storage[fields_retrieve].mean(axis = 1)
            df_means_forest_storage[cat] = vec_mean_storage
        else:
            df_means_forest_storage[cat] = 0.0
        
       
    
    # build growth
    arr_growth = np.zeros((3, len(cats_forest)))
    vec_type = ["nominal", "max", "min"]
    for i, fldp in enumerate(["", "max_", "min_"]):
        
        fields = [f"{fldp}{x}" for x in cats_forest]
        # get mean sequestration factors
        vec_count_forest = np.array(df_means_forest_growth[field_count]).astype(float)
        vec_growth = sf.do_array_mult(
            np.array(df_means_forest_growth[fields]),
            vec_count_forest
        )
        vec_growth /= vec_count_forest.sum()

        # convert 0s, then convert dry matter to CO2e
        vec_growth = np.nan_to_num(vec_growth, 0.0).sum(axis = 0)
        vec_growth *= default_c_per_biomass
        vec_growth *= model_afolu.factor_c_to_co2
        
        arr_growth[i, :] = vec_growth/1000
    
    # output growth
    df_growth = pd.DataFrame(arr_growth, columns = cats_forest)
    df_growth[field_type_factor] = vec_type
    df_growth[field_iso] = iso
    
    
    
    ##  NEXT, GET STORAGE FOR OTHER LU TYPES
    
    vec_bin = np.array(
        [(x not in lucs_zero) for x in (df_means_not_forest[field_luc_attr_name])]
    )
    vec_count = np.array(df_means_not_forest[field_count])
    df_means_not_forest[field_luc_attr_name].replace(dict_luc_to_cats_lndu, inplace = True)
    df_means_not_forest[field_storage_grassland] *= vec_bin

    # normalize by climate
    df_mnf = []
    df_means_not_forest_grouped = (
        df_means_not_forest.
        dropna()
        .groupby([field_kcc, field_luc_attr_name])
    )

    for kcc, df in df_means_not_forest_grouped:
        vec_count = np.array(df[field_count])
        vec_count = vec_count/vec_count.sum()
        df[field_storage_grassland] *= vec_count
        df = sf.simple_df_agg(
            df,
            [field_kcc, field_luc_attr_name],
            {
                field_count: "sum",
                field_storage_grassland: "sum"
            }
        )
        
        df_mnf.append(df)
    df_means_not_forest = pd.concat(df_mnf, axis = 0)
    df_counts_means_not_forest = (
        sf.simple_df_agg(
            df_means_not_forest[[field_count, field_kcc]],
            [field_kcc],
            {field_count: "sum"}
        )
        .rename(columns = {field_count: f"nf_{field_count}"})
    )
    df_means_not_forest = pd.merge(
        sf.pivot_df_clean(
            df_means_not_forest.drop([field_count], axis = 1),
            [field_luc_attr_name],
            [field_storage_grassland]
        ),
        df_counts_means_not_forest,
        how = "left"
    )
    
    
    
    ##  GET STORAGE AND CONVERSION
    
    # initialize a dictionary of weight mean storage
    dict_storage_mu = {}
    
    #  reducting forest data frame
    df_counts_means_forest = (
        sf.simple_df_agg(
            df_means_forest[[field_count, field_kcc]],
            [field_kcc],
            {field_count: "sum"}
        )
        .rename(columns = {field_count: f"f_{field_count}"})
    )
    df_means_forest_storage = pd.merge(
        df_means_forest_storage[[field_kcc] + cats_forest].drop_duplicates(),
        df_counts_means_forest,
        how = "left"
    )
    
    
    # get weighted means for FOREST
    vec_counts = np.array(df_means_forest_storage[f"f_{field_count}"])
    for field in [x for x in df_means_forest_storage.columns if x in attr_lndu.key_values]:
        val = np.array(df_means_forest_storage[field])
        w = np.where(~np.isnan(val))[0]
        val = (
            (val[w]*vec_counts[w]/vec_counts[w].sum()).sum()
            if len(w) > 0
            else 0
        )
        
        dict_storage_mu.update({field: val})
        
    # get weights means for NOT FOREST
    vec_counts = np.array(df_means_not_forest[f"nf_{field_count}"])
    for field in [x for x in df_means_not_forest.columns if x in attr_lndu.key_values]:
        val = np.array(df_means_not_forest[field])
        w = np.where(~np.isnan(val))[0]
        val = (
            (val[w]*vec_counts[w]/vec_counts[w].sum()).sum()
            if len(w) > 0
            else 0
        )
        
        dict_storage_mu.update({field: val})
    
    global dfnf
    global dfmfs
    dfnf = df_means_not_forest
    dfmfs = df_means_forest_storage
    
    global df_conversions
    
    # get final data frame used to calculate conversions by climate
    df_conversions = (
        pd.merge(
            df_means_not_forest,
            df_means_forest_storage,
            how = "outer"
        )
    )
    fields_conv = [f"nf_{field_count}", f"f_{field_count}"]
    df_conversions[field_count] = df_conversions[fields_conv].sum(axis = 1)
    df_conversions.drop(fields_conv, axis = 1, inplace = True)
    df_conversions.fillna(
        dict((k, v) for k, v in dict_storage_mu.items() if k not in cats_forest), 
        inplace = True
    )

    # iterate over rows to build weighted conversion factors
    num = 0.0
    denom = 0.0#np.zeros((attr_lndu.n_key_values, attr_lndu.n_key_values))

    for i, row in df_conversions.iterrows():
    
        vec = np.array(row[attr_lndu.key_values])
        count = int(row[field_count])
        
        storage_rows = np.outer(vec, np.ones(attr_lndu.n_key_values))
        storage_cols = np.outer(np.ones(attr_lndu.n_key_values), vec)
        arr_conv = sf.vec_bounds(storage_rows - storage_cols, (0, np.inf))
        
        tot = count*np.ones((attr_lndu.n_key_values, attr_lndu.n_key_values))
        tot[np.isnan(arr_conv)] = 0
        #print(tot*arr_conv)
        num += np.nan_to_num(arr_conv, 0.0)*tot
        denom += tot
    
    arr_conv = np.nan_to_num(num/denom, nan = 0, posinf = 0)
    arr_conv *= default_c_per_biomass
    arr_conv *= model_afolu.factor_c_to_co2
    arr_conv /= 1000
    arr_conv = np.round(arr_conv, decimals = 6)
    
    # convert to data frame
    df_efs = model_afolu.format_transition_matrix_as_input_dataframe(
        np.array([arr_conv for x in time_periods.all_time_periods]),
        modvar = model_afolu.modvar_lndu_ef_co2_conv,
    )
    df_efs[field_iso] = iso
    
    
    ##  CLEAN SEQUESTRATION DF
    
    # reduce growth out
    cats_forest = [x for x in attr_lndu.key_values if x in cats_forest]
    cats_forest_frst = [x for x in attr_frst.key_values if model_afolu.dict_cats_frst_to_cats_lndu.get(x) in cats_forest]
    
    dict_rnm = dict(
        zip(
            cats_forest,
            model_afolu.model_attributes.build_varlist(
                None,
                model_afolu.modvar_frst_sq_co2,
                restrict_to_category_values = cats_forest_frst,
            )
        )
    )

    df_growth_out = (
        df_growth[
            df_growth[field_type_factor].isin(["nominal"])
        ]
        .reset_index(drop = True)
        .drop([field_type_factor], axis = 1)
        .rename(columns = dict_rnm)
    )
    df_growth_out = sf.explode_merge(
        time_periods.get_time_period_df(),
        df_growth_out
    )
    
    
    return df_growth_out, df_efs




def get_average_forest_factors_by_iso(
    df_climate_by_iso: pd.DataFrame,
    df_carbon_factors_forest: pd.DataFrame,
    df_carbon_factors_grassland: pd.DataFrame,
    dict_luc_to_cats_lndu: Dict[str, List[str]],
    attr_kcc: AttributeTable,
    regions: sc.Regions,
    field_continent: str = field_continent,
    field_count: str = "count",
    field_ecological_zone: str = "ecological_zone1",
    field_forest_cat: str = "ipcc_forest",
    field_type_forest: str = field_type_forest,
) -> pd.DataFrame:
    """
    Generate average storage and sequestration rate factors by iso code
    
    Function Arguments
    ------------------
    - df_climate_by_iso: data frame containing KCC climate counts by ISO
        code
    - df_carbon_factors_forest: data frame storing IPCC GHG default carbon 
        biomass factors for forest (IPCC table V4.4.12)
    - df_carbon_factors_grassland: data frame storing IPCC GHG default carbon 
        biomass factors for forest (IPCC table V4.6.4)
    - dict_luc_to_cats_lndu: dictionary mapping land use category to SISEPUEDE
        LNDU categories
    - attr_kcc: attribute table characterizing Kopen Climate 
        Classification 
        
    Keyword Arguments
    -----------------
    - field_continent: field storing continent
    - field_count: field giving # of cells by country assocaited with 
        each KCC category
    - field_ecological_zone: field in df_carbon_factors_forest containing IPCC
        forests
    - field_forest_cat: field in df_climate_by_iso containing the IPCC 
        forest category used to estimate factors
    - field_type_forest: field in df_carbon_factors_forest storing forest type
    """
    
    dict_un_region_to_continent = {
        "Americas": "North and South America"
    }
    continents_global = [
        "Asia\nEurope\nNorth and South America",
        "Asia Europe North\nAmerica",
        "Asia\nEurope\nNorth\nAmerica"
    ]
    
    fields_keep = [
        regions.field_iso,
        attr_kcc.key,
        field_count,
        field_luc_attr_name,
        field_forest_cat
    ]
    dfg_climate = (#TEMP [df_climate_by_iso[regions.field_iso].isin(["BRA"])]
        df_climate_by_iso[fields_keep]
        .groupby([regions.field_iso])
    )
    df_out_conv = []
    df_out_sequestration = []

    # clean carbon factors df
    df_cf = (
        df_carbon_factors_forest
        .rename(
            columns = {
                field_ecological_zone: field_forest_cat
            }
        )
    )
    fields_ind_cf = [field_forest_cat, field_continent, field_type_forest]
    fields_dat_cf = [x for x in df_cf.columns if x not in fields_ind_cf]

    # split into continent-specific and global
    df_cf_by_continent = (
        df_cf[
            ~df_cf[field_continent].isin(continents_global)
        ]
        .reset_index(drop = True)
    )
    df_cf_global = (
        df_cf[
            df_cf[field_continent].isin(continents_global)
        ]
        .reset_index(drop = True)
    )
    
    # get forest vals by split
    all_forests_by_continent = set(df_cf_by_continent[field_forest_cat])
    all_forests_global = set(df_cf_global[field_forest_cat])
    
  
    for iso, df in dfg_climate:
        
        # get un region
        region_un = regions.get_un_region(iso)
        region_ipcc_forests = dict_un_region_to_continent.get(region_un, region_un)
        
        all_forests_cur = set(df[field_forest_cat])
        any_by_continent = len(all_forests_cur & all_forests_by_continent) > 0
        any_global = len(all_forests_cur & all_forests_global) > 0
        
        # total number of cells
        total_count = df[field_count].sum()
        
        # initialize splits
        df_by_continent = None
        df_global = None
        
        # deal with splits
        if any_by_continent:
            df[field_continent] = region_ipcc_forests

            df_by_continent = (
                pd.merge(
                    df,
                    df_cf_by_continent,
                )
                .drop(
                    [
                        field_continent, 
                        #field_kcc, 
                        #field_forest_cat,
                        regions.field_iso
                    ], 
                    axis = 1
                )
            )
            
            df.drop([field_continent], axis = 1, inplace = True)
            
            
        if any_global:
            df_global = (
                pd.merge(
                    df,
                    df_cf_global.drop([field_continent], axis = 1),
                )
                .drop(
                    [
                        #field_kcc, 
                        #field_forest_cat,
                        regions.field_iso
                    ], 
                    axis = 1
                )
            )

        
        # check that at least one was successfully merges
        if (df_global is None) & (df_by_continent is None):
            continue
        
        global df_means
        df_means = pd.concat([df_by_continent, df_global], axis = 0)
        df_means = pd.merge(
            df_means,
            df_carbon_factors_grassland, 
            how = "left"
        )
        
        
        try:
            df_growth, df_efs = get_factor_info(
                df_means, 
                dict_cat_lndu_to_lucs, 
                dict_luc_to_cats_lndu,
                iso
            )

            df_out_conv.append(df_efs)
            df_out_sequestration.append(df_growth)
        except:
            print(f"ISO {iso} failed")
            
        """
        df_append = None
        for field in fields_dat_cf:
            
            
            df_cur = (
                df_means[
                    [
                        field_count,
                        field_type_forest,
                        field
                    ]
                ]
                .fillna(0)
            )
            
            df_cur[field] = np.array(df_cur[field])*np.array(df_cur[field_count])/total_count
            df_cur = sf.simple_df_agg(
                df_cur.drop([field_count], axis = 1),
                [field_type_forest],
                {
                    field: "sum"
                }
            )
            
            df_append = (
                df_cur
                if df_append is None
                else pd.merge(df_append, df_cur)
            )
            
            df_append[regions.field_iso] = iso
            
        df_out.append(df_append)
        """
        
        
    df_out_conv = pd.concat(df_out_conv, axis = 0)
    df_out_sequestration = pd.concat(df_out_sequestration, axis = 0)
        
    return df_out_conv, df_out_sequestration
        


df_efs_conv, df_efs_sequestration = get_average_forest_factors_by_iso(
    df_lndu_climate_counts.dropna(subset = [field_luc_attr_name]),
    df_carbon_factors_forest.drop([field_domain], axis = 1),
    df_carbon_factors_grassland,
    dict_luc_to_cats_lndu,
    attr_kcc,
    regions
);


               

ISO ABW failed
ISO AND failed
ISO ARM failed
ISO ASM failed
ISO ATG failed
ISO BLZ failed
ISO BMU failed
ISO BRB failed
ISO BRN failed
ISO COM failed
ISO CPV failed
ISO CYM failed
ISO DJI failed
ISO DMA failed
ISO FJI failed
ISO FRO failed
ISO FSM failed
ISO GAB failed
ISO GIB failed
ISO GNQ failed
ISO GRD failed
ISO GRL failed
ISO GUM failed
ISO GUY failed
ISO HKG failed
ISO HTI failed
ISO IMN failed
ISO KIR failed
ISO KNA failed
ISO KWT failed
ISO LBR failed
ISO LCA failed
ISO LIE failed
ISO LSO failed
ISO LUX failed
ISO MAC failed
ISO MAF failed
ISO MCO failed
ISO MDG failed
ISO MDV failed
ISO MHL failed
ISO MLT failed
ISO MNP failed
ISO MUS failed
ISO MYS failed
ISO NCL failed
ISO NRU failed
ISO PHL failed
ISO PLW failed
ISO PYF failed
ISO SGP failed
ISO SLB failed
ISO SLV failed
ISO SMR failed
ISO STP failed
ISO SUR failed
ISO SXM failed
ISO SYC failed
ISO TCA failed
ISO TJK failed
ISO TLS failed
ISO TON failed
ISO TTO failed
ISO TUV failed
ISO VCT failed
ISO VGB failed
ISO VUT fa

In [964]:
df_efs_sequestration

Unnamed: 0,time_period,ef_frst_sequestration_mangroves_kt_co2_ha,ef_frst_sequestration_primary_kt_co2_ha,ef_frst_sequestration_secondary_kt_co2_ha,iso_alpha_3
0,0,0.004724,0.001882,0.005430,AFG
1,1,0.004724,0.001882,0.005430,AFG
2,2,0.004724,0.001882,0.005430,AFG
3,3,0.004724,0.001882,0.005430,AFG
4,4,0.004724,0.001882,0.005430,AFG
...,...,...,...,...,...
31,31,0.004316,0.000025,0.004314,ZWE
32,32,0.004316,0.000025,0.004314,ZWE
33,33,0.004316,0.000025,0.004314,ZWE
34,34,0.004316,0.000025,0.004314,ZWE


##  Build full data frames of output and merge in

In [1001]:
df_out_conv = df_efs_conv.copy()
df_out_seq = df_efs_sequestration.copy()


# verify some checks
df_out_conv_grouped = df_out_conv.groupby([regions.field_iso])
df_out_seq_grouped = df_out_seq.groupby([regions.field_iso])
regions_drop = []

# first, check conversions
for iso, df in df_out_conv_grouped:
    arr = np.array(df["ef_lndu_conv_forests_primary_to_croplands_gg_co2_ha"])
    (
        regions_drop.append(iso)
        if arr.min() == 0
        else None
    )


# then, check sequestrations
for iso, df in df_out_seq_grouped:
    arr = np.array(df[[x for x in df.columns if x.startswith("ef_frst")]]).min(axis = 1)
    (
        regions_drop.append(iso)
        if arr.min() == 0
        else None
    )

# drop
df_out_conv = (
    df_out_conv[
        ~df_out_conv[regions.field_iso].isin(regions_drop)
    ]
    .reset_index(drop = True)
)
df_out_seq = (
    df_out_seq[
        ~df_out_seq[regions.field_iso].isin(regions_drop)
    ]
    .reset_index(drop = True)
)


regions_succeeded = sorted(list(set(df_efs_conv[regions.field_iso]) - set(regions_drop)))
regions_missing = [x for x in regions.all_isos if x not in regions_succeeded]



    
if len(regions_missing) > 0:
    
    df_out_conv = [df_out_conv]
    df_out_seq = [df_out_seq]

    for iso in regions_missing:

        iso_closest = regions.get_closest_region(
            iso,
            regions_valid = regions_succeeded,
            type_input = "iso",
            type_return = "iso",
        )

        # pull dfs  & overwrite iso code - start with emission factors
        df_ef_comp = df_efs_conv[df_efs_conv[regions.field_iso].isin([iso_closest])].copy()
        df_ef_comp[regions.field_iso] = iso
        df_out_conv.append(df_ef_comp)

        # add in sequestration
        df_ef_seq = df_efs_sequestration[df_efs_sequestration[regions.field_iso].isin([iso_closest])].copy()
        df_ef_seq[regions.field_iso] = iso
        df_out_seq.append(df_ef_seq)

    df_out_conv = pd.concat(df_out_conv, axis = 0)
    df_out_seq = pd.concat(df_out_seq, axis = 0)

fields_force_zero = [
    x for x in df_out_conv.columns
    if x.startswith("ef_lndu_conv_forests")
    & ("_to_forests" in x)
]

df_out_conv[fields_force_zero] = 0

In [1002]:
df_out_conv.to_csv(
    sa.fp_csv_lndu_ef_conversion_co2,
    index = None,
    encoding = "UTF-8"
)

df_out_seq.to_csv(
    sa.fp_csv_lndu_ef_forest_sequestration_co2,
    index = None,
    encoding = "UTF-8"
)


In [1004]:
years_hist = list(range(2015, 2020))
tup = repo.write_from_df(
    df_out_conv,
    years_hist,
    #periods_write = ["projected"],
    write_q = True
)

tup = repo.write_from_df(
    df_out_seq,
    years_hist,
    #periods_write = ["projected"],
    write_q = True
)

DataFrame successfully written to '/Users/jsyme/Documents/Projects/git_jbus/sisepuede_data/AFOLU/ef_lndu_conv_croplands_to_croplands_gg_co2_ha/input_to_sisepuede/historical/ef_lndu_conv_croplands_to_croplands_gg_co2_ha.csv'
DataFrame successfully written to '/Users/jsyme/Documents/Projects/git_jbus/sisepuede_data/AFOLU/ef_lndu_conv_croplands_to_croplands_gg_co2_ha/input_to_sisepuede/projected/ef_lndu_conv_croplands_to_croplands_gg_co2_ha.csv'
DataFrame successfully written to '/Users/jsyme/Documents/Projects/git_jbus/sisepuede_data/AFOLU/ef_lndu_conv_croplands_to_forests_mangroves_gg_co2_ha/input_to_sisepuede/historical/ef_lndu_conv_croplands_to_forests_mangroves_gg_co2_ha.csv'
DataFrame successfully written to '/Users/jsyme/Documents/Projects/git_jbus/sisepuede_data/AFOLU/ef_lndu_conv_croplands_to_forests_mangroves_gg_co2_ha/input_to_sisepuede/projected/ef_lndu_conv_croplands_to_forests_mangroves_gg_co2_ha.csv'
DataFrame successfully written to '/Users/jsyme/Documents/Projects/git_jbu

In [None]:
repo.read

In [1000]:
[
    x for x in df_out_conv.columns
    if x.startswith("ef_lndu_conv_forests")
    & ("_to_forests" in x)
]


['ef_lndu_conv_forests_mangroves_to_forests_mangroves_gg_co2_ha',
 'ef_lndu_conv_forests_mangroves_to_forests_primary_gg_co2_ha',
 'ef_lndu_conv_forests_mangroves_to_forests_secondary_gg_co2_ha',
 'ef_lndu_conv_forests_primary_to_forests_mangroves_gg_co2_ha',
 'ef_lndu_conv_forests_primary_to_forests_primary_gg_co2_ha',
 'ef_lndu_conv_forests_primary_to_forests_secondary_gg_co2_ha',
 'ef_lndu_conv_forests_secondary_to_forests_mangroves_gg_co2_ha',
 'ef_lndu_conv_forests_secondary_to_forests_primary_gg_co2_ha',
 'ef_lndu_conv_forests_secondary_to_forests_secondary_gg_co2_ha']

In [994]:
importlib.reload(sa)

<module 'setup_analysis' from '/Users/jsyme/Documents/Projects/git_jbus/lac_decarbonization/python/setup_analysis.py'>

In [293]:
df_read = pd.read_csv("/Users/jsyme/Documents/Projects/git_jbus/sisepuede_data/AFOLU/frac_lndu_initial_croplands/row_data/Land_cover_data/items_classification.csv")





# Build overlay of land use type by climate

In [8]:
import os, os.path
import numpy as np
import pandas as pd
import model_attributes as ma
from attribute_table import AttributeTable
import setup_analysis as sa
import support_classes as sc
import support_functions as sf
import importlib
import time
import warnings
import matplotlib.pyplot as plt
import geopandas as gpd
import rioxarray as rx
import itertools
import model_afolu as mafl

In [9]:
dir_data = "/Users/jsyme/Documents/Projects/FY21/SWCHE131_1000/Data/AFOLU"
# set names 
fn_climates = "kc_1984_2013.tif"
fn_countries = "WB_countries_Admin0_10m"
fn_cw = "values_info_with_cw_kc_1984_2013.csv"

fp_climates = os.path.join(dir_data, fn_climates)
fp_lu = os.path.join(dir_data, "GlcShare_v10_Dominant", "glc_shv10_DOM.Tif")
fp_countries = os.path.join(dir_data, fn_countries, f"{fn_countries}.shp")
fp_cw = os.path.join(dir_data, fn_cw)

model_afolu = mafl.AFOLU(sa.model_attributes)
regions = sc.Regions(sa.model_attributes)
time_periods = sc.TimePeriods(sa.model_attributes)

In [10]:
# convert geotiff to dataframe
rx_array = rx.open_rasterio(fp_climates)
df_climates = rx_array[0].to_pandas()
# retrieve climate categories
df_climate_cats = pd.read_csv(fp_cw, sep = ",")


# convert geotiff to dataframe
rx_array = rx.open_rasterio(fp_lu)
df_lu = rx_array[0].to_pandas()


In [14]:
def check_climate_and_lu_comparison(
    df_climates: pd.DataFrame,
    df_lu: pd.DataFrame,
    digits: int = 6,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Check if tiff-derived climate and land use data frames have the
        same shared indices based on rounding to error epsilon
    """
    return_reduced = True
    
    # check number of rows
    m_a = len(df_climates.index)
    m_b = len(df_lu.index)
    m_min = min(m_a, m_b)

    # check index
    vec_ind_diff_rows = np.round(
        np.array(df_climates.index)[0:m_min].astype(float), 
        decimals = digits
    )
    vec_ind_diff_rows -= np.round(
        np.array(df_lu.index)[0:m_min].astype(float), 
        decimals = digits
    )
    
    return_reduced &= (np.abs(vec_ind_diff_rows).max() < 10**(-digits))
    
    
    # check number of columns
    n_a = len(df_climates.columns)
    n_b = len(df_lu.columns)
    n_min = min(n_a, n_b)

    # check index
    vec_ind_diff_cols = np.round(
        np.array(df_climates.columns)[0:n_min].astype(float), 
        decimals = digits
    )
    vec_ind_diff_cols -= np.round(
        np.array(df_lu.columns)[0:n_min].astype(float), 
        decimals = digits
    )
    
    
    return_reduced &= (np.abs(vec_ind_diff_cols).max() < 10**(-digits))
    if not return_reduced:
        return None
    
    # modify 
    df_climates = df_climates.iloc[0:m_min, 0:n_min]
    df_climates.index = np.round(
        np.array(df_climates.index)[0:m_min].astype(float),
        decimals = digits
    )
    df_climates.columns = np.round(
        np.array(df_climates.columns)[0:n_min].astype(float),
        decimals = digits
    )
    
    # modify 
    df_lu = df_lu.iloc[0:m_min, 0:n_min]
    df_lu.index = np.round(
        np.array(df_lu.index)[0:m_min].astype(float),
        decimals = digits
    )
    df_lu.columns = np.round(
        np.array(df_lu.columns)[0:n_min].astype(float),
        decimals = digits
    )
    
    out = (
        df_climates,
        df_lu
    )
    
    return out


df_climates, df_lu = check_climate_and_lu_comparison(
    df_climates,
    df_lu,
)

In [16]:
df = pd.read_csv("/Users/jsyme/Documents/Projects/FY21/SWCHE131_1000/Data/AFOLU/kcc_cells_merged_to_country/kcc_coords_index.csv", nrows = 100)




In [None]:
dir_out = "/Users/jsyme/Documents/Projects/FY21/SWCHE131_1000/Data/AFOLU/kcc_luc_merged_to_country"
os.path.

In [201]:

def get_grid_data_from_index(
    df_inds: pd.DataFrame,
    df_gridded_values: pd.DataFrame,
    field_value_new: str,
    digits: int = 6,
    field_x: str = field_lon,
    field_y: str = field_lat,
) -> pd.DataFrame:
    """
    For a given long data frame df_inds of lat/lon, retrieve associated
        values from a gridded data frame with lat/lon
        
    Function Arguments
    ------------------
    - df_inds: input DataFrame containing values and indices by lat/lon centroid, 
        including x, y, and value
    - df_grid: DataFrame containing lat as row index and lon as column
        index. Values in df_inds[field_lat] and df_inds[field_lon] are 
        matched to y and x indices, respectively, rounding to match using
        digits_round_index.
    - field_value_new: field name to use for new value in df
    
    Keyword Arguments
    -----------------
    - digits: number of digits to use for rounding lat/lon
    - field_x: field name in df_inds containing x (lon) coordinates (column names
        in df_gridded_values)
    - field_y: field name in df_inds containing y (lon) coordinates (row index 
        names in df_gridded_values)
    """

    # get latitude and vector longitudes
    lat = np.round(df_inds[field_lat].iloc[0], decimals = digits)
    lons = np.round(np.array(df_inds[field_lon]), decimals = digits)
    # 
    vec_x = np.array(df_gridded_values.loc[lat][lons])
    df_inds[field_value_new] = vec_x
    
    return df_inds



def get_gridded_data_by_index_data_frame(
    df_grid: pd.DataFrame,
    df_index_data: pd.DataFrame,
    digits_round_index: int = 6,
    field_lat: str = "y",
    field_lon: str = "x",
    field_value_index: str = "value",
    field_value_index_out: str = field_kcc,
    field_value_new: str = field_luc,
    missing_val: int = -999,
    return_df: bool = True,
) -> Union[pd.DataFrame, np.ndarray, None]:
    """
    Using broken (by country) KCC data frames, get land use categories by
        cell centroid (assumes KCC and LU grids have same centroids)
        
    
    Function Arguments
    ------------------
    - df_grid: DataFrame containing lat as row index and lon as column
        index. Values in df_index_data[field_lat] and df_index_data[field_lon] are 
        matched to y and x indices, respectively, rounding to match using
        digits_round_index.
    - df_index_data: input DataFrame containing KCC indices by lat/lon centroid, 
        including x, y, and value
        
    
    Keyword Arguments
    -----------------
    - digits_round_index: number of digits to use for rounding lat/lon to look
        values in land use
    - field_lat: field containing latitude
    - field_lon: field_containing longitude
    - field_value_index: field in df_index_data containing data values
    - field_value_index_out: new field name for data field stored in df_index_data
    - field_value_new: new field name for data merged in from df_grid
    - missing_val: value to use if lat/lon not found in df_grid
    - return_df: return a dataframe merging the new column to df_index_data? if False,
        returns ordered column vector only
    """
    
    
    # validate inputs
    
    return_none = False
    return_none |= not isinstance(df_grid, pd.DataFrame)
    return_none |= not isinstance(df_index_data, pd.DataFrame)
    #
    # check grid matching here
    #
    if return_none:
        return None
    
    
    
    # group Kopen Climate Classification data frame by latitude
    dfg = (
        df_index_data
        .groupby([field_lat])
    )

    df_out = []
    for y, df in dfg:
        df_out.append(
            get_grid_data_from_index(
                df,
                df_grid,
                field_value_new,
                digits = digits_round_index,
                field_x = field_lon,
                field_y = field_lat,
            )
        )
        
    df_out = (
        pd.concat(df_out, axis = 0)
        .sort_index()
        .rename(columns = {field_value_index: field_value_index_out})
    )
    
    """
    df_out = dfg.apply(
        get_grid_data_from_index,
        df_grid,
        field_luc,
        digits = digits_round_index,
        field_x = field_lon,
        field_y = field_lat,
    )
    """;

    return df_out



def merge_gridded_data_to_kcc_file(
    df_grid: pd.DataFrame,
    fp_coords: str,
    fp_kcc_data: str,
    field_data_value: str,
    field_index: str = field_index,
    field_x: str = "x",
    field_y: str = "y",
    header_coords: [List[str], None] = None,
    output_as_index: bool = True,
    **kwargs
) -> Union[pd.DataFrame, None]:
    """
    Retrieve coordinates associated with indices stored in KCC files
    
    Function Arguments
    ------------------
    - df_grid: data frame containing gridded data
    - fp_coords: file path to coordinates
    - fp_kcc_data: file path to data containing KCC indexed by fp_coords
    - field_data_value: field to use for new data value 
    
    Keyword Arguments
    -----------------
    - field_index: field in input file fp_coord
    - header_coords: list giving columns. If None, reads from fp_coords
    - output_as_index: output DataFrame indexes by field_index? If False,
        includes coordinates
    - **kwargs: passed to get_gridded_data_by_index_data_frame()
    """
    # get header
    header_coords = (
        list(pd.read_csv(fp_coords, nrows = 0).columns)
        if header_coords is None
        else header_coords
    )
    
    
    df_get = pd.read_csv(fp_kcc_data)
        
    # get indices
    vec_index = np.array(df_get[field_index]).astype(int)
    min_ind = min(vec_index)
    max_ind = max(vec_index)
    vec_extract = vec_index - min_ind
        
    # read array from file
    df_lat_lon = sf.read_array_from_file(
        fp_coords,
        len(header_coords),
        min_ind, 
        max_ind,
        skip_header = True,
    )
    df_lat_lon = df_lat_lon[vec_extract, :]
    df_lat_lon = pd.DataFrame(df_lat_lon, columns = header_coords)
    (
        df_lat_lon.drop([field_data_value], axis = 1, inplace = True)
        if field_data_value in df_lat_lon.columns
        else None
    )
    
    # get land use classes associated with the rows in these indices--sorted by index
    df_grid_out = get_gridded_data_by_index_data_frame(
        df_grid,
        df_lat_lon,
        field_value_new = field_data_value,
        field_lat = field_y,
        field_lon = field_x,
        **kwargs
    )
    
    df_grid_out = df_lat_lon.join(df_grid_out[[field_data_value]])
    if output_as_index:
        (
            df_grid_out
            .drop(
                [field_x, field_y],
                axis = 1, 
                inplace = True
            )
        )
        df_grid_out = pd.concat(
            [
                df_get[[field_index]],
                df_grid_out
            ], 
            axis = 1
        )
    
    return df_grid_out




In [436]:
dir_read = os.path.join(dir_data, "kcc_and_lndusecat_country")
regex_match = re.compile("kcc_and_lndusecat_by_iso_(\D*).csv")
#dir_read = os.path.join(dir_data, "KCC_LUC_aggs_by_country")
#regex_match = re.compile("kcc_and_lndusecat_by_iso(\D*)_agg.csv")
fls_read = sorted([x for x in os.listdir(dir_read) if regex_match.match(x) is not None])
fls_read[26]

'kcc_and_lndusecat_by_iso_BRA.csv'

In [437]:
fp_coords = "/Users/jsyme/Documents/Projects/FY21/SWCHE131_1000/Data/AFOLU/kcc_and_lndusecat_country/index_right_coords.csv"
 
t0 = time.time()
df_ord = merge_gridded_data_to_kcc_file(
    df_lu,
    fp_coords,
    os.path.join(dir_read, fls_read[26]),
    field_luc,
    output_as_index = False,
)
t1 = time.time()

In [440]:
df_ord[
    (df_ord["y"] < -6.1)
    & (df_ord["y"] > -6.2)
    & (df_ord["x"] < -64.4)
    & (df_ord["x"] > -64.5)
    
]

Unnamed: 0,x,y,value,luc
9915951,-64.445833,-6.104167,110.0,4
9916015,-64.495833,-6.104167,110.0,4
9916028,-64.429167,-6.104167,110.0,4
9916041,-64.479167,-6.104167,110.0,4
9916054,-64.404167,-6.104167,110.0,4
...,...,...,...,...
9917580,-64.420833,-6.195833,130.0,11
9917582,-64.412500,-6.195833,130.0,4
9917590,-64.404167,-6.195833,130.0,4
9917591,-64.462500,-6.195833,110.0,4


In [229]:
"""
import re
importlib.reload(sf)

    
# get files 

#dir_read = os.path.join(dir_data, "kcc_cells_merged_to_country")
#regex_match = re.compile("kcc_and_lndusecat_by_iso_(\D*).csv")
dir_read = os.path.join(dir_data, "KCC_LUC_aggs_by_country")
regex_match = re.compile("kcc_and_lndusecat_by_iso(\D*)_agg.csv")
fls_read = sorted([x for x in os.listdir(dir_read) if regex_match.match(x) is not None])


for i, fl in enumerate(fls_read):
    
    fp_cur = os.path.join(dir_read, fl)
    fp_new = os.path.join(dir_read, fl.replace("kcc_by_country", "kcc_and_lndusecat_by_iso"))
    
    os.rename(fp_cur, fp_new)
""";

In [223]:
# MERGE IN LUC DATA TO KCC DATA

import re
importlib.reload(sf)

    
fp_coords = "/Users/jsyme/Documents/Projects/FY21/SWCHE131_1000/Data/AFOLU/kcc_and_lndusecat_country/index_right_coords.csv"

# get files 
regex_match = re.compile("kcc_and_lndusecat_by_iso_(\D*).csv")
dir_read = os.path.join(dir_data, "kcc_cells_merged_to_country")
fls_read = sorted([x for x in os.listdir(dir_read) if regex_match.match(x) is not None])
isos_avail = [regex_match.match(x).groups()[0] for x in fls_read]
field_index = "index_right"

# initialize output location for aggregates
dir_output_aggs = os.path.join(dir_data, "KCC_LUC_aggs_by_country")
os.makedirs(dir_output_aggs, exist_ok = True) if not os.path.exists(dir_output_aggs) else None

fls_complete = []

for i, fl in enumerate(fls_read):

    t0 = time.time()
    
    iso = regex_match.match(fl).groups()[0]
    print(f"Starting ISO {iso}...")
    
    fp_in_kcc = os.path.join(dir_read, fl)
    fp_out_kcc_agg = os.path.join(dir_output_aggs, fl.replace(".csv", "_agg.csv"))
    
    # get data and rename
    df_indexed_new = (
        merge_gridded_data_to_kcc_file(
            df_lu,
            fp_coords,
            fp_in_kcc,
            field_luc,
            output_as_index = True,
        )
        .rename(columns = {"value": field_kcc})
    )
    
    
    df_indexed_agg = (
        sf.get_index_fields_count(
            df_indexed_new,
            fields_index = [field_kcc, field_luc]
        )
        .sort_values(by = ["count"], ascending = False)
    )

    # export files
    df_indexed_new.to_csv(
        fp_in_kcc,
        index = None,
        encoding = "UTF-8"
    )
    df_indexed_agg.to_csv(
        fp_out_kcc_agg,
        index = None,
        encoding = "UTF-8"
    )
    
    
    t1 = time.time()
    t_elapsed = sf.get_time_elapsed(t0)
    fls_complete.append(fl)
    print(f"Country '{iso}' complete in {t_elapsed} seconds.\n\n")
    


Starting ISO ABW...
Country 'ABW' complete in 26.36 seconds.


Starting ISO AFG...
Country 'AFG' complete in 64.4 seconds.


Starting ISO AGO...
Country 'AGO' complete in 63.34 seconds.


Starting ISO ALB...
Country 'ALB' complete in 31.23 seconds.


Starting ISO AND...
Country 'AND' complete in 14.18 seconds.


Starting ISO ARE...
Country 'ARE' complete in 31.54 seconds.


Starting ISO ARG...
Country 'ARG' complete in 76.61 seconds.


Starting ISO ARM...
Country 'ARM' complete in 26.69 seconds.


Starting ISO ASM...
Country 'ASM' complete in 30.52 seconds.


Starting ISO ATG...
Country 'ATG' complete in 21.72 seconds.


Starting ISO AUS...
Country 'AUS' complete in 122.35 seconds.


Starting ISO AUT...
Country 'AUT' complete in 27.24 seconds.


Starting ISO AZE...
Country 'AZE' complete in 30.02 seconds.


Starting ISO BDI...
Country 'BDI' complete in 27.48 seconds.


Starting ISO BEL...
Country 'BEL' complete in 22.6 seconds.


Starting ISO BEN...
Country 'BEN' complete in 36.62 seco

In [200]:
#pd.concat([df_get[["index_right"]], df_ord.drop(["x", "y"], axis = 1)], axis = 1)

In [222]:
regex_match.match(fls_read[0]).groups()[0]

'ABW'

In [127]:
attr_kcc.table

Unnamed: 0,kcc,code,name,wet_dry_cat,temperate_tropical_cat,description,group,precipitation_type,level_of_heat,ipcc_forest
0,110,Af,"Tropical, rainforest",wet,tropical,Tropical rainforest climate,Tropical,Rainforest,,Tropical rainforest
1,120,Aw,"Tropical, savannah",wet,tropical,"Tropical savanna, wet",Tropical,"Savanna, Wet",,Tropical shrublands
2,130,Am,"Tropical, monsoon",wet,tropical,Tropical monsoon climate,Tropical,Monsoon,,Tropical moist deciduous forest
3,211,BWh,"Arid, desert, hot",dry,temperate,Hot deserts climate,Arid,Desert,Hot,Desert
4,212,BWk,"Arid, desert, cold",dry,temperate,Cold desert climate,Arid,Desert,Cold,Desert
5,221,BSh,"Arid, steppe, hot",dry,temperate,Hot semi-arid (steppe) climate,Arid,Steppe,Hot,Steppe
6,222,BSk,"Arid, steppe, cold",dry,temperate,Cold semi-arid (steppe) climate,Arid,Steppe,Cold,Steppe
7,311,Csa,"Temperate, dry summer, hot summer",dry,temperate,Hot-summer Mediterranean climate,Temperate,Dry summer,Hot summer,Sub-tropical dry forests
8,312,Csb,"Temperate, dry summer, warm summer",dry,temperate,Warm-summer Mediterranean climate,Temperate,Dry summer,Warm summer,Sub-tropical dry forests
9,313,Csc,"Temperate, dry summer, cold summer",dry,temperate,Cool-summer Mediterranean climate,Temperate,Dry summer,Cold summer,Sub-tropical dry forests


Unnamed: 0,count,value,luc
8,62775,120.0,2
10,34199,120.0,4
38,30992,221.0,2
39,24622,221.0,3
9,15554,120.0,3
11,15117,120.0,5
35,9318,211.0,9
40,8390,221.0,4
21,7772,130.0,4
42,7344,221.0,8


In [114]:
?get_gridded_data_by_index_data_frame

[0;31mSignature:[0m
[0mget_gridded_data_by_index_data_frame[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdf_grid[0m[0;34m:[0m [0mpandas[0m[0;34m.[0m[0mcore[0m[0;34m.[0m[0mframe[0m[0;34m.[0m[0mDataFrame[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdf_index_data[0m[0;34m:[0m [0mpandas[0m[0;34m.[0m[0mcore[0m[0;34m.[0m[0mframe[0m[0;34m.[0m[0mDataFrame[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdigits_round_index[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m6[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfield_lat[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;34m'y'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfield_lon[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;34m'x'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfield_value_index[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;34m'value'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfield_value_index_out[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;34m'kcc'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m  

In [89]:
vec

Unnamed: 0,x,y,value,luc
243588,-87.120833,12.420833,120.0,11
243589,-87.112500,12.420833,120.0,11
243590,-87.104167,12.420833,120.0,7
243591,-87.095833,12.420833,120.0,7
243592,-87.087500,12.420833,120.0,7
...,...,...,...,...
9269,125.054167,12.629167,110.0,11
9270,125.062500,12.629167,110.0,4
9271,125.070833,12.629167,110.0,2
9272,125.079167,12.629167,110.0,11


In [None]:
x = np.round(np.array(df_lat_lon["x"]), decimals = 6)
y = np.round(np.array(df_lat_lon["y"]), decimals = 6)

t0 = time.time()
vec = df_lu.loc[y, x]
t1 = time.time()

In [87]:
vec.sort_index()

Unnamed: 0,x,y,value,luc
0,-70.054167,12.629167,221.0,11
1,-70.045833,12.629167,221.0,11
2,-61.412500,12.629167,110.0,11
3,-61.404167,12.629167,110.0,11
4,-61.395833,12.629167,110.0,11
...,...,...,...,...
244045,-71.479167,12.420833,211.0,11
244046,-71.470833,12.420833,211.0,11
244047,-69.904167,12.420833,221.0,11
244048,-69.895833,12.420833,221.0,11


In [480]:
r = None
for i, row in df_lat_lon.iterrows():    
    r = row if r is None else r
    

In [493]:
digits_round_index = 6
field_lat = "y"
field_lon = "x"
lat = np.round(float(r[field_lat]), decimals = digits_round_index)
lon = np.round(float(r[field_lon]), decimals = digits_round_index)
int(df_lu.loc[lat, lon])


11

In [488]:
df_lu.loc[lat, lon]

11

In [458]:
t0 = time.time()
df_try = pd.read_csv(
    fp_coords,
    header = None,
    nrows = max_ind - min_ind + 1,
    skiprows = min_ind + 1,
)
t1 = time.time()
t1 - t0

147.36334896087646

In [437]:

arr = read_array(
    fp_coords,
    3,
    10000000, 
    12000000,
)

10000001


In [444]:
?sf.read_array_from_file

[0;31mSignature:[0m
[0msf[0m[0;34m.[0m[0mread_array_from_file[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mfp[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mn_cols[0m[0;34m:[0m [0mint[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_ind[0m[0;34m:[0m [0mint[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_ind[0m[0;34m:[0m [0mint[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdelim[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;34m','[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mskip_header[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0mnumpy[0m[0;34m.[0m[0mndarray[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Read an array from a file. min_ind is first row, max_ind is last row + 1
    (python style indexing). Only works with numeric values.
    
Reads like data frame index, so 0 would be the first row of data (unless
    skip_header = False)
    
Keyword Argum

In [409]:
fi.close()

In [408]:
?fi

'-33.945833333333326,83.6375,510.0\n'