In [633]:
import os, os.path
import numpy as np
import pandas as pd
import data_structures as ds
import setup_analysis as sa
import support_functions as sf
import importlib
import time
import warnings
import matplotlib.pyplot as plt
import model_afolu as ma



In [634]:
# build composite data frame for estimation
def build_df_est(
    df_exports: pd.DataFrame,
    df_imports: pd.DataFrame,  
    df_production: pd.DataFrame,
    df_stocks: pd.DataFrame,
    dict_items_repl: dict,
    fields_dat: list,
    fields_grp: list, 
    agg_func: str = "sum"
) -> pd.DataFrame:
    
    # collect and fill na
    df_est = pd.merge(df_imports, df_exports, how = "outer")
    df_est = pd.merge(df_est, df_production, how = "outer")
    df_est = pd.merge(df_est, df_stocks, how = "outer")
    df_est.fillna(0.0, inplace = True)
    
    # drop rows where exports exceed production (here, we have 1:1 connection for items)
    df_est = df_est[
        (df_est["exports"] <= df_est["production"])
        #(df_est["imports"] <= df_est["production"])
    ].reset_index(drop = True)
    
    # calculate demand
    E = np.array(df_est["exports"])
    I = np.array(df_est["imports"])
    P = np.array(df_est["production"])
    M = P + I - E
    df_est["demand"] = M
    fields_dat.append("demand")
    
    df_est_out = df_est.copy()
    df_est["Item"] = df_est["Item"].replace(dict_items_repl)
    dict_agg = dict([(x, "first") for x in fields_grp])
    dict_agg.update(dict([(x, agg_func) for x in fields_dat]))
    fields_drop = [x for x in df_est.columns if (x not in fields_dat + fields_grp)]
    
    df_agg = df_est.drop(fields_drop, axis = 1).groupby(fields_grp).agg(dict_agg).reset_index(drop = True)
    
    return df_agg, df_est_out



# function to clean country names
def clean_region_name(x: str, clean_type = "for_data") -> str:
    nm = x.split("(")[0].strip()
    
    if clean_type == "for_data":
        nm = nm.lower().replace(" ", "_")
    elif clean_type == "for_key":
        None
    
    return nm



# filter and order data frames
def filter_dfs(
    df: pd.DataFrame, 
    dict_subset: dict,
    fields_keep: list
) -> pd.DataFrame:
    
    df_out = sf.subset_df(df, dict_subset)
    fields_drop = [x for x in df_out.columns if x not in fields_keep]
    df_out.drop(fields_drop, axis = 1, inplace = True)
    df_out.reset_index(drop = True, inplace = True)
    
    return df_out



# read in FAO data
def get_faostat_data(
    dir_faostat: str, 
    nm: str,
    encode: str = "ISO-8859-1"
) -> pd.DataFrame:
    
    fp_read = os.path.join(dir_faostat, nm, f"{nm}.csv")
    sf.check_path(fp_read)
    
    df_ret = pd.read_csv(
        fp_read, 
        encoding = encode
    )
    
    return df_ret


# load data
dir_faostat = "/Users/jsyme/Documents/Projects/FY21/SWCHE131_1000/Data/ingestion/FAOSTAT/"
df_ag_production = get_faostat_data(dir_faostat, "Production_Crops_Livestock_E_All_Data_(Normalized)")
df_ag_trade = get_faostat_data(dir_faostat, "Trade_Crops_Livestock_E_All_Data_(Normalized)")
df_cw_fao = pd.read_csv(sa.fp_csv_cw_fao_crops)
attribute_cw_fao_cats = ds.AttributeTable(
    sa.fp_csv_cw_fao_product_demand_categories_for_ie,
    field_cat, 
    []
)

# set some attributes
attr_region = sa.model_attributes.dict_attributes.get("region")
attr_time_period = sa.model_attributes.dict_attributes.get("dim_time_period")
model_afolu = ma.AFOLU(sa.model_attributes)

# get region codes
region_codes_all = attr_region.key_values
dict_area_codes = attr_region.field_maps.get("region_to_fao_area_code")
region_codes_all = [dict_area_codes.get(x, None) for x in region_codes_all]

# set some fields for working with crosswalk
flag_none = "NONE"
field_cat = "fao_category"
field_subsec = "sisepuede_demand_subsector"
field_demand_cat = "sisepuede_demand_category"

# FAO Items to drop and keep
df_cw_product_cats = attribute_cw_fao_cats.table
items_drop = list(
    df_cw_product_cats[
        df_cw_product_cats[field_demand_cat].isin([flag_none])
    ][field_cat]
)
items_keep = list(
    df_cw_product_cats[
        ~df_cw_product_cats[field_demand_cat].isin([flag_none])
    ][field_cat]
)

# get years to keep
years_keep = list(range(2011, 2021))

# filtering dictionaries
dict_filt = {
    "Element": ["Production"],
    "Year": years_keep,
    "Area Code": region_codes_all,
    "Item": list(df_cw_fao["fao_crop"])
}
fields_keep = ["Area", "Item", "Item Code", "Year", "Unit", "Value"]


##  reduce

dict_repl_stocks_0 = {
    "Asses": "mules", 
    "Cattle": "Cattle_nondairy",
    "Chickens": "chickens",
    "Ducks": "chickens",
    "Goats": "goats",
    "Horses": "horses",
    "Mules": "mules",
    "Pigs": "pigs",
    "Sheep": "sheep",
    "Turkeys": "chickens",
    "Camelids, other": "horses",
    "Buffaloes": "buffalo"
}
# stocks/yield - two parts
df_yields = filter_dfs(
    df_ag_production,
    dict_filt,
    fields_keep
).rename(columns =  {"Value": "stocks"})
dict_filt.update({"Item": items_keep, "Element": ["Stocks"]})
df_stocks_0 = filter_dfs(
    df_ag_production,
    dict_filt,
    fields_keep
).rename(columns =  {"Value": "stocks"})
df_stocks = pd.concat([
    df_yields,
    df_stocks_0
], axis = 0).reset_index(drop = True)

# production
dict_filt.update({"Element": ["Production"]})
df_prod = filter_dfs(
    df_ag_production,
    dict_filt,
    fields_keep
).rename(columns =  {"Value": "production"})

# append stocks that are unaccounted for
df_prod_app = df_stocks[
    df_stocks["Item"].isin(set(df_stocks["Item"]) - set(df_prod["Item"]))
].copy().rename(columns = {"stocks": "production"})
df_prod = pd.concat([
    df_prod,
    df_prod_app
], axis = 0).reset_index(drop = True)

# imports
dict_filt.update({"Element": ["Import Quantity"]})
df_imports = filter_dfs(
    df_ag_trade,
    dict_filt,
    fields_keep
).rename(columns =  {"Value": "imports"})

# exports
dict_filt.update({"Element": ["Export Quantity"]})
df_exports = filter_dfs(
    df_ag_trade,
    dict_filt,
    fields_keep
).rename(columns =  {"Value": "exports"})


# convert Heads units to 1000 Heads where present and clean items
dict_dfs = {
    "exports": df_exports,
    "imports": df_imports,
    "production": df_prod,
    "stocks": df_stocks
}
for key in dict_dfs.keys():
    # 
    df = dict_dfs[key]
    vec_adj = np.ones(len(df))
    vec_unit = np.array(df["Unit"])
    vec_vals = np.array(df[key])
    
    w = np.where(vec_unit == "Head")[0]
    if len(w) > 0:
        np.put(vec_adj, w, 0.001)
        np.put(vec_unit, w, "1000 Head")
    df["Unit"] = vec_unit
    df[key] = vec_vals*vec_adj
    
    # clean items
    df["Item"] = [x.strip() for x in list(df["Item"])]
    
    dict_dfs.update({key: df})

    
    
# split out between items associated with stock and derivative goods 
items_stock = list(df_stocks["Item Code"])
dict_dfs_split = {"stocks": {}, "derivatives": {}}

for key in dict_dfs.keys():
    df = dict_dfs[key]
    df_st = df[df["Item Code"].isin(items_stock)].copy().reset_index(drop = True)
    df_dr = df[~df["Item Code"].isin(items_stock)].copy().reset_index(drop = True)
    
    dict_dfs_split["stocks"].update({key: df_st})
    dict_dfs_split["derivatives"].update({key: df_dr})
    

# get composite
dict_repl = attribute_cw_fao_cats.field_maps[f"{field_cat}_to_{field_demand_cat}"]





# get dfs - stock goods
df_agg_stock, df_est_stock = build_df_est(
    dict_dfs_split["stocks"]["exports"],
    dict_dfs_split["stocks"]["imports"], 
    dict_dfs_split["stocks"]["production"], 
    dict_dfs_split["stocks"]["stocks"],
    dict_repl,
    ["exports", "imports", "production", "stocks"],
    ["Area", "Item", "Year", "Unit"]
)

# get data for derivative products
df_agg_deriv, df_est_deriv_0 = build_df_est(
    dict_dfs_split["derivatives"]["exports"],
    dict_dfs_split["derivatives"]["imports"], 
    dict_dfs_split["derivatives"]["production"], 
    dict_dfs_split["derivatives"]["stocks"],
    dict_repl,
    ["exports", "imports", "production", "stocks"],
    ["Area", "Item", "Year", "Unit"]
)



In [756]:
# 1. calculate demands for stocks (done, available in df_agg_stock, df_est_stock)
# 2. caclculate production ratios for derivative products in dict_dfs_split["derivatives"]
#    - merge derivative production to df_agg_stock *after* replacing item names
#    - calculate demadns for each derivative good
#    - using this ratio, calculate demands in terms of 
# get data for derivative products

df_est_deriv_0["Item_merge"] = df_est_deriv_0["Item"].replace(dict_repl)
df_est_deriv_0.drop(["stocks"], axis = 1, inplace = True) if ("stocks" in df_est_deriv_0.columns) else None

# initialize the data frame for estimating imports/exports
df_est_ie = pd.merge(
    df_est_deriv_0, 
    df_agg_stock[["Area", "Year", "Item", "demand"]].rename(columns = {"demand": "stocks", "Item": "Item_merge"}), 
    how = "left"
)
# drop no production
df_est_ie = df_est_ie[df_est_ie["production"] > 0.0].reset_index(drop = True)



##  MAKE SURE ALL PRODUCTION/EXPORT/IMPORT/DEMANDS ARE IN SAME UNIT (tonnes)

# use "small" value as average, 43g/egg: https://www.dineachook.com.au/blog/how-to-get-bigger-eggs-from-your-chickens-egg-weight-and-size/
# 43g/egg = 0.043 tonne/1000 egg
factor_eggs = 0.043
vec_adj = np.ones(len(df_est_ie))
vec_unit = np.array(df_est_ie["Unit"])

for key in ["exports", "imports", "production", "demand"]:
    
    vec_vals = np.array(df_est_ie[key])
    w = np.where(vec_unit == "1000 No")[0]
    if len(w) > 0:
        np.put(vec_adj, w, factor_eggs)
        np.put(vec_unit, w, "tonnes")
    df_est_ie["Unit"] = vec_unit
    df_est_ie[key] = vec_vals*vec_adj


##  THEN, ADD THE PRODUCTION RATIO, GENERATE IMP/EXP ESTIMATES IN TERMS OF STOCKS, AND TAKE TONNE-WEIGHTED AVERAGE

# production ratio and import/exports as stock equiv
field_ratio = "domestic_production_ratio" # prod/stocks
df_est_ie[field_ratio] = np.nan_to_num(np.array(df_est_ie["production"])/np.array(df_est_ie["stocks"]), 0.0, posinf = 0.0)
for key in ["imports", "exports"]:
    field_new = f"{key}_stock_equivalent"
    df_est_ie[field_new] = np.nan_to_num(np.array(df_est_ie[key])/np.array(df_est_ie[field_ratio]), 0.0, posinf = 0.0)

# get production weights - start with total derivative production
fields_grp = ["Area", "Year", "Item_merge"]
fields_sum = ["production"]
dict_agg = dict(zip(fields_grp, ["first" for x in fields_grp]))
dict_agg.update(dict(zip(fields_sum, ["sum" for x in fields_sum])))
df_est_ie_total_deriv_prod = df_est_ie[
    ~df_est_ie["Item"].isin(list(df_stocks["Item"]))
][fields_grp + fields_sum].groupby(fields_grp).agg(dict_agg).reset_index(drop = True).rename(columns = {"production": "total_derivative_production"})


##  

# merge in to get production weights
df_est_ie = pd.merge(df_est_ie, df_est_ie_total_deriv_prod, how = "left")
df_est_ie["production_weight"] = np.array(df_est_ie["production"])/np.array(df_est_ie["total_derivative_production"])

# now, estimate weighted imports/exports
for key in ["imports", "exports"]:
    field_new = f"weighted_est_{key}_equiv"
    field_stock_equiv = f"{key}_stock_equivalent"
    df_est_ie[field_new] = np.array(df_est_ie[field_stock_equiv])*np.array(df_est_ie["production_weight"])

# aggregate 
fields_grp = ["Area", "Year", "Item_merge"]
fields_sum = ["weighted_est_imports_equiv", "weighted_est_exports_equiv"]
dict_agg = dict(zip(fields_grp, ["first" for x in fields_grp]))
dict_agg.update(dict(zip(fields_sum, ["sum" for x in fields_sum])))
df_est_ie_est_ie_equiv = df_est_ie[fields_grp + fields_sum].groupby(fields_grp).agg(dict_agg).reset_index(drop = True)
df_est_ie_est_ie_equiv.rename(
    columns = {
        "weighted_est_imports_equiv": "est_imports_equiv",
        "weighted_est_exports_equiv": "est_exports_equiv"
    }
)

df_out = pd.merge(
    df_est_ie_est_ie_equiv.rename(columns = {"Item_merge": "Item"}),
    df_agg_stock[["Area", "Item", "Year", "exports", "imports", "stocks"]], 
    how = "outer"
)

df_out.fillna(0, inplace = True)


##  CHECK FRACTIONS, OVERWRITE WHERE ESTIMATES ARE UNREASONABLE

# get some fractions
df_out["exports_fin_est"] = np.array(df_out["exports"]) + np.array(df_out["weighted_est_exports_equiv"])
df_out["imports_fin_est"] = np.array(df_out["imports"]) + np.array(df_out["weighted_est_imports_equiv"])
df_out["domestic_demand_fin_est"] = np.array(df_out["imports_fin_est"]) + np.array(df_out["stocks"]) - np.array(df_out["exports_fin_est"])
df_out["import_frac_of_demand"] = np.nan_to_num(np.array(df_out["imports_fin_est"])/np.array(df_out["domestic_demand_fin_est"]), 0.0)
df_out["export_frac_of_prod"] = np.nan_to_num(np.array(df_out["exports_fin_est"])/np.array(df_out["stocks"]), 0.0)

# set a threshold for acceptable exports; if exceeding the threshold, revert to stock exports
thresh = 0.75
vec_old = np.array(df_out["export_frac_of_prod"])
w = np.where(vec_old > thresh)[0]
if len(w) > 0:
    vec_new = np.nan_to_num(np.array(df_out["exports"])/np.array(df_out["stocks"]), 0, posinf = 0.0)
    vec_repl = vec_new[w]
    np.put(vec_old, w, vec_new[w])
    df_out["export_frac_of_prod"] = vec_old
    
df_out["exports_est"] = np.array(df_out["export_frac_of_prod"])*np.array(df_out["stocks"])

# set a threshold for acceptable imports;
thresh = 0.75
vec_old = np.array(df_out["import_frac_of_demand"])
w = np.where(vec_old > thresh)[0]
if len(w) > 0:
    vec_new = np.nan_to_num(np.array(df_out["imports"])/(np.array(df_out["stocks"]) + np.array(df_out["imports"]) - np.array(df_out["exports"])), 0, posinf = 0.0)
    vec_repl = vec_new[w]
    np.put(vec_old, w, vec_new[w])
    df_out["import_frac_of_demand"] = vec_old


    
# components used to build input fields
attr_agrc = sa.model_attributes.get_attribute_table(sa.model_attributes.subsec_name_agrc)
attr_lvst = sa.model_attributes.get_attribute_table(sa.model_attributes.subsec_name_lvst)
dict_subsec_to_subsec_abv = sa.model_attributes.dict_attributes.get("abbreviation_subsector").field_maps.get("subsector_to_abbreviation_subsector")
dict_subsec_abv_to_subsec = sa.model_attributes.dict_attributes.get("abbreviation_subsector").field_maps.get("abbreviation_subsector_to_subsector")

dict_repl_subsecs = dict([(x, dict_subsec_to_subsec_abv.get(sa.model_attributes.subsec_name_agrc)) for x in attr_agrc.key_values])
dict_repl_subsecs.update(dict([(x, dict_subsec_to_subsec_abv.get(sa.model_attributes.subsec_name_lvst)) for x in attr_lvst.key_values]))





  df_est_ie[field_ratio] = np.nan_to_num(np.array(df_est_ie["production"])/np.array(df_est_ie["stocks"]), 0.0, posinf = 0.0)
  df_est_ie[field_new] = np.nan_to_num(np.array(df_est_ie[key])/np.array(df_est_ie[field_ratio]), 0.0, posinf = 0.0)
  df_est_ie[field_new] = np.nan_to_num(np.array(df_est_ie[key])/np.array(df_est_ie[field_ratio]), 0.0, posinf = 0.0)
  df_out["import_frac_of_demand"] = np.nan_to_num(np.array(df_out["imports_fin_est"])/np.array(df_out["domestic_demand_fin_est"]), 0.0)
  df_out["export_frac_of_prod"] = np.nan_to_num(np.array(df_out["exports_fin_est"])/np.array(df_out["stocks"]), 0.0)
  df_out["export_frac_of_prod"] = np.nan_to_num(np.array(df_out["exports_fin_est"])/np.array(df_out["stocks"]), 0.0)
  vec_new = np.nan_to_num(np.array(df_out["exports"])/np.array(df_out["stocks"]), 0, posinf = 0.0)
  vec_new = np.nan_to_num(np.array(df_out["imports"])/(np.array(df_out["stocks"]) + np.array(df_out["imports"]) - np.array(df_out["exports"])), 0, posinf = 0.0)


In [757]:
# format items as fields
def build_field(item: str, field_type: str) -> str:
    
    subsec_abv = dict_repl_subsecs.get(item)
    subsec = dict_subsec_abv_to_subsec.get(subsec_abv)
    
    if subsec_abv == "agrc":
        modvar = model_afolu.modvar_agrc_frac_demand_imported if (field_type == "imports") else model_afolu.modvar_agrc_equivalent_exports
    elif subsec_abv == "lvst":
        modvar = model_afolu.modvar_lvst_frac_demand_imported if (field_type == "imports") else model_afolu.modvar_lvst_equivalent_exports
    
    out = sa.model_attributes.build_varlist(subsec, modvar, restrict_to_category_values = [item])
    
    return out[0]

df_out["field_imports"] = df_out["Item"].apply(build_field, field_type = "imports")
df_out["field_exports"] = df_out["Item"].apply(build_field, field_type = "exports")


In [766]:
df_out_wide = pd.concat([
    df_out[["Area", "Year", "import_frac_of_demand", "field_imports"]].rename(columns = {"import_frac_of_demand": "value", "field_imports": "field"}),
    df_out[["Area", "Year", "export_frac_of_prod", "field_exports"]].rename(columns = {"export_frac_of_prod": "value", "field_exports": "field"})
], axis = 0).reset_index(drop = True)

df_out_wide = pd.pivot(
    df_out_wide,
    ["Area", "Year"],
    ["field"],
    ["value"]
).reset_index()
df_out_wide.columns = df_out_wide.columns.to_flat_index()

# rename
cols_old = list(df_out_wide.columns)
cols = []

for c in cols_old:
    if c[0] == "value":
        cols.append(c[1])
    else:
        cols.append(c[0])
dict_rnm = dict(zip(cols_old, cols))
df_out_wide.rename(columns = dict_rnm, inplace = True)
df_out_wide.rename(columns = {"Year": "year", "Area": "Nation"}, inplace = True)


In [703]:
# read path to formatted input data
df_input_data = pd.read_csv("/Users/jsyme/Downloads/data_complete_future_2022_08_24_test1-4.csv")

In [769]:
df_out_wide.to_csv(sa.fp_csv_afolu_import_exports, index = None, encoding = "UTF-8")