In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from typing import Optional
from pathlib import Path
import pandas as pd
from pandas import Series, DataFrame
from dataclasses import dataclass

import pandas_indexing.accessors
from pandas_indexing import isin, ismatch, semijoin, extractlevel

In [34]:
from aneris.harmonize import Harmonizer
from aneris.downscaling import Downscaler

# Read model and historic data including overrides

In [3]:
base_path = Path(
    "/Users/coroa/Library/CloudStorage/OneDrive-SharedLibraries-IIASA/RESCUE - WP 1/data"
)

## Variable definition files

The variable definition file is a CSV or yaml file that needs to contain the `variable`-name, its `sector`, `gas` components and whether it is expected `global` (or regional instead).

Here we generate one based on the cmip6 historical data we have that could be used as a basis but we would want to finetune this by hand.

### Create new variable definition files from historical data

In [4]:
variables = (
    pd.read_csv(
        base_path / "historical/cmip6/history.csv",
        usecols=[3, 4],
    )
    .rename(columns=str.lower)
    .drop_duplicates()
    .set_index("variable")
)

In [5]:
variabledefs = (
    extractlevel(variables, "CEDS+|9+ Sectors|Emissions|{gas}|{sector}|Unharmonized")
    # .loc[lambda df: ~ismatch(df, sector="Aggregate - *")]
    .assign(**{"global": False})
)
variabledefs.loc[
    isin(gas=["F-Gases", "N2O", "C2F6", "HFC", "SF6", "CF4"]), "global"
] = True
variabledefs.loc[
    isin(
        sector=["Aggregate - Agriculture and LUC", "Aircraft", "International Shipping"]
    ),
    "global",
] = True

In [6]:
variabledefs.to_csv("variabledefs.csv")

### Load variable definitions

In [7]:
@dataclass
class VariableDefinitions:
    data: DataFrame

    @classmethod
    def from_csv(cls, path):
        return cls(pd.read_csv("variabledefs.csv", index_col=list(range(3))))

    @property
    def variable_index(self):
        return self.data.index

    @property
    def index_global(self):
        return self.data.index[self.data["global"]].idx.project(["gas", "sector"])

    @property
    def index_regional(self):
        return self.data.index[~self.data["global"]].idx.project(["gas", "sector"])

    def load_data(
        self,
        df: DataFrame,
        levels: Optional[list[str]] = None,
        ignore_undefined: bool = True,
        ignore_missing: bool = False,
        timeseries: bool = True,
    ):
        """Load data from dataframe

        Assigns sector/gas and checks correct units.

        Parameters
        ----------
        df : DataFrame
            data
        levels : list of str, optional
            levels to keep, or all if None
        ignore_undefined : bool, default True
            whether to fail if undefined variables exist in `df`
        ignore_missing : bool, default False
            whether to ignore defined variables missing from `df`
        timeseries : bool, default True
            whether data is a timeseries and columns should be cast to int

        Returns
        -------
        DataFrame
            data with sector/gas index levels

        Note
        ----
        Does not check regional availability yet! Also because it would have to
        understand about regionmappings and aggregations then.
        """

        df = df.rename_axis(index=str.lower)
        if timeseries:
            df = df.rename_axis(columns="year").rename(columns=int)
        else:
            df = df.rename(columns=str.lower)

        if ignore_undefined and ignore_missing:
            how = "inner"
        elif ignore_undefined:
            how = "right"
        else:
            how = "outer"
        index, li, ri = df.index.join(
            self.variable_index, how=how, return_indexers=True
        )

        def unique_variable_str(index):
            return "\n  " + ",\n  ".join(index.unique("variable"))

        if (li == -1).any():
            raise ValueError(
                "Variables missing from data:" + unique_variable_str(index[li == -1])
            )
        if (ri == -1).any():
            raise ValueError(
                "Undefined variables exist in data:"
                + unique_variable_str(index[ri == -1])
            )

        df = pd.DataFrame(df.values[li], index=index, columns=df.columns).__finalize__(
            df
        )
        if timeseries:
            data_units = self.data["unit"].values[ri]
            non_matching_units = df.index.idx.project("unit") != data_units
            if non_matching_units.any():
                errors = (
                    df.index.to_frame(index=False)
                    .loc[non_matching_units, ["model", "scenario", "variable", "unit"]]
                    .assign(**{"expected unit": data_units[non_matching_units]})
                    .drop_duplicates()
                )
                raise ValueError(
                    "Some variables in the data do not have the correct units:\n"
                    + errors.to_string(index=False)
                )

        if levels is not None:
            return df.idx.project(levels)
        return df

In [8]:
variabledefs = VariableDefinitions.from_csv("variabledefs.csv")

## RegionMapping helps reading in a region definition file

In [9]:
@dataclass
class RegionMapping:
    data: pd.Series

    @classmethod
    def concat(cls, rms):
        return cls(pd.concat(rm.data for rm in rms))

    @classmethod
    def from_regiondef(cls, path):
        path = Path(path)
        match path.suffix:
            case ".csv":
                df = pd.read_csv(path)
            case ".xlsx":
                df = pd.read_csv(path)
            case suffix:
                raise ValueError(f"Unknown file suffix: {suffix}")

        return cls(
            df.set_index("ISO Code")["Native Region Code"]
            .rename_axis("country")
            .rename(index=str.lower)
            .rename("region")
        )

    def prefix(self, s: str):
        return self.__class__(s + self.data)

    @property
    def index(self) -> pd.MultiIndex:
        return pd.MultiIndex.from_arrays(
            [self.data.index, self.data.values], names=["country", "region"]
        )

    def aggregate(self, df: DataFrame, level="country") -> DataFrame:
        if level != "country":
            df = df.rename_axis(index={level: "country"})
        return (
            df.idx.semijoin(self.index, how="right")
            .groupby(
                [n if n != "country" else "region" for n in df.index.names],
                dropna=False,
            )
            .sum()
        )

In [10]:
regionmapping = RegionMapping.from_regiondef(
    base_path / "historical/cmip6/remind_region_mapping.csv"
)

## Model and historic data read in

Can be read in and prepared using `read_iamc` or the `variabledefs`

In [11]:
hist = pd.read_csv(
    base_path / "historical/cmip6/history.csv", index_col=list(range(5))
).pipe(variabledefs.load_data, levels=["region", "gas", "sector", "unit"])

In [12]:
model = (
    pd.read_excel(
        base_path / "iam_files/cmip6/REMIND-MAGPIE_SSP5-34-OS/ssp5-34-os.xlsx",
        sheet_name="data",
        index_col=list(range(5)),
    )
    .rename(index={"Mt CO2-equiv/yr": "Mt CO2-eq/yr"}, level="Unit")
    .pipe(
        variabledefs.load_data,
        levels=["model", "scenario", "region", "gas", "sector", "unit"],
        ignore_missing=True,
    )
)

In [13]:
harm_overrides = (
    pd.read_excel(
        base_path / "iam_files/cmip6/REMIND-MAGPIE_SSP5-34-OS/ssp5-34-os.xlsx",
        sheet_name="harmonization",
        index_col=list(range(4)),
        usecols=list(range(5)),
    )
    .rename_axis(index=str.lower)
    .rename(columns=str.lower)
    .pipe(
        variabledefs.load_data,
        ignore_missing=True,
        levels=["region", "gas", "sector"],
        timeseries=False,
    )
    .method
)

harm_overrides.head()

region  gas  sector        
CHN     BC   Energy Sector     reduce_offset_2150
AFR     BC   Forest Burning        constant_ratio
EUR     BC   Forest Burning        constant_ratio
ROW     BC   Forest Burning        constant_ratio
USA     BC   Forest Burning        constant_ratio
Name: method, dtype: object

In [14]:
model.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,year,2005,2010,2015,2020,2030,2040,2050,2060,2070,2080,2090,2100
model,scenario,region,gas,sector,unit,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
REMIND-MAGPIE,SSP5-34-OS-V27,AFR,BC,Agricultural Waste Burning,Mt BC/yr,0.0439,0.0633,0.0799,0.0966,0.1279,0.1568,0.1768,0.1603,0.1481,0.1306,0.1221,0.1088
REMIND-MAGPIE,SSP5-34-OS-V27,CHN,BC,Agricultural Waste Burning,Mt BC/yr,0.1055,0.1164,0.1243,0.1322,0.14,0.1509,0.1774,0.1593,0.1472,0.1362,0.1287,0.1208
REMIND-MAGPIE,SSP5-34-OS-V27,EUR,BC,Agricultural Waste Burning,Mt BC/yr,0.0387,0.0384,0.0383,0.0383,0.0369,0.0357,0.034,0.034,0.0381,0.0396,0.0391,0.0387
REMIND-MAGPIE,SSP5-34-OS-V27,IND,BC,Agricultural Waste Burning,Mt BC/yr,0.0817,0.0841,0.0892,0.0943,0.1054,0.1061,0.1028,0.1035,0.0923,0.0892,0.0736,0.0633
REMIND-MAGPIE,SSP5-34-OS-V27,JPN,BC,Agricultural Waste Burning,Mt BC/yr,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Harmonization

## Preparation of input data

In [15]:
hist_agg = pd.concat(
    [
        hist.idx.semijoin(variabledefs.index_regional, how="inner").pipe(
            regionmapping.aggregate, level="region"
        ),
        hist.idx.semijoin(variabledefs.index_global, how="inner").loc[
            isin(region="World")
        ],
    ]
)

In [16]:
model_agg = pd.concat(
    [
        model.idx.semijoin(variabledefs.index_regional, how="inner").loc[
            isin(region=regionmapping.data.unique())
        ],
        model.idx.semijoin(variabledefs.index_global, how="inner").loc[
            isin(region="World")
        ],
    ]
)

## Harmonize all model, scenarios combinations

In [27]:
year = 2015

In [28]:
harmonized = []
for m, s in model.index.idx.project(["model", "scenario"]).unique():
    scen = model_agg.loc[isin(model=m, scenario=s)].droplevel(["model", "scenario"])
    h = Harmonizer(
        scen, hist_agg.idx.semijoin(scen.index, how="right"), harm_idx=scen.index.names
    )
    result = h.harmonize(year=year, overrides=harm_overrides)
    harmonized.append(result.idx.assign(model=m, scenario=s))
harmonized = pd.concat(harmonized)

# TODO harmonization casts columns to str!!
harmonized = harmonized.rename(columns=int)

  return np.abs(np.std(x) / np.mean(x))
INFO:root:Harmonizing with reduce_offset_2150_cov
INFO:root:Harmonizing with model_zero
INFO:root:Harmonizing with hist_zero
INFO:root:Harmonizing with constant_ratio
INFO:root:Harmonizing with reduce_offset_2150
INFO:root:Harmonizing with reduce_ratio_2080
INFO:root:Harmonizing with constant_offset
INFO:root:Harmonizing with reduce_offset_2100


In [29]:
harmonized.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,year,2015,2020,2030,2040,2050,2060,2070,2080,2090,2100
gas,sector,region,unit,model,scenario,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
BC,Agricultural Waste Burning,AFR,Mt BC/yr,REMIND-MAGPIE,SSP5-34-OS-V27,0.058867,0.076346,0.109204,0.139662,0.16122,0.146278,0.135636,0.119694,0.112752,0.10101
BC,Agricultural Waste Burning,CHN,Mt BC/yr,REMIND-MAGPIE,SSP5-34-OS-V27,0.013035,0.025056,0.041098,0.060239,0.094981,0.085123,0.081265,0.078507,0.079249,0.079591
BC,Agricultural Waste Burning,EUR,Mt BC/yr,REMIND-MAGPIE,SSP5-34-OS-V27,0.004719,0.005963,0.007051,0.008338,0.009125,0.011613,0.0182,0.022188,0.024175,0.026263
BC,Agricultural Waste Burning,IND,Mt BC/yr,REMIND-MAGPIE,SSP5-34-OS-V27,0.005197,0.013408,0.03073,0.037653,0.040575,0.047498,0.04252,0.045643,0.036265,0.032188
BC,Agricultural Waste Burning,JPN,Mt BC/yr,REMIND-MAGPIE,SSP5-34-OS-V27,0.000437,0.000437,0.000437,0.000437,0.000437,0.000437,0.000437,0.000437,0.000437,0.000437


# Downscaling

In [35]:
gdp = (
    pd.read_csv(
        base_path / "historical" / "SspDb_country_data_2013-06-12.csv",
        index_col=list(range(5)),
    )
    .rename_axis(index=str.lower)
    .loc[
        isin(
            model="OECD Env-Growth",
            scenario=[f"SSP{n+1}_v9_130325" for n in range(5)],
            variable="GDP|PPP",
        )
    ]
    .dropna(how="all", axis=1)
    .rename_axis(index={"scenario": "ssp", "region": "country"})
    .rename(index=str.lower, level="country")
    .rename(columns=int)
    .idx.project(["ssp", "country"])
)
gdp.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,2000,2005,2010,2015,2020,2025,2030,2035,2040,2045,...,2055,2060,2065,2070,2075,2080,2085,2090,2095,2100
ssp,country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
SSP1_v9_130325,bhs,9.179,9.964,9.647,10.852,12.364,13.913,15.577,17.4,19.244,21.071,...,24.061,25.282,26.374,27.331,27.979,28.462,28.752,28.843,28.773,28.526
SSP1_v9_130325,bih,18.823,23.975,28.065,30.809,37.787,47.037,57.74,69.273,80.304,89.614,...,101.533,105.79,108.039,109.08,108.896,107.453,105.156,102.274,99.093,95.538
SSP1_v9_130325,blr,58.129,83.492,118.671,145.259,176.778,211.172,250.038,292.074,330.894,362.482,...,404.928,424.762,443.176,455.985,461.441,462.309,461.472,459.908,456.94,451.145
SSP1_v9_130325,blz,1.404,1.825,2.061,2.266,2.592,3.141,3.903,4.861,5.977,7.218,...,9.869,11.223,12.574,13.881,15.116,16.32,17.463,18.516,19.481,20.309
SSP1_v9_130325,twn,0.0,0.0,742.813,889.439,1073.988,1208.735,1321.155,1424.861,1513.084,1570.705,...,1587.286,1574.961,1608.617,1631.834,1638.589,1633.208,1610.471,1576.924,1534.288,1482.39


In [36]:
SSP_per_pathway = (
    harmonized.index.idx.project(["model", "scenario"])
    .unique()
    .to_frame()
    .scenario.str.extract("(SSP[1-5])")[0]
    .fillna("SSP2")
)
gdp = semijoin(
    gdp,
    SSP_per_pathway.index.idx.assign(ssp=SSP_per_pathway + "_v9_130325"),
    how="right",
).idx.project(["model", "scenario", "country"])

In [37]:
harmonized = harmonized.idx.semijoin(variabledefs.index_regional, how="inner").loc[isin(region="AFR", gas=["CH4", "CO2"])] # sector=["Waste", "Energy Sector"], gas=["CH4", "CO2"])]

In [38]:
hist = hist.idx.semijoin(variabledefs.index_regional, how="inner").rename_axis(
        index={"region": "country"}
).loc[isin(gas=["CH4", "CO2"])]

In [39]:
downscaler = Downscaler(harmonized, hist, 2015, regionmapping.data.drop(["srb (kosovo)", "ssd"]), gdp=gdp)

In [42]:
from aneris.downscaling import Downscaler

downscaler = Downscaler(
    harmonized.idx.semijoin(variabledefs.index_regional, how="inner"),
    hist.idx.semijoin(variabledefs.index_regional, how="inner").rename_axis(
        index={"region": "country"}
    ),
    year,
    regionmapping.data.drop(["srb (kosovo)", "ssd"]),
    gdp=gdp,
)
results = downscaler.downscale()

  return np.abs(np.std(x) / np.mean(x))


In [43]:
results

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,year,2015,2020,2030,2040,2050,2060,2070,2080,2090,2100
gas,sector,region,unit,model,scenario,country,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
CH4,Agriculture,AFR,Mt CH4/yr,REMIND-MAGPIE,SSP5-34-OS-V27,ago,0.220823,0.279381,0.515045,0.876123,0.775596,0.397503,0.178150,0.156520,0.128956,0.098628
CH4,Agriculture,AFR,Mt CH4/yr,REMIND-MAGPIE,SSP5-34-OS-V27,ben,0.095516,0.120845,0.222780,0.378963,0.335480,0.171938,0.077058,0.067702,0.055779,0.042661
CH4,Agriculture,AFR,Mt CH4/yr,REMIND-MAGPIE,SSP5-34-OS-V27,bwa,0.105129,0.133007,0.245201,0.417102,0.369243,0.189242,0.084813,0.074516,0.061393,0.046954
CH4,Agriculture,AFR,Mt CH4/yr,REMIND-MAGPIE,SSP5-34-OS-V27,bfa,0.511754,0.647461,1.193611,2.030404,1.797433,0.921207,0.412861,0.362734,0.298854,0.228569
CH4,Agriculture,AFR,Mt CH4/yr,REMIND-MAGPIE,SSP5-34-OS-V27,bdi,0.036465,0.046135,0.085051,0.144676,0.128076,0.065641,0.029418,0.025847,0.021295,0.016287
CH4,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CH4,Solvents Production and Application,AFR,Mt CH4/yr,REMIND-MAGPIE,SSP5-34-OS-V27,zwe,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
CH4,Solvents Production and Application,AFR,Mt CH4/yr,REMIND-MAGPIE,SSP5-34-OS-V27,ago,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
CH4,Solvents Production and Application,AFR,Mt CH4/yr,REMIND-MAGPIE,SSP5-34-OS-V27,bdi,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
CH4,Solvents Production and Application,AFR,Mt CH4/yr,REMIND-MAGPIE,SSP5-34-OS-V27,ben,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
