# Getting started with raking

Let us start by importing the raking function to run the raking.

In [4]:
import numpy as np
import pandas as pd
from raking.run_raking import run_raking

Here modify the path to indicate the directory where you have installed the GitHub repository.

In [6]:
YOUR_PATH = "/Users/ducela/Documents/Raking/ihmeuw-msca/raking/"

## Examples without uncertainty

In this examples, we only want the raked values. We do not provide draws for the observations and the margins and we do not want the variances and covariances of the raked values.

### 1D example

In [9]:
# Read the observations and margin data sets
df_obs = pd.read_csv(YOUR_PATH + "tests/examples/example_1D/observations.csv")
df_margin = pd.read_csv(YOUR_PATH + "tests/examples/example_1D/margin.csv")

# Run the raking function
(df_raked, dummy1, dummy2, dummy3) = run_raking(
    1, df_obs, [df_margin], ["var1"], cov_mat=False
)

In [10]:
# We can verify that the raked values add up to the margin
print(
    np.allclose(
        df_raked["raked_value"].sum(), df_margin["value_agg_over_var1"].iloc[0]
    )
)

True


### 2D example

In [12]:
# Read the observations and margins data sets
df_obs = pd.read_csv(YOUR_PATH + "tests/examples/example_2D/observations.csv")
df_margins_1 = pd.read_csv(
    YOUR_PATH + "tests/examples/example_2D/margins_1.csv"
)
df_margins_2 = pd.read_csv(
    YOUR_PATH + "tests/examples/example_2D/margins_2.csv"
)

# Run the raking function
(df_raked, dummy1, dummy2, dummy3) = run_raking(
    2, df_obs, [df_margins_1, df_margins_2], ["var1", "var2"], cov_mat=False
)

In [13]:
# We can verify that the raked values add up to the margins
sum_over_var1 = (
    df_raked.groupby(["var2"])
    .agg({"raked_value": "sum"})
    .reset_index()
    .merge(df_margins_1, on="var2")
)
sum_over_var2 = (
    df_raked.groupby(["var1"])
    .agg({"raked_value": "sum"})
    .reset_index()
    .merge(df_margins_2, on="var1")
)
print(
    np.allclose(
        sum_over_var1["raked_value"], sum_over_var1["value_agg_over_var1"]
    ),
    np.allclose(
        sum_over_var2["raked_value"], sum_over_var2["value_agg_over_var2"]
    ),
)

True True


### 3D example

In [15]:
# Read the observations and margins data sets
df_obs = pd.read_csv(YOUR_PATH + "tests/examples/example_3D/observations.csv")
df_margins_1 = pd.read_csv(
    YOUR_PATH + "tests/examples/example_3D/margins_1.csv"
)
df_margins_2 = pd.read_csv(
    YOUR_PATH + "tests/examples/example_3D/margins_2.csv"
)
df_margins_3 = pd.read_csv(
    YOUR_PATH + "tests/examples/example_3D/margins_3.csv"
)

# Run the raking function
(df_raked, dummy1, dummy2, dummy3) = run_raking(
    3,
    df_obs,
    [df_margins_1, df_margins_2, df_margins_3],
    ["var1", "var2", "var3"],
    cov_mat=False,
)

In [16]:
# We can verify that the raked values add up to the margins
sum_over_var1 = (
    df_raked.groupby(["var2", "var3"])
    .agg({"raked_value": "sum"})
    .reset_index()
    .merge(df_margins_1, on=["var2", "var3"])
)
sum_over_var2 = (
    df_raked.groupby(["var1", "var3"])
    .agg({"raked_value": "sum"})
    .reset_index()
    .merge(df_margins_2, on=["var1", "var3"])
)
sum_over_var3 = (
    df_raked.groupby(["var1", "var2"])
    .agg({"raked_value": "sum"})
    .reset_index()
    .merge(df_margins_3, on=["var1", "var2"])
)
print(
    np.allclose(
        sum_over_var1["raked_value"], sum_over_var1["value_agg_over_var1"]
    ),
    np.allclose(
        sum_over_var2["raked_value"], sum_over_var2["value_agg_over_var2"]
    ),
    np.allclose(
        sum_over_var3["raked_value"], sum_over_var3["value_agg_over_var3"]
    ),
)

True True True


### USHD example

In [18]:
# Read the observations and margins data sets
df_obs = pd.read_csv(YOUR_PATH + "tests/examples/example_USHD/observations.csv")
df_margin = pd.read_csv(YOUR_PATH + "tests/examples/example_USHD/margins.csv")

# Run the raking function
(df_raked, dummy1, dummy2, dummy3) = run_raking(
    "USHD",
    df_obs,
    [df_margin],
    None,
    cov_mat=False,
)

In [19]:
# We can verify that the raked values add up to the margins
sum_over_cause = (
    df_raked.loc[df_raked.cause != "_all"]
    .groupby(["race", "county"])
    .agg({"raked_value": "sum"})
    .reset_index()
    .merge(df_raked.loc[df_raked.cause == "_all"], on=["race", "county"])
)
sum_over_race = (
    df_raked.loc[df_raked.race != 0]
    .groupby(["cause", "county"])
    .agg({"raked_value": "sum"})
    .reset_index()
    .merge(df_raked.loc[df_raked.race == 0], on=["cause", "county"])
)
sum_over_cause_race = (
    df_raked.loc[(df_raked.cause != "_all") & (df_raked.race != 0)]
    .groupby(["county"])
    .agg({"raked_value": "sum"})
    .reset_index()
    .merge(
        df_raked.loc[(df_raked.cause == "_all") & (df_raked.race == 0)],
        on=["county"],
    )
)
sum_over_race_county = (
    df_raked.loc[df_raked.race != 0]
    .groupby(["cause"])
    .agg({"raked_value": "sum"})
    .reset_index()
    .merge(df_margin, on=["cause"])
)
print(
    np.allclose(
        sum_over_cause["raked_value_x"],
        sum_over_cause["raked_value_y"],
        atol=1.0e-4,
    ),
    np.allclose(
        sum_over_race["raked_value_x"],
        sum_over_race["raked_value_y"],
        atol=1.0e-4,
    ),
    np.allclose(
        sum_over_cause_race["raked_value_x"],
        sum_over_cause_race["raked_value_y"],
        atol=1.0e-4,
    ),
    np.allclose(
        sum_over_race_county["raked_value"],
        sum_over_race_county["value_agg_over_race_county"],
        atol=1.0e-5,
    ),
)

True True True True


## Examples with uncertainty

In this examples, we are interested in the uncertainty on the raked values. We provide draws for both the observations and the margins. The code will compute the mean and the covariance matrix of the observations and margins and return the mean and the covariance matrix of the raked values.

### 1D example

In [23]:
# Read the observations and margin data sets
df_obs = pd.read_csv(
    YOUR_PATH + "tests/examples/example_1D_draws/observations.csv"
)
df_margin = pd.read_csv(
    YOUR_PATH + "tests/examples/example_1D_draws/margin.csv"
)

# Run the raking function
(df_raked, dummy1, dummy2, sigma) = run_raking(
    1,
    df_obs,
    [df_margin],
    ["var1"],
    draws="draws",
    cov_mat=True,
)

In [24]:
# We can verify that the raked values add up to the margin
print(
    np.allclose(
        df_raked["raked_value"].sum(), df_margin["value_agg_over_var1"].mean()
    )
)

True


### 2D example

In [26]:
# Read the observations and margins data sets
df_obs = pd.read_csv(
    YOUR_PATH + "tests/examples/example_2D_draws/observations.csv"
)
df_margins_1 = pd.read_csv(
    YOUR_PATH + "tests/examples/example_2D_draws/margins_1.csv"
)
df_margins_2 = pd.read_csv(
    YOUR_PATH + "tests/examples/example_2D_draws/margins_2.csv"
)

# Run the raking function
(df_raked, dummy1, dummy2, sigma) = run_raking(
    2,
    df_obs,
    [df_margins_1, df_margins_2],
    ["var1", "var2"],
    draws="draws",
    cov_mat=True,
)

In [27]:
# We can verify that the raked values add up to the margins
sum_over_var1 = (
    df_raked.groupby(["var2"])
    .agg({"raked_value": "sum"})
    .reset_index()
    .merge(
        df_margins_1.groupby(["var2"])
        .agg({"value_agg_over_var1": "mean"})
        .reset_index(),
        on="var2",
    )
)
sum_over_var2 = (
    df_raked.groupby(["var1"])
    .agg({"raked_value": "sum"})
    .reset_index()
    .merge(
        df_margins_2.groupby(["var1"])
        .agg({"value_agg_over_var2": "mean"})
        .reset_index(),
        on="var1",
    )
)
print(
    np.allclose(
        sum_over_var1["raked_value"], sum_over_var1["value_agg_over_var1"]
    ),
    np.allclose(
        sum_over_var2["raked_value"], sum_over_var2["value_agg_over_var2"]
    ),
)

True True


### 3D example

In [29]:
# Read the observations and margins data sets
df_obs = pd.read_csv(
    YOUR_PATH + "tests/examples/example_3D_draws/observations.csv"
)
df_margins_1 = pd.read_csv(
    YOUR_PATH + "tests/examples/example_3D_draws/margins_1.csv"
)
df_margins_2 = pd.read_csv(
    YOUR_PATH + "tests/examples/example_3D_draws/margins_2.csv"
)
df_margins_3 = pd.read_csv(
    YOUR_PATH + "tests/examples/example_3D_draws/margins_3.csv"
)

# Run the raking function
(df_raked, dummy1, dummy2, sigma) = run_raking(
    3,
    df_obs,
    [df_margins_1, df_margins_2, df_margins_3],
    ["var1", "var2", "var3"],
    draws="draws",
    cov_mat=True,
)

In [30]:
# We can verify that the raked values add up to the margins
sum_over_var1 = (
    df_raked.groupby(["var2", "var3"])
    .agg({"raked_value": "sum"})
    .reset_index()
    .merge(
        df_margins_1.groupby(["var2", "var3"])
        .agg({"value_agg_over_var1": "mean"})
        .reset_index(),
        on=["var2", "var3"],
    )
)
sum_over_var2 = (
    df_raked.groupby(["var1", "var3"])
    .agg({"raked_value": "sum"})
    .reset_index()
    .merge(
        df_margins_2.groupby(["var1", "var3"])
        .agg({"value_agg_over_var2": "mean"})
        .reset_index(),
        on=["var1", "var3"],
    )
)
sum_over_var3 = (
    df_raked.groupby(["var1", "var2"])
    .agg({"raked_value": "sum"})
    .reset_index()
    .merge(
        df_margins_3.groupby(["var1", "var2"])
        .agg({"value_agg_over_var3": "mean"})
        .reset_index(),
        on=["var1", "var2"],
    )
)
print(
    np.allclose(
        sum_over_var1["raked_value"], sum_over_var1["value_agg_over_var1"]
    ),
    np.allclose(
        sum_over_var2["raked_value"], sum_over_var2["value_agg_over_var2"]
    ),
    np.allclose(
        sum_over_var3["raked_value"], sum_over_var3["value_agg_over_var3"]
    ),
)

True True True


### USHD example

In [32]:
# Read the observations and margins data sets
df_obs = pd.read_csv(
    YOUR_PATH + "tests/examples/example_USHD_draws/observations.csv"
)
df_margin = pd.read_csv(
    YOUR_PATH + "tests/examples/example_USHD_draws/margins.csv"
)

# Run the raking function
(df_raked, dummy1, dummy2, sigma) = run_raking(
    "USHD",
    df_obs,
    [df_margin],
    None,
    draws="draws",
    cov_mat=True,
)

In [33]:
# We can verify that the raked values add up to the margins
sum_over_cause = (
    df_raked.loc[df_raked.cause != "_all"]
    .groupby(["race", "county"])
    .agg({"raked_value": "sum"})
    .reset_index()
    .merge(df_raked.loc[df_raked.cause == "_all"], on=["race", "county"])
)
sum_over_race = (
    df_raked.loc[df_raked.race != 0]
    .groupby(["cause", "county"])
    .agg({"raked_value": "sum"})
    .reset_index()
    .merge(df_raked.loc[df_raked.race == 0], on=["cause", "county"])
)
sum_over_cause_race = (
    df_raked.loc[(df_raked.cause != "_all") & (df_raked.race != 0)]
    .groupby(["county"])
    .agg({"raked_value": "sum"})
    .reset_index()
    .merge(
        df_raked.loc[(df_raked.cause == "_all") & (df_raked.race == 0)],
        on=["county"],
    )
)
sum_over_race_county = (
    df_raked.loc[df_raked.race != 0]
    .groupby(["cause"])
    .agg({"raked_value": "sum"})
    .reset_index()
    .merge(
        df_margin.groupby(["cause"])
            .agg({"value_agg_over_race_county": "mean"})
            .reset_index(),
            on=["cause"],
        )
)
print(
    np.allclose(
        sum_over_cause["raked_value_x"],
        sum_over_cause["raked_value_y"],
        atol=1.0e-4,
    ),
    np.allclose(
        sum_over_race["raked_value_x"],
        sum_over_race["raked_value_y"],
        atol=1.0e-4,
    ),
    np.allclose(
        sum_over_cause_race["raked_value_x"],
        sum_over_cause_race["raked_value_y"],
        atol=1.0e-4,
    ),
    np.allclose(
        sum_over_race_county["raked_value"],
        sum_over_race_county["value_agg_over_race_county"],
        atol=1.0e-4,
    ),
)

True True True True
