# Getting started with the experimental version of raking

Let us start by importing the raking function to run the raking.

In [1]:
import numpy as np
import pandas as pd
from raking.experimental import DataBuilder
from raking.experimental import DualSolver

Here modify the path to indicate the directory where you have installed the GitHub repository.

In [2]:
YOUR_PATH = "/Users/ducela/Documents/Raking/ihmeuw-msca/raking/"

## Examples

In this examples, we only want the raked values. The experimental version of the raking package does not contain uncertainty propagation yet.

### 1D example

In [3]:
# Read the observations and margin data sets
df_obs = pd.read_csv(YOUR_PATH + "tests/examples/example_1D/observations.csv")
df_margin = pd.read_csv(YOUR_PATH + "tests/examples/example_1D/margin.csv")
df_obs['weights'] = 1.0
df_margin['var1'] = -1
df_margin['weights'] = np.inf
df_margin.rename(columns={'value_agg_over_var1': 'value'}, inplace=True)
df = pd.concat([df_obs, df_margin])

In [4]:
# Build the raking data
data_builder = DataBuilder(
    dim_specs={'var1': -1},
    value='value',
    weights='weights'
)
data = data_builder.build(df)

In [5]:
# Solve the raking problem
solver = DualSolver(distance='entropic', data=data)
df_raked = solver.solve()

In [6]:
# Check the result of the solver
solver.result

  message: CONVERGENCE: RELATIVE REDUCTION OF F <= FACTR*EPSMCH
  success: True
   status: 0
      fun: 6.9408010589989875
        x: [-9.961e-04]
      nit: 3
      jac: [ 1.084e-11]
     nfev: 5
     njev: 5
 hess_inv: <1x1 LbfgsInvHessProduct with dtype=float64>

In [7]:
# We can verify that the raked values add up to the margin
print(
    np.allclose(
        df_raked['soln'].sum(), df_margin['value'].iloc[0]
    )
)

True


### 2D example

In [8]:
# Read the observations and margins data sets
df_obs = pd.read_csv(YOUR_PATH + "tests/examples/example_2D/observations.csv")
df_margins_1 = pd.read_csv(
    YOUR_PATH + "tests/examples/example_2D/margins_1.csv"
)
df_margins_2 = pd.read_csv(
    YOUR_PATH + "tests/examples/example_2D/margins_2.csv"
)
df_obs['weights'] = 1.0
df_margins_1['var1'] = -1
df_margins_1['weights'] = np.inf
df_margins_1.rename(columns={'value_agg_over_var1': 'value'}, inplace=True)
df_margins_2['var2'] = -1
df_margins_2['weights'] = np.inf
df_margins_2.rename(columns={'value_agg_over_var2': 'value'}, inplace=True)
df = pd.concat([df_obs, df_margins_1, df_margins_2])

In [9]:
# Build the raking data
data_builder = DataBuilder(
    dim_specs={'var1': -1, 'var2': -1},
    value='value',
    weights='weights'
)
data = data_builder.build(df)

In [10]:
# Solve the raking problem
solver = DualSolver(distance='entropic', data=data)
df_raked = solver.solve()

In [11]:
# Check the result of the solver
solver.result

  message: CONVERGENCE: RELATIVE REDUCTION OF F <= FACTR*EPSMCH
  success: True
   status: 0
      fun: 37.92727783633624
        x: [-4.717e-03 -1.792e-02 -1.830e-02 -5.281e-03  3.278e-02
             2.807e-02  1.769e-02]
      nit: 9
      jac: [-3.106e-07  3.427e-07  2.581e-07 -3.306e-07  1.547e-07
            -7.824e-07  1.385e-06]
     nfev: 12
     njev: 12
 hess_inv: <7x7 LbfgsInvHessProduct with dtype=float64>

In [12]:
# We can verify that the raked values add up to the margins
sum_over_var1 = (
    df_raked.groupby(["var2"])
    .agg({"soln": "sum"})
    .reset_index()
    .merge(df_margins_1, on="var2")
)
sum_over_var2 = (
    df_raked.groupby(["var1"])
    .agg({"soln": "sum"})
    .reset_index()
    .merge(df_margins_2, on="var1")
)
print(
    np.allclose(
        sum_over_var1["soln"], sum_over_var1["value"]
    ),
    np.allclose(
        sum_over_var2["soln"], sum_over_var2["value"]
    ),
)

True True


### 3D example

In [13]:
# Read the observations and margins data sets
df_obs = pd.read_csv(YOUR_PATH + "tests/examples/example_3D/observations.csv")
df_margins_1 = pd.read_csv(
    YOUR_PATH + "tests/examples/example_3D/margins_1.csv"
)
df_margins_2 = pd.read_csv(
    YOUR_PATH + "tests/examples/example_3D/margins_2.csv"
)
df_margins_3 = pd.read_csv(
    YOUR_PATH + "tests/examples/example_3D/margins_3.csv"
)
df_obs['weights'] = 1.0
df_margins_1['var1'] = -1
df_margins_1['weights'] = np.inf
df_margins_1.rename(columns={'value_agg_over_var1': 'value'}, inplace=True)
df_margins_2['var2'] = -1
df_margins_2['weights'] = np.inf
df_margins_2.rename(columns={'value_agg_over_var2': 'value'}, inplace=True)
df_margins_3['var3'] = -1
df_margins_3['weights'] = np.inf
df_margins_3.rename(columns={'value_agg_over_var3': 'value'}, inplace=True)
df = pd.concat([df_obs, df_margins_1, df_margins_2, df_margins_3])

In [14]:
# Build the raking data
data_builder = DataBuilder(
    dim_specs={'var1': -1, 'var2': -1, 'var3': -1},
    value='value',
    weights='weights'
)
data = data_builder.build(df)

In [15]:
# Solve the raking problem
solver = DualSolver(distance='entropic', data=data)
df_raked = solver.solve()

In [16]:
# Check the result of the solver
solver.result

  message: CONVERGENCE: RELATIVE REDUCTION OF F <= FACTR*EPSMCH
  success: True
   status: 0
      fun: 150.7699855612442
        x: [-4.606e-02 -9.952e-03 ...  2.920e-02  3.608e-02]
      nit: 26
      jac: [-7.671e-07  3.663e-06 ...  2.683e-06  1.457e-05]
     nfev: 30
     njev: 30
 hess_inv: <36x36 LbfgsInvHessProduct with dtype=float64>

In [17]:
# We can verify that the raked values add up to the margins
sum_over_var1 = (
    df_raked.groupby(["var2", "var3"])
    .agg({"soln": "sum"})
    .reset_index()
    .merge(df_margins_1, on=["var2", "var3"])
)
sum_over_var2 = (
    df_raked.groupby(["var1", "var3"])
    .agg({"soln": "sum"})
    .reset_index()
    .merge(df_margins_2, on=["var1", "var3"])
)
sum_over_var3 = (
    df_raked.groupby(["var1", "var2"])
    .agg({"soln": "sum"})
    .reset_index()
    .merge(df_margins_3, on=["var1", "var2"])
)
print(
    np.allclose(
        sum_over_var1["soln"], sum_over_var1["value"]
    ),
    np.allclose(
        sum_over_var2["soln"], sum_over_var2["value"]
    ),
    np.allclose(
        sum_over_var3["soln"], sum_over_var3["value"]
    ),
)

True True True


### USHD example

In [18]:
# Read the observations and margins data sets
df_obs = pd.read_csv(YOUR_PATH + "tests/examples/example_USHD/observations.csv")
df_margin = pd.read_csv(YOUR_PATH + "tests/examples/example_USHD/margins.csv")
df_obs['weights'] = 1.0
df_obs.replace({'cause': '_all', 'race': 1}, -1, inplace=True)
df_obs.drop(columns=['upper'], inplace=True)
df_obs.replace({'cause': {'_comm': 1, '_inj': 2, '_ncd': 3}}, inplace=True)
df_margin['race'] = -1
df_margin['county'] = -1
df_margin['weights'] = np.inf
df_margin.rename(columns={'value_agg_over_race_county': 'value'}, inplace=True)
df_margin.replace({'cause': {'_all': -1, '_comm': 1, '_inj': 2, '_ncd': 3}}, inplace=True)
df = pd.concat([df_obs, df_margin])
df = df.astype({'cause': 'int64'})

In [19]:
# Build the raking data
data_builder = DataBuilder(
    dim_specs={'cause': -1, 'race': -1, 'county': -1},
    value='value',
    weights='weights'
)
data = data_builder.build(df)

In [20]:
# Solve the raking problem
solver = DualSolver(distance='entropic', data=data)
df_raked = solver.solve()

In [21]:
# Check the result of the solver
solver.result

  message: CONVERGENCE: RELATIVE REDUCTION OF F <= FACTR*EPSMCH
  success: True
   status: 0
      fun: 159.6193588261129
        x: [ 5.457e-02  1.679e-02 ... -4.723e-01 -5.950e-01]
      nit: 125
      jac: [-5.167e-05 -2.049e-05 ... -1.178e-05  1.735e-06]
     nfev: 134
     njev: 134
 hess_inv: <30x30 LbfgsInvHessProduct with dtype=float64>

In [22]:
# We can verify that the raked values add up to the margins
sum_over_race_county = (
    df_raked
    .groupby(["cause"], observed=True)
    .agg({"soln": "sum"})
    .reset_index()
    .merge(df_margin, on=["cause"])
)
print(
    np.allclose(
        sum_over_race_county["soln"],
        sum_over_race_county["value"],
        atol=1.0e-5,
    ),
)

True


### USHD lower example

In [23]:
# Read the observations and margins data sets
df_obs = pd.read_csv(
    YOUR_PATH + "tests/examples/example_USHD_lower/observations.csv"
)
df_margin_cause = pd.read_csv(
    YOUR_PATH + "tests/examples/example_USHD_lower/margins_cause.csv"
)
df_margin_county = pd.read_csv(
    YOUR_PATH + "tests/examples/example_USHD_lower/margins_county.csv"
)
df_margin_all_causes = pd.read_csv(
    YOUR_PATH + "tests/examples/example_USHD_lower/margins_all_causes.csv"
)
df_obs['weights'] = 1.0
df_obs.replace({'race': 1}, -1, inplace=True)
df_obs.drop(columns=['upper'], inplace=True)
df_obs.replace({'cause': {'_intent': 1, '_unintent': 2, 'inj_trans': 3}}, inplace=True)
df_margin_cause['race'] = -1
df_margin_cause['county'] = -1
df_margin_cause['weights'] = np.inf
df_margin_cause.rename(columns={'value_agg_over_race_county': 'value'}, inplace=True)
df_margin_cause.replace({'cause': {'_intent': 1, '_unintent': 2, 'inj_trans': 3}}, inplace=True)
df_margin_county['cause'] = -1
df_margin_county['race'] = -1
df_margin_county['weights'] = np.inf
df_margin_county.rename(columns={'value_agg_over_cause_race': 'value'}, inplace=True)
df_margin_all_causes['cause'] = -1
df_margin_all_causes['weights'] = np.inf
df_margin_all_causes.rename(columns={'value_agg_over_cause': 'value'}, inplace=True)
df = pd.concat([df_obs, df_margin_cause, df_margin_county, df_margin_all_causes])
df = df.astype({'cause': 'int64'})

In [24]:
# Build the raking data
data_builder = DataBuilder(
    dim_specs={'cause': -1, 'race': -1, 'county': -1},
    value='value',
    weights='weights'
)
data = data_builder.build(df)

In [25]:
# Solve the raking problem
solver = DualSolver(distance='entropic', data=data)
df_raked = solver.solve()

In [26]:
# Check the result of the solver
solver.result

  message: CONVERGENCE: RELATIVE REDUCTION OF F <= FACTR*EPSMCH
  success: True
   status: 0
      fun: 4.6554862448094285
        x: [ 1.410e-02  3.647e-02 ... -4.615e-01 -6.325e-01]
      nit: 133
      jac: [ 5.744e-07  5.623e-07 ... -2.453e-06 -1.261e-06]
     nfev: 140
     njev: 140
 hess_inv: <27x27 LbfgsInvHessProduct with dtype=float64>

In [27]:
# We can verify that the raked values add up to the margins
sum_over_cause = (
    df_raked
    .groupby(["race", "county"], observed=True)
    .agg({"soln": "sum"})
    .reset_index()
    .merge(df_margin_all_causes, on=["race", "county"])
)
sum_over_cause_race = (
    df_raked
    .groupby(["county"], observed=True)
    .agg({"soln": "sum"})
    .reset_index()
    .merge(df_margin_county, on=["county"])
)
sum_over_race_county = (
    df_raked
    .groupby(["cause"], observed=True)
    .agg({"soln": "sum"})
    .reset_index()
    .merge(df_margin_cause, on=["cause"])
)
print(
    np.allclose(
        sum_over_cause["soln"],
        sum_over_cause["value"],
        atol=1.0e-4,
    ),
    np.allclose(
        sum_over_cause_race["soln"],
        sum_over_cause_race["value"],
        atol=1.0e-4,
    ),
    np.allclose(
        sum_over_race_county["soln"],
        sum_over_race_county["value"],
        atol=1.0e-5,
    ),
)

True True True
