## Simulation Design

### 1. Data Generating Process

- **Subjects**:  
  $ i = 1, \dots, N $

- **Time points**:  
  $ t = 1, \dots, T $

- **Time-invariant confounder**:  
 $[w_i \;\sim\; \mathcal{N}(0,1)]$

- **Treatment assignment** $z_i$:  
  - **Confounded scenario**  
    $
      \text{logit}\bigl[P(z_i=1\mid w_i)\bigr]
      = \alpha_0 + \alpha_1\,w_i
      \quad\Longrightarrow\quad
      z_i \sim \mathrm{Bernoulli}\bigl(\sigma(\alpha_0 + \alpha_1w_i)\bigr)
    $

- **Linear predictor**  
  $
    \eta_{it}
    = \beta_0 \;+\; \beta_1\,z_i \;+\; \gamma\,w_i
  $

- **Outcome**  
  $
    Y_{it} \;\sim\; \mathrm{Poisson}\bigl(\exp(\eta_{it})\bigr)
  $

In [None]:
import numpy as np
import pandas as pd


def simulate_panel_data(
    N, T, beta0, beta1, gamma,
    confounded, eta0=0.0, eta1=1.0, p=0.5, seed=None
):
    """Generate panel data with one time-invariant confounder w (cost)."""
    if seed is not None:
        np.random.seed(seed)
    w = np.random.normal(size=N)  # confounder
    if confounded:
        logits = eta0 + eta1 * w
        pi = 1/(1 + np.exp(-logits))
        z = np.random.binomial(1, pi, size=N)
    else:
        z = np.random.binomial(1, p, size=N)
    df = pd.DataFrame({
        'i': np.repeat(np.arange(N), T),
        't': np.tile(np.arange(T), N)
    })
    df['w'] = df['i'].map(lambda i: w[i])
    df['z'] = df['i'].map(lambda i: z[i])
    eta = beta0 + beta1 * df['z'] + gamma * df['w']
    df['y'] = np.random.poisson(lam=np.exp(eta))
    return df

## Estimators

1. **Longitudinal GEE estimator**  
   We fit a Poisson‐family GEE with exchangeable correlation on the full panel $\{Y_{it}\}$.  The treatment coefficient $\beta_1$ is estimated as the solution to the estimating equations  
   $
     U_n(\beta_1) \;=\;\sum_{i=1}^N\sum_{t=1}^T D_{it}^\top\bigl[Y_{it} - \exp(\beta_0 + \beta_1 z_i + \gamma w_i)\bigr]
     \;=\;0,
   $
   where $ D_{it} = \partial\,E[Y_{it}\mid z_i,w_i]/\partial\beta_1 = z_i\exp(\dots)$

   We denote the resulting estimate by $\hat\beta_1^{\rm GEE}$.

3. **Poisson GLM estimator**  
   At the final time point \(t=T\), we fit
   $
     Y_{iT} \sim \mathrm{Poisson}\bigl(\exp(\beta_0 + \beta_1 z_i + \gamma w_i)\bigr)
     \quad\Longrightarrow\quad
     \hat\beta_1^{\rm GLM}.
   $


   This uses only the cross‐section at $t=T$, ignoring longitudinal correlation.

In [None]:
import statsmodels.api as sm
from statsmodels.genmod.generalized_estimating_equations import GEE
from statsmodels.genmod.families import Poisson

def fit_estimators(df):
    """Fit GEE and snapshot Poisson GLM; return estimates and p-values for z."""
    # GEE
    gee = GEE.from_formula('y ~ z + w', groups='i', data=df,
                           family=Poisson(), cov_struct=sm.cov_struct.Exchangeable())
    ## other cov_struct options include sm.cov_struct.Independence()
    res_gee = gee.fit()
    beta1_gee = res_gee.params['z']
    p_gee = res_gee.pvalues['z']
    # Snapshot GLM
    final_t = df['t'].max()
    df_snap = df[df['t'] == final_t]
    glm = sm.GLM.from_formula('y ~ z + w', data=df_snap, family=Poisson())
    res_glm = glm.fit()
    beta1_glm = res_glm.params['z']
    p_glm = res_glm.pvalues['z']
    return beta1_gee, p_gee, beta1_glm, p_glm


In [None]:
def simulate(
    sample_sizes, T, beta0, betas, gamma,
    confounded, eta0, eta1, p,
    n_sims=1000, alphas=(0.05, 0.01)
):
    """Run simulations with respect to varying inputs, return DataFrame of type I and power."""
    records = []
    for N in sample_sizes:
        for beta1 in betas:
            sims = {'GEE': {α: 0 for α in alphas}, 'GLM': {α: 0 for α in alphas}}
            for sim in range(n_sims):
                df = simulate_panel_data(N, T, beta0, beta1, gamma,
                                         confounded, eta0, eta1, p, seed=sim)
                _, p_gee, _, p_glm = fit_estimators(df)
                for α in alphas:
                    if p_gee < α:
                        sims['GEE'][α] += 1
                    if p_glm < α:
                        sims['GLM'][α] += 1
            # aggregate proportions
            for α in alphas:
                records.append({
                    'N': N,
                    'beta1': beta1,
                    'method': 'GEE',
                    'alpha': α,
                    'rej_rate': sims['GEE'][α] / n_sims
                })
                records.append({
                    'N': N,
                    'beta1': beta1,
                    'method': 'GLM',
                    'alpha': α,
                    'rej_rate': sims['GLM'][α] / n_sims
                })
    return pd.DataFrame(records)

In [None]:
import numpy as np
import pandas as pd

# betas = 0.0 to check type I error
# betas = 0.1 and 0.2 to check power with respect to different average treatement effect (ATE) size
df_results = simulate(
    # Fix sample sizes N=50,200 to demo small and large sample size settings
    sample_sizes=[50, 200],
    T=5,
    beta0=1.0,
    betas=[0.0,0.1,0.2],
    gamma=0.5,
    confounded=True,
    eta0=0.0,
    eta1=1.0,
    p=0.5,
    n_sims=1000,
    ## check two alpha levels
    alphas=(0.05, 0.01)
)


In [None]:
# Filter for null effect
type1_gee = df_results[
    (df_results['beta1'] == 0.0)
]
type1_gee

In [None]:
# Filter for (beta_z>0) effect
type2_gee_glm = df_results[
    (df_results['beta1'] > 0)
]
type2_gee_glm