In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import random
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tqdm import tqdm  # Import tqdm for the progress bar

In [2]:

def generate_staggered_did_data(
    n_units=200,
    num_x_covariates=5,
    num_pre_periods=5,
    num_post_periods=5,
    linearity_degree=1, # 1: fully linear, 2: half X non-linear, 3: treatment + all X non-linear
    pre_trend_bias_delta=0.2,
    epsilon_scale=1,
    seed=42
):
    """
    Generates panel data for Difference-in-Differences analysis with staggered adoption,
    controllable pre-trends, and non-linearity.

    Creates 4 groups:
    - Group 0: Never treated (Control)
    - Group 1: Treated starting at num_pre_periods (First Treatment Time)
    - Group 2: Treated starting at num_pre_periods + 1
    - Group 3: Treated starting at num_pre_periods + 2

    Args:
        n_units (int): Total number of units (e.g., individuals, firms). Should be divisible by 4 ideally.
        num_x_covariates (int): Number of control covariates (X) (not counting W1 and W2).
        num_pre_periods (int): Number of periods before the *earliest* treatment.
        num_post_periods (int): Number of periods after the *earliest* treatment.
        linearity_degree (int): Degree of linearity in the DGP:
            1: Fully linear.
            2: Half of X covariates have non-linear relationship with Y.
            3: Treatment and half of X covariates have non-linear relationship with Y.
            4: Treatment and all X covariates have non-linear relationship with Y.
        pre_trend_bias_delta (float): Bias parameter to induce pre-trends in eventually treated groups.
        epsilon_scale (float): Standard deviation of the error term.
        seed (int): Random seed for reproducibility.

    Returns:
        pd.DataFrame: Generated panel data in long format. Includes columns:
            'unit_id': Unique identifier for each unit.
            'time': Time period index.
            'treatment_group': Integer indicating the unit's group (0: Control, 1: T0, 2: T0+1, 3: T0+2).
            'first_treat_period': The period when treatment starts for the unit (np.inf for control).
            'eventually_treated': Binary indicator (1 if unit belongs to groups 1, 2, or 3, 0 otherwise).
            'D': Binary treatment indicator (1 if unit is treated *in the current period*, 0 otherwise).
            'X_1', 'X_2', ...: Covariates.
            'Y': Outcome variable.
            'CATE': True Conditional Average Treatment Effect for the unit-period.
            'epsilon': Error term component.
            'time_trend': Linear time trend index.
    """
    np.random.seed(seed)

    # --- Set treatment_effect_beta based on linearity_degree ---
    if linearity_degree == 1 or linearity_degree == 2:
        treatment_effect_beta = 0
    elif linearity_degree == 3:
        treatment_effect_beta = 0
    else:
        print(f"Warning: linearity_degree ({linearity_degree}) has an unexpected value. Setting treatment_effect_beta to NaN.")
        treatment_effect_beta = np.nan

    periods = num_pre_periods + num_post_periods
    unit_ids = np.arange(n_units)
    time_periods = np.arange(periods)

    # Create base data frame
    data = pd.DataFrame({
        'unit_id': np.repeat(unit_ids, periods),
        'time': np.tile(time_periods, n_units)
    })

    # --- Staggered Treatment Assignment ---
    # Divide units into 4 roughly equal groups
    shuffled_unit_ids = np.random.permutation(unit_ids)
    group_size = n_units // 4
    group_assignments = {}
    group_assignments[0] = shuffled_unit_ids[0 * group_size : 1 * group_size] # Control
    group_assignments[1] = shuffled_unit_ids[1 * group_size : 2 * group_size] # Treat at T0
    group_assignments[2] = shuffled_unit_ids[2 * group_size : 3 * group_size] # Treat at T0 + 1
    # Assign remaining units (if n_units % 4 != 0) to the last group
    group_assignments[3] = shuffled_unit_ids[3 * group_size :] # Treat at T0 + 2

    # Map unit_id to treatment group
    unit_to_group = {}
    for group_id, units_in_group in group_assignments.items():
        for unit in units_in_group:
            unit_to_group[unit] = group_id
    data['treatment_group'] = data['unit_id'].map(unit_to_group)

    # Determine the first treatment period for each unit
    earliest_treatment_period = num_pre_periods # Period when the *first* group gets treated (Group 1)
    conditions = [
        data['treatment_group'] == 0,
        data['treatment_group'] == 1,
        data['treatment_group'] == 2,
        data['treatment_group'] == 3
    ]
    choices = [
        np.inf, # Never treated
        earliest_treatment_period,
        earliest_treatment_period + 1,
        earliest_treatment_period + 2
    ]
    data['first_treat_period'] = np.select(conditions, choices, default=np.nan)

    # Indicator for being *eventually* treated (used for pre-trend bias)
    data['eventually_treated'] = (data['treatment_group'] > 0).astype(int)
    data['post_treatment'] = (data['time'] >= num_pre_periods).astype(int)

    # Dynamic treatment indicator 'D': 1 if treated in the current period, 0 otherwise
    data['D'] = (data['time'] >= data['first_treat_period']).astype(int)

    # --- Covariates ---
    data['time_trend'] = data['time'] # Simple linear time trend

    # Generate X covariates
    X_numeric = np.random.normal(0, 1, size=(len(data), num_x_covariates))
    bernoulli_values = np.random.binomial(n=1, p=0.66, size=len(data))
    categories = [1, 2, 3, 4]
    probabilities = [0.3, 0.1, 0.2, 0.4]
    categorical_values = np.random.choice(categories, size=len(data), p=probabilities)

    # Combine all covariates into a single matrix X for easier processing later
    X = np.column_stack((bernoulli_values, X_numeric, categorical_values))

    # Add covariates to the DataFrame with names X_1, X_2, ...
    total_covariates = num_x_covariates + 2
    for i in range(total_covariates):
        data[f'X_{i+1}'] = X[:, i]

    # --- Generate Outcome Variable (Y) ---
    data['epsilon'] = np.random.normal(scale=epsilon_scale, size=len(data))

    # DGP parameters
    beta_0 = -0.5 # Intercept
    beta_group_effect = 0.75 # Main effect of treated group (alpha_i)
    beta_time = 0.2 # Main effect of time trend (gamma_t)
    beta_interaction = treatment_effect_beta # Treatment effect magnitude
    # Ensure beta_x has the correct length
    beta_x = np.array([-0.75, 0.5, -0.5, -1.30, 1.8, 2.5, -1.0])[:total_covariates] # Adjust length if num_x_covariates changes

    # Baseline Y components (common across linearity degrees)
    Y_base = (beta_0 +
              beta_group_effect * data['eventually_treated'] + # Group fixed effect for those eventually treated
              beta_time * data['time_trend']) # Common time trend

    # Covariate effects
    if linearity_degree == 1:
        linear_x_contribution = np.sum([beta_x[i] * data[f'X_{i+1}'] for i in range(total_covariates)], axis=0)
        Y_covariates = linear_x_contribution
        Y_treatment = beta_interaction * data['D']
        data['CATE'] = beta_interaction * data['D']

    elif linearity_degree == 2:
        half = total_covariates // 2
        cov_effect = (np.sum(beta_x[:int(half/2)] * (X[:, :int(half/2)] ** 2),axis=1) +
                      np.sum(beta_x[int(half/2):half] * np.exp(X[:, int(half/2):half]),axis=1)+
                      np.sum(beta_x[half:] * X[:, half:],axis=1))
        Y_covariates = cov_effect
        Y_treatment = beta_interaction * data['D']
        data['CATE'] = beta_interaction * data['D']

    elif linearity_degree == 3:
        half = total_covariates // 2
        # Non-linear time trend and covariates
        Y_base = (beta_0 +
                  beta_group_effect * data['eventually_treated'] +
                  beta_time * data['time_trend']**2) # Non-linear time trend
        cov_effect = (np.sum(beta_x[:int(half/2)] * (X[:, :int(half/2)] ** 2),axis=1) +
                      np.sum(beta_x[int(half/2):half] * np.exp(X[:, int(half/2):half]),axis=1)+
                      np.sum(beta_x[half:half+int(half/2)] * np.abs(X[:, half:half+int(half/2)]),axis=1) +
                      np.sum(beta_x[half+int(half/2):] * np.sqrt(np.abs(X[:, half+int(half/2):])),axis=1))
        Y_covariates = cov_effect
        # Linear treatment effect (as per original code structure for degree 4)
        Y_treatment = beta_interaction * data['D']
        data['CATE'] = beta_interaction * data['D']
    else: # Handle unexpected linearity_degree
         Y_covariates = 0
         Y_treatment = 0
         data['CATE'] = 0

    data['Y'] = Y_base + Y_covariates + Y_treatment

    # --- Add pre-trend bias ---
    # Apply bias to *eventually treated* units during the pre-period *relative to the first treatment time*
    if pre_trend_bias_delta != 0:
        # Apply bias only before the *earliest* treatment period
        pre_period_mask = data['time'] < earliest_treatment_period
        # Apply bias only to units that will eventually be treated
        bias_mask = pre_period_mask & (data['eventually_treated'] == 1)

        if linearity_degree == 3: # Non-linear pre-trend (e.g., seasonal)
            seasonal_amplitude = 1.0
            seasonal_period = 4
            seasonal_effect = seasonal_amplitude * np.sin(2 * np.pi * data['time'] / seasonal_period)
            data.loc[bias_mask, 'Y'] += pre_trend_bias_delta * seasonal_effect[bias_mask]
        else: # Linear pre-trend bias
             # Difference relative to the earliest treatment time
            time_diff = data['time'] - earliest_treatment_period
            data.loc[bias_mask, 'Y'] += pre_trend_bias_delta * time_diff[bias_mask]

    # Add final error term
    data['Y'] += data['epsilon']

    # Remove intermediate columns if desired, or keep for clarity
    # data = data.drop(columns=['epsilon'])

    return data


In [3]:
def find_first_treatment_indexes_array(df, min_time=4, eventually_treated=1):
    """
    Finds the indexes of the first row for each treatment group (0, 1, 2, 3)
    after filtering the DataFrame by time and eventually_treated, and returns them as a NumPy array.

    Args:
        df: The pandas DataFrame.
        min_time: The minimum time value.
        eventually_treated: The desired eventually_treated value.

    Returns:
        A NumPy array containing the first row indexes for each treatment group (0, 1, 2, 3),
        or None if no rows meet the criteria. Returns -1 if a treatment group does not appear in the filtered data.
    """

    filtered_df = df[(df['time'] >= min_time) & (df['eventually_treated'] == eventually_treated)]

    if filtered_df.empty:
        return None  # Return None if no rows match the time and eventually_treated criteria.

    indexes = []
    for group in [1, 2, 3]:
        group_df = filtered_df[filtered_df['treatment_group'] == group]
        if not group_df.empty:
            indexes.append(group_df.index[0])  # Get the first index
        else:
            indexes.append(-1) #Return -1 if the treatment group does not appear in the filtered data.

    return np.array(indexes)

In [4]:
def calculate_error_metrics_grouped_hybrid( # Renamed slightly for clarity
    true_ATE,
    estimated_ATE,
    accumulated_p_values,
    suffix=""
    ):
    """
    Calculates both per-iteration and summary metrics (RMSE, MAE, MAPE).

    1. Saves per-iteration results to an Excel file named
       "BCF_GATE_and_PValues{suffix}.xlsx". The file has multiple sheets,
       with each row representing a simulation iteration:
       - 'Overall_Metrics': Contains overall RMSE, MAE, MAPE per iteration.
       - 'Group_X': One sheet per group, containing the group's RMSE, MAE, MAPE
                    per iteration, alongside the raw p-values for that group.

    2. Returns dictionaries containing summary statistics (mean and standard
       deviation of metrics aggregated across all iterations).

    Args:
        true_ATE: A numpy array of shape (num_iterations, num_post_periods, number_of_groups)
                  containing the true ATE values.
        estimated_ATE: A numpy array of shape (num_iterations, num_post_periods, number_of_groups)
                       containing the estimated ATE values.
        accumulated_p_values: A numpy array of shape (num_iterations, num_post_periods, number_of_groups)
                              containing the p-values.
        suffix (str): An optional suffix to append to the base filename
                      "BCF_GATE_and_PValues". Defaults to "".

    Returns:
        A tuple containing:
            - overall_metrics: Dictionary with overall summary statistics
              (mean RMSE, mean MAE, mean MAPE, std RMSE, std MAE, std MAPE).
            - per_group_metrics: Dictionary where keys are group indices and
              values are dictionaries with summary statistics for that group.
    """

    # Input validation
    if not (true_ATE.shape == estimated_ATE.shape == accumulated_p_values.shape):
        raise ValueError("Shapes of true_ATE, estimated_ATE, and accumulated_p_values must match.")
    if true_ATE.ndim != 3:
         raise ValueError("Input arrays must have 3 dimensions: (iterations, time, groups).")

    num_iterations, num_post_periods, number_of_groups = true_ATE.shape
    iteration_index = pd.RangeIndex(num_iterations, name='Iteration')

    # Construct filename using suffix
    filename = f"OLS_GATE_and_PValues{suffix}.xlsx"

    # Calculate element-wise errors
    errors = estimated_ATE - true_ATE # Shape: (iterations, time, groups)
    abs_errors = np.abs(errors)       # Shape: (iterations, time, groups)

    # --- Calculate PER-ITERATION Metrics (used for both Excel and summaries) ---

    # Overall per-iteration metrics
    overall_rmse_per_iteration = np.sqrt(np.mean(errors**2, axis=(1, 2)))
    overall_mae_per_iteration = np.mean(abs_errors, axis=(1, 2))
    overall_mape_per_iteration = np.zeros(num_iterations) * np.nan
    for i in range(num_iterations):
         true_ate_i = true_ATE[i, :, :]
         errors_i = errors[i, :, :]
         valid_mask_i = true_ate_i != 0
         if np.any(valid_mask_i):
             abs_perc_errors_i = np.abs(errors_i[valid_mask_i] / true_ate_i[valid_mask_i])
             overall_mape_per_iteration[i] = np.mean(abs_perc_errors_i) * 100

    # Create Overall DataFrame for Excel
    df_overall = pd.DataFrame({
        'Overall_RMSE': overall_rmse_per_iteration,
        'Overall_MAE': overall_mae_per_iteration,
        'Overall_MAPE': overall_mape_per_iteration
    }, index=iteration_index)

    # --- Calculate SUMMARY Overall Metrics (for return value) ---
    overall_rmse = np.sqrt(np.mean(errors**2))
    overall_mae = np.mean(abs_errors)
    valid_mape_mask = true_ATE != 0
    abs_perc_errors = np.full_like(errors, fill_value=np.nan)
    abs_perc_errors[valid_mape_mask] = np.abs(errors[valid_mape_mask] / true_ATE[valid_mape_mask])
    overall_mape = np.nanmean(abs_perc_errors) * 100

    summary_overall_std_rmse = np.nanstd(overall_rmse_per_iteration)
    summary_overall_std_mae = np.nanstd(overall_mae_per_iteration)
    summary_overall_std_mape = np.nanstd(overall_mape_per_iteration)

    overall_metrics = { # Dictionary for return value
        "Overall_RMSE": overall_rmse,
        "Overall_MAE": overall_mae,
        "Overall_MAPE": overall_mape,
        "Overall_Std_RMSE": summary_overall_std_rmse,
        "Overall_Std_MAE": summary_overall_std_mae,
        "Overall_Std_MAPE": summary_overall_std_mape,
    }

    # --- Process Per-Group Data (for both Excel and summaries) ---
    group_combined_dfs = {} # For Excel sheets
    per_group_metrics = {}  # For return value summaries

    for g in range(number_of_groups):
        # Slice data for the current group
        true_ATE_g = true_ATE[:, :, g]  # Shape: (iterations, time)
        errors_g = errors[:, :, g]      # Shape: (iterations, time)
        abs_errors_g = abs_errors[:, :, g]# Shape: (iterations, time)
        p_values_g = accumulated_p_values[:, :, g] # Shape: (iterations, time)

        # Calculate per-iteration metrics for group g
        group_rmse_per_iter = np.sqrt(np.mean(errors_g**2, axis=1))
        group_mae_per_iter = np.mean(abs_errors_g, axis=1)
        group_mape_per_iter = np.zeros(num_iterations) * np.nan
        for i in range(num_iterations):
            true_ate_gi = true_ATE_g[i, :]
            errors_gi = errors_g[i, :]
            valid_mask_gi = true_ate_gi != 0
            if np.any(valid_mask_gi):
                abs_perc_errors_gi = np.abs(errors_gi[valid_mask_gi] / true_ate_gi[valid_mask_gi])
                group_mape_per_iter[i] = np.mean(abs_perc_errors_gi) * 100

        # --- Calculate SUMMARY Stats for Group g (for return dict) ---
        group_rmse = np.sqrt(np.mean(errors_g**2))
        group_mae = np.mean(abs_errors_g)
        valid_mape_mask_g = true_ATE_g != 0
        abs_perc_errors_g = np.full_like(errors_g, fill_value=np.nan)
        abs_perc_errors_g[valid_mape_mask_g] = np.abs(errors_g[valid_mape_mask_g] / true_ATE_g[valid_mape_mask_g])
        group_mape = np.nanmean(abs_perc_errors_g) * 100

        summary_group_std_rmse = np.nanstd(group_rmse_per_iter)
        summary_group_std_mae = np.nanstd(group_mae_per_iter)
        summary_group_std_mape = np.nanstd(group_mape_per_iter)

        per_group_metrics[g] = { # Populate return dictionary for group g
             f"Group_{g}_RMSE": group_rmse,
             f"Group_{g}_MAE": group_mae,
             f"Group_{g}_MAPE": group_mape,
             f"Group_{g}_Std_RMSE": summary_group_std_rmse,
             f"Group_{g}_Std_MAE": summary_group_std_mae,
             f"Group_{g}_Std_MAPE": summary_group_std_mape,
        }

        # --- Create DataFrames for Excel Sheet for Group g ---
        df_metrics_g = pd.DataFrame({
            f'Group_{g}_RMSE': group_rmse_per_iter, # Use the per-iter arrays
            f'Group_{g}_MAE': group_mae_per_iter,
            f'Group_{g}_MAPE': group_mape_per_iter
        }, index=iteration_index)

        p_value_columns = [f'PValue_Time_{t}' for t in range(num_post_periods)]
        df_pvals_g = pd.DataFrame(p_values_g,
                                  index=iteration_index,
                                  columns=p_value_columns)

        # Combine metrics and p-values for the group's Excel sheet
        group_combined_dfs[g] = pd.concat([df_metrics_g, df_pvals_g], axis=1)


    # --- Write Per-Iteration Data to Excel ---
    try:
        with pd.ExcelWriter(filename) as writer:
            # Write Overall Metrics Sheet
            df_overall.to_excel(writer, sheet_name='Overall_Metrics', header=True, index=True)
            # Write Per-Group Sheets
            for g in range(number_of_groups):
                sheet_name_g = f'Group_{g}'
                group_combined_dfs[g].to_excel(writer, sheet_name=sheet_name_g, header=True, index=True)

        print(f"Per-iteration metrics and p-values successfully saved to '{filename}'")

    except Exception as e:
        print(f"Error saving per-iteration metrics to Excel file '{filename}': {e}")

    # --- Return Summary Dictionaries ---
    return overall_metrics, per_group_metrics

In [5]:
linearity_degree=1
num_x_covariates = 5

num_iterations = 100

num_pre_periods=4

num_post_periods=4

number_of_groups=3
true_ATE=np.zeros([num_iterations,num_post_periods,number_of_groups])
estimated_ATE_subset=np.zeros([num_iterations,num_post_periods,number_of_groups])
accumulated_p_values=np.zeros([num_iterations,num_post_periods,number_of_groups])

epsilon_scale=1

for i in tqdm(range(num_iterations), desc="Progress", unit="iteration"):
    # Generate a random seed for each iteration.
    seed_val = i

    # Generate data with specified hyperparameters.
    data_linear = generate_staggered_did_data(
        n_units=200,
        linearity_degree=linearity_degree,
        num_pre_periods=num_pre_periods,
        num_post_periods=num_post_periods,
        pre_trend_bias_delta=0,
        num_x_covariates=num_x_covariates,
        epsilon_scale=epsilon_scale,
        seed=seed_val
    )
    indexes = find_first_treatment_indexes_array(data_linear)

    # Define the complex regression formula.
    formula_complex = "Y ~ eventually_treated + C(time) + C(time):eventually_treated + X_1 + X_2+X_3+X_4+ X_5 + X_6+X_7"
    
    # Fit the model.
    model_complex = smf.ols(formula=formula_complex, data=data_linear).fit()

    for j in range(len(indexes)):
      true_ATE[i,:,j]=np.array(data_linear[(data_linear['time'] >= num_pre_periods) & data_linear['eventually_treated'] == 1]["CATE"].loc[indexes[j]:indexes[j]+num_post_periods])
      estimated_ATE_subset[i,:,j]=np.array(model_complex.params[[f'C(time)[T.{t}]:eventually_treated' for t in range(num_pre_periods, num_pre_periods + num_post_periods)]])
      accumulated_p_values[i,:,j]=np.array(model_complex.pvalues[[f'C(time)[T.{t}]:eventually_treated' for t in range(num_pre_periods, num_pre_periods + num_post_periods)]])


simulation_suffix = "_linearity=1" # Example suffix
overall_metrics, per_group_metrics = calculate_error_metrics_grouped_hybrid(
    true_ATE,
    estimated_ATE_subset,
    accumulated_p_values,
    suffix=simulation_suffix # Pass the suffix here
)

print("\n--- Overall Metrics (Dictionary) ---")
print(overall_metrics)

print("\n--- Per Group Metrics (Dictionary) ---")
for group_idx, metrics in per_group_metrics.items():
    print(f"Group {group_idx}:")
    print(metrics)

Progress: 100%|██████████| 100/100 [00:08<00:00, 12.11iteration/s]
  overall_mape = np.nanmean(abs_perc_errors) * 100
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  group_mape = np.nanmean(abs_perc_errors_g) * 100
  group_mape = np.nanmean(abs_perc_errors_g) * 100
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  group_mape = np.nanmean(abs_perc_errors_g) * 100
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,


Per-iteration metrics and p-values successfully saved to 'OLS_GATE_and_PValues_linearity=1.xlsx'

--- Overall Metrics (Dictionary) ---
{'Overall_RMSE': np.float64(0.220670341731245), 'Overall_MAE': np.float64(0.17710817753886587), 'Overall_MAPE': np.float64(nan), 'Overall_Std_RMSE': np.float64(0.08421684782444672), 'Overall_Std_MAE': np.float64(0.0818431966517216), 'Overall_Std_MAPE': np.float64(nan)}

--- Per Group Metrics (Dictionary) ---
Group 0:
{'Group_0_RMSE': np.float64(0.220670341731245), 'Group_0_MAE': np.float64(0.17710817753886587), 'Group_0_MAPE': np.float64(nan), 'Group_0_Std_RMSE': np.float64(0.08421684782444672), 'Group_0_Std_MAE': np.float64(0.08184319665172159), 'Group_0_Std_MAPE': np.float64(nan)}
Group 1:
{'Group_1_RMSE': np.float64(0.220670341731245), 'Group_1_MAE': np.float64(0.17710817753886587), 'Group_1_MAPE': np.float64(nan), 'Group_1_Std_RMSE': np.float64(0.08421684782444672), 'Group_1_Std_MAE': np.float64(0.08184319665172159), 'Group_1_Std_MAPE': np.float64(n

In [6]:
linearity_degree=2
num_x_covariates = 5

num_iterations = 100

num_pre_periods=4

num_post_periods=4

number_of_groups=3
true_ATE=np.zeros([num_iterations,num_post_periods,number_of_groups])
estimated_ATE_subset=np.zeros([num_iterations,num_post_periods,number_of_groups])
accumulated_p_values=np.zeros([num_iterations,num_post_periods,number_of_groups])

epsilon_scale=1

for i in tqdm(range(num_iterations), desc="Progress", unit="iteration"):
    # Generate a random seed for each iteration.
    seed_val = i

    # Generate data with specified hyperparameters.
    data_linear = generate_staggered_did_data(
        n_units=200,
        linearity_degree=linearity_degree,
        num_pre_periods=num_pre_periods,
        num_post_periods=num_post_periods,
        pre_trend_bias_delta=0,
        num_x_covariates=num_x_covariates,
        epsilon_scale=epsilon_scale,
        seed=seed_val
    )
    indexes = find_first_treatment_indexes_array(data_linear)

    # Define the complex regression formula.
    formula_complex = "Y ~ eventually_treated + C(time) + C(time):eventually_treated + X_1 + X_2+X_3+X_4+ X_5 + X_6+X_7"
    
    # Fit the model.
    model_complex = smf.ols(formula=formula_complex, data=data_linear).fit()

    for j in range(len(indexes)):
      true_ATE[i,:,j]=np.array(data_linear[(data_linear['time'] >= num_pre_periods) & data_linear['eventually_treated'] == 1]["CATE"].loc[indexes[j]:indexes[j]+num_post_periods])
      estimated_ATE_subset[i,:,j]=np.array(model_complex.params[[f'C(time)[T.{t}]:eventually_treated' for t in range(num_pre_periods, num_pre_periods + num_post_periods)]])
      accumulated_p_values[i,:,j]=np.array(model_complex.pvalues[[f'C(time)[T.{t}]:eventually_treated' for t in range(num_pre_periods, num_pre_periods + num_post_periods)]])


simulation_suffix = "_linearity=2" # Example suffix
overall_metrics, per_group_metrics = calculate_error_metrics_grouped_hybrid(
    true_ATE,
    estimated_ATE_subset,
    accumulated_p_values,
    suffix=simulation_suffix # Pass the suffix here
)

print("\n--- Overall Metrics (Dictionary) ---")
print(overall_metrics)

print("\n--- Per Group Metrics (Dictionary) ---")
for group_idx, metrics in per_group_metrics.items():
    print(f"Group {group_idx}:")
    print(metrics)

Progress: 100%|██████████| 100/100 [00:07<00:00, 12.93iteration/s]
  overall_mape = np.nanmean(abs_perc_errors) * 100
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  group_mape = np.nanmean(abs_perc_errors_g) * 100
  group_mape = np.nanmean(abs_perc_errors_g) * 100
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  group_mape = np.nanmean(abs_perc_errors_g) * 100
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,


Per-iteration metrics and p-values successfully saved to 'OLS_GATE_and_PValues_linearity=2.xlsx'

--- Overall Metrics (Dictionary) ---
{'Overall_RMSE': np.float64(0.31158569965636795), 'Overall_MAE': np.float64(0.24527226958490586), 'Overall_MAPE': np.float64(nan), 'Overall_Std_RMSE': np.float64(0.12193785080026887), 'Overall_Std_MAE': np.float64(0.11218357558120047), 'Overall_Std_MAPE': np.float64(nan)}

--- Per Group Metrics (Dictionary) ---
Group 0:
{'Group_0_RMSE': np.float64(0.31158569965636795), 'Group_0_MAE': np.float64(0.24527226958490586), 'Group_0_MAPE': np.float64(nan), 'Group_0_Std_RMSE': np.float64(0.12193785080026888), 'Group_0_Std_MAE': np.float64(0.11218357558120047), 'Group_0_Std_MAPE': np.float64(nan)}
Group 1:
{'Group_1_RMSE': np.float64(0.31158569965636795), 'Group_1_MAE': np.float64(0.24527226958490586), 'Group_1_MAPE': np.float64(nan), 'Group_1_Std_RMSE': np.float64(0.12193785080026888), 'Group_1_Std_MAE': np.float64(0.11218357558120047), 'Group_1_Std_MAPE': np.fl

In [7]:
linearity_degree=3
num_x_covariates = 5

num_iterations = 100

num_pre_periods=4

num_post_periods=4

number_of_groups=3
true_ATE=np.zeros([num_iterations,num_post_periods,number_of_groups])
estimated_ATE_subset=np.zeros([num_iterations,num_post_periods,number_of_groups])
accumulated_p_values=np.zeros([num_iterations,num_post_periods,number_of_groups])

epsilon_scale=1

for i in tqdm(range(num_iterations), desc="Progress", unit="iteration"):
    # Generate a random seed for each iteration.
    seed_val = i

    # Generate data with specified hyperparameters.
    data_linear = generate_staggered_did_data(
        n_units=200,
        linearity_degree=linearity_degree,
        num_pre_periods=num_pre_periods,
        num_post_periods=num_post_periods,
        pre_trend_bias_delta=0,
        num_x_covariates=num_x_covariates,
        epsilon_scale=epsilon_scale,
        seed=seed_val
    )
    indexes = find_first_treatment_indexes_array(data_linear)

    # Define the complex regression formula.
    formula_complex = "Y ~ eventually_treated + C(time) + C(time):eventually_treated + X_1 + X_2+X_3+X_4+ X_5 + X_6+X_7"
    
    # Fit the model.
    model_complex = smf.ols(formula=formula_complex, data=data_linear).fit()

    for j in range(len(indexes)):
      true_ATE[i,:,j]=np.array(data_linear[(data_linear['time'] >= num_pre_periods) & data_linear['eventually_treated'] == 1]["CATE"].loc[indexes[j]:indexes[j]+num_post_periods])
      estimated_ATE_subset[i,:,j]=np.array(model_complex.params[[f'C(time)[T.{t}]:eventually_treated' for t in range(num_pre_periods, num_pre_periods + num_post_periods)]])
      accumulated_p_values[i,:,j]=np.array(model_complex.pvalues[[f'C(time)[T.{t}]:eventually_treated' for t in range(num_pre_periods, num_pre_periods + num_post_periods)]])


simulation_suffix = "_linearity=3" # Example suffix
overall_metrics, per_group_metrics = calculate_error_metrics_grouped_hybrid(
    true_ATE,
    estimated_ATE_subset,
    accumulated_p_values,
    suffix=simulation_suffix # Pass the suffix here
)

print("\n--- Overall Metrics (Dictionary) ---")
print(overall_metrics)

print("\n--- Per Group Metrics (Dictionary) ---")
for group_idx, metrics in per_group_metrics.items():
    print(f"Group {group_idx}:")
    print(metrics)

Progress: 100%|██████████| 100/100 [00:07<00:00, 12.65iteration/s]
  overall_mape = np.nanmean(abs_perc_errors) * 100
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  group_mape = np.nanmean(abs_perc_errors_g) * 100
  group_mape = np.nanmean(abs_perc_errors_g) * 100
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  group_mape = np.nanmean(abs_perc_errors_g) * 100
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,


Per-iteration metrics and p-values successfully saved to 'OLS_GATE_and_PValues_linearity=3.xlsx'

--- Overall Metrics (Dictionary) ---
{'Overall_RMSE': np.float64(0.4122548413548967), 'Overall_MAE': np.float64(0.32337641509665166), 'Overall_MAPE': np.float64(nan), 'Overall_Std_RMSE': np.float64(0.15786963983787594), 'Overall_Std_MAE': np.float64(0.1452984562657175), 'Overall_Std_MAPE': np.float64(nan)}

--- Per Group Metrics (Dictionary) ---
Group 0:
{'Group_0_RMSE': np.float64(0.4122548413548967), 'Group_0_MAE': np.float64(0.3233764150966516), 'Group_0_MAPE': np.float64(nan), 'Group_0_Std_RMSE': np.float64(0.15786963983787594), 'Group_0_Std_MAE': np.float64(0.1452984562657175), 'Group_0_Std_MAPE': np.float64(nan)}
Group 1:
{'Group_1_RMSE': np.float64(0.4122548413548967), 'Group_1_MAE': np.float64(0.3233764150966516), 'Group_1_MAPE': np.float64(nan), 'Group_1_Std_RMSE': np.float64(0.15786963983787594), 'Group_1_Std_MAE': np.float64(0.1452984562657175), 'Group_1_Std_MAPE': np.float64(na