In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import random
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tqdm import tqdm  # Import tqdm for the progress bar

In [2]:

def generate_did_data(
    n_units=200,
    num_x_covariates=5,
    num_pre_periods=5,
    num_post_periods=5,
    linearity_degree=1, # 1: fully linear, 2: half X non-linear, 3: treatment + all X non-linear
    pre_trend_bias_delta=0.2,
    epsilon_scale=1,
    seed=42
):
    """
    Generates panel data for Difference-in-Differences analysis with controllable pre-trends and non-linearity.

    Args:
        n_units (int): Number of units (e.g., individuals, firms).
        num_x_covariates (int): Number of control covariates (X) (not counting the two covariates W1 and W2 where W1 ~ Bernoulli(0.66)
        and W2 takes the values 1,2,3,4 with the following probabilities 0.3, 0.1, 0.2, 0.4 respectively.
        num_pre_periods (int): Number of periods before treatment.
        num_post_periods (int): Number of periods after treatment.
        treatment_effect_beta (float): True treatment effect size.
        linearity_degree (int): Degree of linearity in the DGP:
            1: Fully linear.
            2: Half of X covariates have non-linear relationship with Y.
            3: Treatment and all X covariates have non-linear relationship with Y.
        pre_trend_bias_delta (float): Bias parameter to induce pre-trends in the treated group.
        seed (int): Random seed for reproducibility.

    Returns:
        pd.DataFrame: Generated panel data in long format.
    """
    np.random.seed(seed)

    # --- Set treatment_effect_beta based on linearity_degree (like R code) ---
    if linearity_degree == 1 or linearity_degree == 2:
        treatment_effect_beta = 3
    elif linearity_degree == 3:
        treatment_effect_beta = 5
    else:
        # Handle cases where linearity_degree is not 1, 2, or 3
        print(f"Warning: linearity_degree ({linearity_degree}) has an unexpected value. Setting treatment_effect_beta to NaN.")
        treatment_effect_beta = np.nan

    periods = num_pre_periods + num_post_periods
    unit_ids = range(n_units)
    time_periods = range(periods)

    # Create base data frame
    data = pd.DataFrame({
        'unit_id': np.repeat(unit_ids, periods),
        'time': np.tile(time_periods, n_units)
    })

    # Treatment assignment (randomly assign half to treatment)
    treated_units = np.random.choice(unit_ids, size=n_units // 2, replace=False) #TO DO: add more complex propensity score
    data['treated_group'] = np.where(data['unit_id'].isin(treated_units), 1, 0)

    # Time indicators
    treatment_period = num_pre_periods # Period when treatment starts
    data['post_treatment'] = np.where(data['time'] >= treatment_period, 1, 0)
    data['time_trend'] = data['time'] # Simple linear time trend

    X = np.random.normal(0, 1, size=(len(data), num_x_covariates))


    # Add Bernoulli random variable with p=0.66
    bernoulli_values = np.random.binomial(n=1, p=0.66, size=len(data))
    # Add to both X matrix and dataframe
    X = np.column_stack((bernoulli_values,X))

   # Add categorical variable with values 1,2,3,4 with probabilities 0.3, 0.1, 0.2, 0.4
    categories = [1, 2, 3, 4]
    probabilities = [0.3, 0.1, 0.2, 0.4]
    categorical_values = np.random.choice(categories, size=len(data), p=probabilities)
    # Add to both X matrix and dataframe
    X = np.column_stack((X, categorical_values))

    for i in range(num_x_covariates+2):
        data[f'X_{i+1}'] = X[:,i]

    # Generate error term
    data['epsilon'] = np.random.normal(scale=epsilon_scale,size=len(data))

    # DGP parameters (can be adjusted for more complex DGPs)
    beta_0 = -0.5 # Intercept
    beta_treated = 0.75 # Main effect of treated group (alpha_i)
    beta_time = 0.2 # Main effect of time trend (gamma_t)
    beta_interaction = treatment_effect_beta # Treatment effect
    beta_x = np.array([-0.75, 0.5, -0.5, -1.30, 1.8, 2.5, -1.0])


    # Non-linear components based on linearity_degree
    if linearity_degree == 1: # Half covariates non-linear
        linear_x_contribution = np.sum([beta_x[i] * data[f'X_{i+1}'] for i in range(num_x_covariates+2)], axis=0)
        data['Y'] = beta_0 + beta_treated * data['treated_group'] + beta_time * data['time_trend']+linear_x_contribution+ beta_interaction * data['treated_group'] * data['post_treatment']
        data['CATE'] = beta_interaction * data['treated_group'] * data['post_treatment']

    elif linearity_degree == 2: # Half covariates non-linear
        half = num_x_covariates+2 // 2
        cov_effect = (np.sum(beta_x[:int(half/2)] * (X[:, :int(half/2)] ** 2),axis=1) + np.sum(beta_x[int(half/2):half] * np.exp(X[:, int(half/2):half]),axis=1)+
                              np.sum(beta_x[half:] * X[:, half:],axis=1))
        data['Y'] = beta_0 + beta_treated * data['treated_group'] + beta_time * data['time_trend']+cov_effect+beta_interaction * data['treated_group'] * data['post_treatment']
        data['CATE'] = beta_interaction * data['treated_group'] * data['post_treatment']

    elif linearity_degree == 3: 
        half = num_x_covariates+2 // 2
        cov_effect = (np.sum(beta_x[:int(half/2)] * (X[:, :int(half/2)] ** 2),axis=1) + np.sum(beta_x[int(half/2):half] * np.exp(X[:, int(half/2):half]),axis=1)+
                              np.sum(beta_x[half:half+int(half/2)] * np.abs(X[:, half:half+int(half/2)]),axis=1) + np.sum(beta_x[half+int(half/2):] * np.sqrt(np.abs(X[:, half+int(half/2):])),axis=1))
        data['Y'] = beta_0 + beta_treated * data['treated_group'] + beta_time * data['time_trend']**2+cov_effect+beta_interaction * data['treated_group'] * data['post_treatment']
        data['CATE'] = beta_interaction * data['treated_group'] * data['post_treatment']


    # Add pre-trend bias (differential trend for treated group in pre-treatment)
    if pre_trend_bias_delta != 0:
        if linearity_degree == 3:
            # Example parameters for seasonality
            seasonal_amplitude = 1.0  # Amplitude of the seasonal effect
            seasonal_period = 4      # Period of the seasonal effect (e.g., 12 for monthly data)

            # Calculate the seasonal effect
            seasonal_effect = seasonal_amplitude * np.sin(2 * np.pi * data['time'] / seasonal_period)
            data['Y'] += pre_trend_bias_delta * data['treated_group'] * seasonal_effect
        else:
            data['Y'] += pre_trend_bias_delta * data['treated_group'] * (data['time'] - treatment_period)
        # (data['time'] - treatment_period) will be negative in pre-treatment, 0 at treatment period, and positive in post-treatment.
        # (1 - data['post_treatment']) ensures this bias only applies in pre-treatment periods.


    # Add error term
    data['Y'] += data['epsilon']

    return data


In [3]:
def analyze_p_values(model_complex, num_pre_periods):
    """
    Extracts p-values for interaction terms in a statsmodels model,
    counts the number of non-significant p-values, and returns them.

    Args:
        model_complex:  A statsmodels model results object.
        num_pre_periods: The number of pre-treatment periods in the model.
                       Determines the range of the loop.

    Returns:
        A tuple containing:
            - A list of extracted p-values.
            - The count of p-values greater than 0.05.
    """

    p_values = []
    for i in range(1, num_pre_periods):  # Loop from 1 to num_pre_periods - 1
        parameter_name = f"C(time)[T.{i}]:treated_group"  # Construct the parameter name dynamically
        pval = model_complex.pvalues.get(parameter_name, np.nan)
        p_values.append(pval)

    count_non_significant = sum(pval > 0.05 for pval in p_values)  # Use a generator expression for efficiency

    return count_non_significant
    

In [4]:
def accumulate_ate_parameters(model_complex, num_pre_periods, num_post_periods, estimated_ATE,iteration):
    """
    Accumulates parameter estimates from a statsmodels model into an existing numpy array.

    Args:
        model_complex: A statsmodels model results object.
        num_pre_periods: The number of pre-treatment periods.
        num_post_periods: The number of post-treatment periods.
        estimated_ATE: A numpy array of shape (num_post_periods, 100) to accumulate parameters into.
                       Assumes this array has already been initialized with zeros.  This is crucial!
    """

    for j in range(num_pre_periods, num_pre_periods + num_post_periods):  # Loop from num_pre to num_pre + num_post - 1
        parameter_name = f"C(time)[T.{j}]:treated_group"
        try:
            parameter_value = model_complex.params[parameter_name]
        except KeyError:
            parameter_value = np.nan  # Handle missing parameters robustly

        post_period_index = j - num_pre_periods  # Calculate the correct row index for estimated_ATE

        # Check for nan before assignment. If it's nan, just move on to the next parameter.
        if np.isnan(parameter_value):
            print(f"Warning: Parameter {parameter_name} is NaN. Skipping.")
            continue

        # Assign parameter value to all columns of the corresponding row
        estimated_ATE[post_period_index, iteration] = parameter_value

In [5]:
def accumulate_ate_pvalues(model_complex, num_pre_periods, num_post_periods, accumulated_p_values, iteration):
    """
    Accumulates p-values for ATE parameters from a statsmodels model into an existing numpy array.

    Args:
        model_complex: A statsmodels model results object.
        num_pre_periods: The number of pre-treatment periods.
        num_post_periods: The number of post-treatment periods.
        accumulated_p_values: A numpy array of shape (num_post_periods, 100) to accumulate p-values into.
                              Assumes this array has already been initialized with zeros or NaNs.
        iteration: The current iteration number (column index in accumulated_p_values).
    """
    for j in range(num_pre_periods, num_pre_periods + num_post_periods):  # Loop from num_pre to num_pre + num_post - 1
        parameter_name = f"C(time)[T.{j}]:treated_group"
        try:
            # Access p-values instead of parameters
            p_value = model_complex.pvalues[parameter_name]
        except KeyError:
            p_value = np.nan  # Handle missing parameters robustly
        post_period_index = j - num_pre_periods  # Calculate the correct row index for accumulated_p_values

        # Check for nan before assignment. If it's nan, just move on to the next parameter.
        if np.isnan(p_value):
            print(f"Warning: P-value for parameter {parameter_name} is NaN. Skipping.")
            continue

        # Assign p-value to the specified column (iteration) and row (post-period)
        accumulated_p_values[post_period_index, iteration] = p_value

In [13]:
def calculate_error_metrics(estimated_ATE, accumulated_p_values, true_ATE=0.5, suffix=""):
    """
    Calculates RMSE, MAE, and MAPE for the estimated ATE values,
    both overall and per time period (row).  Adds columns for each simulation's p-values
    directly to the output CSV.

    Args:
        estimated_ATE: A numpy array of shape (num_post_periods, num_simulations)
                       containing the estimated ATE values.
        accumulated_p_values: A numpy array of shape (num_post_periods, num_simulations)
                              containing the accumulated p-values.
        true_ATE: The true ATE value (default: 0.5).
        suffix: A suffix to add to the output CSV filename.

    Returns:
        A tuple containing:
            - overall_metrics: A dictionary containing overall RMSE, MAE, and MAPE for ATE.
            - per_time_period_metrics: A dictionary where keys are time period indices
              and values are dictionaries containing RMSE, MAE, and MAPE for ATE for that time period.
    """

    # --- Calculate Metrics for Estimated ATE ---
    # Overall metrics for ATE
    overall_rmse_ate = np.sqrt(mean_squared_error(true_ATE * np.ones(estimated_ATE.size), estimated_ATE.flatten()))
    overall_mae_ate = mean_absolute_error(true_ATE * np.ones(estimated_ATE.size), estimated_ATE.flatten())
    overall_mape_ate = np.mean(np.abs((estimated_ATE.flatten() - true_ATE) / true_ATE)) * 100 if true_ATE != 0 else np.nan
    std_rmse_ate = np.std(np.sqrt(np.mean((estimated_ATE-true_ATE)**2, axis=0)).flatten())
    std_mae_ate = np.std(np.mean(np.abs(estimated_ATE-true_ATE), axis=0).flatten())
    std_mape_ate = np.std(np.mean(np.abs((estimated_ATE - true_ATE) / true_ATE), axis=0).flatten() * 100) if true_ATE != 0 else np.nan


    overall_metrics_ate = {
        "ATE_rmse": overall_rmse_ate,
        "ATE_mae": overall_mae_ate,
        "ATE_mape": overall_mape_ate,
        "ATE_std_rmse": std_rmse_ate,
        "ATE_std_mae": std_mae_ate,
        "ATE_std_mape": std_mape_ate
    }

    # Per-time-period metrics for ATE
    per_time_period_metrics_ate = {}
    for i in range(estimated_ATE.shape[0]):  # Iterate over rows (time periods)
        rmse_ate = np.sqrt(mean_squared_error(true_ATE * np.ones(estimated_ATE[i,:].size), estimated_ATE[i,:]))
        mae_ate = mean_absolute_error(true_ATE * np.ones(estimated_ATE[i,:].size), estimated_ATE[i,:])
        mape_ate = np.mean(np.abs((estimated_ATE[i,:] - true_ATE) / true_ATE)) * 100 if true_ATE != 0 else np.nan
        std_mse_ate = np.std((estimated_ATE[i,:]-true_ATE)**2)
        std_mae_ate = np.std(np.abs(estimated_ATE[i,:]-true_ATE).flatten())
        std_mape_ate = np.std(np.abs((estimated_ATE[i,:] - true_ATE) / true_ATE).flatten() * 100) if true_ATE != 0 else np.nan


        per_time_period_metrics_ate[i] = {
            "ATE_rmse": rmse_ate,
            "ATE_mae": mae_ate,
            "ATE_mape": mape_ate,
            "ATE_std_mse": std_mse_ate,
            "ATE_std_mae": std_mae_ate,
            "ATE_std_mape": std_mape_ate
        }


    # Calculate the three specific metrics to save to CSV for ATE
    rmse_values_ate = np.sqrt(np.mean((estimated_ATE-true_ATE)**2, axis=0)).flatten()
    mae_values_ate = np.mean(np.abs(estimated_ATE-true_ATE), axis=0).flatten()
    mape_values_ate = np.mean(np.abs((estimated_ATE - true_ATE) / true_ATE), axis=0).flatten() * 100 if true_ATE != 0 else np.nan


    # Create DataFrame with the three rows for ATE metrics
    df_ate_metrics = pd.DataFrame({
        'ATE_RMSE': rmse_values_ate,
        'ATE_MAE': mae_values_ate,
        'ATE_MAPE': mape_values_ate,
    }).T  # Transpose to have metrics as rows

    # Create DataFrame for accumulated p-values, using column names like 'PValue_0', 'PValue_1', ...
    p_value_columns = [f'PValue_{i}' for i in range(accumulated_p_values.shape[0])] # column names for p-values
    df_p_values = pd.DataFrame(accumulated_p_values.T, columns=p_value_columns).T # Transpose so rows are periods, columns are iterations

    df = pd.concat([df_ate_metrics, df_p_values], axis=0)

    # Save to CSV with the specified filename format
    filename = f"OLS_ATE_and_PValues{suffix}.csv" # Changed filename
    df.to_csv(filename, header=False)

    # Return only ATE metrics
    return overall_metrics_ate, per_time_period_metrics_ate

In [14]:
# Set the number of covariates as specified.
num_x_covariates = 5
linearity_degree=1

# Set the number of iterations and initialize the counter.
num_iterations = 100
count_at_least_two_non_significant = 0

num_pre_periods=4

num_post_periods=4

estimated_ATE=np.zeros([num_post_periods,num_iterations])
accumulated_p_values=np.zeros([num_post_periods,num_iterations])

epsilon_scale=1

iterations_PTA=[]
# Run the loop 100 times.
for i in tqdm(range(num_iterations), desc="Progress", unit="iteration"):
    # Generate a random seed for each iteration.
    seed_val = i
    
    # Generate data with specified hyperparameters.
    data_linear = generate_did_data(
        linearity_degree=linearity_degree,
        num_pre_periods=num_pre_periods,
        num_post_periods=num_post_periods,
        pre_trend_bias_delta=0,
        num_x_covariates=num_x_covariates,
        epsilon_scale=epsilon_scale,
        seed=seed_val
    )
    
    # Define the complex regression formula.
    formula_complex = "Y ~ treated_group + C(time) + C(time):treated_group + X_1 + X_2+X_3+X_4+ X_5 + X_6+X_7"
    
    # Fit the model.
    model_complex = smf.ols(formula=formula_complex, data=data_linear).fit()

    
    # Count how many of these p-values are above 0.05.
    count_non_significant = analyze_p_values(model_complex, num_pre_periods)

    accumulate_ate_parameters(model_complex, num_pre_periods, num_post_periods, estimated_ATE, i)
    accumulate_ate_pvalues(model_complex, num_pre_periods, num_post_periods, accumulated_p_values, i)
    
    # If at least two p-values are above 0.05, increment the counter.
    if count_non_significant >= 2:
        count_at_least_two_non_significant += 1
        iterations_PTA.append(i)

if linearity_degree == 1 or linearity_degree == 2:
    true_ATE = 3.0
elif linearity_degree == 3:
    true_ATE = 5.0

overall_metrics, per_time_period_metrics = calculate_error_metrics(estimated_ATE,accumulated_p_values, true_ATE,"_ATE_linearity=1")

print("Number of iterations (out of 100) with at least two p-values above 0.05 (i.e., we assume that the Parallel Trend Assumption holds):", 
      count_at_least_two_non_significant)

print("Overall Metrics:")
for metric, value in overall_metrics.items():
    print(f"  {metric}: {value:.4f}")

print("\nPer Time Period Metrics:")
for time_period, metrics in per_time_period_metrics.items():
    print(f"  Time Period {time_period+num_pre_periods}:")
    for metric, value in metrics.items():
        print(f"    {metric}: {value:.4f}")

Progress:   0%|          | 0/100 [00:00<?, ?iteration/s]

Progress: 100%|██████████| 100/100 [00:07<00:00, 14.02iteration/s]

Number of iterations (out of 100) with at least two p-values above 0.05 (i.e., we assume that the Parallel Trend Assumption holds): 98
Overall Metrics:
  ATE_rmse: 0.5596
  ATE_mae: 0.5275
  ATE_mape: 17.5838
  ATE_std_rmse: 0.1355
  ATE_std_mae: 0.1374
  ATE_std_mape: 4.5811

Per Time Period Metrics:
  Time Period 4:
    ATE_rmse: 0.5549
    ATE_mae: 0.5241
    ATE_mape: 17.4703
    ATE_std_mse: 0.1976
    ATE_std_mae: 0.1822
    ATE_std_mape: 6.0723
  Time Period 5:
    ATE_rmse: 0.5667
    ATE_mae: 0.5359
    ATE_mape: 17.8623
    ATE_std_mse: 0.2148
    ATE_std_mae: 0.1845
    ATE_std_mape: 6.1498
  Time Period 6:
    ATE_rmse: 0.5675
    ATE_mae: 0.5313
    ATE_mape: 17.7105
    ATE_std_mse: 0.2143
    ATE_std_mae: 0.1994
    ATE_std_mape: 6.6454
  Time Period 7:
    ATE_rmse: 0.5491
    ATE_mae: 0.5188
    ATE_mape: 17.2921
    ATE_std_mse: 0.2033
    ATE_std_mae: 0.1801
    ATE_std_mape: 6.0028





In [15]:
# Set the number of covariates as specified.
num_x_covariates = 5
linearity_degree=2

# Set the number of iterations and initialize the counter.
num_iterations = 100
count_at_least_two_non_significant = 0

num_pre_periods=4

num_post_periods=4

estimated_ATE=np.zeros([num_post_periods,num_iterations])
accumulated_p_values=np.zeros([num_post_periods,num_iterations])

epsilon_scale=1

iterations_PTA=[]
# Run the loop 100 times.
for i in tqdm(range(num_iterations), desc="Progress", unit="iteration"):
    # Generate a random seed for each iteration.
    seed_val = i
    
    # Generate data with specified hyperparameters.
    data_linear = generate_did_data(
        linearity_degree=linearity_degree,
        num_pre_periods=num_pre_periods,
        num_post_periods=num_post_periods,
        pre_trend_bias_delta=0,
        num_x_covariates=num_x_covariates,
        epsilon_scale=epsilon_scale,
        seed=seed_val
    )
    
    # Define the complex regression formula.
    formula_complex = "Y ~ treated_group + C(time) + C(time):treated_group + X_1 + X_2+X_3+X_4+ X_5 + X_6+X_7"
    
    # Fit the model.
    model_complex = smf.ols(formula=formula_complex, data=data_linear).fit()

    
    # Count how many of these p-values are above 0.05.
    count_non_significant = analyze_p_values(model_complex, num_pre_periods)

    accumulate_ate_parameters(model_complex, num_pre_periods, num_post_periods, estimated_ATE, i)
    accumulate_ate_pvalues(model_complex, num_pre_periods, num_post_periods, accumulated_p_values, i)
    
    # If at least two p-values are above 0.05, increment the counter.
    if count_non_significant >= 2:
        count_at_least_two_non_significant += 1
        iterations_PTA.append(i)

if linearity_degree == 1 or linearity_degree == 2:
    true_ATE = 3.0
elif linearity_degree == 3:
    true_ATE = 5.0

overall_metrics, per_time_period_metrics = calculate_error_metrics(estimated_ATE,accumulated_p_values, true_ATE,"_ATE_linearity=2")

print("Number of iterations (out of 100) with at least two p-values above 0.05 (i.e., we assume that the Parallel Trend Assumption holds):", 
      count_at_least_two_non_significant)

print("Overall Metrics:")
for metric, value in overall_metrics.items():
    print(f"  {metric}: {value:.4f}")

print("\nPer Time Period Metrics:")
for time_period, metrics in per_time_period_metrics.items():
    print(f"  Time Period {time_period+num_pre_periods}:")
    for metric, value in metrics.items():
        print(f"    {metric}: {value:.4f}")

Progress:   0%|          | 0/100 [00:00<?, ?iteration/s]

Progress: 100%|██████████| 100/100 [00:05<00:00, 17.06iteration/s]

Number of iterations (out of 100) with at least two p-values above 0.05 (i.e., we assume that the Parallel Trend Assumption holds): 98
Overall Metrics:
  ATE_rmse: 1.0243
  ATE_mae: 0.8269
  ATE_mape: 27.5623
  ATE_std_rmse: 0.4133
  ATE_std_mae: 0.3979
  ATE_std_mape: 13.2642

Per Time Period Metrics:
  Time Period 4:
    ATE_rmse: 1.0449
    ATE_mae: 0.8087
    ATE_mape: 26.9568
    ATE_std_mse: 1.6951
    ATE_std_mae: 0.6616
    ATE_std_mape: 22.0544
  Time Period 5:
    ATE_rmse: 0.9834
    ATE_mae: 0.7978
    ATE_mape: 26.5933
    ATE_std_mse: 1.2301
    ATE_std_mae: 0.5749
    ATE_std_mape: 19.1641
  Time Period 6:
    ATE_rmse: 1.1059
    ATE_mae: 0.9135
    ATE_mape: 30.4490
    ATE_std_mse: 1.4799
    ATE_std_mae: 0.6233
    ATE_std_mape: 20.7767
  Time Period 7:
    ATE_rmse: 0.9565
    ATE_mae: 0.7875
    ATE_mape: 26.2503
    ATE_std_mse: 1.1633
    ATE_std_mae: 0.5428
    ATE_std_mape: 18.0946





In [16]:
# Set the number of covariates as specified.
num_x_covariates = 5
linearity_degree=3

# Set the number of iterations and initialize the counter.
num_iterations = 100
count_at_least_two_non_significant = 0

num_pre_periods=4

num_post_periods=4

estimated_ATE=np.zeros([num_post_periods,num_iterations])
accumulated_p_values=np.zeros([num_post_periods,num_iterations])

epsilon_scale=1

iterations_PTA=[]
# Run the loop 100 times.
for i in tqdm(range(num_iterations), desc="Progress", unit="iteration"):
    # Generate a random seed for each iteration.
    seed_val = i
    
    # Generate data with specified hyperparameters.
    data_linear = generate_did_data(
        linearity_degree=linearity_degree,
        num_pre_periods=num_pre_periods,
        num_post_periods=num_post_periods,
        pre_trend_bias_delta=0,
        num_x_covariates=num_x_covariates,
        epsilon_scale=epsilon_scale,
        seed=seed_val
    )
    
    # Define the complex regression formula.
    formula_complex = "Y ~ treated_group + C(time) + C(time):treated_group + X_1 + X_2+X_3+X_4+ X_5 + X_6+X_7"
    
    # Fit the model.
    model_complex = smf.ols(formula=formula_complex, data=data_linear).fit()

    
    # Count how many of these p-values are above 0.05.
    count_non_significant = analyze_p_values(model_complex, num_pre_periods)

    accumulate_ate_parameters(model_complex, num_pre_periods, num_post_periods, estimated_ATE, i)
    accumulate_ate_pvalues(model_complex, num_pre_periods, num_post_periods, accumulated_p_values, i)
    
    # If at least two p-values are above 0.05, increment the counter.
    if count_non_significant >= 2:
        count_at_least_two_non_significant += 1
        iterations_PTA.append(i)

if linearity_degree == 1 or linearity_degree == 2:
    true_ATE = 3.0
elif linearity_degree == 3:
    true_ATE = 5.0

overall_metrics, per_time_period_metrics = calculate_error_metrics(estimated_ATE,accumulated_p_values, true_ATE,"_ATE_linearity=3")

print("Number of iterations (out of 100) with at least two p-values above 0.05 (i.e., we assume that the Parallel Trend Assumption holds):", 
      count_at_least_two_non_significant)

print("Overall Metrics:")
for metric, value in overall_metrics.items():
    print(f"  {metric}: {value:.4f}")

print("\nPer Time Period Metrics:")
for time_period, metrics in per_time_period_metrics.items():
    print(f"  Time Period {time_period+num_pre_periods}:")
    for metric, value in metrics.items():
        print(f"    {metric}: {value:.4f}")

Progress:   0%|          | 0/100 [00:00<?, ?iteration/s]

Progress: 100%|██████████| 100/100 [00:06<00:00, 15.10iteration/s]

Number of iterations (out of 100) with at least two p-values above 0.05 (i.e., we assume that the Parallel Trend Assumption holds): 98
Overall Metrics:
  ATE_rmse: 2.6140
  ATE_mae: 2.4520
  ATE_mape: 49.0397
  ATE_std_rmse: 0.6926
  ATE_std_mae: 0.7001
  ATE_std_mape: 14.0028

Per Time Period Metrics:
  Time Period 4:
    ATE_rmse: 2.6189
    ATE_mae: 2.4417
    ATE_mape: 48.8337
    ATE_std_mse: 5.1814
    ATE_std_mae: 0.9469
    ATE_std_mape: 18.9386
  Time Period 5:
    ATE_rmse: 2.6427
    ATE_mae: 2.5042
    ATE_mape: 50.0841
    ATE_std_mse: 4.2558
    ATE_std_mae: 0.8443
    ATE_std_mape: 16.8854
  Time Period 6:
    ATE_rmse: 2.6605
    ATE_mae: 2.4805
    ATE_mape: 49.6102
    ATE_std_mse: 4.7376
    ATE_std_mae: 0.9619
    ATE_std_mape: 19.2378
  Time Period 7:
    ATE_rmse: 2.5319
    ATE_mae: 2.3816
    ATE_mape: 47.6310
    ATE_std_mse: 4.1585
    ATE_std_mae: 0.8596
    ATE_std_mape: 17.1914



