# Analysis 2: How Corporate Lobbying is related to Firm Size.

In [8]:
import numpy as np
import pandas as pd
import scipy.special as sp
import statsmodels.api as sm
from scipy.stats import norm
from scipy.optimize import minimize
from joblib import Parallel, delayed
from joblib.externals.loky.process_executor import TerminatedWorkerError
import time
from utils import open_csv

  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0


##### Load bill position data with firm size

In [2]:
client_year_df = open_csv('analysis_input/analysis2_firm_size_bill_position_df.csv')

## Table 18. Data Statistics for Firm Size Analysis (Appendix)

In [3]:
# Compute aggregated statistics by year
agg_stats = (
    client_year_df
    .groupby('year')
    .agg(
        Obs = ('firm_id', 'count'),
        Lobby_mean = ('lobbied', 'mean'),
        Lobby_std = ('lobbied', 'std'),
        Support_mean = ('ratio_label1', 'mean'),
        Support_std = ('ratio_label1', 'std'),
        Oppose_mean = ('ratio_label2', 'mean'),
        Oppose_std = ('ratio_label2', 'std'),
        Amend_mean = ('ratio_label3', 'mean'),
        Amend_std = ('ratio_label3', 'std'),
        Monitor_mean = ('ratio_label4', 'mean'),
        Monitor_std = ('ratio_label4', 'std'),
        LogEmployment_mean = ('log_emp', 'mean'),
        LogEmployment_std = ('log_emp', 'std')
    )
    .reset_index()
)

# Format statistics with mean and standard deviation in parentheses
for col, mean_col, std_col in [
    ('Lobby', 'Lobby_mean', 'Lobby_std'),
    ('Support', 'Support_mean', 'Support_std'),
    ('Oppose', 'Oppose_mean', 'Oppose_std'),
    ('Amend', 'Amend_mean', 'Amend_std'),
    ('Monitor', 'Monitor_mean', 'Monitor_std'),
    ('Log Employment', 'LogEmployment_mean', 'LogEmployment_std')
]:
    agg_stats[col] = agg_stats.apply(
        lambda row: f"{row[mean_col]:.3f} ({row[std_col]:.3f})", 
        axis=1
    )

# Create final table with selected columns
final_table = agg_stats[['year', 'Obs', 'Lobby', 'Support', 'Oppose', 'Amend', 'Monitor', 'Log Employment']]
final_table = final_table.rename(columns={'year': 'Year', 'Obs': 'Obs.'})

# Export to LaTeX
table_latex = final_table.to_latex(
    index=False,
    caption='Data Statistics for Firm Size Analysis',
    label='tab:df_cleaned_stats_nested',
    column_format='cccccccc'
)

footnote = "\\\\ \\multicolumn{8}{l}{\\footnotesize Note: Values represent sample means; standard deviations are reported in parentheses.}"
table_latex = table_latex.replace("\\bottomrule", "\\bottomrule\n" + footnote)

# Save table to file
with open("analysis_output/analysis2_table18_appendix.tex", "w") as f:
    f.write(table_latex)

print(f"Table successfully generated and saved as 'appendix_table_18.tex'")
print(table_latex)


Table successfully generated and saved as 'appendix_table_18.tex'
\begin{table}
\caption{Data Statistics for Firm Size Analysis}
\label{tab:df_cleaned_stats_nested}
\begin{tabular}{cccccccc}
\toprule
Year & Obs. & Lobby & Support & Oppose & Amend & Monitor & Log Employment \\
\midrule
2009 & 7014 & 0.086 (0.280) & 0.198 (0.343) & 0.036 (0.159) & 0.246 (0.362) & 0.519 (0.434) & 6.315 (2.618) \\
2010 & 6943 & 0.086 (0.280) & 0.207 (0.343) & 0.042 (0.174) & 0.236 (0.365) & 0.515 (0.435) & 6.309 (2.654) \\
2011 & 6931 & 0.074 (0.261) & 0.252 (0.348) & 0.037 (0.147) & 0.201 (0.326) & 0.510 (0.419) & 6.300 (2.698) \\
2012 & 7000 & 0.077 (0.267) & 0.271 (0.381) & 0.038 (0.165) & 0.162 (0.312) & 0.529 (0.435) & 6.245 (2.748) \\
2013 & 6912 & 0.071 (0.258) & 0.245 (0.370) & 0.041 (0.160) & 0.154 (0.310) & 0.560 (0.433) & 6.262 (2.766) \\
2014 & 6748 & 0.077 (0.267) & 0.219 (0.357) & 0.028 (0.118) & 0.115 (0.272) & 0.637 (0.422) & 6.300 (2.772) \\
2015 & 6575 & 0.073 (0.261) & 0.241 (0.353) & 0.

## Table 19. Dirichlet Regression Results: Effect of Firm Size on Predicted Probability of Lobbying Position (Appendix)

In [4]:
# -------------------------------
# Helper Functions
# -------------------------------

def create_design_matrix(df, predictors, vary_var, numeric_predictors=None):
    """
    Create a design matrix from df using the list of predictors.
    Predictors in numeric_predictors are left as numeric;
    all others are converted to dummy variables (drop_first=True).
    An intercept is added.
    Returns (X_df, column_names).
    """
    if numeric_predictors is None:
        numeric_predictors = [vary_var]
    else:
        if vary_var not in numeric_predictors:
            numeric_predictors = numeric_predictors + [vary_var]
    X_parts = []
    col_names = []
    for col in predictors:
        if col in numeric_predictors:
            X_parts.append(df[[col]])
            col_names.append(col)
        else:
            dummies = pd.get_dummies(df[col].astype('category'), prefix=col, drop_first=True)
            X_parts.append(dummies)
            col_names.extend(dummies.columns.tolist())
    X_df = pd.concat(X_parts, axis=1)
    X_df.insert(0, 'Intercept', 1)
    col_names = ['Intercept'] + col_names
    return X_df, col_names

def dirichlet_neg_log_likelihood(theta, X, Y, k, p, lambda_reg=0.0):
    """Compute negative log-likelihood for Dirichlet regression"""
    beta = theta.reshape((k, p))
    alpha = np.exp(np.dot(X, beta.T))
    if np.any(~np.isfinite(alpha)):
        return np.inf
    sum_alpha = np.sum(alpha, axis=1)
    ll = np.sum(sp.gammaln(sum_alpha)) - np.sum(sp.gammaln(alpha)) + np.sum((alpha - 1) * np.log(Y))
    return -ll

def significance_stars(p):
    """Return significance stars based on p-value"""
    if p < 0.01:
        return '***'
    elif p < 0.05:
        return '**'
    elif p < 0.1:
        return '*'
    else:
        return ''
    
def standardize_data(df, numeric_predictors):
    """
    Returns a copy of df with the columns in numeric_predictors standardized
    (subtract mean, divide by std) and a dictionary with (mean, std) for each.
    """
    df_std = df.copy()
    scaling = {}
    for col in numeric_predictors:
        mean_val = df_std[col].mean()
        std_val = df_std[col].std()
        scaling[col] = (mean_val, std_val)
        if std_val > 0:
            df_std[col] = (df_std[col] - mean_val) / std_val
    return df_std, scaling

# -------------------------------
# Bootstrap Functions for Dirichlet Regression
# -------------------------------

def bootstrap_iteration(args):
    """Run one bootstrap replicate for Dirichlet QOI"""
    np.random.seed()
    (df_dir, predictors, response_cols, cluster_var, initial_theta, k, p, num_clusters,
     numeric_predictors, lambda_reg, pct_10, pct_90, design_cols, vary_var) = args

    # Block bootstrap by cluster
    sampled_clusters = np.random.choice(df_dir[cluster_var].unique(), size=num_clusters, replace=True)
    boot_sample = pd.concat([df_dir[df_dir[cluster_var] == clust] for clust in sampled_clusters])
    
    # Create design matrix
    X_boot_df, _ = create_design_matrix(boot_sample, predictors, vary_var, numeric_predictors=numeric_predictors)
    X_boot_df = X_boot_df.reindex(columns=design_cols, fill_value=0).astype(float)
    X_boot = X_boot_df.values
    Y_boot = boot_sample[response_cols].values
    
    try:
        res_b = minimize(dirichlet_neg_log_likelihood, initial_theta,
                         args=(X_boot, Y_boot, k, p, lambda_reg),
                         method='L-BFGS-B', options={'maxfun':50000, 'maxiter':30000})
        beta_boot = res_b.x.reshape((k, p))
    except Exception as e:
        print(f"Bootstrap error: {e}")
        return np.full(k, np.nan)
    
    if np.isnan(beta_boot).any():
        return np.full(k, np.nan)
    
    # Create counterfactual data
    idx = design_cols.index(vary_var)
    X_boot_10 = X_boot.copy()
    X_boot_90 = X_boot.copy()
    X_boot_10[:, idx] = pct_10
    X_boot_90[:, idx] = pct_90
    
    # Calculate predictions
    A10 = np.exp(np.dot(X_boot_10, beta_boot.T))
    props_10 = A10 / A10.sum(axis=1, keepdims=True)
    avg_props_10 = props_10.mean(axis=0)
    
    A90 = np.exp(np.dot(X_boot_90, beta_boot.T))
    props_90 = A90 / A90.sum(axis=1, keepdims=True)
    avg_props_90 = props_90.mean(axis=0)
    
    return avg_props_90 - avg_props_10

def bootstrap_with_retry(args, max_retries=3):
    """Retry bootstrap iteration if it fails"""
    for attempt in range(max_retries):
        try:
            return bootstrap_iteration(args)
        except (TerminatedWorkerError, MemoryError, FloatingPointError, ValueError) as e:
            print(f"Error '{e}' on attempt {attempt+1}; retrying...")
            time.sleep(2)
    return np.full(args[5], np.nan)  # k is the 6th argument

# -------------------------------
# Main QOI Function
# -------------------------------

def compute_dirichlet_qoi(spec, data, vary_var, response_cols, cluster_var, B=5000, standardize=True):
    """
    For a given specification, fit a Dirichlet regression and compute the QOI
    (difference in average predicted proportions when vary_var is set to 90th vs 10th percentile).
    
    Parameters:
    -----------
    spec : dict with keys 'numeric' and 'categorical'
        Specification of variables to include in the model
    data : pandas DataFrame
        Dataset containing all required variables
    vary_var : str
        Name of the variable to vary (from 10th to 90th percentile)
    response_cols : list of str
        Names of columns containing outcome proportions
    cluster_var : str
        Name of variable defining clusters for block bootstrap
    lambda_reg : float
        Regularization parameter
    B : int
        Number of bootstrap replicates
    standardize : bool
        Whether to standardize numeric predictors
        
    Returns:
    --------
    list of str
        Formatted results for each outcome in the form "estimate (SE)***"
    """
    predictors = spec['numeric'] + spec['categorical']
    cols = [cluster_var] + predictors + response_cols
    df = data[cols].dropna().copy()
    
    if standardize:
        df, scaling = standardize_data(df, spec['numeric'])
    
    pct_10 = np.percentile(df[vary_var].dropna(), 10)
    pct_90 = np.percentile(df[vary_var].dropna(), 90)
    
    # Add epsilon to ensure proportions are strictly positive
    epsilon = 1e-6
    df[response_cols] = df[response_cols] + epsilon
    df[response_cols] = df[response_cols].div(df[response_cols].sum(axis=1), axis=0)
    
    # Create design matrix
    X_df, design_cols = create_design_matrix(df, predictors, vary_var, numeric_predictors=spec['numeric'])
    X_df = X_df.astype(float)
    X = X_df.values
    n, p_design = X.shape
    Y = df[response_cols].values
    k = Y.shape[1]
    
    # Initialize parameters
    np.random.seed(42)
    initial_theta = np.random.normal(0, 0.1, k * p_design)
    
    # Fit Dirichlet regression
    res = minimize(dirichlet_neg_log_likelihood, initial_theta,
                   args=(X, Y, k, p_design, 0),
                   method='L-BFGS-B', options={'maxfun':50000, 'maxiter':30000})
    
    if not res.success:
        print(f"Warning: Dirichlet regression did not converge for spec: {spec}")
    
    beta_hat = res.x.reshape((k, p_design))
    
    # Calculate QOI: effect of varying the variable from 10th to 90th percentile
    idx = design_cols.index(vary_var)
    X_10 = X.copy()
    X_90 = X.copy()
    X_10[:, idx] = pct_10
    X_90[:, idx] = pct_90
    
    A10 = np.exp(np.dot(X_10, beta_hat.T))
    props_10 = A10 / A10.sum(axis=1, keepdims=True)
    avg_props_10 = props_10.mean(axis=0)
    
    A90 = np.exp(np.dot(X_90, beta_hat.T))
    props_90 = A90 / A90.sum(axis=1, keepdims=True)
    avg_props_90 = props_90.mean(axis=0)
    
    qoi_full = avg_props_90 - avg_props_10
    
    # Bootstrap standard errors
    clusters = df[cluster_var].unique()
    num_clusters = len(clusters)
    
    bs_args = (df, predictors, response_cols, cluster_var, initial_theta, k, p_design,
               num_clusters, spec['numeric'], 0, pct_10, pct_90, design_cols, vary_var)
    
    print(f"Running {B} bootstrap iterations...")
    boot_results = Parallel(n_jobs=-1, verbose=5)(
        delayed(bootstrap_with_retry)(bs_args) for _ in range(B)
    )
    
    boot_results = np.array(boot_results)
    se_boot = np.nanstd(boot_results, axis=0, ddof=1)
    
    z = qoi_full / se_boot
    p_vals = 2 * (1 - norm.cdf(np.abs(z)))
    
    results = []
    for j in range(k):
        star = significance_stars(p_vals[j])
        results.append(f"{qoi_full[j]:.3f} ({se_boot[j]:.3f}){star}")
    
    return results

# -------------------------------
# Specifications Dictionary
# -------------------------------
specs = {
    'No Fixed Effects': {
        'numeric': ['log_emp'],
        'categorical': []
    },
    'Year FE': {
        'numeric': ['log_emp'],
        'categorical': ['year']
    },
    'Year + NAICS FE': {
        'numeric': ['log_emp'],
        'categorical': ['year', 'naics_simple']
    },
    'Year FE + Log PPENT + Log COGS': {
        'numeric': ['log_emp', 'log_ppent', 'log_cogs'],
        'categorical': ['year']
    },
    'Year + NAICS FE + Log PPENT + Log COGS': {
        'numeric': ['log_emp', 'log_ppent', 'log_cogs'],
        'categorical': ['year', 'naics_simple']
    }
}

# -------------------------------
# Main Execution
# -------------------------------
def generate_dirichlet_table(data_file='client_year_level_data.csv'):
    """Generate LaTeX table for Dirichlet regression results"""
    # Read data
    print(f"Reading data from {data_file}...")
    client_year_df = pd.read_csv(data_file)
    
    response_cols = ['ratio_label1', 'ratio_label2', 'ratio_label3', 'ratio_label4']
    outcome_labels = ['Support', 'Oppose', 'Amend', 'Monitor']
    
    results = {}
    for spec_name, spec in specs.items():
        print(f"\nFitting Dirichlet regression for: {spec_name}")
        results[spec_name] = compute_dirichlet_qoi(
            spec, client_year_df, 'log_emp', response_cols, 'firm_id', 
            B=5000, standardize=True
        )
    
    results_df = pd.DataFrame(results, index=outcome_labels)
    print("\nDirichlet Regression Results:")
    print(results_df)
    
    # Generate LaTeX table
    latex_header = r"\begin{table*}" + "\n"
    latex_header += r"\centering" + "\n"
    latex_header += r"\caption{Dirichlet Regression Results: Effect of Firm Size on Predicted Probability of Lobbying Position}" + "\n"
    latex_header += r"\resizebox{\textwidth}{!}{%" + "\n"
    latex_header += r"\begin{tabular}{lccccc}" + "\n"
    
    col_headers = [""] + list(specs.keys())
    header_row = " & ".join(col_headers) + r" \\" + "\n"
    
    body = r"\midrule" + "\n"
    for outcome in outcome_labels:
        row = [outcome]
        for spec in specs.keys():
            row.append(results[spec][outcome_labels.index(outcome)])
        body += " & ".join(row) + r" \\" + "\n"
    
    footer = r"\midrule" + "\n"
    footer += r"\multicolumn{6}{l}{\footnotesize Note: Clustered standard errors are in parentheses. *p$<$0.1; **p$<$0.05; ***p$<$0.01.} \\" + "\n"
    footer += r"\end{tabular}%" + "\n"
    footer += r"}" + "\n"
    footer += r"\label{tab:SI_dirichlet_qoi}" + "\n"
    footer += r"\end{table*}"
    
    latex_table = latex_header + header_row + body + footer
    
    # Save LaTeX table to file
    with open("analysis_output/analysis2_table19_appendix.tex", "w") as f:
        f.write(latex_table)
    return latex_table



latex_table = generate_dirichlet_table()
print("\nGenerated LaTeX Table:")
print(latex_table)

Reading data from client_year_level_data.csv...

Fitting Dirichlet regression for: No Fixed Effects
Running 5000 bootstrap iterations...


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 128 concurrent workers.
[Parallel(n_jobs=-1)]: Done  32 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done 194 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done 392 tasks      | elapsed:    9.5s
[Parallel(n_jobs=-1)]: Done 626 tasks      | elapsed:   12.5s
[Parallel(n_jobs=-1)]: Done 896 tasks      | elapsed:   15.8s
[Parallel(n_jobs=-1)]: Done 1202 tasks      | elapsed:   20.0s
[Parallel(n_jobs=-1)]: Done 1544 tasks      | elapsed:   24.4s
[Parallel(n_jobs=-1)]: Done 1922 tasks      | elapsed:   29.6s
[Parallel(n_jobs=-1)]: Done 2336 tasks      | elapsed:   35.4s
[Parallel(n_jobs=-1)]: Done 2786 tasks      | elapsed:   41.5s
[Parallel(n_jobs=-1)]: Done 3272 tasks      | elapsed:   48.2s
[Parallel(n_jobs=-1)]: Done 3794 tasks      | elapsed:   55.3s
[Parallel(n_jobs=-1)]: Done 4352 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 5000 out of 5000 | elapsed:  1.2min finished



Fitting Dirichlet regression for: Year FE
Running 5000 bootstrap iterations...


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 128 concurrent workers.
[Parallel(n_jobs=-1)]: Done  32 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done 194 tasks      | elapsed:   14.5s
[Parallel(n_jobs=-1)]: Done 392 tasks      | elapsed:   24.5s
[Parallel(n_jobs=-1)]: Done 626 tasks      | elapsed:   37.1s
[Parallel(n_jobs=-1)]: Done 896 tasks      | elapsed:   51.6s
[Parallel(n_jobs=-1)]: Done 1202 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 1544 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 1922 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 2336 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 2786 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 3272 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 3794 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 4352 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 5000 out of 5000 | elapsed:  4.5min finished



Fitting Dirichlet regression for: Year + NAICS FE
Running 5000 bootstrap iterations...


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 128 concurrent workers.
[Parallel(n_jobs=-1)]: Done  32 tasks      | elapsed:   55.5s
[Parallel(n_jobs=-1)]: Done 194 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 392 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 626 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done 896 tasks      | elapsed: 10.0min
[Parallel(n_jobs=-1)]: Done 1202 tasks      | elapsed: 13.2min
[Parallel(n_jobs=-1)]: Done 1544 tasks      | elapsed: 16.8min
[Parallel(n_jobs=-1)]: Done 1922 tasks      | elapsed: 20.7min
[Parallel(n_jobs=-1)]: Done 2336 tasks      | elapsed: 25.1min
[Parallel(n_jobs=-1)]: Done 2786 tasks      | elapsed: 29.8min
[Parallel(n_jobs=-1)]: Done 3272 tasks      | elapsed: 34.9min
[Parallel(n_jobs=-1)]: Done 3794 tasks      | elapsed: 40.3min
[Parallel(n_jobs=-1)]: Done 4352 tasks      | elapsed: 45.9min
[Parallel(n_jobs=-1)]: Done 5000 out of 5000 | elapsed: 52.5min finished



Fitting Dirichlet regression for: Year FE + Log PPENT + Log COGS
Running 5000 bootstrap iterations...


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 128 concurrent workers.
[Parallel(n_jobs=-1)]: Done  32 tasks      | elapsed:    9.3s
[Parallel(n_jobs=-1)]: Done 194 tasks      | elapsed:   19.9s
[Parallel(n_jobs=-1)]: Done 392 tasks      | elapsed:   36.4s
[Parallel(n_jobs=-1)]: Done 626 tasks      | elapsed:   52.6s
[Parallel(n_jobs=-1)]: Done 896 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1202 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 1544 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 1922 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 2336 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 2786 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 3272 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 3794 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 4352 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 5000 out of 5000 | elapsed:  6.4min finished



Fitting Dirichlet regression for: Year + NAICS FE + Log PPENT + Log COGS
Running 5000 bootstrap iterations...


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 128 concurrent workers.
[Parallel(n_jobs=-1)]: Done  32 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 194 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 392 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 626 tasks      | elapsed:  8.6min
[Parallel(n_jobs=-1)]: Done 896 tasks      | elapsed: 11.9min
[Parallel(n_jobs=-1)]: Done 1202 tasks      | elapsed: 16.1min
[Parallel(n_jobs=-1)]: Done 1544 tasks      | elapsed: 20.3min
[Parallel(n_jobs=-1)]: Done 1922 tasks      | elapsed: 25.1min
[Parallel(n_jobs=-1)]: Done 2336 tasks      | elapsed: 30.3min
[Parallel(n_jobs=-1)]: Done 2786 tasks      | elapsed: 35.7min
[Parallel(n_jobs=-1)]: Done 3272 tasks      | elapsed: 41.9min
[Parallel(n_jobs=-1)]: Done 3794 tasks      | elapsed: 48.4min
[Parallel(n_jobs=-1)]: Done 4352 tasks      | elapsed: 55.5min



Dirichlet Regression Results:
          No Fixed Effects            Year FE    Year + NAICS FE  \
Support     -0.014 (0.011)     -0.015 (0.012)    -0.024 (0.013)*   
Oppose   -0.017 (0.005)***  -0.018 (0.005)***  -0.024 (0.007)***   
Amend      -0.014 (0.007)*    -0.013 (0.007)*     -0.011 (0.009)   
Monitor    0.045 (0.020)**    0.045 (0.020)**    0.060 (0.024)**   

        Year FE + Log PPENT + Log COGS Year + NAICS FE + Log PPENT + Log COGS  
Support                 -0.027 (0.020)                        -0.043 (0.023)*  
Oppose                  -0.009 (0.009)                        -0.020 (0.011)*  
Amend                -0.042 (0.014)***                         -0.014 (0.016)  
Monitor                0.078 (0.033)**                         0.077 (0.042)*  

Generated LaTeX Table:
\begin{table*}
\centering
\caption{Dirichlet Regression Results: Effect of Firm Size on Predicted Probability of Lobbying Position}
\resizebox{\textwidth}{!}{%
\begin{tabular}{lccccc}
 & No Fixed Effects 

[Parallel(n_jobs=-1)]: Done 5000 out of 5000 | elapsed: 63.3min finished


## Table 20. Logistic Regression Results: Effect of Firm Size on Predicted Probability of Lobbying (Appendix)

In [9]:
def create_design_matrix(df, predictors, vary_var, numeric_predictors=None):

    if numeric_predictors is None:
        numeric_predictors = []
    
    categorical_predictors = [p for p in predictors if p not in numeric_predictors]
    
    dummies_list = []
    dummy_names = []
    X_numeric = df[numeric_predictors].copy()
    
    if categorical_predictors:
        for cat_var in categorical_predictors:
            dummies = pd.get_dummies(df[cat_var], prefix=cat_var, drop_first=True)
            dummies_list.append(dummies)
            dummy_names.extend(dummies.columns.tolist())
    
    if dummies_list:
        X_categorical = pd.concat(dummies_list, axis=1)
        X_df = pd.concat([X_numeric, X_categorical], axis=1)
        all_columns = numeric_predictors + dummy_names
    else:
        X_df = X_numeric
        all_columns = numeric_predictors
    
    X_df = sm.add_constant(X_df)
    all_columns = ['const'] + all_columns
    
    return X_df, all_columns

def significance_stars(p):
    """Return significance stars based on p-value."""
    if p < 0.01:
        return '***'
    elif p < 0.05:
        return '**'
    elif p < 0.1:
        return '*'
    else:
        return ''

def compute_logistic_qoi_delta(spec, data, vary_var, logistic_response, cluster_var, standardize=False):
    
    predictors = spec['numeric'] + spec['categorical']
    numeric_predictors = spec['numeric'].copy()
    if vary_var not in numeric_predictors:
        numeric_predictors.append(vary_var)
    
    cols = [cluster_var] + predictors + [logistic_response]
    df_logit = data[cols].dropna().copy()
    
    if standardize:
        for col in numeric_predictors:
            mean_val = df_logit[col].mean()
            std_val = df_logit[col].std()
            if std_val > 0:
                df_logit[col] = (df_logit[col] - mean_val) / std_val
            print(f"Standardized {col} in logistic subset: mean={mean_val:.4f}, std={std_val:.4f}")
    
    X_df, design_cols = create_design_matrix(df_logit, predictors, vary_var, numeric_predictors=numeric_predictors)
    X_df = X_df.astype(float)
    X = X_df.values
    
    print(f"Design matrix dimensions: {X.shape[0]} rows x {X.shape[1]} columns")
    
    # Compute 10th and 90th percentiles for the key predictor
    p10 = np.percentile(df_logit[vary_var].dropna(), 10)
    p90 = np.percentile(df_logit[vary_var].dropna(), 90)
    print(f"10th percentile of {vary_var}: {p10:.4f}")
    print(f"90th percentile of {vary_var}: {p90:.4f}")
    
    # Fit logistic regression using clustered standard errors
    model = sm.Logit(df_logit[logistic_response], X_df)
    res = model.fit(cov_type='cluster', cov_kwds={'groups': df_logit[cluster_var]}, disp=0)
    
    idx = design_cols.index(vary_var)
    X_p10 = X.copy()
    X_p90 = X.copy()
    X_p10[:, idx] = p10
    X_p90[:, idx] = p90
    preds_p10 = res.predict(X_p10)
    preds_p90 = res.predict(X_p90)
    avg_p10 = preds_p10.mean()
    avg_p90 = preds_p90.mean()
    qoi = avg_p90 - avg_p10
    
    grads_p10 = []
    for i in range(X_p10.shape[0]):
        x_i = X_p10[i, :]
        p_i = preds_p10[i]
        grad_i = p_i * (1 - p_i) * x_i
        grads_p10.append(grad_i)
    
    grads_p90 = []
    for i in range(X_p90.shape[0]):
        x_i = X_p90[i, :]
        p_i = preds_p90[i]
        grad_i = p_i * (1 - p_i) * x_i
        grads_p90.append(grad_i)
    
    avg_grad_p10 = np.mean(grads_p10, axis=0)
    avg_grad_p90 = np.mean(grads_p90, axis=0)
    delta_grad = avg_grad_p90 - avg_grad_p10
    
    cov_beta = res.cov_params().values
    var_delta = np.dot(delta_grad, np.dot(cov_beta, delta_grad))
    se_delta = np.sqrt(var_delta)
    
    z = qoi / se_delta
    p_val = 2 * (1 - norm.cdf(np.abs(z)))
    
    return qoi, se_delta, p_val

def results_to_latex(results_df, footnote, caption="Logistic Regression Results: Effect of Firm Size on Predicted Probability of Lobbying", label="tab:logistic_qoi"):
    """
    Convert results DataFrame to LaTeX table format
    """
    latex_code = "\\begin{table*}\n"
    latex_code += "\\centering\n"
    latex_code += f"\\caption{{{caption}}}\n"
    latex_code += "\\resizebox{\\textwidth}{!}{%\n"
    latex_code += "\\begin{tabular}{lccccc}\n"
    header = " & " + " & ".join(results_df.columns) + " \\\\\n"
    midrule = "\\midrule\n"
    
    rows = []
    for idx, row in results_df.iterrows():
        row_str = idx + " & " + " & ".join(row.values) + " \\\\\n"
        rows.append(row_str)
    footer_midrule = "\\midrule\n"
    footer = f"\\multicolumn{{{len(results_df.columns) + 1}}}{{l}}{{\\footnotesize {footnote}}} \\\\\\\\\n"
    
    latex_code += header + midrule + "".join(rows) + footer_midrule + footer
    
    latex_code += "\\end{tabular}%\n"
    latex_code += "}\n"
    latex_code += f"\\label{{{label}}}\n"
    latex_code += "\\end{table*}"
    
    return latex_code


In [10]:
client_year_df = open_csv('analysis_input/analysis2_firm_size_bill_position_df.csv')

specs = {
    'No Fixed Effects': {
        'numeric': ['log_emp'],
        'categorical': []
    },
    'Year FE': {
        'numeric': ['log_emp'],
        'categorical': ['year']
    },
    'Year + NAICS FE': {
        'numeric': ['log_emp'],
        'categorical': ['year', 'naics_simple']
    },
    'Year FE + Log PPENT + Log COGS': {
        'numeric': ['log_emp', 'log_ppent', 'log_cogs'],
        'categorical': ['year']
    },
    'Year + NAICS FE + Log PPENT + Log COGS': {
        'numeric': ['log_emp', 'log_ppent', 'log_cogs'],
        'categorical': ['year', 'naics_simple']
    }
}

# Compute results for each specification
results = {}
for spec_name, spec in specs.items():
    print(f"\nFitting logistic regression for specification: {spec_name}")
    qoi, se, p_val = compute_logistic_qoi_delta(
        spec, data=client_year_df, vary_var='log_emp',
        logistic_response='lobbied', cluster_var='firm_id', standardize=True
    )
    
    star = significance_stars(p_val)
    formatted_result = f"{qoi:.3f} ({se:.3f}){star}"
    results[spec_name] = formatted_result

results_df = pd.DataFrame(results, index=['Lobby'])
footnote = "Note: Clustered standard errors are in parentheses. *p<0.1; **p<0.05; ***p<0.01."

# Generate LaTeX table and save to file
latex_table = results_to_latex(results_df, footnote)
print("\nLaTeX code for Appendix Table 20:")
print(latex_table)

output_file = "analysis_output/analysis2_table20_appendix.tex"
with open(output_file, "w") as f:
    f.write(latex_table)
print(f"\nLaTeX table saved to {output_file}")

  df = fun(x) - f0



Fitting logistic regression for specification: No Fixed Effects
Standardized log_emp in logistic subset: mean=6.3576, std=2.7409
Design matrix dimensions: 90905 rows x 2 columns
10th percentile of log_emp: -1.3567
90th percentile of log_emp: 1.3115


  df = fun(x) - f0
  df = fun(x) - f0



Fitting logistic regression for specification: Year FE
Standardized log_emp in logistic subset: mean=6.3576, std=2.7409
Design matrix dimensions: 90905 rows x 15 columns
10th percentile of log_emp: -1.3567
90th percentile of log_emp: 1.3115


  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0



Fitting logistic regression for specification: Year + NAICS FE
Standardized log_emp in logistic subset: mean=6.3595, std=2.7395
Design matrix dimensions: 90872 rows x 37 columns
10th percentile of log_emp: -1.3581
90th percentile of log_emp: 1.3115


  df = fun(x) - f0



Fitting logistic regression for specification: Year FE + Log PPENT + Log COGS
Standardized log_emp in logistic subset: mean=6.3796, std=2.7527
Standardized log_ppent in logistic subset: mean=10.2608, std=3.9305
Standardized log_cogs in logistic subset: mean=10.9274, std=4.1649
Design matrix dimensions: 89052 rows x 17 columns
10th percentile of log_emp: -1.3589
90th percentile of log_emp: 1.3050


  df = fun(x) - f0



Fitting logistic regression for specification: Year + NAICS FE + Log PPENT + Log COGS
Standardized log_emp in logistic subset: mean=6.3815, std=2.7513
Standardized log_ppent in logistic subset: mean=10.2624, std=3.9295
Standardized log_cogs in logistic subset: mean=10.9307, std=4.1616
Design matrix dimensions: 89020 rows x 39 columns
10th percentile of log_emp: -1.3603
90th percentile of log_emp: 1.3056


  df = fun(x) - f0



LaTeX code for Appendix Table 20:
\begin{table*}
\centering
\caption{Logistic Regression Results: Effect of Firm Size on Predicted Probability of Lobbying}
\resizebox{\textwidth}{!}{%
\begin{tabular}{lccccc}
 & No Fixed Effects & Year FE & Year + NAICS FE & Year FE + Log PPENT + Log COGS & Year + NAICS FE + Log PPENT + Log COGS \\
\midrule
Lobby & 0.193 (0.008)*** & 0.195 (0.008)*** & 0.205 (0.009)*** & 0.053 (0.012)*** & 0.055 (0.015)*** \\
\midrule
\multicolumn{6}{l}{\footnotesize Note: Clustered standard errors are in parentheses. *p<0.1; **p<0.05; ***p<0.01.} \\\\
\end{tabular}%
}
\label{tab:logistic_qoi}
\end{table*}

LaTeX table saved to analysis_output/analysis2_table20_appendix.tex


  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
  df = fun(x) - f0
