In [2]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
from linearmodels.iv import IV2SLS

def simulate_nested_logit_varying_nest_sizes(
    I=100,           # Number of markets (or people)
    beta=2.0,        # True slope parameter
    rho=0.5,         # Nest correlation parameter in (0,1)
    sigma_x=1.0,     # Std dev of x_{j i}
    sigma_xi=1.0,    # Std dev of xi_{j i}
    seed=42
):
    """
    Simulates a nested-logit scenario with 10 nests, where
    nest g has exactly g products (so 1 + 2 + ... + 10 = 55 inside goods).
    
    For each market i:
      1. We draw x_{j i} ~ N(0, sigma_x) and xi_{j i} ~ N(0, sigma_xi).
      2. delta_{j i} = beta*x_{j i} + xi_{j i}.
      3. Use nested-logit formulas with an outside good.
    Returns a DataFrame with columns:
      ['market', 'product', 'nest', 'nest_size', 'x_ji', 'delta_ji',
       's_ji', 's_gi', 's_0i'].
    """
    np.random.seed(seed)
    
    # 10 nests with sizes 1..10 => total of 55 inside products
    nest_sizes = np.arange(1, 11)  # [1, 2, ..., 10]
    G = len(nest_sizes)
    J = np.sum(nest_sizes)  # 1+2+...+10 = 55
    
    # Assign each product to the appropriate nest
    product_to_nest = []
    product_id = 0
    for g in range(G):
        g_index = g+1  # nest label in {1..10}
        size_g = nest_sizes[g]
        for _ in range(size_g):
            product_to_nest.append((product_id, g_index))
            product_id += 1
    
    rows = []
    for i in range(I):
        # 1. Draw x_{j i} and xi_{j i}
        x_ji = np.random.normal(0.0, sigma_x, size=J)
        xi_ji = np.random.normal(0.0, sigma_xi, size=J)
        delta_ji = beta * x_ji + xi_ji
        
        # 2. exp(delta_{j i}/rho) and nest sums
        exp_term = np.exp(delta_ji / rho)
        nest_sums = {}
        for g in range(1, G+1):
            mask_g = [ (product_to_nest[j][1] == g) for j in range(J) ]
            nest_sums[g] = np.sum(exp_term[mask_g])
        
        # 3. total_inside_sum
        total_inside_sum = np.sum([val**rho for val in nest_sums.values()])
        
        # 4. outside share
        s_0i = 1.0 / (1.0 + total_inside_sum)
        
        # 5. nest share s_{g,i}
        s_gi = {}
        for g in range(1, G+1):
            s_gi[g] = (nest_sums[g]**rho) / (1.0 + total_inside_sum)
        
        # 6. product-level shares
        for j in range(J):
            nest_j = product_to_nest[j][1]
            nest_size_j = nest_sizes[nest_j - 1]
            denom_within_nest = nest_sums[nest_j]
            if denom_within_nest <= 1e-14:
                share_ji = 0.0
            else:
                share_ji = s_gi[nest_j] * (exp_term[j] / denom_within_nest)
            
            row = {
                'market'    : i,
                'product'   : product_to_nest[j][0],
                'nest'      : nest_j,
                'nest_size' : nest_size_j,
                'x_ji'      : x_ji[j],
                'delta_ji'  : delta_ji[j],
                's_ji'      : share_ji,
                's_gi'      : s_gi[nest_j],
                's_0i'      : s_0i
            }
            rows.append(row)
    
    df = pd.DataFrame(rows)
    return df


def run_nested_logit_regressions(df):
    """
    1. Construct y = log(s_ji) - log(s_0i).
    2. Endogenous variable: log_s_j_g = log(s_ji) - log(s_gi).
    3. OLS: y ~ log_s_j_g + x_ji
    4. 2SLS: y ~ 1 + x_ji + [log_s_j_g ~ nest_size]
    """
    df = df.loc[(df['s_ji']>0) & (df['s_gi']>0)].copy()
    df['y'] = np.log(df['s_ji']) - np.log(df['s_0i'])
    df['log_s_j_g'] = np.log(df['s_ji']) - np.log(df['s_gi'])
    
    # OLS
    ols_model = smf.ols('y ~ log_s_j_g + x_ji', data=df).fit()
    
    # IV 2SLS
    #   y ~ x_ji + [log_s_j_g ~ nest_size]
    iv_model = IV2SLS.from_formula(
        'y ~ 1 + x_ji + [log_s_j_g ~ nest_size]',
        data=df
    ).fit(cov_type='robust')
    
    return ols_model, iv_model


if __name__ == "__main__":
    # 1. Simulate
    df_data = simulate_nested_logit_varying_nest_sizes(
        I=100,
        beta=2.0,
        rho=0.5,
        sigma_x=1.0,
        sigma_xi=1.0,
        seed=42
    )
    
    # 2. Regressions
    ols_res, iv_res = run_nested_logit_regressions(df_data)
    
    # 3. Print LATEX results for both OLS and 2SLS
    print("\n--- OLS results (LaTeX) ---")
    print(ols_res.summary().as_latex())
    
    print("\n--- 2SLS/IV results (LaTeX) ---")
    # linearmodels IV "summary" returns multiple tables; print each as LaTeX
    for tab in iv_res.summary.tables:
        print(tab.as_latex_tabular())
        print()  # blank line between tables



--- OLS results (LaTeX) ---
\begin{center}
\begin{tabular}{lclc}
\toprule
\textbf{Dep. Variable:}    &        y         & \textbf{  R-squared:         } &     0.960   \\
\textbf{Model:}            &       OLS        & \textbf{  Adj. R-squared:    } &     0.960   \\
\textbf{Method:}           &  Least Squares   & \textbf{  F-statistic:       } & 6.640e+04   \\
\textbf{Date:}             & Wed, 29 Jan 2025 & \textbf{  Prob (F-statistic):} &     0.00    \\
\textbf{Time:}             &     15:06:56     & \textbf{  Log-Likelihood:    } &   -7056.5   \\
\textbf{No. Observations:} &        5500      & \textbf{  AIC:               } & 1.412e+04   \\
\textbf{Df Residuals:}     &        5497      & \textbf{  BIC:               } & 1.414e+04   \\
\textbf{Df Model:}         &           2      & \textbf{                     } &             \\
\textbf{Covariance Type:}  &    nonrobust     & \textbf{                     } &             \\
\bottomrule
\end{tabular}
\begin{tabular}{lcccccc}
          