In [32]:
import numpy as np
import pandas as pd

# import matplotlib.pyplot as plt
# import scipy.stats as stats

# Set random seed for reproducibility
np.random.seed(42)

# Generate data for 10 individuals
n = 1000

# Simulate market income (y)
income = np.random.lognormal(mean=10, sigma=0.8, size=n)

# Simulate household size 
household_size = np.random.randint(1, 6, size=n)

# Simulate number of dependents (a)
dependents = np.random.binomial(household_size - 1, 0.5, size=n)
dependents = np.clip(dependents, 0, household_size - 1)

# Households with size 1 have 0 dependents
dependents[household_size == 1] = 0

# Calculate working adults (A)
working = household_size - dependents

# Simulate weights
weights = np.random.uniform(0.5, 1.5, size=n)

# Simulate taxes (T)
tau = 0.6
theta = 0.8
ell = 100
sigma = 0.5
net_income = (
    pow(household_size, theta) / working * (
        ell * pow(income, 1 - tau) * (
            np.exp(np.random.normal(scale = sigma, size = n))
            )
        )
)

# Create DataFrame
df = pd.DataFrame({
    'income': income,  # Market income
    'net_income': net_income,  # Net income after taxes
    'working': working,  # Household size
    'household_size': household_size,  # Non-dependents in household
    'weights': weights,  # Survey weights
})


# Filter the data
df = df[(df['income'] > 0) & (df['net_income'] > 0)] 


# Display the data
df

Unnamed: 0,income,net_income,working,household_size,weights
0,32773.361742,5828.091824,4,4,1.041990
1,19719.998637,2817.568018,1,1,1.439652
2,36980.677685,3270.696039,3,3,0.720484
3,74490.276951,7152.389958,4,5,1.225726
4,18263.819407,4230.106960,3,3,1.117249
...,...,...,...,...,...
995,17590.596803,5334.819696,2,3,0.751080
996,92795.109820,3542.866326,4,4,1.150425
997,36778.705030,4846.601972,1,1,0.712334
998,13947.517413,9672.465028,2,3,0.622007


In [5]:
help(np.exp)

Help on ufunc in module numpy:

exp = <ufunc 'exp'>
    exp(x, /, out=None, *, where=True, casting='same_kind', order='K', dtype=None, subok=True[, signature])

    Calculate the exponential of all elements in the input array.

    Parameters
    ----------
    x : array_like
        Input values.
    out : ndarray, None, or tuple of ndarray and None, optional
        A location into which the result is stored. If provided, it must have
        a shape that the inputs broadcast to. If not provided or None,
        a freshly-allocated array is returned. A tuple (possible only as a
        keyword argument) must have length equal to the number of outputs.
    where : array_like, optional
        This condition is broadcast over the input. At locations where the
        condition is True, the `out` array will be set to the ufunc result.
        Elsewhere, the `out` array will retain its original value.
        Note that if an uninitialized `out` array is created via the default
        ``

## OLS

In [33]:
import statsmodels.api as sm

X = np.log(df[['income', 'household_size']])
X = sm.add_constant(X)
# y = np.log(df['net_income'] / df['working'])
y = np.log(df['net_income']) + np.log(df['working'])

model = sm.OLS(y, X).fit()
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.569
Model:,OLS,Adj. R-squared:,0.568
Method:,Least Squares,F-statistic:,658.8
Date:,"Mon, 17 Mar 2025",Prob (F-statistic):,4.5200000000000005e-183
Time:,16:41:28,Log-Likelihood:,-713.07
No. Observations:,1000,AIC:,1432.0
Df Residuals:,997,BIC:,1447.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.4717,0.203,22.022,0.000,4.073,4.870
income,0.4099,0.020,20.524,0.000,0.371,0.449
household_size,0.8379,0.028,30.409,0.000,0.784,0.892

0,1,2,3
Omnibus:,0.013,Durbin-Watson:,2.049
Prob(Omnibus):,0.993,Jarque-Bera (JB):,0.036
Skew:,-0.008,Prob(JB):,0.982
Kurtosis:,2.975,Cond. No.,132.0


In [None]:
# Residual estimate
pow(np.var(model.resid), 1/2)

0.49367958452886423

## Maximum Likelihood

In [57]:
# Split dataframe

# // is the floor division operator
half_index = df.shape[0] // 2

df_2 = pd.concat([
    # .iloc[:x] selects all rows from the beginning up to, but not including, the row at position x
        df.iloc[:half_index], 
        df.iloc[half_index:].reset_index(drop = True)
    ], axis=1)

df_2

Unnamed: 0,income,net_income,working,household_size,weights,income.1,net_income.1,working.1,household_size.1,weights.1
0,32773.361742,5828.091824,4,4,1.041990,46209.562586,4311.830482,1,1,1.389884
1,19719.998637,2817.568018,1,1,1.439652,101471.506344,6126.206407,4,5,1.317856
2,36980.677685,3270.696039,3,3,0.720484,7195.031094,3758.679432,3,4,1.010148
3,74490.276951,7152.389958,4,5,1.225726,34557.344734,7040.802305,2,3,1.236240
4,18263.819407,4230.106960,3,3,1.117249,13088.456594,2185.187297,3,3,1.012897
...,...,...,...,...,...,...,...,...,...,...
495,33898.567470,17719.624860,1,2,0.623233,17590.596803,5334.819696,2,3,0.751080
496,9606.575362,4721.649505,1,1,1.198531,92795.109820,3542.866326,4,4,1.150425
497,18915.350318,8504.885876,3,5,0.916145,36778.705030,4846.601972,1,1,0.712334
498,10932.610572,6038.047093,1,1,1.013851,13947.517413,9672.465028,2,3,0.622007


In [58]:
column_names = df.columns

new_column_names = []
for k in column_names:
    new_name = str(k) + "_i"
    new_column_names.append(new_name)
for k in column_names:
    new_name = str(k) + "_j"
    new_column_names.append(new_name)

df_2.columns = new_column_names

df_2

Unnamed: 0,income_i,net_income_i,working_i,household_size_i,weights_i,income_j,net_income_j,working_j,household_size_j,weights_j
0,32773.361742,5828.091824,4,4,1.041990,46209.562586,4311.830482,1,1,1.389884
1,19719.998637,2817.568018,1,1,1.439652,101471.506344,6126.206407,4,5,1.317856
2,36980.677685,3270.696039,3,3,0.720484,7195.031094,3758.679432,3,4,1.010148
3,74490.276951,7152.389958,4,5,1.225726,34557.344734,7040.802305,2,3,1.236240
4,18263.819407,4230.106960,3,3,1.117249,13088.456594,2185.187297,3,3,1.012897
...,...,...,...,...,...,...,...,...,...,...
495,33898.567470,17719.624860,1,2,0.623233,17590.596803,5334.819696,2,3,0.751080
496,9606.575362,4721.649505,1,1,1.198531,92795.109820,3542.866326,4,4,1.150425
497,18915.350318,8504.885876,3,5,0.916145,36778.705030,4846.601972,1,1,0.712334
498,10932.610572,6038.047093,1,1,1.013851,13947.517413,9672.465028,2,3,0.622007


In [59]:
# Add an indicator variable

df_2 = df_2.assign(
    atr_i = df_2['net_income_i'] / df_2['income_i'],
    atr_j = df_2['net_income_j'] / df_2['income_j'],
)

df_2 = df_2.assign(
    rank_binary = np.where(df_2['atr_i'] > df_2['atr_j'], 1, 0)
)

# df_2["rank_binary"] = np.where(
#     (df_2['net_income_i'] / df_2['income_i'] 
#      > df_2['net_income_j'] / df_2['income_j']), 
#      1, 0
# )

df_2

Unnamed: 0,income_i,net_income_i,working_i,household_size_i,weights_i,income_j,net_income_j,working_j,household_size_j,weights_j,atr_i,atr_j,rank_binary
0,32773.361742,5828.091824,4,4,1.041990,46209.562586,4311.830482,1,1,1.389884,0.177830,0.093310,1
1,19719.998637,2817.568018,1,1,1.439652,101471.506344,6126.206407,4,5,1.317856,0.142879,0.060374,1
2,36980.677685,3270.696039,3,3,0.720484,7195.031094,3758.679432,3,4,1.010148,0.088443,0.522399,0
3,74490.276951,7152.389958,4,5,1.225726,34557.344734,7040.802305,2,3,1.236240,0.096018,0.203743,0
4,18263.819407,4230.106960,3,3,1.117249,13088.456594,2185.187297,3,3,1.012897,0.231611,0.166955,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,33898.567470,17719.624860,1,2,0.623233,17590.596803,5334.819696,2,3,0.751080,0.522725,0.303277,1
496,9606.575362,4721.649505,1,1,1.198531,92795.109820,3542.866326,4,4,1.150425,0.491502,0.038179,1
497,18915.350318,8504.885876,3,5,0.916145,36778.705030,4846.601972,1,1,0.712334,0.449629,0.131777,1
498,10932.610572,6038.047093,1,1,1.013851,13947.517413,9672.465028,2,3,0.622007,0.552297,0.693490,0


In [60]:
from scipy.stats import norm
import scipy.optimize as optim



# Specify a likelihood function
def neg_log_likelihood(params, data):
      tau, theta, sigma = params
      
      # Specify random variable
      randvar = (
            tau * np.log(data['net_income_i'] / data['net_income_j'])
            + np.log(data['working_i'] / data['working_j'])
            - theta * (
                  np.log(data['household_size_i'] / data['household_size_j'])
            )
      )
      # Specify CDF
      cdf = norm.cdf(randvar, scale = sigma)
      # Specify log-likelihood function
      log_likelihood = np.sum(
        data['rank_binary'] * np.log(cdf) 
        + (1 - data['rank_binary']) * np.log(1 - cdf)
      )
      return -log_likelihood

initial_params = [0.6, 0.8, 0.5]

model = optim.minimize(neg_log_likelihood, initial_params, args = (df_2,))

model.x

array([ 3562.81609163, -1533.52468076,  8098.91460477])

In [50]:
help(norm.cdf)

Help on method cdf in module scipy.stats._distn_infrastructure:

cdf(x, *args, **kwds) method of scipy.stats._continuous_distns.norm_gen instance
    Cumulative distribution function of the given RV.

    Parameters
    ----------
    x : array_like
        quantiles
    arg1, arg2, arg3,... : array_like
        The shape parameter(s) for the distribution (see docstring of the
        instance object for more information)
    loc : array_like, optional
        location parameter (default=0)
    scale : array_like, optional
        scale parameter (default=1)

    Returns
    -------
    cdf : ndarray
        Cumulative distribution function evaluated at `x`

