biased sampling scheme of:
- expit(logit(sample_probability_centering) + \
                               sample_probability_bias_factor * (y / std_y))

# Imports:

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

from scipy.stats import chi2

from tqdm import tqdm, trange
import random

import matplotlib.pyplot as plt
import pickle
import statsmodels.api as sm
from scipy.special import expit, logit

# Hyperparameters:

In [2]:
pop_size = 100 * 1_000
sample_size = 30
agg_iters = 100_000

beta = 1

## for biased sampling mechanism:
sample_probability_centering = 0.8
sample_probability_bias_factor = 1

In [3]:
## for rng generator:
pop_number = 1

# Run:

In [4]:
rand_generator = np.random.default_rng(seed=333 * pop_number)

In [5]:
assert pop_size % 100 == 0
base_x = [i for i in range(-50, 0)] + [i for i in range(1, 51)]
x = pd.Series(base_x * (pop_size // 100))
normal_stdev = 30

In [6]:
fitted_betas = []
fitted_FI = []
fitted_correlations = []
realized_sample_sizes = []

Create the simulated data (x, y):

In [7]:
y = x * beta + rand_generator.normal(size=pop_size) * normal_stdev

In [8]:
std_y = np.sqrt(np.var(x) + normal_stdev ** 2)

In [9]:
np.std(y)

np.float64(42.04298479595959)

In [10]:
data = pd.concat([y, x], axis=1)

data.columns = ['y', 'x']

fitted_beta = sm.OLS(data['y'], data['x']).fit().params.iloc[0]

sm.OLS(data['y'], data['x']).fit().summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.486
Model:,OLS,Adj. R-squared (uncentered):,0.486
Method:,Least Squares,F-statistic:,94600.0
Date:,"Thu, 28 Aug 2025",Prob (F-statistic):,0.0
Time:,10:35:22,Log-Likelihood:,-482470.0
No. Observations:,100000,AIC:,964900.0
Df Residuals:,99999,BIC:,965000.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x,1.0005,0.003,307.572,0.000,0.994,1.007

0,1,2,3
Omnibus:,6.615,Durbin-Watson:,2.0
Prob(Omnibus):,0.037,Jarque-Bera (JB):,6.474
Skew:,0.008,Prob(JB):,0.0393
Kurtosis:,2.964,Cond. No.,1.0


In [11]:
data['g_star'] = data['x'] * (data['y'] - data['x'] * fitted_beta)

In [12]:
marginal_probabilities = expit(logit(sample_probability_centering) + \
                               sample_probability_bias_factor * (y / std_y))

In [13]:
np.mean(marginal_probabilities)

np.float64(0.7603001094704808)

# run:

In [14]:
for _ in tqdm(range(agg_iters)):
    srs_sample_indices = pd.Series(random.sample(range(pop_size), sample_size))
    srs_sample_indices = srs_sample_indices.sort_values().reset_index(drop=True)

    # reset the sample
    data['r0'] = 0
    data['r'] = 0
    data['r1'] = 0
    data['r2'] = 0
    
    data.loc[srs_sample_indices, 'r0'] = 1
    full_sampled_data = data[data['r0'] == 1]

    biased_sample_indices = marginal_probabilities.index[rand_generator.binomial(n=1, p = marginal_probabilities) == 1]

    data.loc[biased_sample_indices, 'r'] = 1

    biased_sample_indices_size = len(biased_sample_indices)
    realized_sample_sizes.append(biased_sample_indices_size)


    srs_sample = data[data['r0'] == 1].copy()
    biased_sample = data[data['r'] == 1].copy()

    srs_fitted_beta = sm.OLS(srs_sample['y'], srs_sample['x']).fit().params.iloc[0]
    biased_fitted_beta = sm.OLS(biased_sample['y'], biased_sample['x']).fit().params.iloc[0]

    fitted_betas.append(pd.Series([fitted_beta, srs_fitted_beta, biased_fitted_beta]))

    fitted_FI.append(pd.Series([np.mean(srs_sample['x'] ** 2), np.mean(biased_sample['x'] ** 2)]))

    fitted_correlations.append(data[['g_star', 'r0', 'r']].corr().iloc[0, 1:])

100%|██████████████████████████████████████████████████████████████████████████| 100000/100000 [24:55<00:00, 66.88it/s]


Clean up the data:

In [15]:
fitted_betas = pd.concat(fitted_betas, axis=1).T
fitted_FI = pd.concat(fitted_FI, axis=1).T
fitted_correlations = pd.concat(fitted_correlations, axis=1).T.reset_index(drop=True)

In [16]:
fitted_FI.columns = ['srs', 'biased']
fitted_betas.columns = ['population', 'srs', 'biased']
fitted_correlations.columns = ['srs', 'biased']

In [17]:
realized_sample_sizes = pd.Series(realized_sample_sizes)

In [18]:
fitted_FI.head()

Unnamed: 0,srs,biased
0,688.566667,845.643528
1,931.466667,843.379366
2,972.1,843.463103
3,830.6,841.92556
4,841.733333,843.166152


In [19]:
fitted_FI.tail()

Unnamed: 0,srs,biased
99995,708.033333,845.905249
99996,979.333333,845.083118
99997,764.9,843.619438
99998,972.766667,844.446755
99999,1149.3,845.026032


# Save:

In [20]:
fitted_FI_and_corr = fitted_FI.join(fitted_correlations, lsuffix = '_fi', rsuffix = '_corr')

fitted_FI_and_corr.head()

Unnamed: 0,srs_fi,biased_fi,srs_corr,biased_corr
0,688.566667,845.643528,0.001519,-0.079726
1,931.466667,843.379366,-0.001842,-0.076616
2,972.1,843.463103,0.003345,-0.078689
3,830.6,841.92556,-0.000359,-0.081295
4,841.733333,843.166152,-0.001097,-0.076856


In [21]:
Jeee = np.mean(data['x'] ** 2)

fitted_FI_and_corr['srs_samp_elasticity'] = fitted_FI_and_corr.eval(f'srs_fi / {Jeee}')
fitted_FI_and_corr['biased_samp_elasticity'] = fitted_FI_and_corr.eval(f'biased_fi / {Jeee}')

In [22]:
fitted_FI_and_corr.to_pickle('sample_by_y_bias.pkl')