biased sampling scheme of:
- logit(sample_probability_centering) + \
                               0.1 * (data['x'].abs() - 25.5))

# Imports:

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

from scipy.stats import chi2

from tqdm import tqdm, trange
import random

In [2]:
import matplotlib.pyplot as plt
import pickle
import statsmodels.api as sm
from scipy.special import expit, logit

# Hyperparameters:

In [6]:
pop_size = 100 * 1_000
sample_size = 30
agg_iters = 100_0

beta = 1

## for biased sampling mechanism:
sample_probability_centering = 0.8
sample_probability_bias_factor = 0.1

In [7]:
## for rng generator:
pop_number = 1

# Run:

In [8]:
rand_generator = np.random.default_rng(seed=333 * pop_number)

In [9]:
assert pop_size % 100 == 0
base_x = [i for i in range(-50, 0)] + [i for i in range(1, 51)]
x = pd.Series(base_x * (pop_size // 100))
normal_stdev = 30

In [11]:
fitted_betas = []
fitted_FI = []
fitted_correlations = []
realized_sample_sizes = []

Create the simulated data (x, y):

In [12]:
y = x * beta + rand_generator.normal(size=pop_size) * normal_stdev

data = pd.concat([y, x], axis=1)

data.columns = ['y', 'x']

fitted_beta = sm.OLS(data['y'], data['x']).fit().params.iloc[0]

data['g_star'] = data['x'] * (data['y'] - data['x'] * fitted_beta)

In [13]:
marginal_probabilities = expit(logit(sample_probability_centering) + \
                               sample_probability_bias_factor * (data['x'].abs() - 25.5))

# run:

In [14]:
for _ in tqdm(range(agg_iters)):
    srs_sample_indices = pd.Series(random.sample(range(pop_size), sample_size))
    srs_sample_indices = srs_sample_indices.sort_values().reset_index(drop=True)

    # reset the sample
    data['r0'] = 0
    data['r'] = 0
    data['r1'] = 0
    data['r2'] = 0
    
    data.loc[srs_sample_indices, 'r0'] = 1
    full_sampled_data = data[data['r0'] == 1]

    marginal_probabilities = expit(logit(sample_probability_centering) + \
                                   sample_probability_bias_factor * (full_sampled_data['x'].abs() - 25.5))
    biased_sample_indices = marginal_probabilities.index[rand_generator.binomial(n=1, p = marginal_probabilities) == 1]

    data.loc[biased_sample_indices, 'r'] = 1

    biased_sample_indices_size = len(biased_sample_indices)
    realized_sample_sizes.append(biased_sample_indices_size)


    srs_sample = data[data['r0'] == 1].copy()
    biased_sample = data[data['r'] == 1].copy()

    srs_fitted_beta = sm.OLS(srs_sample['y'], srs_sample['x']).fit().params.iloc[0]
    biased_fitted_beta = sm.OLS(biased_sample['y'], biased_sample['x']).fit().params.iloc[0]

    fitted_betas.append(pd.Series([fitted_beta, srs_fitted_beta, biased_fitted_beta]))

    fitted_FI.append(pd.Series([np.mean(srs_sample['x'] ** 2), np.mean(biased_sample['x'] ** 2)]))

    fitted_correlations.append(data[['g_star', 'r0', 'r']].corr().iloc[0, 1:])

100%|█████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:04<00:00, 203.63it/s]


Clean up the data:

In [12]:
fitted_betas = pd.concat(fitted_betas, axis=1).T
fitted_FI = pd.concat(fitted_FI, axis=1).T
fitted_correlations = pd.concat(fitted_correlations, axis=1).T.reset_index(drop=True)

In [13]:
fitted_FI.columns = ['srs', 'biased']
fitted_betas.columns = ['population', 'srs', 'biased']
fitted_correlations.columns = ['srs', 'biased']

In [14]:
realized_sample_sizes = pd.Series(realized_sample_sizes)

In [15]:
fitted_FI.head()

Unnamed: 0,srs,biased
0,752.566667,1069.70687
1,929.166667,1069.3012
2,888.5,1068.272182
3,933.833333,1068.335264
4,824.333333,1067.513695


In [16]:
fitted_FI.tail()

Unnamed: 0,srs,biased
99995,826.666667,1068.562033
99996,836.033333,1068.791067
99997,891.5,1068.082733
99998,1031.433333,1068.256316
99999,992.566667,1068.13227


In [17]:
fitted_FI_and_corr = fitted_FI.join(fitted_correlations, lsuffix = '_fi', rsuffix = '_corr')

fitted_FI_and_corr.head()

Unnamed: 0,srs_fi,biased_fi,srs_corr,biased_corr
0,752.566667,1069.70687,0.001939,-0.003245
1,929.166667,1069.3012,0.003219,0.000446
2,888.5,1068.272182,-0.00578,0.002344
3,933.833333,1068.335264,0.004207,-3.1e-05
4,824.333333,1067.513695,0.000749,0.002485


In [18]:
np.mean(data['x'] ** 2)

np.float64(858.5)

# Save:

In [19]:
Jeee = np.mean(data['x'] ** 2)

fitted_FI_and_corr['srs_samp_elasticity'] = fitted_FI_and_corr.eval(f'srs_fi / {Jeee}')
fitted_FI_and_corr['biased_samp_elasticity'] = fitted_FI_and_corr.eval(f'biased_fi / {Jeee}')

fitted_FI_and_corr.to_pickle('sample_by_x_bias.pkl')