biased sampling scheme of:
- logit(sample_probability_centering) + \
                               0.1 * (data['x'].abs() - 25.5))

creates file:
- sample_by_x_bias.pkl

# Imports:

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import random
import statsmodels.api as sm
from scipy.special import expit, logit

# Hyperparameters:

In [2]:
pop_size = 100 * 1_000
sample_size = 100_0
agg_iters = 100_000

beta = 1

## for biased sampling mechanism:
sample_probability_centering = 0.8
sample_probability_bias_factor = 1

In [3]:
## for rng generator:
pop_number = 1

In [4]:
# set seeds:
rand_generator = np.random.default_rng(seed=333 * pop_number)
random.seed(333 * pop_number)

In [5]:
assert pop_size % 100 == 0
base_x = [i for i in range(-50, 0)] + [i for i in range(1, 51)]
x = pd.Series(base_x * (pop_size // 100))
normal_stdev = 30

# Run:

In [6]:
fitted_betas = []
fitted_Jn = []
fitted_correlations = []
realized_sample_sizes = []

Create the simulated data (x, y):

In [7]:
y = x * beta + rand_generator.normal(size=pop_size) * normal_stdev

data = pd.concat([y, x], axis=1)
data.columns = ['y', 'x']
fitted_beta = sm.OLS(data['y'], data['x']).fit().params.iloc[0]
data['g_star'] = data['x'] * (data['y'] - data['x'] * fitted_beta)

In [8]:
data["marginal_probs"] = expit(
    logit(sample_probability_centering)
    + sample_probability_bias_factor * (data["x"].abs() - 25.5)
)

In [9]:
data['marginal_probs'].mean()

np.float64(0.5277258867808664)

simulate per iter:

In [10]:
for _ in tqdm(range(agg_iters)):
    srs_sample_indices = pd.Series(random.sample(range(pop_size), sample_size))

    # reset the sample
    data["r1"] = 0
    data["r"] = 0

    # sample both the intended and biased sample:
    data.loc[srs_sample_indices, "r1"] = 1
    full_sampled_data = data[data["r1"] == 1]

    biased_sample_indices = full_sampled_data.index[
        rand_generator.binomial(n=1, p=full_sampled_data["marginal_probs"]) == 1
    ]
    data.loc[biased_sample_indices, "r"] = 1

    # compute results:
    biased_sample_indices_size = len(biased_sample_indices)
    realized_sample_sizes.append(biased_sample_indices_size)

    srs_sample = data[data["r1"] == 1]
    biased_sample = data[data["r"] == 1]
    srs_fitted_beta = sm.OLS(srs_sample["y"], srs_sample["x"]).fit().params.iloc[0]
    biased_fitted_beta = (
        sm.OLS(biased_sample["y"], biased_sample["x"]).fit().params.iloc[0]
    )

    fitted_betas.append(pd.Series([fitted_beta, srs_fitted_beta, biased_fitted_beta]))
    fitted_Jn.append(
        pd.Series([np.mean(srs_sample["x"] ** 2), np.mean(biased_sample["x"] ** 2)])
    )
    fitted_correlations.append(data[["g_star", "r1", "r"]].corr().iloc[0, 1:])

100%|█████████████████████████████████████████████████████████████████████████| 100000/100000 [09:11<00:00, 181.36it/s]


## Clean up the data:

In [11]:
fitted_betas = pd.concat(fitted_betas, axis=1).T
fitted_Jn = pd.concat(fitted_Jn, axis=1).T
fitted_correlations = pd.concat(fitted_correlations, axis=1).T.reset_index(drop=True)

In [12]:
fitted_Jn.columns = ['srs', 'biased']
fitted_betas.columns = ['population', 'srs', 'biased']
fitted_correlations.columns = ['srs', 'biased']

In [13]:
realized_sample_sizes = pd.Series(realized_sample_sizes)

In [14]:
fitted_Jn_and_corr = fitted_Jn.join(fitted_correlations, lsuffix = '_fi', rsuffix = '_corr')
fitted_Jn_and_corr.head()

Unnamed: 0,srs_fi,biased_fi,srs_corr,biased_corr
0,820.383,1437.43487,-0.000672,0.000277
1,867.127,1451.103383,-0.001645,-0.003564
2,856.042,1457.330784,0.000319,0.001169
3,870.083,1471.434866,0.003066,0.003978
4,859.271,1474.044402,0.003522,0.004132


# Save:

In [15]:
fitted_Jn_and_corr.describe()

Unnamed: 0,srs_fi,biased_fi,srs_corr,biased_corr
count,100000.0,100000.0,100000.0,100000.0
mean,858.641428,1446.768983,-3e-06,-0.000102
std,23.947606,24.904229,0.003165,0.004106
min,750.12,1342.132692,-0.014226,-0.01712
25%,842.45275,1429.969134,-0.002143,-0.002863
50%,858.6025,1446.743278,2e-06,-9.3e-05
75%,874.838,1463.503612,0.002141,0.002667
max,982.528,1552.019194,0.014403,0.017901


In [16]:
fitted_Jn_and_corr.to_pickle('sample_by_x_bias.pkl')