biased sampling scheme of:
- expit(logit(sample_probability_centering) + \
                               sample_probability_bias_factor * (y / std_y))

creates file:
- sample_by_y_bias.pkl

# Imports:

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import random
import statsmodels.api as sm
from scipy.special import expit, logit

# Hyperparameters:

In [2]:
pop_size = 100 * 1_000
sample_size = 100_0
agg_iters = 100_000

beta = 1

## for biased sampling mechanism:
sample_probability_centering = 0.8
sample_probability_bias_factor = 1

In [3]:
## for rng generator:
pop_number = 1

In [4]:
# set seeds:
rand_generator = np.random.default_rng(seed=333 * pop_number)
random.seed(333 * pop_number)

In [5]:
assert pop_size % 100 == 0
base_x = [i for i in range(-50, 0)] + [i for i in range(1, 51)]
x = pd.Series(base_x * (pop_size // 100))
normal_stdev = 30

# Run:

In [6]:
fitted_betas = []
fitted_Jn = []
fitted_correlations = []
realized_sample_sizes = []

Create the simulated data (x, y):

In [7]:
y = x * beta + rand_generator.normal(size=pop_size) * normal_stdev

In [8]:
data = pd.concat([y, x], axis=1)
data.columns = ['y', 'x']
fitted_beta = sm.OLS(data['y'], data['x']).fit().params.iloc[0]
data['g_star'] = data['x'] * (data['y'] - data['x'] * fitted_beta)

In [9]:
std_y = np.std(y)
data["marginal_probs"] = expit(
    logit(sample_probability_centering) + sample_probability_bias_factor * (y / std_y)
)

In [10]:
data["marginal_probs"].mean()

np.float64(0.7604675827575927)

# run:

In [11]:
for _ in tqdm(range(agg_iters)):
    srs_sample_indices = pd.Series(random.sample(range(pop_size), sample_size))

    # reset the sample
    data["r1"] = 0
    data["r"] = 0

    # sample both the intended and biased sample:
    data.loc[srs_sample_indices, "r1"] = 1
    full_sampled_data = data[data["r1"] == 1]

    biased_sample_indices = full_sampled_data.index[
        rand_generator.binomial(n=1, p=full_sampled_data["marginal_probs"]) == 1
    ]
    data.loc[biased_sample_indices, "r"] = 1

    # compute results:
    biased_sample_indices_size = len(biased_sample_indices)
    realized_sample_sizes.append(biased_sample_indices_size)

    srs_sample = data[data["r1"] == 1]
    biased_sample = data[data["r"] == 1]
    srs_fitted_beta = sm.OLS(srs_sample["y"], srs_sample["x"]).fit().params.iloc[0]
    biased_fitted_beta = (
        sm.OLS(biased_sample["y"], biased_sample["x"]).fit().params.iloc[0]
    )

    fitted_betas.append(pd.Series([fitted_beta, srs_fitted_beta, biased_fitted_beta]))
    fitted_Jn.append(
        pd.Series([np.mean(srs_sample["x"] ** 2), np.mean(biased_sample["x"] ** 2)])
    )
    fitted_correlations.append(data[["g_star", "r1", "r"]].corr().iloc[0, 1:])

100%|█████████████████████████████████████████████████████████████████████████| 100000/100000 [09:16<00:00, 179.85it/s]


Clean up the data:

In [12]:
fitted_betas = pd.concat(fitted_betas, axis=1).T
fitted_Jn = pd.concat(fitted_Jn, axis=1).T
fitted_correlations = pd.concat(fitted_correlations, axis=1).T.reset_index(drop=True)

In [13]:
fitted_Jn.columns = ['srs', 'biased']
fitted_betas.columns = ['population', 'srs', 'biased']
fitted_correlations.columns = ['srs', 'biased']

In [14]:
realized_sample_sizes = pd.Series(realized_sample_sizes)

In [15]:
fitted_Jn_and_corr = fitted_Jn.join(fitted_correlations, lsuffix = '_fi', rsuffix = '_corr')
fitted_Jn_and_corr.head()

Unnamed: 0,srs_fi,biased_fi,srs_corr,biased_corr
0,820.383,816.399481,-0.000672,-0.002976
1,867.127,847.071429,-0.001645,-0.006582
2,856.042,836.248391,0.000319,-0.003973
3,870.083,837.937738,0.003066,-0.000788
4,859.271,842.957474,0.003522,-0.001406


In [16]:
fitted_Jn_and_corr.describe()

Unnamed: 0,srs_fi,biased_fi,srs_corr,biased_corr
count,100000.0,100000.0,100000.0,100000.0
mean,858.641428,843.940512,-3e-06,-0.003821
std,23.947606,27.302935,0.003165,0.003092
min,750.12,735.557441,-0.014226,-0.016726
25%,842.45275,825.411701,-0.002143,-0.005908
50%,858.6025,843.8331,2e-06,-0.003823
75%,874.838,862.445649,0.002141,-0.001739
max,982.528,994.464286,0.014403,0.008751


# Save:

In [17]:
fitted_Jn_and_corr.describe()

Unnamed: 0,srs_fi,biased_fi,srs_corr,biased_corr
count,100000.0,100000.0,100000.0,100000.0
mean,858.641428,843.940512,-3e-06,-0.003821
std,23.947606,27.302935,0.003165,0.003092
min,750.12,735.557441,-0.014226,-0.016726
25%,842.45275,825.411701,-0.002143,-0.005908
50%,858.6025,843.8331,2e-06,-0.003823
75%,874.838,862.445649,0.002141,-0.001739
max,982.528,994.464286,0.014403,0.008751


In [18]:
fitted_Jn_and_corr.to_pickle('sample_by_y_bias.pkl')