biased sampling scheme of:
- logit(0.8) + (np.abs(y) - np.mean(abs(y)))/ std(abs(y))

creates file:
- sample_by_abs_y_bias.pkl
        

# Imports:

In [28]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import random
import statsmodels.api as sm
from scipy.special import expit, logit

# Hyperparameters:

In [29]:
pop_size = 100 * 1_000
sample_size = 1_000
agg_iters = 100_000

beta = 1

## for biased sampling mechanism:
sample_probability_centering = 0.8
sample_probability_bias_factor = 10

In [30]:
## for rng generator:
pop_number = 1

In [31]:
# set seeds:
rand_generator = np.random.default_rng(seed=333 * pop_number)
random.seed(333 * pop_number)

In [32]:
assert pop_size % 100 == 0
base_x = [i for i in range(-50, 0)] + [i for i in range(1, 51)]
x = pd.Series(base_x * (pop_size // 100))
normal_stdev = 30

# Run:

In [33]:
fitted_betas = []
fitted_Jn = []
fitted_correlations = []
realized_sample_sizes = []

Create the simulated data (x, y):

In [34]:
y = x * beta + rand_generator.normal(size=pop_size) * normal_stdev

In [35]:
data = pd.concat([y, x], axis=1)
data.columns = ['y', 'x']
fitted_beta = sm.OLS(data['y'], data['x']).fit().params.iloc[0]
data['g_star'] = data['x'] * (data['y'] - data['x'] * fitted_beta)

In [36]:
std_y = y.abs().std()
data["marginal_probs"] = expit(
    logit(sample_probability_centering)
    + sample_probability_bias_factor * (np.abs(y) - np.mean(abs(y))) / std_y
)

In [37]:
data['marginal_probs'].mean()

np.float64(0.4878752684434134)

In [38]:
for _ in tqdm(range(agg_iters)):
    srs_sample_indices = pd.Series(random.sample(range(pop_size), sample_size))

    # reset the sample
    data["r1"] = 0
    data["r"] = 0

    # sample both the intended and biased sample:
    data.loc[srs_sample_indices, "r1"] = 1
    full_sampled_data = data[data["r1"] == 1]

    biased_sample_indices = full_sampled_data.index[
        rand_generator.binomial(n=1, p=full_sampled_data["marginal_probs"]) == 1
    ]
    data.loc[biased_sample_indices, "r"] = 1

    # compute results:
    biased_sample_indices_size = len(biased_sample_indices)
    realized_sample_sizes.append(biased_sample_indices_size)

    srs_sample = data[data["r1"] == 1]
    biased_sample = data[data["r"] == 1]
    srs_fitted_beta = sm.OLS(srs_sample["y"], srs_sample["x"]).fit().params.iloc[0]
    biased_fitted_beta = (
        sm.OLS(biased_sample["y"], biased_sample["x"]).fit().params.iloc[0]
    )

    fitted_betas.append(pd.Series([fitted_beta, srs_fitted_beta, biased_fitted_beta]))
    fitted_Jn.append(
        pd.Series([np.mean(srs_sample["x"] ** 2), np.mean(biased_sample["x"] ** 2)])
    )
    fitted_correlations.append(data[["g_star", "r1", "r"]].corr().iloc[0, 1:])

100%|█████████████████████████████████████████████████████████████████████████| 100000/100000 [08:08<00:00, 204.85it/s]


## Clean up the data:

In [39]:
fitted_betas = pd.concat(fitted_betas, axis=1).T
fitted_Jn = pd.concat(fitted_Jn, axis=1).T
fitted_correlations = pd.concat(fitted_correlations, axis=1).T.reset_index(drop=True)

In [40]:
fitted_Jn.columns = ['srs', 'biased']
fitted_betas.columns = ['population', 'srs', 'biased']
fitted_correlations.columns = ['srs', 'biased']

In [41]:
realized_sample_sizes = pd.Series(realized_sample_sizes)

In [42]:
fitted_Jn_and_corr = fitted_Jn.join(fitted_correlations, lsuffix = '_fi', rsuffix = '_corr')
fitted_Jn_and_corr.head()

Unnamed: 0,srs_fi,biased_fi,srs_corr,biased_corr
0,820.383,1049.095335,-0.000672,0.033525
1,867.127,1062.175847,-0.001645,0.039859
2,856.042,1092.287234,0.000319,0.04212
3,870.083,1059.778865,0.003066,0.040517
4,859.271,1074.781746,0.003522,0.04316


# Save:

In [43]:
fitted_Jn_and_corr.describe()

Unnamed: 0,srs_fi,biased_fi,srs_corr,biased_corr
count,100000.0,100000.0,100000.0,100000.0
mean,858.641428,1069.482471,-3e-06,0.038741
std,23.947606,35.691209,0.003165,0.003115
min,750.12,904.186296,-0.014226,0.025078
25%,842.45275,1045.245314,-0.002143,0.036656
50%,858.6025,1069.35959,2e-06,0.038755
75%,874.838,1093.588246,0.002141,0.040846
max,982.528,1232.791322,0.014403,0.052924


In [44]:
fitted_Jn_and_corr.corr()

Unnamed: 0,srs_fi,biased_fi,srs_corr,biased_corr
srs_fi,1.0,0.725042,0.001127,0.202122
biased_fi,0.725042,1.0,0.139577,0.203984
srs_corr,0.001127,0.139577,1.0,0.769759
biased_corr,0.202122,0.203984,0.769759,1.0


In [45]:
fitted_Jn_and_corr.to_pickle('sample_by_abs_y_bias.pkl')