How the simulation is done:
- First, a base simulation is done per sample size.
- Then, as needed, more simulations are done by upticking the iteration values.

# Imports:

In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import statsmodels.api as sm
from scipy.special import expit, logit
from joblib import Parallel, delayed

import sys
sys.path.append('../')

from ddc_utils import compute_average_jn, is_binomial_data_seperable
from jzhou_utils import save_obj_pickle

# Hyperparams:

In [3]:
pop_index = 1
iter_val = 10

In [4]:
rand_generator = np.random.default_rng(seed=333 * pop_index + iter_val)

In [5]:
population_size = 100_000
number_of_coefficients = 1

# num_iters_per_population_for_small_samples = 25_000
num_iters_per_population_for_small_samples = 50_000
num_iters_per_population_for_large_samples = 10_000
small_large_sample_co = 100

# biased sampling scheme params:
sample_probability_centering = 0.77
sample_probability_bias_factor = 1

In [9]:
ALL_SAMPLE_SIZES = (
    # [3, 4, 5] +
    [6, 7, 9, 11, 13, 16]
    # + [20, 25]
    # + [i for i in range(30, 45)]
    # + [50, 70, 100, 150, 250, 400, 600, 1000, 1400]
    # + [2000, 3000, 5000, 7500, 10_000, 15_000, 20_000]
)

In [10]:
njobs = 3
multiprocess_backend = "loky"

# Run:

## Load Finite Population Data:

In [11]:
pickle_filename = f'base_population_data_Logit_1.pickle'
pop_data = pd.read_pickle(pickle_filename)

## get population-level statistics:

In [12]:
feature_cols = [f'x_{i}' for i in range(number_of_coefficients)]
pop_x = pop_data[feature_cols]
pop_y = pop_data["y"]

pop_model = sm.Logit(endog=pop_y, exog=pop_x).fit(disp=0)
pop_beta = np.array(pop_model.params)
pop_gs = pop_x * (
    np.array(pop_y).reshape((population_size, 1))
    - np.array(pop_model.predict()).reshape((population_size, 1))
)

## Function to parallelize:

In [13]:
def fn_to_parallel(pop_data, temp_sample_size):
    obtained_valid_sample = False
    non_separable_count = 0
    while not obtained_valid_sample:
        # intended sample:
        pop_data["r0"] = 0
        pop_data.loc[
            np.random.choice(pop_data.index, size=temp_sample_size, replace=False),
            "r0",
        ] = 1
        full_sampled_data = pop_data[pop_data["r0"] == 1]

        # biased sample:
        pop_data["r"] = 0
        marginal_probabilities = expit(
            logit(sample_probability_centering)
            + sample_probability_bias_factor
            * (2 * full_sampled_data["y"] - 1)
            * full_sampled_data["x_0"]
        )
        other_sample_indices = marginal_probabilities.index[
            rand_generator.binomial(n=1, p=marginal_probabilities) == 1
        ]
        pop_data.loc[other_sample_indices, "r"] = 1

        # sample_data here means the biased sample data.
        sample_data = pop_data[pop_data["r"] == 1]

        # if the sample size is too small, check for seperability:
        realised_sample_size = len(other_sample_indices)
        if realised_sample_size < 1_000:
            if is_binomial_data_seperable(sample_data, "y", "x_0"):
                non_separable_count = non_separable_count + 1
                continue

        obtained_valid_sample = True

    """
        Then, compute the logistic betas, ddc, Jns:
    """
    # compute biased x, y, model, beta
    sample_x, sample_y = sample_data[feature_cols], sample_data["y"]
    sample_beta = np.array(
        sm.Logit(endog=sample_y, exog=sample_x).fit(disp=0, maxiter=5_00).params
    )
    sample_r = pop_data["r"]

    # compute full x, y, model, beta
    sample_x_full, sample_y_full = (
        full_sampled_data[feature_cols],
        full_sampled_data["y"],
    )
    sample_beta_full = np.array(
        sm.Logit(endog=sample_y_full, exog=sample_x_full)
        .fit(disp=0, maxiter=5_00)
        .params
    )
    sample_r_full = pop_data["r0"]

    # ret: sample beta, sample ddc, sample Jn, sample size;
    #    intended beta, intended ddc, intended Jn, non seperable count
    return (
        pd.Series(sample_beta),
        pop_gs.corrwith(sample_r)[["x_0"]],
        compute_average_jn(
            pop_beta, sample_beta, sample_x, sample_y, link_fn="Logit"
        ),
        realised_sample_size,
        
        pd.Series(sample_beta_full),
        pop_gs.corrwith(sample_r_full)[["x_0"]],
        compute_average_jn(
            pop_beta,
            sample_beta_full,
            sample_x_full,
            sample_y_full,
            link_fn="Logit",
        ),
        non_separable_count,
    )

## iterations:

In [None]:
sample_specific_non_separable_count = {}

In [None]:
for temp_sample_size in tqdm(ALL_SAMPLE_SIZES):
    # set up how much to sample for this population:
    if temp_sample_size < small_large_sample_co:
        num_iters_per_population = num_iters_per_population_for_small_samples
    else:
        num_iters_per_population = num_iters_per_population_for_large_samples

    # run all the results, with the function to parallel above!
    agg_results = list(
        tqdm(
            Parallel(n_jobs=njobs, backend=multiprocess_backend, return_as="generator")(
                delayed(fn_to_parallel)(pop_data, temp_sample_size)
                for rep in range(num_iters_per_population)
            ),
            mininterval=10,
        )
    )

    sample_specific_non_separable_count[temp_sample_size] = np.sum(
        pd.Series([temp_res[7] for temp_res in agg_results])
    )
    print(
        f"# of non-seperable samples for sample size {temp_sample_size}: {sample_specific_non_separable_count[temp_sample_size]}"
    )

    """
        Save the data!
    """
    # concatenate the biased versions:
    temp_samp_beta_biased = pd.Series([temp_res[0][0] for temp_res in agg_results])
    temp_ddc_biased = pd.Series([temp_res[1].iloc[0] for temp_res in agg_results])
    temp_jn_biased = pd.Series([temp_res[2][0].iloc[0] for temp_res in agg_results])
    realised_sizes = pd.Series([temp_res[3] for temp_res in agg_results])

    # concat the SRS versions:
    temp_samp_beta_full = pd.Series([temp_res[4][0] for temp_res in agg_results])
    temp_ddc_full = pd.Series([temp_res[5].iloc[0] for temp_res in agg_results])
    temp_jn_full = pd.Series([temp_res[6][0].iloc[0] for temp_res in agg_results])

    temp_ss_data = pd.concat(
        [
            temp_samp_beta_biased,
            temp_ddc_biased,
            temp_jn_biased,
            realised_sizes,
            temp_samp_beta_full,
            temp_ddc_full,
            temp_jn_full,
        ],
        axis=1,
    )
    temp_ss_data.columns = [
        "samp_biased",
        "ddc_biased",
        "jn_biased",
        "realized_size_biased",
        "samp_intended",
        "ddc_intended",
        "jn_intended",
    ]

    temp_ss_data["sample_size"] = temp_sample_size
    temp_ss_data["pop_beta"] = pop_beta[0]

    temp_ss_data["mse_biased"] = (
        temp_ss_data["pop_beta"] - temp_ss_data["samp_biased"]
    ) ** 2

    temp_ss_data["mse_intended"] = (
        temp_ss_data["pop_beta"] - temp_ss_data["samp_intended"]
    ) ** 2

    save_obj_pickle(
        f"sim_results/sim_{temp_sample_size}_iter_{iter_val}.pickle", temp_ss_data
    )

  0%|                                                                                            | 0/6 [00:00<?, ?it/s]
[A [00:00, ?it/s]
[A2it [00:10, 186.69it/s]
[A0it [00:20, 196.64it/s]
[A7it [00:30, 198.41it/s]
[A2it [00:40, 197.34it/s]
[A0it [00:50, 195.50it/s]
[A53it [01:01, 190.40it/s]
[A63it [01:11, 186.76it/s]
[A78it [01:21, 187.92it/s]
[A98it [01:31, 188.92it/s]
[A10it [01:41, 187.16it/s]
[A90it [01:52, 187.36it/s]
[A69it [02:02, 187.42it/s]
[A14it [02:12, 189.35it/s]
[A30it [02:22, 191.87it/s]
[A07it [02:33, 188.18it/s]
[A86it [02:43, 187.92it/s]
[A59it [02:53, 185.53it/s]
[A98it [03:03, 184.92it/s]
[A82it [03:13, 188.43it/s]
[A50it [03:25, 183.77it/s]
[A90it [03:35, 178.32it/s]
[A22it [03:45, 176.74it/s]
[A51it [03:55, 174.69it/s]
[A18it [04:06, 179.90it/s]
[A40it [04:16, 179.32it/s]
50000it [04:30, 184.93it/s]


# of non-seperable samples for sample size 6: 65808


 17%|█████████████▊                                                                     | 1/6 [04:33<22:49, 273.94s/it]
[A [00:00, ?it/s]
[A5it [00:10, 197.60it/s]
[A5it [00:20, 202.94it/s]
[A1it [00:30, 208.45it/s]
[A1it [00:40, 210.15it/s]
[A29it [00:51, 201.29it/s]
[A03it [01:03, 188.15it/s]
[A31it [01:13, 188.83it/s]
[A91it [01:23, 196.53it/s]