How the simulation is done:
- First, a base simulation is done per sample size.
- Then, as needed, more simulations are done by upticking the iteration values.

# Imports:

In [1]:
import numpy as np
import pandas as pd
from collections import Counter

from itertools import permutations
from tqdm import tqdm, trange

In [2]:
import pickle
import statsmodels.api as sm
from collections import defaultdict
from scipy.special import expit, logit

In [3]:
from joblib import Parallel, delayed

# Helper Functions:

In [4]:
import sys

sys.path.append('../')

from ddc_utils import *
from data_generating_utils import *

In [5]:
def to_pickle_obj(file_path, raw_data):
    with open(file_path, "wb") as handle:
        pickle.dump(raw_data, handle)

def read_pickle_obj(file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)

# Hyperparams:

In [6]:
pop_index = 1
iter_val = 9

In [7]:
rand_generator = np.random.default_rng(seed=333 * pop_index + iter_val)

In [8]:
population_size = 100_000
number_of_coefficients = 1

num_iters_per_population_for_small_samples = 25_000
num_iters_per_population_for_large_samples = 10_000
small_large_sample_co = 100

# biased sampling scheme params:
sample_probability_centering = 0.77
sample_probability_bias_factor = 1

In [9]:
ALL_SAMPLE_SIZES = (
    # [3, 4, 5] + 
    [6, 7, 9, 11, 13, 16, 20, 25]
    # + [i for i in range(30, 45)]
    # + [50, 70, 100, 150, 250, 400, 600, 1000, 1400]
    # + [2000, 3000, 5000, 7500, 10_000, 15_000, 20_000]
)

In [10]:
njobs = 3
multiprocess_backend = "loky"

# Load Finite Population Data:

In [11]:
pickle_filename = f'base_population_data_Logit_1.pickle'
pop_data = pd.read_pickle(pickle_filename)

# Run:

In [12]:
feature_cols = [f'x_{i}' for i in range(number_of_coefficients)]

In [13]:
sample_specific_non_separable_count = {}

#### get population-level statistics:

In [14]:
pop_x = pop_data[feature_cols]
pop_y = pop_data['y']

pop_model = sm.Logit(endog = pop_y, exog = pop_x).fit(disp=0)
pop_beta = np.array(pop_model.params)
pop_gs = pop_x * (np.array(pop_y).reshape((population_size, 1)) - \
              np.array(pop_model.predict()).reshape((population_size, 1)))

#### actually run:

In [15]:
def fn_to_parallel(pop_data, temp_sample_size):
    obtained_valid_sample = False
    non_separable_count = 0
    while not obtained_valid_sample:
        # intended sample:
        pop_data["r0"] = 0
        pop_data.loc[
            np.random.choice(pop_data.index, size=temp_sample_size, replace=False),
            "r0",
        ] = 1

        full_sampled_data = pop_data[pop_data["r0"] == 1]

        # biased sample:
        pop_data["r"] = 0

        marginal_probabilities = expit(
            logit(sample_probability_centering)
            + sample_probability_bias_factor
            * (2 * full_sampled_data["y"] - 1)
            * full_sampled_data["x_0"]
        )
        other_sample_indices = marginal_probabilities.index[
            rand_generator.binomial(n=1, p=marginal_probabilities) == 1
        ]
        pop_data.loc[other_sample_indices, "r"] = 1

        # sample_data here means the biased sample data.
        sample_data = pop_data[pop_data["r"] == 1]

        # if the sample size is too small, check for seperability:
        realised_sample_size = len(other_sample_indices)
        if realised_sample_size < 1_000:
            if is_binomial_data_seperable(sample_data, "y", "x_0"):
                non_separable_count = non_separable_count + 1
                continue

        obtained_valid_sample = True

    """
        Then, compute the logistic betas, ddc, Jns:
    """
    # compute biased x, y, model, beta
    sample_x, sample_y = sample_data[feature_cols], sample_data["y"]
    sample_beta = np.array(
        sm.Logit(endog=sample_y, exog=sample_x).fit(disp=0, maxiter=5_00).params
    )
    sample_r = pop_data["r"]

    # compute full x, y, model, beta
    sample_x_full, sample_y_full = (
        full_sampled_data[feature_cols],
        full_sampled_data["y"],
    )
    sample_beta_full = np.array(
        sm.Logit(endog=sample_y_full, exog=sample_x_full)
        .fit(disp=0, maxiter=5_00)
        .params
    )
    sample_r_full = pop_data["r0"]

    # ret: sample beta, sample ddc, sample Jn, sample size; intended beta, intended ddc, intended Jn, non seperable count
    return (
        pd.Series(sample_beta),
        pop_gs.corrwith(sample_r)[["x_0"]],
        compute_average_jn(
            pop_beta, sample_beta, sample_x, sample_y, model_type="Logit"
        ),
        realised_sample_size,
        pd.Series(sample_beta_full),
        pop_gs.corrwith(sample_r_full)[["x_0"]],
        compute_average_jn(
            pop_beta,
            sample_beta_full,
            sample_x_full,
            sample_y_full,
            model_type="Logit",
        ),
        non_separable_count,
    )

In [16]:
for temp_sample_size in tqdm(ALL_SAMPLE_SIZES):
    # set up how much to sample for this population:
    if temp_sample_size < small_large_sample_co:
        num_iters_per_population = num_iters_per_population_for_small_samples
    else:
        num_iters_per_population = num_iters_per_population_for_large_samples

    # run all the results, with the function to parallel above!
    agg_results = list(
        tqdm(
            Parallel(n_jobs=njobs, backend=multiprocess_backend, return_as="generator")(
                delayed(fn_to_parallel)(pop_data, temp_sample_size)
                for rep in range(num_iters_per_population)
            ),
            mininterval=10,
        )
    )

    sample_specific_non_separable_count[temp_sample_size] = np.sum(pd.Series([temp_res[7] for temp_res in agg_results]))
    print(f'# of non-seperable samples for sample size {temp_sample_size}: {sample_specific_non_separable_count[temp_sample_size]}')
    
    """
        Save the data!
    """
    # concatenate the biased versions:
    temp_samp_beta_biased = pd.Series([temp_res[0][0] for temp_res in agg_results])
    temp_ddc_biased = pd.Series([temp_res[1].iloc[0] for temp_res in agg_results])
    temp_jn_biased = pd.Series([temp_res[2][0].iloc[0] for temp_res in agg_results])
    realised_sizes = pd.Series([temp_res[3] for temp_res in agg_results])

    # concat the SRS versions:
    temp_samp_beta_full = pd.Series([temp_res[4][0] for temp_res in agg_results])
    temp_ddc_full = pd.Series([temp_res[5].iloc[0] for temp_res in agg_results])
    temp_jn_full = pd.Series([temp_res[6][0].iloc[0] for temp_res in agg_results])

    temp_ss_data = pd.concat(
        [
            temp_samp_beta_biased,
            temp_ddc_biased,
            temp_jn_biased,
            realised_sizes,
            temp_samp_beta_full,
            temp_ddc_full,
            temp_jn_full,
        ],
        axis=1,
    )
    temp_ss_data.columns = [
        "samp_biased",
        "ddc_biased",
        "jn_biased",
        "realized_size_biased",
        "samp_intended",
        "ddc_intended",
        "jn_intended",
    ]

    temp_ss_data["sample_size"] = temp_sample_size
    temp_ss_data["pop_beta"] = pop_beta[0]

    temp_ss_data["mse_biased"] = (
        temp_ss_data["pop_beta"] - temp_ss_data["samp_biased"]
    ) ** 2

    temp_ss_data["mse_intended"] = (
        temp_ss_data["pop_beta"] - temp_ss_data["samp_intended"]
    ) ** 2

    to_pickle_obj(f"sim_results/sim_{temp_sample_size}_iter_{iter_val}.pickle", temp_ss_data)

  0%|                                                                                            | 0/8 [00:00<?, ?it/s]
[A [00:00, ?it/s]
[A0it [00:10, 166.83it/s]
[A4it [00:20, 175.64it/s]
[A8it [00:30, 181.05it/s]
[A0it [00:40, 186.00it/s]
[A8it [00:50, 189.23it/s]
[A88it [01:01, 187.13it/s]
[A21it [01:11, 182.79it/s]
[A38it [01:21, 182.43it/s]
[A74it [01:31, 185.63it/s]
[A03it [01:42, 185.97it/s]
[A71it [01:52, 183.65it/s]
[A22it [02:02, 184.03it/s]
25000it [02:15, 183.96it/s]


# of non-seperable samples for sample size 6: 34800


 12%|██████████▍                                                                        | 1/8 [02:17<16:02, 137.52s/it]
[A [00:00, ?it/s]
[A9it [00:10, 213.02it/s]
[A0it [00:20, 208.64it/s]
[A3it [00:30, 211.74it/s]
[A8it [00:40, 212.85it/s]
[A47it [00:50, 213.46it/s]
[A93it [01:01, 208.76it/s]
[A51it [01:11, 210.47it/s]
[A59it [01:21, 213.20it/s]
[A83it [01:31, 215.82it/s]
[A83it [01:42, 215.82it/s]
[A11it [01:42, 206.41it/s]
25000it [01:58, 211.53it/s]


# of non-seperable samples for sample size 7: 23259


 25%|████████████████████▊                                                              | 2/8 [04:17<12:44, 127.34s/it]
[A [00:00, ?it/s]
[A3it [00:10, 235.45it/s]
[A7it [00:20, 240.98it/s]
[A6it [00:30, 241.95it/s]
[A7it [00:41, 231.54it/s]
[A27it [00:51, 231.55it/s]
[A43it [01:01, 231.55it/s]
[A59it [01:11, 230.63it/s]
[A47it [01:22, 223.08it/s]
[A27it [01:32, 224.27it/s]
25000it [01:48, 230.36it/s]


# of non-seperable samples for sample size 9: 12786


 38%|███████████████████████████████▏                                                   | 3/8 [06:08<09:58, 119.60s/it]
[A [00:00, ?it/s]
[A5it [00:10, 238.54it/s]
[A1it [00:20, 237.39it/s]
[A5it [00:30, 242.32it/s]
[A7it [00:40, 244.07it/s]
[A31it [00:50, 248.80it/s]
[A04it [01:01, 243.77it/s]
[A19it [01:11, 246.23it/s]
[A34it [01:21, 247.74it/s]
[A45it [01:31, 245.29it/s]
25000it [01:43, 242.23it/s]


# of non-seperable samples for sample size 11: 7329


 50%|█████████████████████████████████████████▌                                         | 4/8 [07:52<07:34, 113.63s/it]
[A [00:00, ?it/s]
[A5it [00:10, 237.11it/s]
[A1it [00:20, 248.68it/s]
[A0it [00:30, 245.96it/s]
[A39it [00:40, 250.19it/s]
[A15it [00:50, 252.24it/s]
[A75it [01:01, 245.22it/s]
[A35it [01:11, 248.17it/s]
[A80it [01:22, 248.62it/s]
25000it [01:40, 249.44it/s]


# of non-seperable samples for sample size 13: 4490


 62%|███████████████████████████████████████████████████▉                               | 5/8 [09:34<05:28, 109.43s/it]
[A [00:00, ?it/s]
[A3it [00:10, 244.12it/s]
[A9it [00:20, 256.44it/s]
[A9it [00:30, 263.43it/s]
[A37it [00:40, 257.34it/s]
[A43it [00:50, 261.76it/s]
[A42it [01:01, 263.89it/s]
[A23it [01:11, 257.85it/s]
[A55it [01:22, 261.82it/s]
25000it [01:35, 262.20it/s]


# of non-seperable samples for sample size 16: 2339


 75%|██████████████████████████████████████████████████████████████▎                    | 6/8 [11:11<03:30, 105.30s/it]
[A [00:00, ?it/s]
[A7it [00:10, 249.83it/s]
[A3it [00:20, 262.43it/s]
[A5it [00:30, 264.21it/s]
[A19it [00:42, 247.91it/s]
[A07it [00:52, 254.94it/s]
[A59it [01:02, 261.40it/s]
[A05it [01:13, 254.32it/s]
[A67it [01:23, 257.20it/s]
25000it [01:36, 258.04it/s]


# of non-seperable samples for sample size 20: 939


 88%|████████████████████████████████████████████████████████████████████████▋          | 7/8 [12:50<01:43, 103.18s/it]
[A [00:00, ?it/s]
[A5it [00:10, 263.36it/s]
[A9it [00:20, 255.93it/s]
[A5it [00:30, 262.63it/s]
[A5it [00:40, 262.63it/s]
[A99it [00:40, 263.63it/s]
[A67it [00:50, 253.78it/s]
[A67it [00:50, 253.78it/s]
[A67it [01:00, 253.78it/s]
[A59it [01:00, 255.40it/s]
[A59it [01:10, 255.40it/s]
[A67it [01:10, 243.45it/s]
[A56it [01:21, 234.37it/s]
25000it [01:41, 245.73it/s]


# of non-seperable samples for sample size 25: 303


100%|███████████████████████████████████████████████████████████████████████████████████| 8/8 [14:33<00:00, 109.21s/it]
