How the simulation is done:
- First, a base simulation is done per sample size.
- Then, as needed, more simulations are done by upticking the iteration values.

Here, we only compute the MSEs for the intended sample, which results in a faster simulation than before

# Imports:

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm, trange
import statsmodels.api as sm

import sys
sys.path.append('../')
from ddc_utils import is_binomial_data_seperable
from jzhou_utils import save_obj_pickle

# Hyperparams:

In [4]:
pop_index = 1
iter_val = 11

In [5]:
rand_generator = np.random.default_rng(seed=333 * pop_index + iter_val)

In [6]:
population_size = 100_000
number_of_coefficients = 1

num_iters_per_population_for_small_samples = 50_000
num_iters_per_population_for_large_samples = 1_000
small_large_sample_co = 100

# biased sampling scheme params:
sample_probability_centering = 0.77
sample_probability_bias_factor = 1

In [7]:
ALL_SAMPLE_SIZES = (
    [i for i in range(3, 15)]
)

In [8]:
len(ALL_SAMPLE_SIZES)

12

# Run:

## Load Finite Population Data:

In [9]:
pickle_filename = f'base_population_data_Logit_1.pickle'
pop_data = pd.read_pickle(pickle_filename)

## get population-level statistics:

In [None]:
feature_cols = [f"x_{i}" for i in range(number_of_coefficients)]
pop_x = pop_data[feature_cols]
pop_y = pop_data["y"]

pop_model = sm.Logit(endog=pop_y, exog=pop_x).fit(disp=0)
pop_beta = np.array(pop_model.params)
pop_gs = pop_x * (
    np.array(pop_y).reshape((population_size, 1))
    - np.array(pop_model.predict()).reshape((population_size, 1))
)

## iterate:

In [13]:
mse_means = {}
mse_stdevs = {}

In [14]:
for temp_sample_size in tqdm(ALL_SAMPLE_SIZES):
    print(temp_sample_size)
    # where the data will be saved:
    all_sample_beta_full = []
    all_realized_sample_sizes = []

    # set up how much to sample for this population:
    non_separable_count = 0
    if temp_sample_size < small_large_sample_co:
        num_iters_per_population = num_iters_per_population_for_small_samples
    else:
        num_iters_per_population = num_iters_per_population_for_large_samples

    for _ in trange(num_iters_per_population, mininterval=10):
        """
        First, sample a valid dataset (which is not seperable):
        """
        # use sampling scheme to sample data:
        obtained_valid_sample = False

        while not obtained_valid_sample:
            # intended sample:
            pop_data["r0"] = 0
            pop_data.loc[
                np.random.choice(pop_data.index, size=temp_sample_size, replace=False),
                "r0",
            ] = 1

            full_sampled_data = pop_data[pop_data["r0"] == 1]

            # if the sample size is too small, check for seperability:
            if temp_sample_size < 1_000:
                if is_binomial_data_seperable(full_sampled_data, "y", "x_0"):
                    non_separable_count = non_separable_count + 1
                    continue

            obtained_valid_sample = True

        """
            Then, compute the logistic MSEs only:
        """
        # compute full x, y, model, beta
        sample_x_full, sample_y_full = (
            full_sampled_data[feature_cols],
            full_sampled_data["y"],
        )
        sample_beta_full = np.array(
            sm.Logit(endog=sample_y_full, exog=sample_x_full)
            .fit(disp=0, maxiter=5_00)
            .params
        )

        # compute full versions of things:
        all_sample_beta_full.append(pd.Series(sample_beta_full))

    all_mses = (pd.concat(all_sample_beta_full).reset_index(drop=True) - pop_beta) ** 2
    mse_means[temp_sample_size] = np.mean(all_mses)
    mse_stdevs[temp_sample_size] = np.std(all_mses) / np.sqrt(num_iters_per_population)

  0%|                                                                                           | 0/12 [00:00<?, ?it/s]

3



[A%|                                                                                        | 0/50000 [00:00<?, ?it/s]
[A%|█▏                                                                            | 783/50000 [00:10<10:28, 78.25it/s]
[A%|█▏                                                                            | 783/50000 [00:20<10:28, 78.25it/s]
[A%|██▍                                                                          | 1587/50000 [00:20<10:09, 79.39it/s]
[A%|███▋                                                                         | 2389/50000 [00:30<10:08, 78.26it/s]
[A%|████▉                                                                        | 3168/50000 [00:40<09:59, 78.10it/s]
[A%|██████                                                                       | 3947/50000 [00:50<09:51, 77.81it/s]
[A%|███████▎                                                                     | 4720/50000 [01:00<09:51, 76.55it/s]
[A%|████████▌                         

4



[A%|                                                                                        | 0/50000 [00:00<?, ?it/s]
[A%|██▏                                                                         | 1444/50000 [00:10<05:36, 144.31it/s]
[A%|████▍                                                                       | 2899/50000 [00:20<05:24, 144.97it/s]
[A%|██████▋                                                                     | 4372/50000 [00:30<05:12, 146.01it/s]
[A%|████████▉                                                                   | 5846/50000 [00:40<05:01, 146.54it/s]
[A%|███████████▏                                                                | 7333/50000 [00:50<04:49, 147.31it/s]
[A%|█████████████▍                                                              | 8820/50000 [01:00<04:45, 144.43it/s]
[A%|███████████████▍                                                           | 10286/50000 [01:10<04:33, 145.12it/s]
[A%|█████████████████▋                

5



[A%|                                                                                        | 0/50000 [00:00<?, ?it/s]
[A%|██▊                                                                         | 1822/50000 [00:10<04:24, 182.11it/s]
[A%|█████▋                                                                      | 3744/50000 [00:20<04:06, 188.02it/s]
[A%|████████▋                                                                   | 5724/50000 [00:30<03:49, 192.57it/s]
[A%|███████████▋                                                                | 7695/50000 [00:40<03:38, 193.35it/s]
[A%|███████████▋                                                                | 7695/50000 [00:40<03:38, 193.35it/s]
[A%|██████████████▋                                                             | 9671/50000 [00:50<03:26, 194.84it/s]
[A%|██████████████▋                                                             | 9671/50000 [01:00<03:26, 194.84it/s]
[A%|█████████████████▍                

6



[A%|                                                                                        | 0/50000 [00:00<?, ?it/s]
[A%|███▌                                                                        | 2304/50000 [00:10<03:27, 230.39it/s]
[A%|███████                                                                     | 4608/50000 [00:20<03:17, 230.28it/s]
[A%|██████████▌                                                                 | 6912/50000 [00:30<03:07, 230.32it/s]
[A%|██████████████                                                              | 9216/50000 [00:40<02:58, 228.62it/s]
[A%|█████████████████▏                                                         | 11477/50000 [00:50<02:50, 225.74it/s]
[A%|████████████████████▌                                                      | 13738/50000 [01:00<02:40, 225.84it/s]
[A%|████████████████████████                                                   | 16017/50000 [01:10<02:30, 226.49it/s]
[A%|███████████████████████████▍      

7



[A%|                                                                                        | 0/50000 [00:00<?, ?it/s]
[A%|███▉                                                                        | 2553/50000 [00:10<03:05, 255.30it/s]
[A%|███████▊                                                                    | 5106/50000 [00:20<02:57, 252.30it/s]
[A%|███████████▌                                                                | 7625/50000 [00:30<02:48, 252.08it/s]
[A%|███████████████▎                                                           | 10222/50000 [00:40<02:35, 255.05it/s]
[A%|███████████████████▏                                                       | 12819/50000 [00:50<02:26, 254.37it/s]
[A%|███████████████████████                                                    | 15378/50000 [01:00<02:15, 254.85it/s]
[A%|██████████████████████████▉                                                | 17936/50000 [01:10<02:06, 253.25it/s]
[A%|██████████████████████████████▋   

8



[A%|                                                                                        | 0/50000 [00:00<?, ?it/s]
[A%|███▉                                                                        | 2586/50000 [00:10<03:03, 258.55it/s]
[A%|████████                                                                    | 5289/50000 [00:20<02:48, 265.46it/s]
[A%|████████████▏                                                               | 8028/50000 [00:30<02:35, 269.29it/s]
[A%|████████████████▏                                                          | 10769/50000 [00:40<02:24, 271.19it/s]
[A%|████████████████████▎                                                      | 13510/50000 [00:50<02:15, 268.88it/s]
[A%|████████████████████████▏                                                  | 16166/50000 [01:00<02:06, 267.74it/s]
[A%|████████████████████████████▎                                              | 18840/50000 [01:10<01:56, 267.60it/s]
[A%|████████████████████████████████▎ 

9



[A%|                                                                                        | 0/50000 [00:00<?, ?it/s]
[A%|████▍                                                                       | 2908/50000 [00:10<02:41, 290.76it/s]
[A%|████████▊                                                                   | 5832/50000 [00:20<02:31, 291.68it/s]
[A%|█████████████▎                                                              | 8756/50000 [00:30<02:21, 291.44it/s]
[A%|█████████████████▌                                                         | 11668/50000 [00:40<02:11, 290.86it/s]
[A%|█████████████████████▊                                                     | 14583/50000 [00:50<02:01, 291.07it/s]
[A%|██████████████████████████▏                                                | 17498/50000 [01:00<01:52, 289.45it/s]
[A%|██████████████████████████████▌                                            | 20372/50000 [01:10<01:42, 288.71it/s]
[A%|██████████████████████████████████

10



[A%|                                                                                        | 0/50000 [00:00<?, ?it/s]
[A%|████▍                                                                       | 2912/50000 [00:10<02:41, 291.18it/s]
[A%|████████▉                                                                   | 5903/50000 [00:20<02:29, 295.81it/s]
[A%|█████████████▌                                                              | 8894/50000 [00:30<02:18, 297.09it/s]
[A%|█████████████████▊                                                         | 11881/50000 [00:40<02:08, 297.61it/s]
[A%|██████████████████████▎                                                    | 14865/50000 [00:50<01:58, 296.78it/s]
[A%|██████████████████████████▋                                                | 17819/50000 [01:00<01:48, 295.32it/s]
[A%|███████████████████████████████                                            | 20745/50000 [01:10<01:39, 293.71it/s]
[A%|██████████████████████████████████

11



[A%|                                                                                        | 0/50000 [00:00<?, ?it/s]
[A%|████▌                                                                       | 3022/50000 [00:10<02:35, 302.09it/s]
[A%|█████████▏                                                                  | 6049/50000 [00:20<02:25, 302.42it/s]
[A%|█████████████▊                                                              | 9078/50000 [00:30<02:15, 302.57it/s]
[A%|██████████████████▏                                                        | 12106/50000 [00:40<02:05, 301.84it/s]
[A%|██████████████████████▋                                                    | 15114/50000 [00:50<01:56, 298.83it/s]
[A%|███████████████████████████                                                | 18051/50000 [01:00<01:48, 294.44it/s]
[A%|███████████████████████████████▍                                           | 20978/50000 [01:10<01:38, 293.82it/s]
[A%|██████████████████████████████████

12



[A%|                                                                                        | 0/50000 [00:00<?, ?it/s]
[A%|████▋                                                                       | 3090/50000 [00:10<02:31, 309.00it/s]
[A%|█████████▍                                                                  | 6180/50000 [00:20<02:24, 304.03it/s]
[A%|█████████████▉                                                              | 9187/50000 [00:30<02:17, 296.49it/s]
[A%|██████████████████▏                                                        | 12098/50000 [00:40<02:08, 294.38it/s]
[A%|██████████████████████▌                                                    | 15032/50000 [00:50<01:58, 294.00it/s]
[A%|██████████████████████████▉                                                | 17976/50000 [01:00<01:48, 294.11it/s]
[A%|███████████████████████████████▍                                           | 20981/50000 [01:10<01:37, 296.15it/s]
[A%|██████████████████████████████████

13



[A%|                                                                                        | 0/50000 [00:00<?, ?it/s]
[A%|████▋                                                                       | 3092/50000 [00:10<02:31, 309.18it/s]
[A%|█████████▍                                                                  | 6193/50000 [00:20<02:21, 309.69it/s]
[A%|██████████████▏                                                             | 9294/50000 [00:30<02:11, 309.19it/s]
[A%|██████████████████▌                                                        | 12410/50000 [00:40<02:01, 310.13it/s]
[A%|███████████████████████▎                                                   | 15526/50000 [00:50<01:51, 310.05it/s]
[A%|████████████████████████████                                               | 18672/50000 [01:00<01:40, 311.55it/s]
[A%|████████████████████████████████▋                                          | 21817/50000 [01:10<01:30, 311.91it/s]
[A%|██████████████████████████████████

14



[A%|                                                                                        | 0/50000 [00:00<?, ?it/s]
[A%|████▊                                                                       | 3170/50000 [00:10<02:27, 316.95it/s]
[A%|█████████▋                                                                  | 6340/50000 [00:20<02:18, 315.45it/s]
[A%|██████████████▍                                                             | 9496/50000 [00:30<02:08, 315.50it/s]
[A%|██████████████████▉                                                        | 12652/50000 [00:40<01:58, 314.17it/s]
[A%|███████████████████████▋                                                   | 15774/50000 [00:50<01:49, 311.74it/s]
[A%|████████████████████████████▎                                              | 18863/50000 [01:00<01:40, 310.78it/s]
[A%|████████████████████████████████▉                                          | 21952/50000 [01:10<01:30, 308.57it/s]
[A%|██████████████████████████████████

  result = getattr(ufunc, method)(*inputs, **kwargs)

  result = getattr(ufunc, method)(*inputs, **kwargs)

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)

 25%|███████████████████▎                                                         | 2502/10000 [03:25<10:46, 11.60it/s][A
  result = getattr(ufunc, method)(*inputs, **kwargs)

 27%|█████████████████████                                                        | 2731/10000 [03:47<10:57, 11.05it/s][A
 28%|█████████████████████▊                                                       | 2839/10000 [03:57<10:52, 10.97it/s][A
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)

 32%|████████████████████████▌                                                    | 3188/10000 [04:27<09:58, 11.38it/s][A
  result = getattr(ufunc,

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)

 47%|████████████████████████████████████▎                                        | 4717/10000 [06:41<07:53, 11.15it/s][A
  result = getattr(ufunc, method)(*inputs, **kwargs)

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)

  result = getattr(ufunc, method)(*inputs, **kwargs)

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)

 54%|█████████████████████████████████████████▍                                   | 5376/10000 [07:43<07:09, 10.75it/s][A
 55%|██████████████████████████████████████████▏                                  | 5482/10000 [07:53<07:02, 10.69it/s][A
  result = getattr(ufunc, method)(*inputs,


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)

 76%|██████████████████████████████████████████████████████████▋                  | 7625/10000 [11:10<03:30, 11.30it/s][A
 76%|██████████████████████████████████████████████████████████▋                  | 7625/10000 [11:21<03:30, 11.30it/s][A
 77%|███████████████████████████████████████████████████████████▌                 | 7737/10000 [11:21<03:24, 11.07it/s][A
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)

  result = getattr(ufunc, method)(*inputs, **kwargs)

  result = getattr(ufunc, method)(*inputs, **kwargs)

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)

 83%|███████████████████████████████████████████████████████████████▋             | 8277/10000 

## merge data, save:

In [None]:
mse_means = pd.DataFrame.from_dict(mse_means, orient="index")
mse_means.columns = ["mean"]

mse_stdevs = pd.DataFrame.from_dict(mse_stdevs, orient="index")
mse_stdevs.columns = ["stdev"]

mse_data = mse_means.join(mse_stdevs)
mse_data["count"] = np.where(
    mse_data.index < small_large_sample_co,
    num_iters_per_population_for_small_samples,
    num_iters_per_population_for_large_samples,
)

save_obj_pickle(f"srs_sim_results/sim_srs_iter_{iter_val}.pickle", mse_data)