How the simulation is done:
- First, a base simulation is done per sample size.
- Then, as needed, more simulations are done by upticking the iteration values.

# Imports:

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm, trange
import statsmodels.api as sm
import pickle

# Helper Functions:

In [2]:
import sys

sys.path.append('../')

from ddc_utils import *
from data_generating_utils import *

In [3]:
def is_binomial_data_seperable(
    df: pd.DataFrame, binary_col: str, cont_col: str
) -> bool:
    """
    Given a dataframe, column for binary rv, and column for continuous rv, returns True if the
        binary rv is seperable.
    """
    separability_check_df = (
        df[[binary_col, cont_col]].groupby(binary_col)[cont_col].agg(["min", "max"])
    )

    # checks if there is y = 0, y = 1, as well as whether min(y = 0) > max(y = 1), and vice versa
    return (
        (len(separability_check_df) < 2)
        or (separability_check_df.iloc[0, 0] > separability_check_df.iloc[1, 1])
        or (separability_check_df.iloc[1, 0] > separability_check_df.iloc[0, 1])
    )

def to_pickle_obj(file_path, raw_data):
    with open(file_path, "wb") as handle:
        pickle.dump(raw_data, handle)

def read_pickle_obj(file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)

# Hyperparams:

In [4]:
pop_index = 1
iter_val = 11

In [5]:
rand_generator = np.random.default_rng(seed=333 * pop_index + iter_val)

In [6]:
population_size = 100_000
number_of_coefficients = 1

num_iters_per_population_for_small_samples = 50_000
num_iters_per_population_for_large_samples = 1_000
small_large_sample_co = 100

# biased sampling scheme params:
sample_probability_centering = 0.77
sample_probability_bias_factor = 1

In [7]:
ALL_SAMPLE_SIZES = (
    [i for i in range(3, 15)]
)

In [8]:
len(ALL_SAMPLE_SIZES)

12

# Load Finite Population Data:

In [9]:
pickle_filename = f'base_population_data_Logit_1.pickle'
pop_data = pd.read_pickle(pickle_filename)

# Run:

In [10]:
feature_cols = [f'x_{i}' for i in range(number_of_coefficients)]

In [11]:
sample_specific_non_separable_count = {}

#### get population-level statistics:

In [12]:
pop_x = pop_data[feature_cols]
pop_y = pop_data['y']

pop_model = sm.Logit(endog = pop_y, exog = pop_x).fit(disp=0)
pop_beta = np.array(pop_model.params)
pop_gs = pop_x * (np.array(pop_y).reshape((population_size, 1)) - \
              np.array(pop_model.predict()).reshape((population_size, 1)))

#### actually run:

In [13]:
mse_means = {}
mse_stdevs = {}

In [14]:
for temp_sample_size in tqdm(ALL_SAMPLE_SIZES):
    print(temp_sample_size)
    # where the data will be saved:
    all_sample_beta_full = []

    all_realized_sample_sizes = []

    # set up how much to sample for this population:
    non_separable_count = 0
    if temp_sample_size < small_large_sample_co:
        num_iters_per_population = num_iters_per_population_for_small_samples
    else:
        num_iters_per_population = num_iters_per_population_for_large_samples

    for _ in trange(num_iters_per_population, mininterval=10):
        """
        First, sample a valid dataset (which is not seperable):
        """
        # use sampling scheme to sample data:
        obtained_valid_sample = False

        while not obtained_valid_sample:
            # intended sample:
            pop_data["r0"] = 0
            pop_data.loc[
                np.random.choice(pop_data.index, size=temp_sample_size, replace=False),
                "r0",
            ] = 1

            full_sampled_data = pop_data[pop_data["r0"] == 1]

            # if the sample size is too small, check for seperability:
            if temp_sample_size < 1_000:
                if is_binomial_data_seperable(full_sampled_data, "y", "x_0"):
                    non_separable_count = non_separable_count + 1
                    continue

            obtained_valid_sample = True

        """
            Then, compute the logistic betas, ddc, Jns:
        """
        # compute full x, y, model, beta
        sample_x_full, sample_y_full = (
            full_sampled_data[feature_cols],
            full_sampled_data["y"],
        )
        sample_beta_full = np.array(
            sm.Logit(endog=sample_y_full, exog=sample_x_full)
            .fit(disp=0, maxiter=5_00)
            .params
        )

        # compute full versions of things:
        all_sample_beta_full.append(pd.Series(sample_beta_full))

    all_mses = (pd.concat(all_sample_beta_full).reset_index(drop=True) - pop_beta) ** 2
    mse_means[temp_sample_size] = np.mean(all_mses)
    mse_stdevs[temp_sample_size] = np.std(all_mses) / np.sqrt(num_iters_per_population)

  0%|                                                                                           | 0/12 [00:00<?, ?it/s]

3



[A%|                                                                                        | 0/50000 [00:00<?, ?it/s]
[A%|█▏                                                                            | 736/50000 [00:10<11:09, 73.57it/s]
[A%|██▎                                                                          | 1514/50000 [00:20<10:37, 76.06it/s]
[A%|██▎                                                                          | 1514/50000 [00:30<10:37, 76.06it/s]
[A%|███▍                                                                         | 2246/50000 [00:30<10:39, 74.64it/s]
[A%|███▍                                                                         | 2246/50000 [00:40<10:39, 74.64it/s]
[A%|████▋                                                                        | 3016/50000 [00:40<10:21, 75.54it/s]
[A%|████▋                                                                        | 3016/50000 [00:50<10:21, 75.54it/s]
[A%|█████▊                            

4



[A%|                                                                                        | 0/50000 [00:00<?, ?it/s]
[A%|██▏                                                                         | 1436/50000 [00:10<05:38, 143.56it/s]
[A%|████▍                                                                       | 2902/50000 [00:20<05:24, 145.33it/s]
[A%|██████▋                                                                     | 4368/50000 [00:30<05:22, 141.61it/s]
[A%|████████▊                                                                   | 5808/50000 [00:40<05:10, 142.52it/s]
[A%|███████████                                                                 | 7248/50000 [00:50<05:00, 142.48it/s]
[A%|█████████████▏                                                              | 8674/50000 [01:00<04:49, 142.51it/s]
[A%|███████████████▏                                                           | 10131/50000 [01:10<04:37, 143.50it/s]
[A%|█████████████████▍                

5



[A%|                                                                                        | 0/50000 [00:00<?, ?it/s]
[A%|██▉                                                                         | 1970/50000 [00:10<04:03, 196.98it/s]
[A%|██████                                                                      | 3995/50000 [00:20<03:49, 200.21it/s]
[A%|█████████▏                                                                  | 6020/50000 [00:30<03:41, 198.33it/s]
[A%|████████████▏                                                               | 7982/50000 [00:40<03:36, 194.31it/s]
[A%|███████████████                                                             | 9879/50000 [00:50<03:28, 192.66it/s]
[A%|█████████████████▋                                                         | 11776/50000 [01:00<03:20, 190.67it/s]
[A%|████████████████████▋                                                      | 13755/50000 [01:10<03:07, 193.00it/s]
[A%|███████████████████████▌          

6



[A%|                                                                                        | 0/50000 [00:00<?, ?it/s]
[A%|███▌                                                                        | 2328/50000 [00:10<03:24, 232.68it/s]
[A%|███████                                                                     | 4655/50000 [00:20<03:20, 226.21it/s]
[A%|██████████▌                                                                 | 6915/50000 [00:30<03:10, 226.09it/s]
[A%|█████████████▉                                                              | 9200/50000 [00:40<02:59, 227.01it/s]
[A%|█████████████████▏                                                         | 11485/50000 [00:50<02:50, 225.97it/s]
[A%|████████████████████▋                                                      | 13806/50000 [01:00<02:38, 228.03it/s]
[A%|████████████████████▋                                                      | 13806/50000 [01:10<02:38, 228.03it/s]
[A%|████████████████████████▏         

7



[A%|                                                                                        | 0/50000 [00:00<?, ?it/s]
[A%|███▉                                                                        | 2554/50000 [00:10<03:05, 255.32it/s]
[A%|███████▊                                                                    | 5108/50000 [00:20<02:58, 251.57it/s]
[A%|███████████▌                                                                | 7610/50000 [00:30<02:48, 250.94it/s]
[A%|███████████▌                                                                | 7610/50000 [00:40<02:48, 250.94it/s]
[A%|███████████████                                                             | 9891/50000 [00:40<02:46, 240.97it/s]
[A%|███████████████                                                             | 9891/50000 [00:50<02:46, 240.97it/s]
[A%|██████████████████▏                                                        | 12094/50000 [00:50<02:42, 233.36it/s]
[A%|█████████████████████▍            

8



[A%|                                                                                        | 0/50000 [00:00<?, ?it/s]
[A%|███▊                                                                        | 2528/50000 [00:10<03:07, 252.74it/s]
[A%|███████▌                                                                    | 4983/50000 [00:20<03:02, 247.09it/s]
[A%|███████▌                                                                    | 4983/50000 [00:20<03:02, 247.09it/s]
[A%|███████████▎                                                                | 7455/50000 [00:30<02:52, 247.11it/s]
[A%|███████████████▏                                                            | 9957/50000 [00:40<02:41, 248.29it/s]
[A%|███████████████▏                                                            | 9957/50000 [00:50<02:41, 248.29it/s]
[A%|██████████████████▌                                                        | 12364/50000 [00:50<02:33, 245.37it/s]
[A%|██████████████████████            

9



[A%|                                                                                        | 0/50000 [00:00<?, ?it/s]
[A%|███▋                                                                        | 2449/50000 [00:10<03:14, 244.76it/s]
[A%|███████▍                                                                    | 4929/50000 [00:20<03:02, 246.65it/s]
[A%|███████████▎                                                                | 7479/50000 [00:30<02:49, 250.45it/s]
[A%|███████████████                                                            | 10080/50000 [00:40<02:37, 254.23it/s]
[A%|███████████████████                                                        | 12681/50000 [00:50<02:26, 254.25it/s]
[A%|██████████████████████▊                                                    | 15224/50000 [01:00<02:19, 248.93it/s]
[A%|██████████████████████████▌                                                | 17716/50000 [01:10<02:09, 249.01it/s]
[A%|██████████████████████████████▍   

10



[A%|                                                                                        | 0/50000 [00:00<?, ?it/s]
[A%|███▉                                                                        | 2601/50000 [00:10<03:02, 260.05it/s]
[A%|███████▉                                                                    | 5202/50000 [00:20<02:53, 258.01it/s]
[A%|███████████▊                                                                | 7775/50000 [00:30<02:43, 257.67it/s]
[A%|███████████████▌                                                           | 10348/50000 [00:40<02:34, 256.59it/s]
[A%|███████████████████▍                                                       | 12976/50000 [00:50<02:23, 258.82it/s]
[A%|███████████████████████▍                                                   | 15604/50000 [01:00<02:12, 259.38it/s]
[A%|███████████████████████████▎                                               | 18229/50000 [01:10<02:02, 260.38it/s]
[A%|███████████████████████████████▎  

11



[A%|                                                                                        | 0/50000 [00:00<?, ?it/s]
[A%|███▉                                                                        | 2593/50000 [00:10<03:02, 259.15it/s]
[A%|███████▉                                                                    | 5185/50000 [00:20<02:54, 256.62it/s]
[A%|███████████▊                                                                | 7734/50000 [00:30<02:46, 253.43it/s]
[A%|███████████████▍                                                           | 10331/50000 [00:40<02:35, 255.85it/s]
[A%|███████████████████▍                                                       | 12928/50000 [00:52<02:35, 238.67it/s]
[A%|██████████████████████▌                                                    | 15062/50000 [01:02<02:32, 229.32it/s]
[A%|██████████████████████████▎                                                | 17546/50000 [01:12<02:17, 235.28it/s]
[A%|██████████████████████████████    

12



[A%|                                                                                        | 0/50000 [00:00<?, ?it/s]
[A%|████▍                                                                       | 2953/50000 [00:10<02:39, 295.21it/s]
[A%|█████████                                                                   | 5948/50000 [00:20<02:27, 297.66it/s]
[A%|█████████████▋                                                              | 8979/50000 [00:30<02:16, 300.11it/s]
[A%|██████████████████                                                         | 12010/50000 [00:40<02:07, 298.51it/s]
[A%|██████████████████████▌                                                    | 15025/50000 [00:50<01:56, 299.57it/s]
[A%|███████████████████████████                                                | 18059/50000 [01:00<01:46, 300.85it/s]
[A%|███████████████████████████████▋                                           | 21093/50000 [01:10<01:35, 301.30it/s]
[A%|██████████████████████████████████

13



[A%|                                                                                        | 0/50000 [00:00<?, ?it/s]
[A%|████▌                                                                       | 3007/50000 [00:10<02:36, 300.64it/s]
[A%|█████████▏                                                                  | 6056/50000 [00:20<02:24, 303.12it/s]
[A%|█████████████▊                                                              | 9120/50000 [00:30<02:14, 304.61it/s]
[A%|██████████████████▎                                                        | 12184/50000 [00:40<02:04, 302.96it/s]
[A%|██████████████████████▊                                                    | 15211/50000 [00:50<01:54, 302.84it/s]
[A%|███████████████████████████▎                                               | 18238/50000 [01:00<01:45, 301.38it/s]
[A%|███████████████████████████████▊                                           | 21224/50000 [01:10<01:35, 300.30it/s]
[A%|██████████████████████████████████

14



[A%|                                                                                        | 0/50000 [00:00<?, ?it/s]
[A%|████▋                                                                       | 3053/50000 [00:10<02:33, 305.23it/s]
[A%|█████████▎                                                                  | 6106/50000 [00:20<02:25, 301.18it/s]
[A%|█████████████▊                                                              | 9106/50000 [00:30<02:16, 300.64it/s]
[A%|██████████████████▏                                                        | 12106/50000 [00:40<02:06, 300.37it/s]
[A%|██████████████████████▊                                                    | 15179/50000 [00:50<01:54, 302.84it/s]
[A%|███████████████████████████▍                                               | 18257/50000 [01:00<01:44, 304.52it/s]
[A%|████████████████████████████████                                           | 21376/50000 [01:10<01:33, 306.93it/s]
[A%|██████████████████████████████████

  result = getattr(ufunc, method)(*inputs, **kwargs)

  result = getattr(ufunc, method)(*inputs, **kwargs)

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)

 25%|███████████████████▎                                                         | 2502/10000 [03:25<10:46, 11.60it/s][A
  result = getattr(ufunc, method)(*inputs, **kwargs)

 27%|█████████████████████                                                        | 2731/10000 [03:47<10:57, 11.05it/s][A
 28%|█████████████████████▊                                                       | 2839/10000 [03:57<10:52, 10.97it/s][A
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)

 32%|████████████████████████▌                                                    | 3188/10000 [04:27<09:58, 11.38it/s][A
  result = getattr(ufunc,

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)

 47%|████████████████████████████████████▎                                        | 4717/10000 [06:41<07:53, 11.15it/s][A
  result = getattr(ufunc, method)(*inputs, **kwargs)

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)

  result = getattr(ufunc, method)(*inputs, **kwargs)

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)

 54%|█████████████████████████████████████████▍                                   | 5376/10000 [07:43<07:09, 10.75it/s][A
 55%|██████████████████████████████████████████▏                                  | 5482/10000 [07:53<07:02, 10.69it/s][A
  result = getattr(ufunc, method)(*inputs,


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)

 76%|██████████████████████████████████████████████████████████▋                  | 7625/10000 [11:10<03:30, 11.30it/s][A
 76%|██████████████████████████████████████████████████████████▋                  | 7625/10000 [11:21<03:30, 11.30it/s][A
 77%|███████████████████████████████████████████████████████████▌                 | 7737/10000 [11:21<03:24, 11.07it/s][A
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)

  result = getattr(ufunc, method)(*inputs, **kwargs)

  result = getattr(ufunc, method)(*inputs, **kwargs)

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)

 83%|███████████████████████████████████████████████████████████████▋             | 8277/10000 

In [15]:
mse_means = pd.DataFrame.from_dict(mse_means, orient = 'index')
mse_means.columns = ['mean']

mse_stdevs = pd.DataFrame.from_dict(mse_stdevs, orient = 'index')
mse_stdevs.columns = ['stdev']

In [16]:
mse_data = mse_means.join(mse_stdevs)

In [17]:
mse_data["count"] = np.where(
    mse_data.index < small_large_sample_co,
    num_iters_per_population_for_small_samples,
    num_iters_per_population_for_large_samples,
)

In [18]:
to_pickle_obj(f"srs_sim_results/sim_srs_iter_{iter_val}.pickle", mse_data)