can also check the linear independence of rho and Jn here as well!

# Imports:

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LogisticRegression

from scipy.stats import chi2
from collections import Counter

from itertools import permutations
from tqdm import tqdm, trange

In [2]:
import matplotlib.pyplot as plt

import pickle

import statsmodels.api as sm

from collections import defaultdict

from scipy.linalg import sqrtm

from scipy.special import expit, logit

# Helper Functions:

In [3]:
from scipy.stats import norm

note that for the cloglog link $g(\mu) = \log(-\log(1-\mu))$, its derivative is $-1/(\log(1-x)(1-x))$, while for the probit link, it is simply $1/\phi(\Phi^{-1}(\mu))$.

In [4]:
def compute_mu_i(X: np.array, beta: np.array, model_type = 'Logit'):
    # X can be 2d or 1d
    
    if np.sum(~(beta == 0)) == 0:
        return 1/2
    
    if model_type == 'Logit':
        #print('a')
        return 1/(1 + np.exp(- X @ beta))
    elif model_type == 'Probit':
        #print('b')
        return norm.cdf(X @ beta)
    elif model_type == 'CLogLog':
        #print('c')
        return 1 - np.exp(-np.exp(X @ beta))
    else:
        return None


def compute_sum_G_i(x, y, beta, model_type = 'Logit'):
    # here we assume x is 2d.
    # print(compute_mu_i(x, beta, model_type = model_type))

    if model_type == 'Logit':
        return x.T @ (y - compute_mu_i(x, beta, model_type = model_type))
    else:
        temp_mu_is = compute_mu_i(x, beta, model_type = model_type)
        variance_denominator = 1/((temp_mu_is) * (1 - temp_mu_is))
        
        if model_type == 'Probit':
            # dg_dmu = 1/np.pdf(x @ beta)
            dmu_dg = norm.pdf(x @ beta)
        elif model_type == 'CLogLog':
#             if (temp_mu_is == 1).sum() > 0:
#                 global bad_temp_mu_is
#                 global bad_x
#                 global bad_y
#                 global bad_beta
#                 print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
#                 print((temp_mu_is == 1).sum())
#                 print(temp_mu_is)
#                 print(x)
#                 bad_temp_mu_is = temp_mu_is
#                 bad_x = x
#                 bad_y = y
#                 bad_beta = beta
#                 print(y)
#                 print(beta)
#             global bad_dmudg
            dmu_dg = np.log(1 - temp_mu_is) * (temp_mu_is - 1) # ).fillna(0)
            # bad_dmudg = dmu_dg 
        else:
            return None
        
        return np.sum(x.mul(((y - temp_mu_is) * variance_denominator * dmu_dg), axis=0))

In [5]:
def generate_all_paths(number_of_coefficients):
    # simply returns a list of tuples of paths from a to b
    return list(permutations(range(number_of_coefficients)))

def generate_all_beta_pairs_per_row(path, pop_beta, sample_beta):
    ### given a path (tuple), retuns a list of pairs of betas [(0, a, b), (1, c, d), ...] 
    ###### where the first pair is for the first column, ...
    
    assert len(path) == len(pop_beta)
    assert len(pop_beta) == len(sample_beta)
    
    prev_beta = pop_beta.copy()
    
    list_of_betas = []
    
    for ele in path:
        new_beta = prev_beta.copy()
        new_beta[ele] = sample_beta[ele]
        list_of_betas.append([ele, prev_beta, new_beta])
        
        prev_beta = new_beta.copy()
    
    list_of_betas.sort()
    
    return list_of_betas

def compute_path_specific_jn(path, pop_beta, sample_beta, x, y, model_type = 'Logit'):
    # given a specific path, computes the Jn matrix.
    
    path_betas = generate_all_beta_pairs_per_row(path, pop_beta, sample_beta)
    # print(path_betas)
    num_coef = len(pop_beta)
    num_obs = len(x)
    # print(compute_sum_G_i(x, y, path_betas[0][2], model_type = model_type) )
    unnormalized_jn = pd.concat([(compute_sum_G_i(x, y, path_betas[temp_col][2], model_type = model_type) \
                                     - compute_sum_G_i(x, y, path_betas[temp_col][1], model_type = model_type)) / \
                                    (sample_beta[temp_col] - pop_beta[temp_col]) \
                                 for temp_col in range(num_coef)], axis=1)
    
    return -unnormalized_jn / num_obs

def compute_average_jn(pop_beta, sample_beta, x, y, model_type = 'Logit'):
    num_obs = len(pop_beta)
    all_paths = generate_all_paths(num_obs)
    
    all_path_specific_jns = [compute_path_specific_jn(temp_path, pop_beta, \
                                                      sample_beta, x, y, model_type = model_type) for temp_path in all_paths]
    
    return pd.concat(all_path_specific_jns).reset_index().groupby('index').mean()

### data generator:

In [6]:
# generate population data:
def generate_population_data(population_size, number_of_coefficients, feature_cols, \
                            true_beta = 0):
    population_x = rand_generator.multivariate_normal(mean=np.zeros(number_of_coefficients), 
                                       cov=np.eye(number_of_coefficients), 
                                       size=population_size)
    if true_beta != 0:
        population_y = rand_generator.binomial(n=1, p=expit(logit(1/2) + true_beta * population_x)).flatten()
    else:
        population_y = rand_generator.binomial(n=1, p=1/2, size=population_size)


    population_data = pd.concat([pd.Series(population_y), pd.DataFrame(population_x)], axis=1)

    population_data.columns = ['y'] + feature_cols

    return population_data

# Hyperparams:

In [7]:
pop_index = 2
iter_val = 7

In [8]:
rand_generator = np.random.default_rng(seed=333 * pop_index + iter_val)

In [9]:
population_size = 100_000

number_of_coefficients = 1

num_iters_per_population = 25_000
true_beta = 1

In [10]:
ALL_SAMPLE_SIZES = [6, 7, 9, 11, 13, 16, 20, 25] + [30, 50, 70, 100, 150, 250, 400, 600, 1000, 1400] + \
                    [2000, 3000, 5000, 7500, 10_000, 15_000, 20_000] # sub-1500 stash doing NOW

In [17]:
ALL_SAMPLE_SIZES = [6, 7, 9, 11, 13, 16, 20, 25] + [i for i in range(30, 45)]

In [12]:
ALL_SAMPLE_SIZES = [3, 4, 5, 6, 7, 9, 11, 13, 16, 20, 25, 30, 40, 50]

In [13]:
# ALL_SAMPLE_SIZES

# Run:

In [14]:
feature_cols = [f'x_{i}' for i in range(number_of_coefficients)]

In [15]:
# hypervariabes where things will be saved as key of sample size -> list.
all_jns_per_sample_size_biased = defaultdict(list)
all_ddc_per_sample_size_biased = defaultdict(list)
all_sample_beta_per_sample_size_biased = defaultdict(list)

all_jns_per_sample_size_full = defaultdict(list)
all_ddc_per_sample_size_full = defaultdict(list)
all_sample_beta_per_sample_size_full = defaultdict(list)



all_realized_sample_sizes_per_sample_size = defaultdict(list)
all_pop_beta_per_sample_size = defaultdict(list)

sample_specific_non_separable_count = {}

In [16]:
sample_probability_centering = 0.77
sample_probability_bias_factor = 1

In [18]:
pop_data = generate_population_data(population_size, number_of_coefficients, 
                                    feature_cols = feature_cols, true_beta = true_beta)

In [19]:
# pickle_filename = f'base_population_data4.pickle'
# with open(pickle_filename, 'wb') as handle:
#     pickle.dump(pop_data, handle)

In [20]:
pickle_filename = f'base_population_data{pop_index}.pickle'
pop_data = pd.read_pickle(pickle_filename)

In [21]:
pop_data.head()

Unnamed: 0,y,x_0
0,1,-0.952528
1,1,2.164991
2,0,-0.714784
3,1,1.079437
4,1,0.512145


In [22]:
pop_x = pop_data[feature_cols]
pop_y = pop_data['y']

pop_model = sm.Logit(endog = pop_y, exog = pop_x).fit(disp=0)
pop_beta = np.array(pop_model.params)
pop_gs = pop_x * (np.array(pop_y).reshape((population_size, 1)) - \
              np.array(pop_model.predict()).reshape((population_size, 1)))

#### actually run:

In [23]:
for temp_sample_size in tqdm(ALL_SAMPLE_SIZES):
    non_separable_count = 0
    if temp_sample_size < 15:
        num_iters_per_population = 25_000
    else:
        num_iters_per_population = 25_000
        
    for _ in trange(num_iters_per_population, mininterval=10):
        # use sampling scheme to sample data:
        obtained_valid_sample = False
        
        while not obtained_valid_sample:
            pop_data['r0'] = 0
            pop_data.loc[np.random.choice(pop_data.index, size = temp_sample_size, replace=False), 'r0'] = 1

            pop_data['r'] = 0
            
            full_sampled_data = pop_data[pop_data['r0'] == 1]

            marginal_probabilities = expit(logit(sample_probability_centering) + \
                                           sample_probability_bias_factor * (2* full_sampled_data['y'] - 1) * \
                                           full_sampled_data['x_0'])

            other_sample_indices = marginal_probabilities.index[rand_generator.binomial(n=1, p = marginal_probabilities) == 1]

            pop_data.loc[other_sample_indices, 'r'] = 1
            # sample_data here means the biased sample data.
            sample_data = pop_data[pop_data['r'] == 1]
            realised_sample_size = len(other_sample_indices)
            
            if realised_sample_size < 1_000:
                separability_check_df = sample_data[['x_0', 'y']].groupby('y')['x_0'].agg(['min', 'max'])
                is_not_separable = (len(separability_check_df) < 2) or \
                                    (separability_check_df.iloc[0, 0] > separability_check_df.iloc[1, 1]) or \
                                    (separability_check_df.iloc[1, 0] > separability_check_df.iloc[0, 1])

                if is_not_separable:
                    non_separable_count = non_separable_count + 1
                    continue

            else:
                ...
            obtained_valid_sample = True
            
            # compute biased x, y, model, beta
            sample_x = sample_data[feature_cols]
            sample_y = sample_data['y']
            sample_model = sm.Logit(endog = sample_y, exog = sample_x).fit(disp=0, maxiter=5_00)
            sample_beta = np.array(sample_model.params)
            sample_r = pop_data['r']
        
        # compute full x, y, model, beta
        sample_x_full = full_sampled_data[feature_cols]
        sample_y_full = full_sampled_data['y']
        sample_model_full = sm.Logit(endog = sample_y_full, exog = sample_x_full).fit(disp=0, maxiter=5_00)
        sample_beta_full = np.array(sample_model_full.params)
        sample_r_full = pop_data['r0']
        
        
        
        # compute biased versions of things:
        all_sample_beta_per_sample_size_biased[temp_sample_size].append(pd.Series(sample_beta))        
        all_ddc_per_sample_size_biased[temp_sample_size].append(pop_gs.corrwith(sample_r)[['x_0']])
        all_jns_per_sample_size_biased[temp_sample_size].append(compute_average_jn(pop_beta, sample_beta, sample_x, sample_y,
                                                                model_type = 'Logit'))
            
        all_realized_sample_sizes_per_sample_size[temp_sample_size].append(realised_sample_size)
        
        # compute full versions of things:
        all_sample_beta_per_sample_size_full[temp_sample_size].append(pd.Series(sample_beta_full))        
        all_ddc_per_sample_size_full[temp_sample_size].append(pop_gs.corrwith(sample_r_full)[['x_0']])
        all_jns_per_sample_size_full[temp_sample_size].append(compute_average_jn(pop_beta, sample_beta_full, 
                                                                                 sample_x_full, sample_y_full,
                                                                                 model_type = 'Logit'))

        
    sample_specific_non_separable_count[temp_sample_size] = non_separable_count
    

  0%|                                                                                           | 0/23 [00:00<?, ?it/s]
  0%|                                                                                        | 0/25000 [00:00<?, ?it/s][A
  2%|█▎                                                                            | 409/25000 [00:10<10:02, 40.82it/s][A
  3%|██▌                                                                           | 818/25000 [00:20<10:04, 40.03it/s][A
  5%|███▊                                                                         | 1230/25000 [00:30<09:46, 40.51it/s][A
  7%|█████                                                                        | 1641/25000 [00:40<09:41, 40.18it/s][A
  8%|██████▎                                                                      | 2038/25000 [00:51<09:39, 39.60it/s][A
 10%|███████▍                                                                     | 2435/25000 [01:01<09:29, 39.60it/s][A
 11%|████████▋     

  0%|                                                                                        | 0/25000 [00:00<?, ?it/s][A
  2%|█▎                                                                            | 432/25000 [00:10<09:29, 43.10it/s][A
  3%|██▋                                                                           | 871/25000 [00:20<09:13, 43.57it/s][A
  5%|████                                                                         | 1310/25000 [00:30<09:02, 43.66it/s][A
  7%|█████▍                                                                       | 1751/25000 [00:40<08:50, 43.83it/s][A
  9%|██████▊                                                                      | 2192/25000 [00:50<08:45, 43.44it/s][A
 10%|████████                                                                     | 2620/25000 [01:00<08:40, 43.03it/s][A
 12%|█████████▍                                                                   | 3051/25000 [01:10<08:30, 43.04it/s][A
 14%|██████████▋

 12%|████████▉                                                                    | 2895/25000 [01:00<07:36, 48.46it/s][A
 14%|██████████▍                                                                  | 3390/25000 [01:10<07:24, 48.60it/s][A
 16%|███████████▉                                                                 | 3879/25000 [01:20<07:16, 48.43it/s][A
 18%|█████████████▍                                                               | 4380/25000 [01:30<07:01, 48.92it/s][A
 20%|███████████████                                                              | 4880/25000 [01:40<06:52, 48.77it/s][A
 22%|████████████████▌                                                            | 5376/25000 [01:50<06:40, 49.01it/s][A
 22%|████████████████▌                                                            | 5376/25000 [02:03<06:40, 49.01it/s][A
 23%|█████████████████▉                                                           | 5819/25000 [02:03<07:11, 44.45it/s][A
 25%|███████████

 29%|██████████████████████▏                                                      | 7191/25000 [02:21<05:46, 51.36it/s][A
 31%|███████████████████████▋                                                     | 7700/25000 [02:31<05:38, 51.06it/s][A
 33%|█████████████████████████▎                                                   | 8210/25000 [02:41<05:29, 51.01it/s][A
 35%|██████████████████████████▉                                                  | 8726/25000 [02:51<05:17, 51.18it/s][A
 37%|████████████████████████████▍                                                | 9242/25000 [03:01<05:07, 51.21it/s][A
 39%|██████████████████████████████                                               | 9760/25000 [03:11<04:56, 51.37it/s][A
 41%|███████████████████████████████▏                                            | 10278/25000 [03:21<04:46, 51.42it/s][A
 41%|███████████████████████████████▏                                            | 10278/25000 [03:34<04:46, 51.42it/s][A
 43%|███████████

 58%|████████████████████████████████████████████▏                               | 14549/25000 [04:39<03:15, 53.50it/s][A
 60%|█████████████████████████████████████████████▉                              | 15093/25000 [04:49<03:05, 53.42it/s][A
 63%|███████████████████████████████████████████████▌                            | 15639/25000 [04:59<02:54, 53.75it/s][A
 65%|█████████████████████████████████████████████████▏                          | 16185/25000 [05:09<02:43, 53.99it/s][A
 67%|██████████████████████████████████████████████████▊                         | 16734/25000 [05:19<02:32, 54.22it/s][A
 69%|████████████████████████████████████████████████████▌                       | 17282/25000 [05:29<02:23, 53.77it/s][A
 71%|██████████████████████████████████████████████████████▏                     | 17818/25000 [05:39<02:13, 53.71it/s][A
 73%|███████████████████████████████████████████████████████▊                    | 18354/25000 [05:49<02:04, 53.47it/s][A
 76%|███████████

 93%|██████████████████████████████████████████████████████████████████████▋     | 23263/25000 [07:09<00:30, 56.16it/s][A
 95%|████████████████████████████████████████████████████████████████████████▍   | 23836/25000 [07:19<00:20, 56.22it/s][A
 98%|██████████████████████████████████████████████████████████████████████████▏ | 24400/25000 [07:29<00:10, 55.80it/s][A
100%|████████████████████████████████████████████████████████████████████████████| 25000/25000 [07:40<00:00, 54.29it/s][A
 26%|████████████████████▊                                                           | 6/23 [52:49<2:20:40, 496.53s/it]
  0%|                                                                                        | 0/25000 [00:00<?, ?it/s][A
  2%|█▊                                                                            | 561/25000 [00:10<07:15, 56.07it/s][A
  5%|███▍                                                                         | 1136/25000 [00:20<06:59, 56.90it/s][A
  7%|█████▎        

 25%|███████████████████▎                                                         | 6270/25000 [01:50<05:30, 56.73it/s][A
 27%|█████████████████████                                                        | 6847/25000 [02:00<05:19, 56.81it/s][A
 30%|██████████████████████▊                                                      | 7417/25000 [02:10<05:09, 56.76it/s][A
 32%|████████████████████████▌                                                    | 7984/25000 [02:21<05:01, 56.52it/s][A
 34%|██████████████████████████▎                                                  | 8544/25000 [02:31<04:52, 56.27it/s][A
 36%|████████████████████████████                                                 | 9104/25000 [02:41<04:43, 56.15it/s][A
 39%|█████████████████████████████▊                                               | 9680/25000 [02:51<04:30, 56.58it/s][A
 41%|███████████████████████████████▏                                            | 10256/25000 [03:01<04:19, 56.78it/s][A
 43%|███████████

 68%|███████████████████████████████████████████████████▋                        | 17010/25000 [05:02<02:22, 56.18it/s][A
 70%|█████████████████████████████████████████████████████▍                      | 17576/25000 [05:12<02:12, 56.07it/s][A
 73%|███████████████████████████████████████████████████████▏                    | 18141/25000 [05:22<02:02, 56.18it/s][A
 75%|████████████████████████████████████████████████████████▊                   | 18706/25000 [05:32<01:52, 55.81it/s][A
 77%|██████████████████████████████████████████████████████████▌                 | 19256/25000 [05:43<01:45, 54.29it/s][A
 79%|████████████████████████████████████████████████████████████▎               | 19820/25000 [05:53<01:34, 54.88it/s][A
 82%|█████████████████████████████████████████████████████████████▉              | 20384/25000 [06:03<01:23, 55.33it/s][A
 84%|███████████████████████████████████████████████████████████████▋            | 20948/25000 [06:13<01:12, 55.63it/s][A
 86%|███████████

 43%|█████████████████████████████████▍                                           | 10/23 [1:23:04<1:40:26, 463.57s/it]
  0%|                                                                                        | 0/25000 [00:00<?, ?it/s][A
  2%|█▋                                                                            | 557/25000 [00:10<07:18, 55.69it/s][A
  4%|███▍                                                                         | 1114/25000 [00:20<07:33, 52.73it/s][A
  7%|█████▏                                                                       | 1672/25000 [00:30<07:11, 54.07it/s][A
  9%|██████▉                                                                      | 2233/25000 [00:40<06:55, 54.82it/s][A
 11%|████████▋                                                                    | 2801/25000 [00:50<06:39, 55.52it/s][A
 13%|██████████▍                                                                  | 3369/25000 [01:01<06:32, 55.13it/s][A
 13%|██████████▍   

 41%|███████████████████████████████▍                                            | 10349/25000 [03:02<04:19, 56.50it/s][A
 44%|█████████████████████████████████▏                                          | 10909/25000 [03:12<04:10, 56.34it/s][A
 46%|██████████████████████████████████▊                                         | 11469/25000 [03:22<04:00, 56.17it/s][A
 48%|████████████████████████████████████▌                                       | 12027/25000 [03:32<03:52, 55.81it/s][A
 50%|██████████████████████████████████████▎                                     | 12595/25000 [03:42<03:41, 56.09it/s][A
 53%|████████████████████████████████████████                                    | 13163/25000 [03:52<03:30, 56.17it/s][A
 55%|█████████████████████████████████████████▋                                  | 13727/25000 [04:02<03:20, 56.15it/s][A
 57%|███████████████████████████████████████████▍                                | 14297/25000 [04:12<03:09, 56.38it/s][A
 59%|███████████

 85%|████████████████████████████████████████████████████████████████▋           | 21269/25000 [06:50<01:09, 53.90it/s][A
 87%|██████████████████████████████████████████████████████████████████▏         | 21783/25000 [07:01<01:02, 51.77it/s][A
 89%|███████████████████████████████████████████████████████████████████▉        | 22333/25000 [07:11<00:50, 52.69it/s][A
 92%|█████████████████████████████████████████████████████████████████████▌      | 22883/25000 [07:21<00:39, 53.29it/s][A
 94%|███████████████████████████████████████████████████████████████████████▏    | 23437/25000 [07:31<00:28, 53.90it/s][A
 96%|████████████████████████████████████████████████████████████████████████▉   | 23999/25000 [07:41<00:18, 54.57it/s][A
100%|████████████████████████████████████████████████████████████████████████████| 25000/25000 [07:59<00:00, 52.13it/s][A
 57%|███████████████████████████████████████████▌                                 | 13/23 [1:45:50<1:16:49, 460.96s/it]
  0%|              

 26%|███████████████████▊                                                         | 6429/25000 [02:14<06:43, 46.07it/s][A
 27%|████████████████████▊                                                        | 6772/25000 [02:14<07:09, 42.47it/s][A
 29%|██████████████████████▍                                                      | 7302/25000 [02:24<06:28, 45.55it/s][A
 31%|████████████████████████▏                                                    | 7862/25000 [02:34<05:52, 48.63it/s][A
 34%|█████████████████████████▉                                                   | 8430/25000 [02:44<05:24, 51.02it/s][A
 34%|█████████████████████████▉                                                   | 8430/25000 [02:54<05:24, 51.02it/s][A
 36%|███████████████████████████▋                                                 | 8984/25000 [02:54<05:06, 52.30it/s][A
 36%|███████████████████████████▋                                                 | 8984/25000 [03:04<05:06, 52.30it/s][A
 38%|███████████

 59%|████████████████████████████████████████████▌                               | 14665/25000 [04:21<03:04, 56.06it/s][A
 61%|██████████████████████████████████████████████▎                             | 15227/25000 [04:32<02:54, 56.09it/s][A
 63%|███████████████████████████████████████████████▉                            | 15789/25000 [04:42<02:44, 55.96it/s][A
 65%|█████████████████████████████████████████████████▋                          | 16355/25000 [04:52<02:34, 56.13it/s][A
 68%|███████████████████████████████████████████████████▍                        | 16921/25000 [05:02<02:23, 56.27it/s][A
 70%|█████████████████████████████████████████████████████▏                      | 17487/25000 [05:12<02:13, 56.09it/s][A
 72%|██████████████████████████████████████████████████████▉                     | 18054/25000 [05:22<02:03, 56.25it/s][A
 74%|████████████████████████████████████████████████████████▌                   | 18621/25000 [05:32<01:53, 56.34it/s][A
 77%|███████████

  0%|                                                                                        | 0/25000 [00:00<?, ?it/s][A
  2%|█▊                                                                            | 563/25000 [00:10<07:14, 56.24it/s][A
  5%|███▍                                                                         | 1126/25000 [00:20<07:07, 55.90it/s][A
  7%|█████▏                                                                       | 1683/25000 [00:30<07:00, 55.49it/s][A
  9%|██████▉                                                                      | 2241/25000 [00:40<06:49, 55.60it/s][A
 11%|████████▋                                                                    | 2801/25000 [00:50<06:38, 55.72it/s][A
 13%|██████████▎                                                                  | 3361/25000 [01:00<06:30, 55.48it/s][A
 16%|████████████                                                                 | 3920/25000 [01:10<06:19, 55.59it/s][A
 18%|███████████

 41%|██████████████████████████████▉                                             | 10183/25000 [03:01<04:27, 55.49it/s][A
 43%|████████████████████████████████▌                                           | 10723/25000 [03:11<04:19, 55.02it/s][A
 45%|██████████████████████████████████▎                                         | 11276/25000 [03:21<04:09, 55.08it/s][A
 47%|███████████████████████████████████▉                                        | 11840/25000 [03:31<03:57, 55.44it/s][A
 47%|███████████████████████████████████▉                                        | 11840/25000 [04:14<03:57, 55.44it/s][A
 48%|████████████████████████████████████▊                                       | 12099/25000 [04:14<09:03, 23.72it/s][A
 51%|██████████████████████████████████████▍                                     | 12653/25000 [04:24<07:00, 29.36it/s][A
 53%|████████████████████████████████████████▏                                   | 13207/25000 [04:34<05:41, 34.57it/s][A
 55%|███████████

 67%|███████████████████████████████████████████████████▏                        | 16819/25000 [05:17<02:26, 55.94it/s][A
 70%|████████████████████████████████████████████████████▊                       | 17380/25000 [05:27<02:16, 55.79it/s][A
 72%|██████████████████████████████████████████████████████▌                     | 17939/25000 [05:37<02:06, 55.82it/s][A
 74%|████████████████████████████████████████████████████████▏                   | 18498/25000 [05:47<01:56, 55.80it/s][A
 76%|█████████████████████████████████████████████████████████▉                  | 19056/25000 [05:57<01:46, 55.60it/s][A
 78%|███████████████████████████████████████████████████████████▋                | 19625/25000 [06:07<01:36, 55.98it/s][A
 81%|█████████████████████████████████████████████████████████████▍              | 20194/25000 [06:17<01:26, 55.51it/s][A
 83%|███████████████████████████████████████████████████████████████             | 20759/25000 [06:27<01:16, 55.80it/s][A
 85%|███████████

  9%|██████▉                                                                      | 2237/25000 [00:40<06:48, 55.67it/s][A
 11%|████████▌                                                                    | 2796/25000 [00:50<06:38, 55.74it/s][A
 13%|██████████▎                                                                  | 3356/25000 [01:00<06:27, 55.82it/s][A
 16%|████████████                                                                 | 3918/25000 [01:10<06:16, 55.94it/s][A
 18%|█████████████▊                                                               | 4480/25000 [01:20<06:06, 56.01it/s][A
 20%|███████████████▌                                                             | 5042/25000 [01:30<05:56, 55.99it/s][A
 22%|█████████████████▎                                                           | 5602/25000 [01:40<05:48, 55.69it/s][A
 25%|██████████████████▉                                                          | 6158/25000 [01:50<05:38, 55.64it/s][A
 27%|███████████

 47%|███████████████████████████████████▊                                        | 11794/25000 [03:32<03:55, 55.96it/s][A
 49%|█████████████████████████████████████▌                                      | 12361/25000 [03:42<03:46, 55.75it/s][A
 52%|███████████████████████████████████████▎                                    | 12921/25000 [03:52<03:36, 55.81it/s][A
 54%|████████████████████████████████████████▉                                   | 13481/25000 [04:03<03:26, 55.76it/s][A
 56%|██████████████████████████████████████████▋                                 | 14040/25000 [04:13<03:16, 55.79it/s][A
 58%|████████████████████████████████████████████▍                               | 14606/25000 [04:23<03:05, 56.03it/s][A
 61%|██████████████████████████████████████████████                              | 15172/25000 [04:33<02:54, 56.18it/s][A
 63%|███████████████████████████████████████████████▊                            | 15738/25000 [04:43<02:45, 55.83it/s][A
 65%|███████████

#### combine data for each sample size:

In [24]:
sample_specific_non_separable_count

{6: 34846,
 7: 23967,
 9: 12981,
 11: 7704,
 13: 4795,
 16: 2492,
 20: 994,
 25: 304,
 30: 120,
 31: 83,
 32: 67,
 33: 55,
 34: 40,
 35: 33,
 36: 26,
 37: 21,
 38: 10,
 39: 20,
 40: 22,
 41: 6,
 42: 6,
 43: 11,
 44: 4}

In [20]:
# all_realized_sample_sizes_per_sample_size

In [25]:
all_data_per_ss = []

In [26]:
for temp_sample_size in tqdm(ALL_SAMPLE_SIZES):
#     temp_pop_beta = pd.concat(all_pop_beta_per_sample_size[temp_sample_size], axis=1).T
#     temp_pop_beta.columns = ['pop']

    # compute the biased versions:
    temp_samp_beta_biased = pd.concat(all_sample_beta_per_sample_size_biased[temp_sample_size], axis=1).T
    temp_samp_beta_biased.columns = ['samp_biased']
    temp_ddc_biased =  pd.concat(all_ddc_per_sample_size_biased[temp_sample_size], axis=1).T
    temp_ddc_biased.columns = ['ddc_biased']
    temp_jn_biased =  pd.concat(all_jns_per_sample_size_biased[temp_sample_size], axis=1).T
    temp_jn_biased.columns = ['jn_biased']
    temp_jn_biased = temp_jn_biased.reset_index(drop=True)
    realised_sizes = pd.DataFrame(all_realized_sample_sizes_per_sample_size[temp_sample_size])
    realised_sizes.columns = ['realized_size_biased']
    
    # compute the SRS versions:
    temp_samp_beta_full = pd.concat(all_sample_beta_per_sample_size_full[temp_sample_size], axis=1).T
    temp_samp_beta_full.columns = ['samp_intended']
    temp_ddc_full = pd.concat(all_ddc_per_sample_size_full[temp_sample_size], axis=1).T
    temp_ddc_full.columns = ['ddc_intended']
    temp_jn_full =  pd.concat(all_jns_per_sample_size_full[temp_sample_size], axis=1).T
    temp_jn_full.columns = ['jn_intended']
    temp_jn_full = temp_jn_full.reset_index(drop=True)
    
    temp_ss_data = pd.concat([temp_samp_beta_biased, temp_ddc_biased, temp_jn_biased, realised_sizes, \
                             temp_samp_beta_full, temp_ddc_full, temp_jn_full], axis=1)
    temp_ss_data['sample_size'] = temp_sample_size
    temp_ss_data['pop_beta'] = pop_beta[0]
    all_data_per_ss.append(temp_ss_data)

100%|██████████████████████████████████████████████████████████████████████████████████| 23/23 [06:28<00:00, 16.88s/it]


In [27]:
all_data_per_ss = pd.concat(all_data_per_ss, axis=0).reset_index(drop=True)

In [28]:
all_data_per_ss['mse_biased'] = (all_data_per_ss['pop_beta']- all_data_per_ss['samp_biased']) ** 2

In [29]:
all_data_per_ss['mse_intended'] = (all_data_per_ss['pop_beta']- all_data_per_ss['samp_intended']) ** 2

In [30]:
# all_data_per_ss['r_s'] = all_data_per_ss['realized_size'] / all_data_per_ss['sample_size']

In [31]:
# all_data_per_ss.groupby('sample_size')['r_s'].mean()[:3]

In [32]:
(all_data_per_ss.groupby(by='sample_size')['mse_biased'].std() / np.sqrt(10_000)).head()

sample_size
6     0.091333
7     0.110241
9     0.148153
11    0.229773
13    0.136406
Name: mse_biased, dtype: float64

In [33]:
all_data_per_ss.groupby(by='sample_size')['mse_biased'].describe().head()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
sample_size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
6,25000.0,1.965191,9.133266,3.558472e-09,0.081429,0.378857,1.325874,679.748302
7,25000.0,2.123358,11.024106,1.643762e-09,0.080775,0.375182,1.363531,842.153515
9,25000.0,2.511414,14.815274,7.110244e-09,0.078905,0.398012,1.477903,947.587493
11,25000.0,2.591623,22.977277,1.502466e-08,0.079427,0.389869,1.560604,3027.560912
13,25000.0,2.619823,13.640578,1.381341e-08,0.079901,0.403498,1.656551,1017.923537


In [34]:
# number_key = 1500

# number_key = 20000.14
number_key = 40.72

# number_key = 90_000

# number_key = 100_000

In [35]:
pickle_filename = f'ess_curve_biased_2_stage_sub{number_key}_non_separable_counts.pickle'
with open(pickle_filename, 'wb') as handle:
    pickle.dump(sample_specific_non_separable_count, handle)

In [36]:
pickle_filename = f'ess_curve_biased_sub{number_key}.pickle'
with open(pickle_filename, 'wb') as handle:
    pickle.dump(all_data_per_ss, handle)