In [1]:
import numpy as np
import scipy as sp
import pandas as pd
from scipy import stats as sps
from IPython.display import display, Markdown

In [2]:
from datetime import datetime

## Example data from Chapter 2 of "What If?"

In [3]:
ch2data = {
    'L': [0]*8 + [1]*12,
    'A': [0]*4 + [1]*4 + [0]*3 + [1]*9,
    'Y': [0,1] + [0]*5 + [1]*3 + [0] + [1]*6 + [0]*3,
}
ch2names = ['Rheia', 'Kronos', 'Demeter', 'Hades', 'Hestia', 'Poseidon', 'Hera', 
            'Zeus', 'Artemis', 'Apollo', 'Leto', 'Ares', 'Athena', 'Hephaestus', 
            'Aphrodite', 'Cyclope', 'Persephone', 'Hermes', 'Hebe', 'Dionysus']
pd.DataFrame(ch2data, index=ch2names)

Unnamed: 0,L,A,Y
Rheia,0,0,0
Kronos,0,0,1
Demeter,0,0,0
Hades,0,0,0
Hestia,0,1,0
Poseidon,0,1,0
Hera,0,1,0
Zeus,0,1,1
Artemis,1,0,1
Apollo,1,0,1


## Basic function definitions

A utility function for generating all binary vectors of a given length.

In [4]:
def all_binary_vectors(length):
    result = [[]]
    while len(result[0]) < length:
        result = [ vector + [val] for vector in result for val in [False, True] ]
    return result

In [5]:
pd.DataFrame(all_binary_vectors(3))

Unnamed: 0,0,1,2
0,False,False,False
1,False,False,True
2,False,True,False
3,False,True,True
4,True,False,False
5,True,False,True
6,True,True,False
7,True,True,True


Calculate the standardized means for each level of A for a given dataset

In [6]:
def standardized_means_for_A(L, A, Y):
    A = np.array(A)
    Y = np.array(Y)
    L = np.reshape(np.array(L), (len(Y), -1)) # force L to be a 2D array
    A_levels = [False, True]
    L_levels = all_binary_vectors(L.shape[1])
    Y_mean = []
    for a in A_levels:
        total = 0
        for l in L_levels:
            mask_l = (np.product(L == l, axis=1) != 0)
            P_l = np.mean(mask_l)
            Y_given_a_l = np.mean(Y[mask_l * (A == a)])
            total += Y_given_a_l * P_l
        Y_mean.append(total)
    return {
        'A_level': A_levels,
        'Y_mean': Y_mean
    }

In [7]:
pd.DataFrame(standardized_means_for_A(**ch2data))

Unnamed: 0,A_level,Y_mean
0,False,0.5
1,True,0.5


Calculate the inverse probability weighted means for each level of A for a given dataset

In [8]:
def ip_weighted_means_for_A(L, A, Y):
    start = datetime.now()
    A = np.array(A)
    Y = np.array(Y)
    L = np.reshape(np.array(L), (len(Y), -1)) # force L to be a 2D array
    A_levels = [False, True]
    L_levels = all_binary_vectors(L.shape[1])
    
    f_A_given_L = np.zeros(len(Y))
    for a in A_levels:
        total = 0
        for l in L_levels:
            mask_l = (np.product(L == l, axis=1) != 0)
            mask_l_a = mask_l * (A == a)
            val = np.sum(mask_l_a) / np.sum(mask_l)
            f_A_given_L[mask_l_a] = val
    weighted_outcomes = Y / f_A_given_L
    Y_mean = [np.mean(weighted_outcomes * (A==a)) for a in A_levels]
    return {
        'A_level': A_levels,
        'Y_mean': Y_mean,
    }

In [9]:
pd.DataFrame(ip_weighted_means_for_A(**ch2data))

Unnamed: 0,A_level,Y_mean
0,False,0.5
1,True,0.5


A simple utility function for summarizing results for a given model

In [10]:
def summarize_model(data, title='', description=''):
    display(Markdown('### ' + title))
    display(Markdown(description))
    display(Markdown('#### Sample data'))
    display(pd.DataFrame(data).iloc[:12])

    display(Markdown('#### Summary statistics'))
    stats = {
        'mean(L)': np.mean(data['L']),
        'mean(A)': np.mean(data['A']),
        'mean(Y)': np.mean(data['Y']),
    }
    display(pd.DataFrame({'statistic': stats.values()}, index=stats.keys()))
    
    display(Markdown('#### Standardized means'))
    display(pd.DataFrame(standardized_means_for_A(**data)))

    display(Markdown('#### Inverse probability weighted means'))
    display(pd.DataFrame(ip_weighted_means_for_A(**data)))

In [11]:
summarize_model(ch2data, 'Analysis of data from chapter 2', 'Validating the summary function with data from chapter 2')

### Analysis of data from chapter 2

Validating the summary function with data from chapter 2

#### Sample data

Unnamed: 0,L,A,Y
0,0,0,0
1,0,0,1
2,0,0,0
3,0,0,0
4,0,1,0
5,0,1,0
6,0,1,0
7,0,1,1
8,1,0,1
9,1,0,1


#### Summary statistics

Unnamed: 0,statistic
mean(L),0.6
mean(A),0.65
mean(Y),0.5


#### Standardized means

Unnamed: 0,A_level,Y_mean
0,False,0.5
1,True,0.5


#### Inverse probability weighted means

Unnamed: 0,A_level,Y_mean
0,False,0.5
1,True,0.5


## Testing with generated data

Start with a simple parameterized model...

In [12]:
def generate_simple_logistic_model_data(N=1000000, P_L=0.5,
                                        beta_A_0=0, beta_A_L=0, 
                                        beta_Y_0=0, beta_Y_L=0, beta_Y_A=0, beta_Y_LA=0,
                                        debug_info=False):
    L = np.random.uniform(size=(N,)+np.array(P_L).shape) < P_L
    P_A_given_l = sps.logistic.cdf(beta_A_0 + np.array(beta_A_L).dot(L))
    A = np.random.uniform(size=N) < P_A_given_l
    P_Y_given_l_a = sps.logistic.cdf(beta_Y_0 + np.array(beta_Y_L).dot(L) +
                                     np.array(beta_Y_A).dot(A) +
                                     np.array(beta_Y_LA).dot(L)*A
                                     )
    Y = np.random.uniform(size=N) < P_Y_given_l_a
    
    return {
        'L' : L,
        'A' : A,
        'Y' : Y,
    } | ({
        'P_L' : P_L,
        'P_A_given_l' : P_A_given_l, 
        'P_Y_given_l_a' : P_Y_given_l_a,
    } if debug_info else {})

In [13]:
pd.DataFrame(generate_simple_logistic_model_data(N=5))

Unnamed: 0,L,A,Y
0,True,False,False
1,True,True,True
2,False,True,True
3,False,False,False
4,True,False,True


In [14]:
summarize_model(generate_simple_logistic_model_data(), 'Maximal entropy model', 'No interactions, 1:1 odds for L, A, and Y')

### Maximal entropy model

No interactions, 1:1 odds for L, A, and Y

#### Sample data

Unnamed: 0,L,A,Y
0,True,True,True
1,True,False,False
2,True,True,False
3,True,False,True
4,False,False,False
5,False,True,False
6,True,False,False
7,False,True,True
8,True,False,True
9,False,False,True


#### Summary statistics

Unnamed: 0,statistic
mean(L),0.499777
mean(A),0.500073
mean(Y),0.500102


#### Standardized means

Unnamed: 0,A_level,Y_mean
0,False,0.499328
1,True,0.500873


#### Inverse probability weighted means

Unnamed: 0,A_level,Y_mean
0,False,0.499328
1,True,0.500873


In [15]:
summarize_model(generate_simple_logistic_model_data(beta_Y_A=-1),
                'Simple randomized protective intervention', 'Assumes no effect from covariate')

### Simple randomized protective intervention

Assumes no effect from covariate

#### Sample data

Unnamed: 0,L,A,Y
0,True,False,False
1,False,False,False
2,True,True,False
3,True,False,True
4,True,False,True
5,True,True,False
6,False,True,False
7,True,True,False
8,False,True,False
9,False,False,True


#### Summary statistics

Unnamed: 0,statistic
mean(L),0.50066
mean(A),0.501229
mean(Y),0.384252


#### Standardized means

Unnamed: 0,A_level,Y_mean
0,False,0.499694
1,True,0.269379


#### Inverse probability weighted means

Unnamed: 0,A_level,Y_mean
0,False,0.499694
1,True,0.269379


In [16]:
summarize_model(generate_simple_logistic_model_data(beta_Y_A=-1, beta_Y_L=1),
                'Randomized, covariate is risk, intervention is protective',
                '1:1 odds of covariate')

### Randomized, covariate is risk, intervention is protective

1:1 odds of covariate

#### Sample data

Unnamed: 0,L,A,Y
0,True,True,True
1,True,True,False
2,False,False,False
3,True,True,True
4,False,False,True
5,True,True,False
6,False,False,False
7,False,True,False
8,False,False,False
9,False,True,True


#### Summary statistics

Unnamed: 0,statistic
mean(L),0.49918
mean(A),0.500812
mean(Y),0.49876


#### Standardized means

Unnamed: 0,A_level,Y_mean
0,False,0.614182
1,True,0.383712


#### Inverse probability weighted means

Unnamed: 0,A_level,Y_mean
0,False,0.614182
1,True,0.383712


In [17]:
summarize_model(generate_simple_logistic_model_data(beta_A_0=-2, beta_Y_A=-1, beta_Y_L=1),
                'Randomized, covariate is risk, rare intervention is protective',
                '1:1 odds of covariate')

### Randomized, covariate is risk, rare intervention is protective

1:1 odds of covariate

#### Sample data

Unnamed: 0,L,A,Y
0,True,False,True
1,True,False,True
2,True,True,False
3,False,False,False
4,True,False,False
5,True,False,False
6,False,True,False
7,True,False,True
8,False,False,True
9,True,False,True


#### Summary statistics

Unnamed: 0,statistic
mean(L),0.499554
mean(A),0.119237
mean(Y),0.5874


#### Standardized means

Unnamed: 0,A_level,Y_mean
0,False,0.614874
1,True,0.384458


#### Inverse probability weighted means

Unnamed: 0,A_level,Y_mean
0,False,0.614874
1,True,0.384458


In [18]:
summarize_model(generate_simple_logistic_model_data(P_L=0.75, beta_A_0=-2, beta_Y_A=-1, beta_Y_L=1),
                'Randomized, common covariate is risk, rare intervention is protective',
                '1:1 odds of covariate')

### Randomized, common covariate is risk, rare intervention is protective

1:1 odds of covariate

#### Sample data

Unnamed: 0,L,A,Y
0,False,False,True
1,True,False,True
2,True,False,True
3,True,False,True
4,True,False,True
5,True,False,True
6,True,False,False
7,True,True,True
8,True,False,True
9,False,False,True


#### Summary statistics

Unnamed: 0,statistic
mean(L),0.749715
mean(A),0.119877
mean(Y),0.645643


#### Standardized means

Unnamed: 0,A_level,Y_mean
0,False,0.673537
1,True,0.440849


#### Inverse probability weighted means

Unnamed: 0,A_level,Y_mean
0,False,0.673537
1,True,0.440849


In [19]:
summarize_model(generate_simple_logistic_model_data(P_L=0.75,
                                                    beta_A_0=-2, beta_A_L=1, 
                                                    beta_Y_A=-1, beta_Y_L=1),
                'Common covariate is risk for disease and intervention, intervention is protective',
                '')

### Common covariate is risk for disease and intervention, intervention is protective



#### Sample data

Unnamed: 0,L,A,Y
0,True,False,True
1,True,False,False
2,False,False,True
3,True,True,True
4,True,False,True
5,True,False,True
6,True,False,False
7,True,False,True
8,False,False,True
9,True,False,True


#### Summary statistics

Unnamed: 0,statistic
mean(L),0.749372
mean(A),0.232052
mean(Y),0.618718


#### Standardized means

Unnamed: 0,A_level,Y_mean
0,False,0.6724
1,True,0.441404


#### Inverse probability weighted means

Unnamed: 0,A_level,Y_mean
0,False,0.6724
1,True,0.441404


In [20]:
summarize_model(generate_simple_logistic_model_data(P_L=0.5,
                                                    beta_A_0=0, 
                                                    beta_Y_A=1, beta_Y_L=1, beta_Y_LA=-2),
                'Covariate and intervention are risks, but negate eachother',
                '1:1 odds of covariate')

### Covariate and intervention are risks, but negate eachother

1:1 odds of covariate

#### Sample data

Unnamed: 0,L,A,Y
0,True,False,True
1,False,True,True
2,True,True,False
3,False,True,True
4,False,False,False
5,True,False,True
6,False,True,False
7,False,False,False
8,False,False,True
9,True,True,True


#### Summary statistics

Unnamed: 0,statistic
mean(L),0.500327
mean(A),0.499245
mean(Y),0.615505


#### Standardized means

Unnamed: 0,A_level,Y_mean
0,False,0.616218
1,True,0.615185


#### Inverse probability weighted means

Unnamed: 0,A_level,Y_mean
0,False,0.616218
1,True,0.615185
