In [14]:
import numpy as np
import scipy as sp 
import pandas as pd
from scipy import stats as sps
from sklearn.linear_model import LogisticRegression
from IPython.display import display, Markdown

In [4]:
from datetime import datetime

## Example data from Chapter 2 of "What If?"

In [5]:
ch2data = {
    'L': [0]*8 + [1]*12,
    'A': [0]*4 + [1]*4 + [0]*3 + [1]*9,
    'Y': [0,1] + [0]*5 + [1]*3 + [0] + [1]*6 + [0]*3,
}
ch2names = ['Rheia', 'Kronos', 'Demeter', 'Hades', 'Hestia', 'Poseidon', 'Hera', 
            'Zeus', 'Artemis', 'Apollo', 'Leto', 'Ares', 'Athena', 'Hephaestus', 
            'Aphrodite', 'Cyclope', 'Persephone', 'Hermes', 'Hebe', 'Dionysus']
pd.DataFrame(ch2data, index=ch2names)

Unnamed: 0,L,A,Y
Rheia,0,0,0
Kronos,0,0,1
Demeter,0,0,0
Hades,0,0,0
Hestia,0,1,0
Poseidon,0,1,0
Hera,0,1,0
Zeus,0,1,1
Artemis,1,0,1
Apollo,1,0,1


## Basic function definitions

A utility function for generating all binary vectors of a given length.

In [6]:
def all_binary_vectors(length):
    result = [[]]
    while len(result[0]) < length:
        result = [ vector + [val] for vector in result for val in [False, True] ]
    return result

In [7]:
pd.DataFrame(all_binary_vectors(3))

Unnamed: 0,0,1,2
0,False,False,False
1,False,False,True
2,False,True,False
3,False,True,True
4,True,False,False
5,True,False,True
6,True,True,False
7,True,True,True


Calculate the standardized means for each level of A for a given dataset

In [8]:
def standardized_means_for_A(L, A, Y):
    A = np.array(A)
    Y = np.array(Y)
    L = np.reshape(np.array(L), (len(Y), -1)) # force L to be a 2D array
    A_levels = [False, True]
    L_levels = all_binary_vectors(L.shape[1])
    Y_mean = []
    for a in A_levels:
        total = 0
        for l in L_levels:
            mask_l = (np.product(L == l, axis=1) != 0)
            P_l = np.mean(mask_l)
            Y_given_a_l = np.mean(Y[mask_l * (A == a)])
            total += Y_given_a_l * P_l
        Y_mean.append(total)
    return {
        'A_level': A_levels,
        'Y_mean': Y_mean
    }

In [9]:
pd.DataFrame(standardized_means_for_A(**ch2data))

Unnamed: 0,A_level,Y_mean
0,False,0.5
1,True,0.5


Calculate the inverse probability weighted means for each level of A for a given dataset

In [10]:
def ip_weighted_means_for_A(L, A, Y):
    start = datetime.now()
    A = np.array(A)
    Y = np.array(Y)
    L = np.reshape(np.array(L), (len(Y), -1)) # force L to be a 2D array
    A_levels = [False, True]
    L_levels = all_binary_vectors(L.shape[1])
    
    f_A_given_L = np.zeros(len(Y))
    for a in A_levels:
        total = 0
        for l in L_levels:
            mask_l = (np.product(L == l, axis=1) != 0)
            mask_l_a = mask_l * (A == a)
            val = np.sum(mask_l_a) / np.sum(mask_l)
            f_A_given_L[mask_l_a] = val
    weighted_outcomes = Y / f_A_given_L
    Y_mean = [np.mean(weighted_outcomes * (A==a)) for a in A_levels]
    return {
        'A_level': A_levels,
        'Y_mean': Y_mean,
    }

In [11]:
pd.DataFrame(ip_weighted_means_for_A(**ch2data))

Unnamed: 0,A_level,Y_mean
0,False,0.5
1,True,0.5


Calculate the double-robust means for each level of A for a given dataset

In [150]:
def doubly_robust_means_for_A(L, A, Y):
    start = datetime.now()
    A = np.array(A)
    Y = np.array(Y)
    L = np.reshape(np.array(L), (len(Y), -1)) # force L to be a 2D array
    A_levels = [False, True]
    L_levels = all_binary_vectors(L.shape[1])
    #f_a_given_L = np.zeros((L_levels))
    f_A_given_L = np.zeros(len(Y))
    for a in A_levels:
        total = 0
        for l in L_levels:
            mask_l = (np.product(L == l, axis=1) != 0)
            mask_l_a = mask_l * (A == a)
            val = np.sum(mask_l_a) / np.sum(mask_l)
            f_A_given_L[mask_l_a] = val
    R = (-1 + 2*A) / f_A_given_L
    X = np.hstack([L, A[:,np.newaxis], R[:,np.newaxis]])
    model = LogisticRegression(penalty='none',random_state=42)
    model.fit(X, Y)
    Y_mean = [np.mean(model.predict_proba(
        np.hstack([L, a*np.ones_like(A)[:,np.newaxis], 
                   ((-1 + 2*a) / f_A_given_L)[:,np.newaxis]])
    )[:,1]) for a in A_levels]
    print(model.coef_)
        #[np.hstack([L, a*np.ones_like(A)[:,np.newaxis], R[:,np.newaxis]]) for a in A_levels]
    #  [model.predict_proba(
    #    np.hstack([L, a*np.ones_like(A)[:,np.newaxis], 
    #               ((-1 + 2*a) / f_A_given_L)[:,np.newaxis]])
    #)[:,1] for a in A_levels]
    #     )
    return {
        'A_level': A_levels,
        'Y_mean': Y_mean,
    }

In [137]:
pd.DataFrame(doubly_robust_means_for_A(**ch2data))

[[ 1.79175617e+00  2.23533576e-05 -4.59793374e-06]]


Unnamed: 0,A_level,Y_mean
0,False,0.499999
1,True,0.5


A simple utility function for summarizing results for a given model

In [138]:
def summarize_model(data, title='', description=''):
    display(Markdown('### ' + title))
    display(Markdown(description))
    display(Markdown('#### Sample data'))
    display(pd.DataFrame(data).iloc[:12])

    display(Markdown('#### Summary statistics'))
    stats = {
        'mean(L)': np.mean(data['L']),
        'mean(A)': np.mean(data['A']),
        'mean(Y)': np.mean(data['Y']),
    }
    display(pd.DataFrame({'statistic': stats.values()}, index=stats.keys()))
    
    display(Markdown('#### Standardized means'))
    display(pd.DataFrame(standardized_means_for_A(**data)))

    display(Markdown('#### Inverse probability weighted means'))
    display(pd.DataFrame(ip_weighted_means_for_A(**data)))
        
    display(Markdown('#### Doubly-robust means'))
    display(pd.DataFrame(doubly_robust_means_for_A(**data)))

In [139]:
summarize_model(ch2data, 'Analysis of data from chapter 2', 'Validating the summary function with data from chapter 2')

### Analysis of data from chapter 2

Validating the summary function with data from chapter 2

#### Sample data

Unnamed: 0,L,A,Y
0,0,0,0
1,0,0,1
2,0,0,0
3,0,0,0
4,0,1,0
5,0,1,0
6,0,1,0
7,0,1,1
8,1,0,1
9,1,0,1


#### Summary statistics

Unnamed: 0,statistic
mean(L),0.6
mean(A),0.65
mean(Y),0.5


#### Standardized means

Unnamed: 0,A_level,Y_mean
0,False,0.5
1,True,0.5


#### Inverse probability weighted means

Unnamed: 0,A_level,Y_mean
0,False,0.5
1,True,0.5


#### Doubly-robust means

[[ 1.79175617e+00  2.23533576e-05 -4.59793374e-06]]


Unnamed: 0,A_level,Y_mean
0,False,0.499999
1,True,0.5


## Testing with generated data

Start with a simple parameterized model...

In [140]:
def generate_simple_logistic_model_data(N=1000000, P_L=0.5,
                                        beta_A_0=0, beta_A_L=0, 
                                        beta_Y_0=0, beta_Y_L=0, beta_Y_A=0, beta_Y_LA=0,
                                        debug_info=False):
    L = np.random.uniform(size=(N,)+np.array(P_L).shape) < P_L
    P_A_given_l = sps.logistic.cdf(beta_A_0 + np.array(beta_A_L).dot(L))
    A = np.random.uniform(size=N) < P_A_given_l
    P_Y_given_l_a = sps.logistic.cdf(beta_Y_0 + np.array(beta_Y_L).dot(L) +
                                     np.array(beta_Y_A).dot(A) +
                                     np.array(beta_Y_LA).dot(L)*A
                                     )
    Y = np.random.uniform(size=N) < P_Y_given_l_a
    
    return {
        'L' : L,
        'A' : A,
        'Y' : Y,
    } | ({
        'P_L' : P_L,
        'P_A_given_l' : P_A_given_l, 
        'P_Y_given_l_a' : P_Y_given_l_a,
    } if debug_info else {})

In [141]:
pd.DataFrame(generate_simple_logistic_model_data(N=5))

Unnamed: 0,L,A,Y
0,True,True,True
1,True,True,True
2,True,False,True
3,False,True,True
4,False,True,True


In [142]:
summarize_model(generate_simple_logistic_model_data(), 'Maximal entropy model', 'No interactions, 1:1 odds for L, A, and Y')

### Maximal entropy model

No interactions, 1:1 odds for L, A, and Y

#### Sample data

Unnamed: 0,L,A,Y
0,True,True,False
1,True,True,True
2,True,False,False
3,True,True,True
4,False,False,False
5,False,True,True
6,False,True,True
7,False,True,False
8,True,True,False
9,False,True,False


#### Summary statistics

Unnamed: 0,statistic
mean(L),0.500236
mean(A),0.499682
mean(Y),0.500534


#### Standardized means

Unnamed: 0,A_level,Y_mean
0,False,0.500876
1,True,0.500191


#### Inverse probability weighted means

Unnamed: 0,A_level,Y_mean
0,False,0.500876
1,True,0.500191


#### Doubly-robust means

[[-0.00387517  0.00142922 -0.00104179]]


Unnamed: 0,A_level,Y_mean
0,False,0.500877
1,True,0.500192


In [143]:
summarize_model(generate_simple_logistic_model_data(beta_Y_A=-1),
                'Simple randomized protective intervention', 'Assumes no effect from covariate')

### Simple randomized protective intervention

Assumes no effect from covariate

#### Sample data

Unnamed: 0,L,A,Y
0,True,True,False
1,False,False,True
2,True,True,False
3,False,False,True
4,True,False,True
5,True,True,True
6,True,True,False
7,False,True,True
8,False,False,False
9,True,True,True


#### Summary statistics

Unnamed: 0,statistic
mean(L),0.499291
mean(A),0.499899
mean(Y),0.384361


#### Standardized means

Unnamed: 0,A_level,Y_mean
0,False,0.499855
1,True,0.26882


#### Inverse probability weighted means

Unnamed: 0,A_level,Y_mean
0,False,0.499855
1,True,0.26882


#### Doubly-robust means

[[-0.00115468 -0.23800649 -0.19050749]]


Unnamed: 0,A_level,Y_mean
0,False,0.499874
1,True,0.268835


In [144]:
summarize_model(generate_simple_logistic_model_data(beta_Y_A=-1, beta_Y_L=1),
                'Randomized, covariate is risk, intervention is protective',
                '1:1 odds of covariate')

### Randomized, covariate is risk, intervention is protective

1:1 odds of covariate

#### Sample data

Unnamed: 0,L,A,Y
0,True,True,True
1,False,True,False
2,False,False,False
3,True,True,True
4,False,False,False
5,False,True,True
6,False,False,False
7,False,True,False
8,False,True,False
9,False,False,True


#### Summary statistics

Unnamed: 0,statistic
mean(L),0.50008
mean(A),0.500133
mean(Y),0.500042


#### Standardized means

Unnamed: 0,A_level,Y_mean
0,False,0.615878
1,True,0.384267


#### Inverse probability weighted means

Unnamed: 0,A_level,Y_mean
0,False,0.615878
1,True,0.384267


#### Doubly-robust means

[[ 0.99804838 -0.23895137 -0.19079478]]


Unnamed: 0,A_level,Y_mean
0,False,0.615843
1,True,0.384257


Note that $Y_\mathrm{mean}$ in the first row corresponds to $P(Y | \mathrm{do}(A=\mathrm{False}))$ and the second row corresponds to $P(Y | \mathrm{do}(A=\mathrm{True}))$.

The first row is $\mathrm{sigmoid}(0\cdot \beta_A + \ldots)$

The second row is $\mathrm{sigmoid}(1 \cdot \beta_A + \ldots)$

We can recover $\beta_A$ by inverting the sigmoid function for each and taking the difference between them:

In [145]:
sp.special.logit(0.383850)-sp.special.logit(0.616879)

-0.9495606173935895

In [146]:
summarize_model(generate_simple_logistic_model_data(beta_A_0=-2, beta_Y_A=-1, beta_Y_L=1),
                'Randomized, covariate is risk, rare intervention is protective',
                '1:1 odds of covariate')

### Randomized, covariate is risk, rare intervention is protective

1:1 odds of covariate

#### Sample data

Unnamed: 0,L,A,Y
0,True,False,True
1,True,False,True
2,False,False,True
3,True,False,False
4,False,False,False
5,False,False,True
6,True,False,False
7,False,False,True
8,False,False,True
9,False,False,True


#### Summary statistics

Unnamed: 0,statistic
mean(L),0.500064
mean(A),0.119162
mean(Y),0.587851


#### Standardized means

Unnamed: 0,A_level,Y_mean
0,False,0.61523
1,True,0.385467


#### Inverse probability weighted means

Unnamed: 0,A_level,Y_mean
0,False,0.61523
1,True,0.385467


#### Doubly-robust means

[[ 0.99032971 -0.02423637 -0.1016838 ]]


Unnamed: 0,A_level,Y_mean
0,False,0.632881
1,True,0.536503


In [147]:
summarize_model(generate_simple_logistic_model_data(P_L=0.75, beta_A_0=-2, beta_Y_A=-1, beta_Y_L=1),
                'Randomized, common covariate is risk, rare intervention is protective',
                '1:1 odds of covariate')

### Randomized, common covariate is risk, rare intervention is protective

1:1 odds of covariate

#### Sample data

Unnamed: 0,L,A,Y
0,False,False,True
1,True,False,True
2,False,False,True
3,False,False,True
4,False,False,True
5,True,False,True
6,True,False,False
7,False,False,True
8,True,True,True
9,True,False,True


#### Summary statistics

Unnamed: 0,statistic
mean(L),0.750143
mean(A),0.119172
mean(Y),0.646052


#### Standardized means

Unnamed: 0,A_level,Y_mean
0,False,0.673672
1,True,0.441901


#### Inverse probability weighted means

Unnamed: 0,A_level,Y_mean
0,False,0.673672
1,True,0.441901


#### Doubly-robust means

[[ 0.99822286 -0.0225582  -0.10297292]]


Unnamed: 0,A_level,Y_mean
0,False,0.689761
1,True,0.59684


In [148]:
summarize_model(generate_simple_logistic_model_data(P_L=0.75,
                                                    beta_A_0=-2, beta_A_L=1, 
                                                    beta_Y_A=-1, beta_Y_L=1),
                'Common covariate is risk for disease and intervention, intervention is protective',
                '')

### Common covariate is risk for disease and intervention, intervention is protective



#### Sample data

Unnamed: 0,L,A,Y
0,True,False,False
1,True,True,False
2,True,True,False
3,False,False,True
4,True,True,True
5,True,True,True
6,False,False,False
7,True,True,False
8,True,False,False
9,True,True,True


#### Summary statistics

Unnamed: 0,statistic
mean(L),0.750341
mean(A),0.23191
mean(Y),0.619853


#### Standardized means

Unnamed: 0,A_level,Y_mean
0,False,0.673565
1,True,0.442545


#### Inverse probability weighted means

Unnamed: 0,A_level,Y_mean
0,False,0.673565
1,True,0.442545


#### Doubly-robust means

[[ 1.0020917  -1.03165107  0.00526429]]


Unnamed: 0,A_level,Y_mean
0,False,0.672789
1,True,0.439199


In [149]:
summarize_model(generate_simple_logistic_model_data(P_L=0.5,
                                                    beta_A_0=0, 
                                                    beta_Y_A=1, beta_Y_L=1, beta_Y_LA=-2),
                'Covariate and intervention are risks, but negate eachother',
                '1:1 odds of covariate')

### Covariate and intervention are risks, but negate eachother

1:1 odds of covariate

#### Sample data

Unnamed: 0,L,A,Y
0,True,False,True
1,True,True,True
2,True,False,False
3,False,True,True
4,False,True,True
5,True,True,False
6,True,True,False
7,True,False,True
8,False,True,True
9,True,True,False


#### Summary statistics

Unnamed: 0,statistic
mean(L),0.500678
mean(A),0.500194
mean(Y),0.615473


#### Standardized means

Unnamed: 0,A_level,Y_mean
0,False,0.615919
1,True,0.614812


#### Inverse probability weighted means

Unnamed: 0,A_level,Y_mean
0,False,0.615919
1,True,0.614812


#### Doubly-robust means

[[ 0.00155209  0.17884941 -0.04588174]]


Unnamed: 0,A_level,Y_mean
0,False,0.616019
1,True,0.614912


## Notes for next time

- debug doubly robust estimation (WIP)
- Increase the dimension of the covariates $L$