In [1]:
import numpy as np
import pandas as pd
from scipy.stats import bernoulli, uniform
from scipy.special import logit, expit
from sklearn.linear_model import LogisticRegression

## Tong Chen's Model
https://github.com/T0ngChen/multiwave/blob/master/sim.r


In [68]:
r_dir = "data/chen_optimal_2020/"

In [36]:
# Reading in data from R
x_vars = ["x", "z1", "z2"]
data1 = pd.read_csv(r_dir+"data1.csv", index_col=0)
dm = data1[x_vars]
dm.insert(0, "intercept", [1]*1000)

In [37]:
# Fitting Logistic Regression
model = LogisticRegression()
model.fit(X=data1[x_vars], y=data1["y"])
fitted_values = model.predict_proba(data1[x_vars])[:,1]
resid = data1["y"] - fitted_values

In [63]:
# Estimating influence
Ihat = (dm.T*fitted_values*(1-fitted_values))@dm/len(dm)
infl = (dm.T*resid).T@np.linalg.inv(Ihat)
infl.columns = ["infl_intercept"] + ["infl_"+ x for x in x_vars]

In [64]:
# Calculating optimal allocation for stratas
aa = pd.concat([data1, infl], axis="columns")
std = (aa.groupby("stra")
         .agg({"infl_x":"std"})
         .rename(columns={"infl_x":"std"})
         .reset_index())
n = (aa.groupby("stra")
       .agg({"infl_x":"count"})
       .rename(columns={"infl_x":"count"})
       .reset_index())
oa = pd.merge(std, n, on="stra")
NS = oa["std"]*oa["count"]
NS

## Implementing in python