# Data Loading

In [85]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import random
from sklearn.impute import KNNImputer
from sklearn import linear_model
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
import scipy.stats as stats


In [86]:
#load data
X = np.load("/Users/jiaweizhang/research/data/X.npy")
Y = np.load("/Users/jiaweizhang/research/data/Y.npy")
Z = np.load("/Users/jiaweizhang/research/data/Z.npy")
M = np.load("/Users/jiaweizhang/research/data/M.npy")

display(pd.DataFrame(X))
display(pd.DataFrame(Y))
display(pd.DataFrame(Z))
display(pd.DataFrame(M))


Unnamed: 0,0,1,2,3,4
0,0.766472,-1.644463,1.135335,1.647609,0.0
1,1.228147,-0.133635,-0.159909,0.206834,1.0
2,1.526981,1.568491,0.769310,0.693670,0.0
3,0.477432,0.327402,0.300267,-0.691641,0.0
4,1.607284,-1.126374,0.615059,0.591469,1.0
...,...,...,...,...,...
19995,0.757626,-0.174572,-1.267520,0.876582,0.0
19996,-0.137899,-0.914321,-0.869284,0.295851,1.0
19997,1.737828,1.498230,0.018937,0.894567,0.0
19998,0.329276,-1.028964,0.561963,0.876246,1.0


Unnamed: 0,0,1,2
0,36.975790,23.551939,40.542855
1,7.427683,20.491803,41.315736
2,27.721509,65.341903,304.874363
3,9.646436,4.172816,3.265358
4,35.500065,34.804845,84.133658
...,...,...,...
19995,2.169075,2.442815,-0.196697
19996,-1.731654,-2.343015,-0.000662
19997,25.559103,60.395742,252.374515
19998,7.545427,14.749494,23.554173


Unnamed: 0,0
0,1.0
1,0.0
2,0.0
3,1.0
4,1.0
...,...
19995,0.0
19996,0.0
19997,0.0
19998,0.0


Unnamed: 0,0,1,2
0,0.0,0.0,0.0
1,0.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,0.0,0.0
4,0.0,0.0,0.0
...,...,...,...
19995,0.0,0.0,0.0
19996,0.0,0.0,1.0
19997,0.0,0.0,0.0
19998,0.0,0.0,0.0


# One shot framework

### Randomly split one dataframe to two datasets
split_df takes a pandas DataFrame as input and randomly splits it into two separate DataFrames with a specified proportion of the data in each split. The function shuffles the indices randomly and splits the DataFrame using the shuffled indices. It returns the two separate DataFrames as output.

In [87]:
def split_df(df):
    # Set the proportion of data to be split
    split_proportion = 0.5

    # Set a random seed for reproducibility
    random.seed(10)

    # Get the indices for the split
    indices = df.index.tolist()
    num_rows = len(df)
    split_index = int(num_rows * split_proportion)

    # Shuffle the indices randomly
    random.shuffle(indices)

    # Get the randomly selected rows for each split
    split1_indices = indices[:split_index]
    split2_indices = indices[split_index:]

    # Split the original DataFrame into two separate DataFrames
    df1 = df.loc[split1_indices]
    df2 = df.loc[split2_indices]
    
    return df1,df2

### T-test for T(Z,Y)
t-value of T(Z,Y), where Z is a treatment indicator and Y is the effect, you can use a two-sample t-test

In [88]:
def TwithP(G, df):
    
    # Get the imputed data Y and indicator Z
    df_imputed = G.transform(df)
    Y_pred = df_imputed[:, Z.shape[1] + X.shape[1]:df_imputed.shape[1]]
    Z_shuffled = df_imputed[:, 0]

    # Get the t-statistics for T(Z,Y)
    treatment = Y_pred[Z_shuffled == 1].flatten()
    control = Y_pred[Z_shuffled == 0].flatten()

    t, p = stats.ttest_ind(treatment, control, equal_var=True)

    print(t,p)
    return t,p

def T(G, df):
    
    # Get the imputed data Y and indicator Z
    df_imputed = G.transform(df)
    Y_pred = df_imputed[:, Z.shape[1] + X.shape[1]:df_imputed.shape[1]]
    Z_shuffled = df_imputed[:, 0]

    # Get the treatment and control groups
    treatment = Y_pred[Z_shuffled == 1].flatten()
    control = Y_pred[Z_shuffled == 0].flatten()  

    # Get the t-statistics for T(Z,Y)
    mean_treatment = np.mean(treatment)
    mean_control = np.mean(control)
    std_treatment = np.std(treatment, ddof=1)
    std_control = np.std(control, ddof=1)

    pooled_std = np.sqrt(((len(treatment) - 1) * std_treatment**2 + (len(control) - 1) * std_control**2) / (len(treatment) + len(control) - 2))

    t = (mean_treatment - mean_control) / (pooled_std * np.sqrt(1/len(treatment) + 1/len(control)))

    return t



## One Short Framework 


In [89]:
def one_shot_test(Z, X, M, Y, G1, G2, L=10000):
    """
    A one-shot framework for testing H_0.

    Args:
    Z: 2D array of observed treatment indicators
    X: 2D array of observed covariates
    M: 2D array of observed missing indicators
    Y: 2D array of observed values for K outcomes
    G1: a function that takes (Z, X, M, Y_k) as input and returns the imputed value for outcome k
    G2: a function that takes (Z, X, M, Y_k) as input and returns the imputed value for outcome k
    L: number of Monte Carlo simulations (default is 10000)

    Returns:
    p1: 1D array of exact p-values for testing Fisher's sharp null in part 1
    p2: 1D array of exact p-values for testing Fisher's sharp null in part 2
    """
    # create data a whole data frame
    Y_masked = np.ma.masked_array(Y, mask=M)
    Y_masked = Y_masked.filled(np.nan)
    df = pd.DataFrame(np.concatenate((Z, X, Y_masked), axis=1))
    
    # randomly split the data into two parts
    df1, df2 = split_df(df)

    # impute the missing values and calculate the observed test statistics in part 1
    G1.fit(df1)
    t1_obs = T(G1, df1)

    # impute the missing values and calculate the observed test statistics in part 2
    G2.fit(df2)
    t2_obs = T(G2, df2)

    # simulate data and calculate test statistics
    t1_sim = np.zeros(L)
    t2_sim = np.zeros(L)
    for l in range(L):
        # simulate treatment indicators in parts 1 and 2
        Z_sim = np.random.binomial(1, 0.5, df.shape[0]).reshape(-1, 1)
        df_sim = pd.DataFrame(np.concatenate((Z_sim, X, Y_masked), axis=1))
        
        # split the simulated data into two parts
        df1_sim, df2_sim = split_df(df_sim)
        # get the test statistics in part 1
        t1_sim[l] = T(G1, df1_sim)

        # get the test statistics in part 2
        t2_sim[l] = T(G2, df2_sim)

        # Calculate the completeness percentage
        completeness = l / L * 100  
        print(f"Task is {completeness:.2f}% complete.")

    # calculate exact p-values for each outcome
    p1 = np.mean(t1_sim >= t1_obs, axis=0)
    p2 = np.mean(t2_sim >= t2_obs, axis=0)
    
    return p1, p2



# Test the framework

In [None]:
#MissForest
print("One-shot test for Fisher's sharp null")
missForest = IterativeImputer(estimator = RandomForestRegressor(),max_iter=10, random_state=0)
p1, p2 = one_shot_test(Z, X, M, Y, G1=missForest, G2=missForest)
print("p-values for part 1:", p1)
print("p-values for part 2:", p2)


In [90]:
#KNNimputer
print("One-shot test for Fisher's sharp null")
KNNimputer = KNNImputer(n_neighbors=7)
p1, p2 = one_shot_test(Z, X, M, Y, G1=KNNimputer, G2=KNNimputer)
print("p-values for part 1:", p1)
print("p-values for part 2:", p2)


One-shot test for Fisher's sharp null
1.041316735611151 0.29773696420063855
1.2474082984166261 0.21225756335154783
-0.35260220807757253 0.7243891592154375
-1.1017824832145255 0.2705650872918411
-1.2466239877370946 0.2125451358932869
0.16759143436602858 0.8669058399889478
-0.4156071121750735 0.6777004942598381
0.6731204275434776 0.5008758644735789
0.6290217030297298 0.5293396375868845
-0.5560268546115967 0.5781966437448978
-1.8710278288697637 0.06135095082545921
0.3429228097608535 0.731658934263393
-0.9127865198507682 0.36136216497127316
2.293357551284793 0.02183431071330642
1.8842185795533848 0.05954507650048377
0.27381624565684615 0.7842277235813064
-1.5581772800960003 0.11920177836607691
0.904532649962831 0.36572029374682313
1.3262944836953203 0.18475227074603598
-1.7652832657643438 0.07752634289469777
0.16414943903342125 0.8696146088098027
0.8134286882755598 0.4159787728174551
0.1533677242017254 0.8781093276846079
1.520591404340693 0.1283729448668956
-0.984213251858405 0.32501860924

KeyboardInterrupt: 

In [None]:
#BayesianRidge
print("One-shot test for Fisher's sharp null")
BayesianRidge = IterativeImputer(estimator = BayesianRidge(),max_iter=10, random_state=0)
p1, p2 = one_shot_test(Z, X, M, Y, G1=BayesianRidge, G2=BayesianRidge)
print("p-values for part 1:", p1)
print("p-values for part 2:", p2)

In [None]:
#Median imputer
print("One-shot test for Fisher's sharp null")
median_imputer = SimpleImputer(missing_values=np.nan, strategy='median')
p1, p2 = one_shot_test(Z, X, M, Y, G1=median_imputer, G2=median_imputer)
print("p-values for part 1:", p1)
print("p-values for part 2:", p2)

In [None]:
#Mean imputer
print("One-shot test for Fisher's sharp null")
mean_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
p1, p2 = one_shot_test(Z, X, M, Y, G1=mean_imputer, G2=mean_imputer)
print("p-values for part 1:", p1)
print("p-values for part 2:", p2)
