# Data Loading

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import train_test_split
import random
from sklearn.impute import KNNImputer
from sklearn.metrics import r2_score
from sklearn import linear_model
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer


In [None]:
#load data
X = np.load("/Users/jiaweizhang/research/data/X.npy")
Y = np.load("/Users/jiaweizhang/research/data/Y.npy")
Z = np.load("/Users/jiaweizhang/research/data/Z.npy")
M = np.load("/Users/jiaweizhang/research/data/M.npy")

display(pd.DataFrame(X))
display(pd.DataFrame(Y))
display(pd.DataFrame(Z))
display(pd.DataFrame(M))

# One shot framework

In [None]:
def split_df(df):
    # Set the proportion of data to be split
    split_proportion = 0.5

    # Set a random seed for reproducibility
    random_seed = 42

    # Get the indices for the split
    indices = df.index.tolist()
    num_rows = len(df)
    split_index = int(num_rows * split_proportion)

    # Shuffle the indices randomly
    random.shuffle(indices)

    # Get the randomly selected rows for each split
    split1_indices = indices[:split_index]
    split2_indices = indices[split_index:]

    # Split the original DataFrame into two separate DataFrames
    df1 = df.loc[split1_indices]
    df2 = df.loc[split2_indices]
    
    return df1,df2

def getY(df_pred,df_missing):
    # get the predicted Y values for the missing values
    Y_pred = []
    m,n = df_pred.shape

    #return Y_pred
    for i in range(m):
        for j in range(n):
            if pd.isna(df_missing[i][j]):
                Y_pred.append(df_pred[i][j])
    
    return Y_pred

def one_shot_test(Z, X, M, Y, G1, G2, T, L=10000):
    """
    A one-shot framework for testing H_0.

    Args:
    Z: 2D array of observed treatment indicators
    X: 2D array of observed covariates
    M: 2D array of observed missing indicators
    Y: 2D array of observed values for K outcomes
    G1: a function that takes (Z, X, M, Y_k) as input and returns the imputed value for outcome k
    G2: a function that takes (Z, X, M, Y_k) as input and returns the imputed value for outcome k
    T: a function that takes (Z, Y_k) as input and returns the test statistic for outcome k
    L: number of Monte Carlo simulations (default is 10000)

    Returns:
    p1: 1D array of exact p-values for testing Fisher's sharp null in part 1
    p2: 1D array of exact p-values for testing Fisher's sharp null in part 2
    """
    # create data a whole data frame
    Y_masked = np.ma.masked_array(Y, mask=M)
    Y_masked = Y_masked.filled(np.nan)
    df = pd.DataFrame(np.concatenate((Z, X, Y_masked), axis=1))

    # randomly split the data into two parts
    df1, df2 = split_df(df)

    # impute the missing values and calculate the observed test statistics in part 1
    G1.fit(df1)
    df1_imputed = G1.transform(df1)
    Y1_impute = df1_imputed.iloc[:, 2:df1_imputed.shape[1]]
    Y1_impute = Y1_impute.to_numpy()

    # impute the missing values and calculate the observed test statistics in part 2
    G2.fit(df2)
    df2_imputed = G2.transform(df2)
    Y2_impute = df2_imputed.iloc[:, 2:df2_imputed.shape[1]]
    Y2_impute = Y2_impute.to_numpy()

    # simulate data and calculate test statistics
    t1_sim = np.zeros((L, len(Y)))
    t2_sim = np.zeros((L, len(Y)))
    for l in range(L):
        # simulate treatment indicators in parts 1 and 2
        Z1_sim = np.random.binomial(1, 0.5, df1.shape[0])
        Z2_sim = np.random.binomial(1, 0.5, df2.shape[0])
        
        # impute missing values in part 1 using G2
        Y1_sim = [G2(Z1_sim, X1, M1, Y1_k) for Y1_k in Y1]

        # impute missing values in part 2 using G1
        Y2_sim = [G1(Z2_sim, X2, M2, Y2_k) for Y2_k in Y2]

        # calculate test statistics for part 1 and part 2
        for k in range(len(Y)):
            t1_sim[l,k] = T(Z1_sim, Y1_sim[k])
            t2_sim[l,k] =
                # calculate exact p-values for each outcome
    p1 = np.mean(t1_sim >= t1_obs, axis=0)
    p2 = np.mean(t2_sim >= t2_obs, axis=0)
    
    return p1, p2

