In [1]:
import pandas as pd
import numpy as np

In [2]:
# Function to read in y_true, pscore_true, and tau_true from orig data file
# Used for X-meta training preds, because those are messed up
def get_true_y_p_tau(preds_df, orig_data):
    preds_df['y_true'] = orig_data['Y']
    preds_df['pscore_true'] = orig_data['pscore']
    preds_df['tau_true'] = orig_data['tau']
    
    # also create more manageably named y0/y1 pred columns
    preds_df['y0_preds'] = preds_df['y0_treat_preds']
    preds_df['y1_preds'] = preds_df['y1_control_preds']
    return preds_df


# Function to get potential outcomes (y0 and y1)
# from preds_df y_true and tau_true
def get_true_potential_outcomes(preds_df):
    
    # Initialize y0 and y1
    preds_df['y0_true'] = np.nan
    preds_df['y1_true'] = np.nan
    
    # Get tau for treat and for control
    tau_control = preds_df.loc[preds_df.W==0, 'tau_true']
    tau_treat = preds_df.loc[preds_df.W==1, 'tau_true']
    
    # Get observed outcomes
    preds_df.loc[preds_df.W==0, 'y0_true'] = preds_df.loc[preds_df.W==0, 'y_true']
    preds_df.loc[preds_df.W==1, 'y1_true'] = preds_df.loc[preds_df.W==1, 'y_true']
    
    # Reconstruct unobserved outcomes
    preds_df.loc[preds_df.W==0, 'y1_true'] = preds_df.loc[preds_df.W==0, 'y0_true'] + tau_control
    
    preds_df.loc[preds_df.W==1, 'y0_true'] = preds_df.loc[preds_df.W==1, 'y1_true'] - tau_treat
    
    return preds_df


# Function to calculate prediction errors
# NOTE: Could experiment with plotting absolute value of training errors
def get_pred_error(preds_df):
    
    # get list of all columns we can calculate a prediction error for
    pred_cols = [x.replace('_preds', '') for x in preds_df.columns if 'preds' in x]
    true_cols = [x.replace('_true', '') for x in preds_df.columns if 'true' in x]
    err_cols = set(pred_cols) & set(true_cols)
    
    # calculate and save prediction error for each
    for col in err_cols:
        preds_df[col+'_err'] = preds_df[col+'_preds'] - preds_df[col+'_true']
        
    return preds_df
    

In [3]:
all_configs = ['iw_g_logreg_default', 'iw_g_rfc_default', 'rf_g_logreg_default', 
                     'rf_g_rfc_default', 'rf_g_rfc_authors', 'rf_g_rfc_tuned']

for sim in ['A', 'B', 'C', 'D', 'E', 'F']:
    # read in original training set (will be used for X-learner)
    train = pd.read_parquet('../data/sim{}/samp1_train.parquet'.format(sim))
    for meta in ['S', 'T', 'X']:
        for config in all_configs:
            # read in test set predictions
            test_preds = pd.read_parquet('preds/{}_sim{}_{}_test_preds.parquet'\
                                        .format(meta, sim, config))
            # Add y0_true and y1_true columns
            test_preds = get_true_potential_outcomes(test_preds)
            # Calculate prediction errors
            test_preds = get_pred_error(test_preds)
            if meta=='X':
                # also read in training predictions
                train_preds = pd.read_parquet('preds/{}_sim{}_{}_train_preds.parquet'\
                                              .format(meta, sim, config))
                # fix y_true, pscore_true, and tau_true
                train_preds = get_true_y_p_tau(train_preds, train)
                # Add y0_true and y1_true columns
                train_preds = get_true_potential_outcomes(train_preds)
                # Calculate prediction errors
                train_preds = get_pred_error(train_preds)
                
                # PLOTTING NOTE:
                # All plots should be color-coded by W
                # plot thing1 vs. thing2 means thing1 should be Xaxis & thing2 should be Yaxis
                
                # TO DO for test_preds:
                # plot tau_true vs tau_preds
                # plot pscore_true vs. pscore_preds
                # plot pscore_err vs. tau_err
                
                # TO DO for train_preds:
                # plot y0_true vs. y0_preds (for people who have W =1) (on same plot as the 1 below if x-scales alignable)
                # plot y1_true vs. y1_preds (for people who have W =0) (on same plot as the 1 above if x-scales alignable)
                # plot y0_err vs. tau_err (on same plot as the 1 below if x-scales alignable)
                # plot y1_err vs. tau_err (on same plot as the 1 above if x-scales alignable)
                
                if config=='iw_g_logreg_default' or config=='iw_g_rfc_default':
                    # TO DO for train_preds, to the above ADD:
                    # plot pscore_true vs. pscore_preds
                    pass
                
            if meta=='T' or meta=='S':
                # TO DO for test_preds:
                # plot y0_true vs y0_preds
                # plot y1_true vs. y1_preds
                # plot tau_true vs. tau_preds
                # plot y0_err vs tau_preds (on same plot as the 1 below if x-scales alignable)
                # plot y1_err vs. tau_preds (on same plot as the 1 above if x-scales alignable)
                
                pass
            
                        