# Solve Selection Bias by Imputing the missing labels

In [None]:
# set working directory
from random import SystemRandom
import pandas as pd
import numpy as np
import xgboost as xgb
# from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import RobustScaler

import os
import pickle
from sklearn.model_selection import train_test_split
import wandb

import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# from torch._C import float32
import argparse
from asyncio.log import logger
import os, math
import logging
import torch
import numpy as np
import json

import torch.nn as nn
import torch

import pickle
import json
import numpy as np
import random
from torch.utils.data import Dataset, DataLoader, random_split, WeightedRandomSampler
import torch
import os
import torch.nn as nn
import torch.optim as optim
from sklearn.datasets import load_iris
from sklearn.datasets import load_breast_cancer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import torch.nn as nn
import torch
import math
import pandas as pd
import random

# Importing matplotlib and seaborn
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import sys
import warnings

import numpy as np
import pandas as pd

from sklearn.metrics import roc_auc_score
from hyperimpute.plugins.imputers import Imputers

from IPython.display import HTML, display
# import tabulate

import utils
from utils import *
if not sys.warnoptions:
    warnings.simplefilter("ignore")

%matplotlib inline

In [None]:
MINI_BATCH = 64
EPOCHS = 1000
LOAD = None
SEED = 42
REPEAT = 10

if torch.cuda.is_available():
    device = torch.device("cuda")  # Set device to GPU
    print("CUDA is available! Using GPU.")
else:
    device = torch.device("cpu")  # Set device to CPU
    print("CUDA is not available. Using CPU.")

torch.manual_seed(SEED)
np.random.seed(SEED)
torch.cuda.manual_seed(SEED)
random.seed(SEED)

param_grid = {
            'drop_rate': [0.1],
            'hidden_sizes':[[50], [100], [100, 100]],# [[50], [50, 50], [50, 100], [100], [100, 100]],
            'head_sizes':[[50], [100]],# [[50], [100]],
            'lr':[0.0001, 0.0005]
        }
# params_risk = {
#             'drop_rate': 0.05,
#             'hidden_sizes':[100],# [[50], [50, 50], [50, 100], [100], [100, 100]],
#             'head_sizes':[50],# [[50], [100]],
#             'lr':0.0001
#         }
# params_censoring = params_risk 
############################################
# initilising wandb
# wandb.init(project='SeletionBML', entity="jmdvinodjmd")
wandb.init(mode="disabled")
wandb.run.name = 'SB'
makedirs('./results/')
experimentID = LOAD
if experimentID is None:
    experimentID = int(SystemRandom().random()*100000)
# checkpoint
ckpt_path = os.path.join('./results/checkpoints/imputation_model.ckpt')
makedirs('./results/checkpoints/')
# set logger
log_path = os.path.join("./results/logs/" + "exp_imputation_" + str(experimentID) + ".log")
makedirs("./results/logs/")
logger = get_logger(logpath=log_path, filepath="exp_imputation_" + str(experimentID) + ".log", displaying=False)
logger.info("Experiment " + str(experimentID))
############################################

In [None]:
def experiment(data, params_risk, repeat=1):
    [X_train, y_train, s_train, X_val, y_val, s_val, X_test, y_test, s_test] = data

    # data for risk prediction
    loader_train_br, input_size = get_loaders([X_train, y_train], batch_size=MINI_BATCH, is_train=True, device=device)
    loader_val_br, _ = get_loaders([X_val, y_val], batch_size=MINI_BATCH, is_train=False, device=device)
    loader_test_ur, _ = get_loaders([X_test, y_test], batch_size=MINI_BATCH, is_train=False, device=device)

    # repeating experiment for a given number of times
    results_risk = {}
    for i in range(repeat):
        logger.info('Repeating: ' + str(i+1))
        results_risk[i] = {}
        #############################
        # train risk prediction model
        model_risk, optimizer, criterion = create_model('MLP', params_risk, input_size, output_size=1, device=device)
        early_stopping = EarlyStopping(patience=10, path=ckpt_path, verbose=True, logger=logger)
        logger.info(model_risk)
        wandb.watch(model_risk)
        # train
        model_risk = train_model(model_risk, 'MLP', loader_train_br, loader_val_br, optimizer, criterion, early_stopping, logger, epochs=EPOCHS, plot=False, wandb=wandb)
        # evaluate
        auroc_vb, _ = evaluate_model('Val', loader_val_br, model_risk, 'MLP', criterion, logger, -1, device, wandb)
        auroc_tu, _ = evaluate_model('Test', loader_test_ur, model_risk, 'MLP', criterion, logger, -1, device, wandb)
        logger.info('Risk Prediction: biased Val AUROC:' + str(auroc_vb['Val AUROC']) + ' unbiased Test AUROC:' + str(auroc_tu['Test AUROC']))
        results_risk[i] = {'R-Val AUROC':auroc_vb['Val AUROC'], 'R-Test AUROC':auroc_tu['Test AUROC']}

        ############################

    return results_risk

def study_effect(data_name, file_name, results_file, r, c, n, search_param=False):
    ''' 
    This function is used to study effect of (riks rate, dataset size etc.).
    It expects a set of datasets with some variations.
    '''
    logger.info('\n\n-------------N:'+str(n)+'--Risk Rate:' + str(r)+'--Censoring Rate:' + str(c)+'-------------------------.')

    results_sizes = {}
    for ni in n:
        for ci in c:
            for ri in r:
                # load data dictionary
                data_dict = get_data_dict(file_name, [ri], [ci], [ni])

                logger.info('-----Running for Size:'+str(ni)+'--Risk Rate:' + str(ri)+'--Censoring Rate:' + str(ci)+'\n-----------')
                [X_train, y_train, s_train, X_val, y_val, s_val, X_test, y_test, s_test] = data_dict[str(ni)+'R'+str(ri)+'C'+str(ci)]
                
                # do imputation here
                plugin = Imputers().get('missforest')

                df = pd.DataFrame(X_train)
                df['y'] = y_train
                df['y'][s_train==1] = np.nan
                _ = plugin.fit_transform(df.copy())
                df = plugin.transform(df.copy())
                y_train = df['y'].values

                df = pd.DataFrame(X_val)
                df['y'] = y_val
                df['y'][s_val==1] = np.nan
                df = plugin.transform(df.copy())
                y_val = df['y'].values

                data = [X_train, y_train, s_train, X_val, y_val, s_val, X_test, y_test, s_test]
                
                #############################
                # Reading hyperparameters from the JSON file
                with open('best_hyperparams.json', 'r') as json_file:
                    best_hyperparams = json.load(json_file)
                if ('Impute-'+data_name+str(ni)+'R'+str(ri)+'C'+str(ci) not in best_hyperparams) or search_param:
                    # hyperparameter tuning 
                    logger.info('Finding best hyperparams.')
                    loader_train_br, input_size = get_loaders([X_train, y_train], batch_size=MINI_BATCH, is_train=True, device=device)
                    loader_val_br, _ = get_loaders([X_val, y_val], batch_size=MINI_BATCH, is_train=False, device=device)
                    params_risk, best_score, results = grid_search_MLP('MLP', loader_train_br, loader_val_br, input_size, ckpt_path, param_grid, EPOCHS, logger, wandb, device)
                    logger.info('Hyperparam tuning for risk prediction:')
                    logger.info(results)

                    best_hyperparams['Impute-'+data_name+str(ni)+'R'+str(ri)+'C'+str(ci)] = {'params_risk': params_risk}
                    # save best params
                    with open('best_hyperparams.json', 'w') as json_file:
                        json.dump(best_hyperparams, json_file)
                    
                else:
                    logger.info('Accessing the existing best hyperparams.')
                    params_risk = best_hyperparams['Impute-'+data_name+str(ni)+'R'+str(ri)+'C'+str(ci)]['params_risk']

                ################################
                # run experiments and repeat for given number of times
                results = experiment(data, params_risk, repeat=REPEAT)
                logger.info('\n\nBest params for risk:\n' + str(params_risk))
                logger.info(results)
                results_sizes[str(ni)+'R'+str(ri)+'C'+str(ci)] = results

                # save results
                dict_to_file(results_file, results_sizes)
                ################################

    logger.info('\n\n------------------- Experiments ended-------------------.\n'+str(results_sizes)+'\n------------------------------------------------\n\n')

    return results_sizes


## Synthetic

In [None]:
results_sizes = study_effect('synthetic', 'selection_bias_data.pkl', 'results_Imputation', r=[.05, .1, .2, .3, .4], c=[.05, .1, .2, .3, .4], n=[1000, 2000, 3000, 4000, 5000], search_param=False)

## Diabetes

In [None]:
results_sizes = study_effect('diabetes', 'diabetes_bias_data.pkl', 'results_Imputation-diabetes', r=[.05, .1, .2, .3, .4], c=[.05, .1, .2, .3, .4], n=[25000, 10000, 5000, 2000, 1000], search_param=False)

## Covid

In [None]:
results_sizes = study_effect('covid', 'covid_bias_data.pkl', 'results_Imputation-covid', r=[.05, .1, .2, .3, .4], c=[.05, .1, .2, .3, .4], n=[15000, 10000, 5000, 2000, 1000], search_param=False)