# Import Libraries

In [None]:
# Remove warnings
#
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Basic libraries
#
import numpy  as np
import pandas as pd
from   os     import listdir


# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
# Sklearn library
from sklearn.pipeline        import Pipeline
from sklearn.preprocessing   import StandardScaler
from sklearn.neighbors       import KNeighborsRegressor

# User libraries
from utils.metrics import PEHE, ATE
from utils.data_loading import Synthetic_dataset, TWINS_dataset
from utils.utils import data_sanity_check
print('[INFO] All libraries were imported')

# Get files

In [None]:
problem = "Synthetic-large" # {"Synthetic-large", "Synthetic"}
path = "Data/Synthetic/5000_large/" # {"Data/Synthetic/5000/", "Data/Synthetic/5000_large/", "Data/Twins/"}
limited_number_of_instances = 5
ratio = 0.01 # Synthetic: 0.01, Twins: 0.005
filename = f"./Results/{problem}_kNN-ratio={ratio}.csv"


if "Synthetic" in problem:
    DataLoader = Synthetic_dataset(path=path)
elif "TWINS" in problem:
    DataLoader = TWINS_dataset(path=path)



# Sanity check
# * Ensure that all categories obtain enough datapoints *
data_sanity_check(DataLoader=DataLoader, ratio=ratio)

# Simulations

In [None]:
results = {'ATE': [], 'Error_ATE': [], 'Error_PEHE':[]}

for idx in range(DataLoader.nProblems):
    
    
    # Load training data
    #
    trainX, trainT, trainY, train_potential_Y = DataLoader.getTraining( idx )
    # Get sampling set
    from utils.data_loading import create_sample_dataset
    indices = create_sample_dataset(T=trainT, Y=trainY, ratio=ratio, limited_number_of_instances=limited_number_of_instances)
    trainX, trainT, trainY, train_potential_Y = trainX[indices], trainT[indices], trainY[indices], train_potential_Y[indices]

    # Load testing data
    #
    testX, testT, testY, test_potential_Y     = DataLoader.getTesting( idx )
    #
    print('Simulation: ', idx)
    print('[INFO] Dataset imported')
    
    
    

    
    
    # Setup model
    #
    model = Pipeline([('scaler', StandardScaler()), 
                      ('kNN',   KNeighborsRegressor())])
    
    
    
    # Train model
    #
    model.fit(np.concatenate([trainX, trainT.reshape(-1,1)], axis=1), trainY);
    print('[INFO] Model trained')
    
    
    
    
    
    
    # Calculate number of instances in testing set
    #
    nInstances = testX.shape[0]

    
    # Initialize an empty array
    #
    test_y_hat = np.empty([nInstances, 2], dtype=float)


    # Get predictions
    #
    # \hat{m}(x, 0)
    test_y_hat[:,0] = model.predict( np.concatenate([testX, np.zeros([nInstances,1])], axis=1) ).squeeze(-1)
    #
    # \hat{m}(x, 1)
    test_y_hat[:,1] = model.predict( np.concatenate([testX, np.ones([nInstances,1])],  axis=1) ).squeeze(-1)
    

    
    # ATE
    #
    real_ATE = ( test_potential_Y[:,1] - test_potential_Y[:,0] ).mean()
    
    
    # Error PEHE
    #
    Error_PEHE = PEHE(test_potential_Y, test_y_hat)
    
    
    # Error ATE
    #
    Error_ATE = ATE(test_potential_Y, test_y_hat)  
    
        
    # Store errors of PEHE and ATE
    #
    results['ATE']            += [ np.round(real_ATE,   6) ]
    results['Error_ATE']      += [ np.round(Error_ATE,  6) ]
    results['Error_PEHE']     += [ np.round(Error_PEHE, 6) ]

    print('[INFO] Error of PEHE and ATE computed\n\n')
    
    
    # Save results (at each iteration)
    df = pd.DataFrame( results )
    df['Problem'] = [f"{problem} {x}" for x in df.index]
    df.to_csv(filename, index=False)