## Repeated sampling from LAZADA real data for experiments.
Each repeated sampling for experiments, training set is sampled by 90% randomly, and test set is keep 100%.

Three paths need to be set(must be an absolute path).

- `full_trainset_path`: the path of full_trainset.csv

- `full_testset_path`: the path of full_testset.csv

- `base_path`: the path of output

In [1]:
import pandas as pd 
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder 
import numpy as np 

# load data. path need to be set.
full_trainset_path = "/home/admin/uplift_data/dataset_public_md5/full_trainset.csv"
full_testset_path = "/home/admin/uplift_data/dataset_public_md5/full_testset.csv"

# output dir: save as .npz file
base_path="/home/admin/uplift_data/dataset_public_md5"

df_trainset = pd.read_csv(full_trainset_path)
df_testset = pd.read_csv(full_testset_path)

from sklearn.model_selection import train_test_split
num_experiments=5


# split and export
train_data = {  "yf":[], "t":[], "x":[], "e":[] }
test_data = {"yf":[], "t":[], "x":[], "e":[] }

for exp_i in range(num_experiments):
    # train set 
    _, df_sub_set = train_test_split(df_trainset, test_size=0.9 ,random_state=exp_i) 
#     df_sub_set=df_trainset
    
    y = df_sub_set.label.values
    t = df_sub_set.is_treat.values
    X = df_sub_set.iloc[:,3:].values
    
    train_data["yf"].append(y)
    train_data["x"].append(X)
    train_data["t"].append(t)
    train_data["e"].append( np.zeros_like(t) )
    
    
    # test set 
#     _, df_sub_set = train_test_split(df_testset, test_size=1 ,random_state=exp_i)
    df_sub_set = df_testset
    
    y = df_sub_set.label.values
    t = df_sub_set.is_treat.values
    X = df_sub_set.iloc[:,3:].values
    
    test_data["yf"].append(y)
    test_data["x"].append(X)
    test_data["t"].append(t)
    test_data["e"].append( np.ones_like(t) )
    

# format
train_data["x"] = np.swapaxes(np.swapaxes(np.array(train_data["x"]), 0, 1), 1, 2)
test_data["x"] = np.swapaxes(np.swapaxes(np.array(test_data["x"]), 0, 1), 1, 2)
for col in [ "yf", "t", "e"]:
    train_data[col] = np.swapaxes(train_data[col], 0, 1)
    test_data[col] = np.swapaxes(test_data[col], 0, 1)
    


train_npz_path = "{}/real_bin_set_full.{}.train".format(base_path, num_experiments)
test_npz_path = "{}/real_bin_set_full.{}.test".format(base_path, num_experiments)


pair_list = [(train_npz_path, train_data), 
             (test_npz_path, test_data)]

for (path, data_dict) in pair_list:
    np.savez(path, yf=data_dict["yf"], t=data_dict["t"], x=data_dict["x"], e=data_dict["e"])

train_npz = train_npz_path+".npz"
test_npz = test_npz_path+".npz"
print("training set:{}".format(train_npz))
print("test set:{}".format(test_npz))


training set:/home/admin/uplift_data/dataset_public_md5/real_bin_set_full.5.train.npz
test set:/home/admin/uplift_data/dataset_public_md5/real_bin_set_full.5.test.npz


## X-learner / X-learner with PS 
Neural Network-based

In [2]:
!python search_params.py x_learner_main.py eval4real_data.py ./conf4models/lzd_real_data/Xlearner.txt 1 {train_npz} {test_npz}
!python search_params.py x_learner_main.py eval4real_data.py ./conf4models/lzd_real_data/Xlearner_with_PS.txt 1 {train_npz} {test_npz}

------------------------------
Run 1 of 1:
------------------------------

2022-02-15 16:12:39,297 - DEBUG - Setting JobRuntime:name=UNKNOWN_NAME
2022-02-15 16:12:39,297 - DEBUG - Setting JobRuntime:name=x_learner_main
[2022-02-15 16:12:39,466][root][INFO] - log testing ...
[2022-02-15 16:12:39,466][root][INFO] - cfg:{'lr': 0.001, 'decay_rate': 0.95, 'decay_step_size': 1, 'l2': 0.001, 'use_ps': 0, 'model_name': 'X_learner_128_20220215_161237', 'n_experiments': 5, 'batch_size': 5000, 'base_dim': 128, 'val_rate': 0.2, 'do_rate': 0.1, 'normalization': 'divide', 'epochs': 5, 'log_step': 1, 'pred_step': 1, 'optim': 'Adam', 'device': 'cuda:1', 'verbose': 0, 'pred_output_dir': '/home/admin/dufeng/ESX_Model/results/lzd_real', 'data_train_path': '/home/admin/uplift_data/dataset_public_md5/real_bin_set_full.5.train.npz', 'data_test_path': '/home/admin/uplift_data/dataset_public_md5/real_bin_set_full.5.test.npz', 'summary_base_dir': '/home/admin/dufeng/ESX_Model/runs', 'overwrite': 1, 'sample_alp

## TARNet/CFRwass/CFRmmd

In [3]:
!python search_params.py main.py eval4real_data.py ./conf4models/lzd_real_data/TARNet.txt 1 {train_npz} {test_npz}
!python search_params.py main.py eval4real_data.py ./conf4models/lzd_real_data/CFRmmd.txt 1 {train_npz} {test_npz}
!python search_params.py main.py eval4real_data.py ./conf4models/lzd_real_data/CFRwass.txt 1 {train_npz} {test_npz}

------------------------------
Run 1 of 1:
------------------------------

2022-02-15 16:40:43,094 - DEBUG - Setting JobRuntime:name=UNKNOWN_NAME
2022-02-15 16:40:43,095 - DEBUG - Setting JobRuntime:name=main
[2022-02-15 16:40:43,284][root][INFO] - log testing ...
[2022-02-15 16:40:43,285][root][INFO] - cfg:{'lr': 0.001, 'decay_rate': 0.95, 'decay_step_size': 1, 'l2': 0.001, 'model_name': 'TARNET_128_64_20220215_164041', 'n_experiments': 5, 'batch_size': 5000, 'share_dim': 128, 'base_dim': 64, 'reweight_sample': 1, 'val_rate': 0.2, 'do_rate': 0.1, 'normalization': 'divide', 'epochs': 5, 'log_step': 50, 'pred_step': 1, 'optim': 'Adam', 'BatchNorm1d': 'true', 'prpsy_w': 0, 'escvr1_w': 0, 'escvr0_w': 0, 'h1_w': 1, 'h0_w': 1, 'mu0hat_w': 0, 'mu1hat_w': 0, 'imb_dist': 'wass2', 'imb_dist_w': 0, 'device': 'cuda:1', 'verbose': 0, 'pred_output_dir': '/home/admin/dufeng/ESX_Model/results/lzd_real', 'data_train_path': '/home/admin/uplift_data/dataset_public_md5/real_bin_set_full.5.train.npz', 'da

## ES + TARNet/CFRwass/CFRmmd

In [4]:
!python search_params.py main.py eval4real_data.py ./conf4models/lzd_real_data/ES_TARNet.txt 1 {train_npz} {test_npz}
!python search_params.py main.py eval4real_data.py ./conf4models/lzd_real_data/ES_CFRmmd.txt 1 {train_npz} {test_npz}
!python search_params.py main.py eval4real_data.py ./conf4models/lzd_real_data/ES_CFRwass.txt 1 {train_npz} {test_npz}


------------------------------
Run 1 of 1:
------------------------------

2022-02-15 17:08:28,146 - DEBUG - Setting JobRuntime:name=UNKNOWN_NAME
2022-02-15 17:08:28,146 - DEBUG - Setting JobRuntime:name=main
[2022-02-15 17:08:28,330][root][INFO] - log testing ...
[2022-02-15 17:08:28,330][root][INFO] - cfg:{'lr': 0.001, 'decay_rate': 0.95, 'decay_step_size': 1, 'l2': 0.001, 'model_name': 'ES_TARNet128_64_20220215_170826', 'n_experiments': 5, 'batch_size': 5000, 'share_dim': 128, 'base_dim': 64, 'reweight_sample': 1, 'val_rate': 0.2, 'do_rate': 0.1, 'normalization': 'divide', 'epochs': 5, 'log_step': 50, 'pred_step': 1, 'optim': 'Adam', 'BatchNorm1d': 'true', 'prpsy_w': 0.5, 'escvr1_w': 2, 'escvr0_w': 1, 'h1_w': 0, 'h0_w': 0, 'mu0hat_w': 0, 'mu1hat_w': 0, 'imb_dist': 'wass', 'imb_dist_w': 0, 'device': 'cuda:1', 'verbose': 0, 'pred_output_dir': '/home/admin/dufeng/ESX_Model/results/lzd_real', 'data_train_path': '/home/admin/uplift_data/dataset_public_md5/real_bin_set_full.5.train.npz', 

## X-network/ESCN

In [6]:
!python search_params.py main.py eval4real_data.py  ./conf4models/lzd_real_data/Xnetwork.txt 1 {train_npz} {test_npz}
!python search_params.py main.py eval4real_data.py  ./conf4models/lzd_real_data/DESCN.txt 1 {train_npz} {test_npz}

------------------------------
Run 1 of 1:
------------------------------

2022-02-15 17:52:24,184 - DEBUG - Setting JobRuntime:name=UNKNOWN_NAME
2022-02-15 17:52:24,185 - DEBUG - Setting JobRuntime:name=main
[2022-02-15 17:52:24,367][root][INFO] - log testing ...
[2022-02-15 17:52:24,367][root][INFO] - cfg:{'lr': 0.001, 'decay_rate': 0.95, 'decay_step_size': 1, 'l2': 0.001, 'model_name': 'Xnetwork_128_64_20220215_175222', 'n_experiments': 5, 'batch_size': 5000, 'share_dim': 128, 'base_dim': 64, 'reweight_sample': 1, 'val_rate': 0.2, 'do_rate': 0.1, 'normalization': 'divide', 'epochs': 5, 'log_step': 50, 'pred_step': 1, 'optim': 'Adam', 'BatchNorm1d': 'true', 'prpsy_w': 0, 'escvr1_w': 0, 'escvr0_w': 0, 'h1_w': 2, 'h0_w': 2, 'mu0hat_w': 1, 'mu1hat_w': 2, 'imb_dist': '', 'imb_dist_w': 0, 'device': 'cuda:1', 'verbose': 0, 'pred_output_dir': '/home/admin/dufeng/ESX_Model/results/lzd_real', 'data_train_path': '/home/admin/uplift_data/dataset_public_md5/real_bin_set_full.5.train.npz', 'data_

## Print all results.

In [7]:
# print all results.
!cat /home/admin/dufeng/ESX_Model/results/lzd_real/eval_result.txt

X_learner_128_20220215_161237,auuc: 0.019345 +/- 0.004395,e_att: 0.005796 +/- 0.000363
X_learner_with_PS_128_20220215_162642,auuc: 0.023382 +/- 0.003457,e_att: 0.007551 +/- 0.000889
TARNET_128_64_20220215_164041,auuc: 0.030861 +/- 0.002132,e_att: 0.010581 +/- 0.001550
CFR_mmd_128_64_20220215_164829,auuc: 0.032439 +/- 0.002917,e_att: 0.025792 +/- 0.001538
CFR_wass_128_64_20220215_165705,auuc: 0.026086 +/- 0.000169,e_att: 0.026643 +/- 0.001277
ES_TARNet128_64_20220215_170826,auuc: 0.033958 +/- 0.000820,e_att: 0.016532 +/- 0.001748
ES_CFR_mmd_128_64_20220215_171601,auuc: 0.033127 +/- 0.000525,e_att: 0.020667 +/- 0.000968
ES_CFR_wass_128_64_20220215_172449,auuc: 0.026419 +/- 0.001760,e_att: 0.021172 +/- 0.001548
Xnetwork_128_64_20220215_175222,auuc: 0.032444 +/- 0.001600,e_att: 0.004832 +/- 0.001025
DESCN_128_64_20220215_175927,auuc: 0.033976 +/- 0.000645,e_att: 0.003894 +/- 0.000729
