# Synthetic data

In [1]:
import os
os.chdir('..') 
from bilevel.synth_datagen import SynthGenLinear
from bilevel.utils import *
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
params = {'samples': 100000, 'dim':20, 
        'group_dict': {'SHAPE':['circle', 'square', 'triangle'], 'COLOR': ['green', 'red']},
        'prob_dict': {'SHAPE': [0.5, 0.3, 0.2], 'COLOR': [0.6, 0.4]},
        'feat_lo': 0.0, 'feat_hi': 1.0, 'w_lo': 0.0, 'w_hi': 1.0,
        'add_linear_mapping': True, 'add_quad_mapping' : False,
        'S_lo': 0.0, 'S_hi':0.0,
        'label_noise_width':0.16, 'drop_sensitive':False, 'fixed_seed':21,
        }
syn_ob = SynthGenLinear(**params) # SEED set to 21, for reproducibility in generation
print(syn_ob.dperm, np.array(syn_ob.all_groupnames)[syn_ob.dperm])
df = syn_ob.df

[3 1 4 2 0] ['green' 'square' 'red' 'triangle' 'circle']


In [3]:
print(df.columns)
filter_feature = [col for col in df if col.startswith('x')]
filter_label = [col for col in df if col.startswith('y')]
filter_group = [col for col in df if col.startswith('g')]
print(filter_feature, filter_label, filter_group)

Index(['x_0', 'x_1', 'x_2', 'x_3', 'x_4', 'x_5', 'x_6', 'x_7', 'x_8', 'x_9',
       'x_10', 'x_11', 'x_12', 'x_13', 'x_14', 'x_15', 'x_16', 'x_17', 'x_18',
       'x_19', 'g_circle', 'g_square', 'g_triangle', 'g_green', 'g_red',
       'y_circle', 'y_square', 'y_triangle', 'y_green', 'y_red',
       'y_mean_active', 'y_min_active', 'y_max_active', 'y_dperm_active'],
      dtype='object')
['x_0', 'x_1', 'x_2', 'x_3', 'x_4', 'x_5', 'x_6', 'x_7', 'x_8', 'x_9', 'x_10', 'x_11', 'x_12', 'x_13', 'x_14', 'x_15', 'x_16', 'x_17', 'x_18', 'x_19'] ['y_circle', 'y_square', 'y_triangle', 'y_green', 'y_red', 'y_mean_active', 'y_min_active', 'y_max_active', 'y_dperm_active'] ['g_circle', 'g_square', 'g_triangle', 'g_green', 'g_red']


In [4]:
syn_ob.A_t.sum(axis=0), syn_ob.all_groupnames

(array([49857., 30044., 20099., 59985., 40015.]),
 ['circle', 'square', 'triangle', 'green', 'red'])

In [5]:
A_t = pd.DataFrame(syn_ob.A_t, columns = syn_ob.all_groupnames)
A_t['always_on'] = 1

In [6]:
df_mean = df[filter_feature+filter_group + ['y_mean_active']]
df_min = df[filter_feature+filter_group + ['y_min_active']]
df_max = df[filter_feature+filter_group + ['y_max_active']]
df_dperm = df[filter_feature+filter_group + ['y_dperm_active']]

In [7]:
from bilevel.Groupwise_seedruns import BuildGroupwise_diffseeds

## y_mean

In [8]:
%%time
ds_ymean = BuildGroupwise_diffseeds(df_mean, 'y_mean_active', A_t) # different seeds object
ds_ymean.build_all_seeds()
ds_ymean.build_df_res()
ds_ymean.build_regret_curve()

100%|██████████| 100000/100000 [00:02<00:00, 35402.93it/s]
100%|██████████| 100000/100000 [00:15<00:00, 6626.04it/s]
100%|██████████| 100000/100000 [00:02<00:00, 36555.33it/s]
100%|██████████| 100000/100000 [00:15<00:00, 6629.86it/s]
100%|██████████| 100000/100000 [00:03<00:00, 31758.85it/s]
100%|██████████| 100000/100000 [00:14<00:00, 6868.54it/s]
100%|██████████| 100000/100000 [00:02<00:00, 36556.46it/s]
100%|██████████| 100000/100000 [00:14<00:00, 6802.24it/s]
100%|██████████| 100000/100000 [00:02<00:00, 36581.09it/s]
100%|██████████| 100000/100000 [00:14<00:00, 6762.17it/s]
100%|██████████| 100000/100000 [00:02<00:00, 36925.79it/s]
100%|██████████| 100000/100000 [00:15<00:00, 6656.12it/s]
100%|██████████| 100000/100000 [00:03<00:00, 31276.56it/s]
100%|██████████| 100000/100000 [00:15<00:00, 6631.63it/s]
100%|██████████| 100000/100000 [00:02<00:00, 37337.39it/s]
100%|██████████| 100000/100000 [00:14<00:00, 6929.97it/s]
100%|██████████| 100000/100000 [00:02<00:00, 35187.55it/s]
100%|

CPU times: user 4min 37s, sys: 13min 37s, total: 18min 14s
Wall time: 4min 40s


In [13]:
# save_ob('syn_ymean_2sg_UC-all.pickle', ds_ymean)

## y_min

In [None]:
%%time
ds_ymin = BuildGroupwise_diffseeds(df_min, 'y_min_active', A_t) # different seeds object
ds_ymin.build_all_seeds()
ds_ymin.build_df_res()
ds_ymin.build_regret_curve()

100%|██████████| 100000/100000 [00:02<00:00, 34890.44it/s]
100%|██████████| 100000/100000 [00:14<00:00, 6696.81it/s]
100%|██████████| 100000/100000 [00:02<00:00, 33424.26it/s]
100%|██████████| 100000/100000 [00:14<00:00, 6742.08it/s]
100%|██████████| 100000/100000 [00:02<00:00, 36595.75it/s]
100%|██████████| 100000/100000 [00:14<00:00, 6765.56it/s]
100%|██████████| 100000/100000 [00:02<00:00, 36098.58it/s]
100%|██████████| 100000/100000 [00:15<00:00, 6555.79it/s]
100%|██████████| 100000/100000 [00:02<00:00, 36280.44it/s]
100%|██████████| 100000/100000 [00:15<00:00, 6588.85it/s]
100%|██████████| 100000/100000 [00:02<00:00, 35289.30it/s]
100%|██████████| 100000/100000 [00:15<00:00, 6574.78it/s]
100%|██████████| 100000/100000 [00:03<00:00, 33127.92it/s]
100%|██████████| 100000/100000 [00:24<00:00, 4150.67it/s]
100%|██████████| 100000/100000 [00:04<00:00, 21827.42it/s]
100%|██████████| 100000/100000 [00:25<00:00, 3872.76it/s]
100%|██████████| 100000/100000 [00:06<00:00, 16508.18it/s]
100%|

In [15]:
# save_ob('syn_ymin_2sg_UC-all.pickle', ds_ymin)

In [19]:
# pd.concat([ds_ymin.df_base_meansd, ds_ymin.df_Anh_meansd], axis=1)

## y_max

In [None]:
ds_ymax = BuildGroupwise_diffseeds(df_max, 'y_max_active', A_t) # different seeds object
ds_ymax.build_all_seeds()
ds_ymax.build_df_res()
ds_ymax.build_regret_curve()

In [17]:
# save_ob('syn_ymax_2sg_UC-all.pickle', ds_ymax)

## y_dperm

In [None]:
ds_ydperm = BuildGroupwise_diffseeds(df_dperm, 'y_dperm_active', A_t) # different seeds object
ds_ydperm.build_all_seeds()
ds_ydperm.build_df_res()
ds_ydperm.build_regret_curve()

In [None]:
def plot_reg_sidebyside(gwise_obj: BuildGroupwise_diffseeds, dir_name:str):
    for g_ind, gname in enumerate(gwise_obj.group_names):
        gwise_obj.regret_Anh_groupwise_array[g_ind] = np.array(gwise_obj.regret_Anh_groupwise_array[g_ind]) # all 10 values in the row have same dim, so can make np array
        gwise_obj.regret_Base_groupwise_array[g_ind] = np.array(gwise_obj.regret_Base_groupwise_array[g_ind])
        print(gname, gwise_obj.group_sizes[g_ind])
        mean_reg_Anh, sd_reg_Anh = gwise_obj.regret_Anh_groupwise_array[g_ind].mean(axis = 0), gwise_obj.regret_Anh_groupwise_array[g_ind].std(axis = 0)
        mean_reg_Base, sd_reg_Base = gwise_obj.regret_Base_groupwise_array[g_ind].mean(axis = 0), gwise_obj.regret_Base_groupwise_array[g_ind].std(axis = 0)
        # plt.plot(self.pos[g_ind], self.regret_Anh_groupwise_array[g_ind][0], label = 'mean(ada - besthind_ls)')
        plt.figure(figsize=[12.8, 4.8]) # 2x default figure size
        plt.subplot(121)
        plt.plot(gwise_obj.pos[g_ind], mean_reg_Base, color = 'C0', label = 'Baseline')
        plt.fill_between(gwise_obj.pos[g_ind], mean_reg_Base - sd_reg_Base, mean_reg_Base + sd_reg_Base, alpha = 0.5, color = 'C0')
        plt.legend()
        plt.xlabel('time')
        plt.ylabel('Regret')
        plt.title(gname)
        
        plt.subplot(122)
        plt.plot(gwise_obj.pos[g_ind], mean_reg_Anh, color = 'C1', label = 'Our algorithm')
        plt.fill_between(gwise_obj.pos[g_ind], mean_reg_Anh - sd_reg_Anh, mean_reg_Anh + sd_reg_Anh, alpha = 0.5, color = 'C1')
        plt.legend()
        # plt.legend(bbox_to_anchor=(0, 1.02, 0.8, 0.2), loc ='lower left', mode='expand', ncol = 2)
        # plt.legend(bbox_to_anchor=(0, 1.02, 1.0, 0.2), loc ='lower left')
        plt.xlabel('time')
        plt.ylabel('Regret')
        plt.title(gname)
        # plt.savefig(dir_name + '/regret_'+ gname +'.pdf')
        plt.show()

In [None]:
plot_reg_sidebyside(ds_ymean, "")

In [None]:
from bilevel.Groupwise_seedruns import get_end_regret_gw_df
df_regend_ymin = get_end_regret_gw_df(ds_ymin)
df_regend_ymean = get_end_regret_gw_df(ds_ymean)
df_regend_ymax = get_end_regret_gw_df(ds_ymax)
df_regend_ydperm = get_end_regret_gw_df(ds_ydperm)

In [None]:
df_regend_ymin.to_csv('./tables/synth_ymin.csv')
df_regend_ymean.to_csv('./tables/synth_ymean.csv')
df_regend_ymax.to_csv('./tables/synth_ymax.csv')
df_regend_ydperm.to_csv('./tables/synth_ydperm.csv')

In [None]:
df_regend_ymean['mean_hindsight'].mean(axis=0), (df_regend_ymean['mean_regend_Base'] - df_regend_ymean['mean_regend_Anh']).mean(axis=0) # rough values mean

In [None]:
df_regend_ymin['mean_hindsight'].mean(axis=0), (df_regend_ymin['mean_regend_Base'] - df_regend_ymin['mean_regend_Anh']).mean(axis=0) # rough values  min 

In [None]:
df_regend_ymax['mean_hindsight'].mean(axis=0), (df_regend_ymax['mean_regend_Base'] - df_regend_ymax['mean_regend_Anh']).mean(axis=0) # rough values  max

In [None]:
df_regend_ydperm['mean_hindsight'].mean(axis=0), (df_regend_ydperm['mean_regend_Base'] - df_regend_ydperm['mean_regend_Anh']).mean(axis=0) # rough values  dperm

In [None]:
from bilevel.Groupwise_seedruns import plot_regret_curve_with_std
plot_regret_curve_with_std(ds_ymean, './plots/synth_mean')

plot_regret_curve_with_std(ds_ymin, './plots/synth_min')

plot_regret_curve_with_std(ds_ymax, './plots/synth_max')

plot_regret_curve_with_std(ds_ydperm, './plots/synth_perm')