In [1]:
from folktables import ACSDataSource
import folktables
import pandas as pd
import numpy as np
import random
import pickle
from tqdm import tqdm

import numpy as np
from bestLS_hindsight import *
from OnlineRidgeRiver import *
from lean_adahedge import *
import matplotlib.pyplot as plt
from bestLS_hindsight_together import *
from oridge_alwaysactive_implementable import *

# Data for New Jersey in 2021 

**X feature**

    -The numeric features are AGEP: Age , WKHP: # of hours worked per week

    -The categorical features are COW: Class of Worker, SEX: Male/Female, RAC1P - Race code, SCHL - Educational attainment, OCCP - Occupation code, MAR - Marital status

**y target** 

    - is PINCP - Annual Income of the individual

For more about these variable names search "data dictionary" for ACM PUMS 2021 at  https://www.census.gov/programs-surveys/acs/microdata/documentation.html
 
The cell below loads a pandas dataframe in which we have already encoded categorical variables to one-hot, and scaled numeric variables by min-max scaling.

To see how it was prepared see the **example_dataprocessing.ipynb** file

In [2]:
df_all = pd.read_pickle("allstates2021.pkl")
#df_subset = pd.read_pickle("Name_of_subset_ofstates.pkl")

In [3]:
df_all

Unnamed: 0,AGEP,WKHP,PINCP,COW_1,COW_2,COW_3,COW_4,COW_5,COW_6,COW_7,...,SEX_2,RAC1P_1,RAC1P_2,RAC1P_3,RAC1P_4,RAC1P_5,RAC1P_6,RAC1P_7,RAC1P_8,RAC1P_9
0,0.025316,0.295918,0.039472,0,1,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
1,0.037975,0.397959,0.030967,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,0.025316,0.173469,0.030467,0,1,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
3,0.215190,0.051020,0.053479,0,1,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
4,0.025316,0.091837,0.009455,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1630162,0.291139,0.397959,0.399670,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
1630163,0.379747,0.397959,0.299615,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1630164,0.367089,0.397959,0.092501,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,1
1630165,0.063291,0.295918,0.060483,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1


In [4]:
X_dat = df_all.drop('PINCP', axis=1) #dropping the income column
y_dat = pd.DataFrame(df_all['PINCP']) # picking up only the income column for the target

SEX_1 is Male
SEX_2 is Female
RAC1P_1 is White
RAC1P_2 is Black
....
See 

In [5]:
gnames = ['SEX_1', 'SEX_2', 'RAC1P_1','RAC1P_2','RAC1P_3','RAC1P_4','RAC1P_5','RAC1P_6', 'RAC1P_7','RAC1P_8','RAC1P_9'] #sensitive group names
sensitive_group_cols = X_dat[gnames] # this picks the above columns from X_dat dataframe
A_tarr = sensitive_group_cols.to_numpy()

In [6]:
sensitive_group_cols # for e.g. row with index 1 has two groups active SEX_2 and RAC1P_1, this means the person is a white female

Unnamed: 0,SEX_1,SEX_2,RAC1P_1,RAC1P_2,RAC1P_3,RAC1P_4,RAC1P_5,RAC1P_6,RAC1P_7,RAC1P_8,RAC1P_9
0,0,1,1,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,0
2,0,1,1,0,0,0,0,0,0,0,0
3,0,1,1,0,0,0,0,0,0,0,0
4,1,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
1630162,0,1,0,0,0,0,0,0,0,0,1
1630163,1,0,0,0,0,0,0,0,0,0,1
1630164,0,1,0,0,0,0,0,0,0,0,1
1630165,1,0,0,0,0,0,0,0,0,0,1


In [7]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

The code block below finds the best squared loss in hindsight for
each group subsequence by using BestLS_Hindsight class defined in **bestLS_hindsight.py**

In [None]:
bestsqloss_list = [] #bestsqloss_list stores BestLS_Hindsight objects, can try to do multiprocessing
for gname in gnames:
    print(gname)
    df_group = df_all.loc[df_all[gname] == 1] #picking only those rows where group ``gname'' is active
    X_df_group = 1.0 * df_group.drop('PINCP', axis=1) # We're multiplying by 1.0 to make boolean False/True to 0/1
    y_df_group = pd.DataFrame(df_group['PINCP'])
    bestsqloss_list.append(BestLS_Hindsight(X_df_group, y_df_group))

a) **pickle dump bestsqloss_list**, to be used by plitting notebook

In [None]:
with open('bestsqloss_hindsight_all.pkl', 'wb') as f:
    pickle.dump(bestsqloss_list, f)

In [None]:
N = A_tarr.shape[1] # number of meta-experts
d = len(X_dat.columns) # dimensionality of features
T = len(X_dat) # number of rounds of interaction, basically # of rows of data
print(N, d, T)


Below we instantiate **Adanormal_sleepingexps** object from **lean_adahedge.py**, it takes as input the number of experts N, and a list of meta-experts

The meta-experts here are **River_OLS** objects from **OnlineRidgeRiver.py**

In [None]:
experts = [River_OnlineRidge() for _ in range(N)] #online ridge meta-experts

Anh = Adanormal_sleepingexps(N, experts) #adanormal hedge

In [None]:
# ONLINE INTERACTIONS!
for t in tqdm(range(T)): 
  Anh.get_prob_over_experts(A_tarr[t]) #get probability over meta-experts
  Anh.update_metaexps_loss(A_tarr[t], X_dat.iloc[[t]], y_dat.iloc[t]) # update internal states of the meta-experts

Loading previously saved data and building cumulative loss curve

In [None]:
Anh.build_cumloss_curve(bestsqloss_arr, A_tarr) # Very important for plotting, calculates regret on each subsequence for Adanormal hedge

**b) Pickle dump Adanormal hedge object, to be used by the plotting notebook**

In [None]:
with open('Anh_all_with_cumreg.pkl', 'wb') as f: # "wb" because we want to write in binary mode
    pickle.dump(Anh, f)

Code cell below calculates regret of Adanormal hedge on each subsequence

Essentially it calculates the: Adanormal hedge loss - bestsqloss(Least squares),for each subsequence

Code block below is a single online ridge learner which is **active in each round**

We will also calculate its regret **wrt to best squared loss** on that subsequence

In [None]:
#TODO data structure/wrapper for the below

In [None]:
model_oridge_baseline = linear_model.LinearRegression(l2 = 1.0) # (Kakade and Foster reference) ridge has a regret bound
loss_tarr = []

for t in tqdm(range(T)):
    y_temp_ridge = np.clip(model_oridge_baseline.predict_many(X_dat.iloc[[t]]).iloc[0], 0.0, 1.0)
    model_oridge_baseline.learn_many(X_dat.iloc[[t]], y_dat.iloc[t])
    loss_tarr.append((y_temp_ridge - y_dat.iloc[t][0])**2)

loss_groupwise_oridge = []
cumloss_groupwise_oridge = []
cumreg_groupwise_oridge = []
loss_oridge_tarr = np.array(loss_tarr)

for gnum in range(N): # build cumulative loss for  on each group subsequence
    loss_groupwise_oridge.append(loss_oridge_tarr[A_tarr[:, gnum].astype(bool)]) # select those losses where group gnum active
    cumloss_groupwise_oridge.append(np.cumsum(loss_groupwise_oridge[-1])) #cumulative sum of the previous
    cumreg_groupwise_oridge.append(cumloss_groupwise_oridge[-1] - np.array(bestsqloss_arr[gnum])) #bestsquare loss for that group subsequence still the same


In [10]:
#wrapper for above, sending dropped columns dataframe
or_alwaysactive_dropped = OnlineRidgeImplementable_alwaysactive(X_dat_dropped, y_dat) # undropped columns

  0%|          | 0/1428283 [00:00<?, ?it/s]

100%|██████████| 1428283/1428283 [07:30<00:00, 3167.71it/s]


In [11]:
with open('oridge_alwaysactive_droppedobj.pkl', 'wb') as f:
    pickle.dump(or_alwaysactive_dropped, f)

In [9]:
X_dat_dropped = X_dat.drop(gnames, axis=1)


In [None]:
X_dat_dropped

**c) Pickle dump online ridge model and its loss_tarr** 

In [None]:
with open('model_oridge_all_alwaysactive.pkl', 'wb') as f:
    pickle.dump(model_oridge_baseline, f)

with open('loss_oridge_all_alwaysactive.pkl', 'wb') as f:
    pickle.dump(loss_tarr, f)

**Anh with an extra always active group**


In [None]:
print(A_tarr.shape) # has the 11 groups already, now to add the always active group
sensitive_group_cols = X_dat[gnames]
sensitive_group_cols['always_on'] = 1
A_tarr_plus = sensitive_group_cols.to_numpy()
print(A_tarr_plus.shape)

In [None]:
A_tarr_plus

In [None]:
N = A_tarr_plus.shape[1] # number of meta-experts
d = len(X_dat.columns) # dimensionality of features
T = len(X_dat) # number of rounds of interaction, basically # of rows of data
print(N, d, T)

In [None]:
experts_plusone = [River_OnlineRidge() for _ in range(N)] #online ridge meta-experts
Anh_plusone = Adanormal_sleepingexps(N, experts_plusone) #adanormal hedge

In [None]:
# ONLINE INTERACTIONS!
for t in tqdm(range(T)):
  Anh_plusone.get_prob_over_experts(A_tarr_plus[t]) #get probability over meta-experts
  Anh_plusone.update_metaexps_loss(A_tarr_plus[t], X_dat.iloc[[t]], y_dat.iloc[t]) # update internal states of the meta-experts

In [None]:
with open('Anh_plus_alwaysactive.pkl', 'wb') as f:
    pickle.dump(Anh_plusone, f)

In [None]:
bestsqloss_always_active = BestLS_Hindsight(X_dat, y_dat) #calculating best sq loss in hindsight for always active also:

In [None]:
with open('bestsqloss_alwaysactive.pkl', 'wb') as f:
    pickle.dump(bestsqloss_always_active, f)

In [None]:
with open('bestsqloss_hindsight_all.pkl', 'rb') as f: #best least squares loss for each subsequence
    bestsqloss_list = pickle.load(f)

In [None]:
bestsqloss_list.append(bestsqloss_always_active)

In [None]:
with open('bestsqlosslist_11groups_plusalwaysactive.pkl', 'wb') as f:
    pickle.dump(bestsqloss_list, f)

In [None]:
len(bestsqloss_list[4].loss_tarr)

In [None]:
X_dat

# 

# Processing module, now feed only non sensitive data, i.e. drop columns with sex, race; will be understood by A_tarr_plusone


In [None]:
X_dat_dropped = X_dat.drop(gnames, axis=1)
# y_dat_dropped = y_dat

In [None]:
X_dat_dropped

In [None]:
N = A_tarr_plus.shape[1] # number of meta-experts
d = len(X_dat_dropped.columns) # dimensionality of features
T = len(X_dat) # number of rounds of interaction, basically # of rows of data
print(N, d, T)

In [None]:
experts_plusone_dropped = [River_OnlineRidge() for _ in range(N)] #online ridge meta-experts
Anh_plusone_dropped = Adanormal_sleepingexps(N, experts_plusone_dropped) #adanormal hedge

In [None]:
# ONLINE INTERACTIONS!
for t in tqdm(range(T)):
  Anh_plusone_dropped.get_prob_over_experts(A_tarr_plus[t]) #get probability over meta-experts
  Anh_plusone_dropped.update_metaexps_loss(A_tarr_plus[t], X_dat_dropped.iloc[[t]], y_dat.iloc[t]) # update internal states of the meta-experts

In [None]:
with open('Anh_plus_alwaysactive_dropped.pkl', 'wb') as f:
    pickle.dump(Anh_plusone_dropped, f)

In [None]:
# even recomputing best squared loss in hindisght with subsequences dropped columns
bls_together = BestLS_Hindsight_Together(N)
for t in tqdm(range(T)):
  bls_together.update(A_tarr_plus[t], X_dat_dropped.iloc[[t]], y_dat.iloc[t])
# bls_together.make_all_numpyarr()
# bls_together.cumbestsqloss()

In [None]:
for gnum in range(N):
    bls_together.loss_experts_arr[gnum] = np.array(bls_together.loss_experts_arr[gnum])

In [None]:
print(bls_together.loss_experts_arr[4].shape)

In [None]:
import joblib
joblib.dump(bls_together, 'bls_together_joblib.pkl')

In [None]:
# with open('bls_together_plusone_dropped.pkl', 'wb') as f:
#     pickle.dump(bls_together, f)

In [None]:
bls_together_undropped = BestLS_Hindsight_Together(N) # this is just to compare to the earlier best square loss which might take longer as it loads dataframes seperately
for t in tqdm(range(T)):
  bls_together_undropped.update(A_tarr_plus[t], X_dat.iloc[[t]], y_dat.iloc[t])
# bls_together_undropped.cumbestsqloss()

In [None]:
for gnum in range(N):
    bls_together_undropped.loss_experts_arr[gnum] = np.array(bls_together_undropped.loss_experts_arr[gnum])

In [None]:
# bls_together_undropped.make_all_numpyarr()
bls_together_undropped.cumbestsqloss()

In [None]:
import joblib
joblib.dump(bls_together_undropped, 'bls_together_undroppedjoblib.pkl')

In [None]:
with open('bls_together_plusone_undropped.pkl', 'wb') as f:
    pickle.dump(bls_together_undropped, f)