- Each feature $x$ generated from some fixed distribution, e.g. uniform $[0, 1]^d$
- Each group label is linearly realizable i.e. its $w_g^\top x$
- Idea to generate different models for different groups
Sample $w_1, \ldots w_{|G|}$ from a Multivariate uniform with high variance $U[a,b]^n$ ($b-a$ large gap); fix these to be the true regressors, with perhaps some small noise, i.e. $y_g = w_g^\top x + noise$
- A_t is generated by non atomic discrete prob, see section outline

In [1]:
import numpy as np
import itertools
import random

In [2]:
dim = 20 #dimension of the features
samples = 10000 # of samples

## Feature Generation


In [3]:
X_dat = np.random.uniform(low = 0.0, high = 1.0, size = (samples, dim))

## Groups and group membership A_t generation

In [4]:
# https://en.wikipedia.org/wiki/Race_and_ethnicity_in_the_United_States
race_list = ['white', 'black', 'two-or-more','some-other', 'asian', 'native-american/alaska-native']
sex_list = ['male', 'female']
print("non-atomic groups",race_list+sex_list)
print("atomic groups ", list(itertools.product(race_list, sex_list)))
race_prob = [0.61, 0.12, 0.1, 0.08, 0.06, 0.03] # 2020 us census rough guide
sex_prob = [0.5, 0.5]

group_prob_atomic = [(a * b) for a in race_prob for b in sex_prob]
print("nonatomic group probabilities", race_prob, sex_prob, len(race_prob)+len(sex_prob), "groups")
print("atomic group probabilities",group_prob_atomic, len(group_prob_atomic), "groups")

print(np.sum(group_prob_atomic), np.sum(race_prob), np.sum(sex_prob))


non-atomic groups ['white', 'black', 'two-or-more', 'some-other', 'asian', 'native-american/alaska-native', 'male', 'female']
atomic groups  [('white', 'male'), ('white', 'female'), ('black', 'male'), ('black', 'female'), ('two-or-more', 'male'), ('two-or-more', 'female'), ('some-other', 'male'), ('some-other', 'female'), ('asian', 'male'), ('asian', 'female'), ('native-american/alaska-native', 'male'), ('native-american/alaska-native', 'female')]
nonatomic group probabilities [0.61, 0.12, 0.1, 0.08, 0.06, 0.03] [0.5, 0.5] 8 groups
atomic group probabilities [0.305, 0.305, 0.06, 0.06, 0.05, 0.05, 0.04, 0.04, 0.03, 0.03, 0.015, 0.015] 12 groups
1.0 1.0 1.0


In [5]:
def A_t_nonatomic(samples, g1_prob, g2_prob):
    l1 = len(g1_prob)
    l2 = len(g2_prob)
    g1_eye = np.eye(l1)
    g2_eye = np.eye(l2)
    g1 = random.choices(population = g1_eye, weights = g1_prob, k = samples)
    g2 = random.choices(population = g2_eye, weights = g2_prob, k = samples)
    return np.hstack((g1, g2))


In [7]:
g1_eye = np.eye(3)
g1 = random.choices(population = g1_eye, weights = [0.9, 0.09, 0.01], k = samples)

In [9]:
g1

[array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([0., 1., 0.]),
 array([0., 1., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([0., 1., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([0., 1., 0.]),
 array([0., 1., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([0., 1., 0.]),
 array([0.

In [94]:
A_t = A_t_nonatomic(samples, race_prob, sex_prob)

In [None]:
A_t.shape

In [95]:
A_t.sum(axis=0) / samples

array([0.614 , 0.1189, 0.0956, 0.0824, 0.0611, 0.028 , 0.5059, 0.4941])

In [74]:
# k = 100000
# val = random.choices(
#     population = np.eye(3),
#     weights = [0.8, 0.15, 0.05],
#     k = k
# )
# c1 = 0
# c2= 0
# c3=0
# for v in val:
#     if np.all(v == np.eye(3)[0]):
#         c1 += 1
#     elif np.all(v == np.eye(3)[1]):
#         c2 += 1
#     else:
#         c3 +=1 
# print(c1/k, c2/k, c3/k)

In [42]:
# 8 groups, overlapping ['white','black','two-or-more','some-other','asian','native-american/alaska-native','male','female']


['white',
 'black',
 'two-or-more',
 'some-other',
 'asian',
 'native-american/alaska-native',
 'male',
 'female']

## Weight generation and label generation

In [99]:
groups_nonatomic = race_list+sex_list
print(groups_nonatomic, len(groups_nonatomic))
print(dim)

['white', 'black', 'two-or-more', 'some-other', 'asian', 'native-american/alaska-native', 'male', 'female'] 8
20


In [141]:
w_lo = 0.0
w_hi = 100.0

w = np.random.uniform(low = w_lo, high = w_hi, size = (dim, len(groups_nonatomic)))
print(w.shape) # w[0] .. w[7]

(20, 8)


In [146]:
w[:, 0].shape # weights for group 0
w[:, 1].shape # weights for group 2

(20,)

In [147]:
print(X_dat.shape, w.shape)

(10000, 20) (20, 8)


In [150]:
y_labels = np.matmul(X_dat, w)

In [152]:
y_labels.shape # labels for each group

(10000, 8)

In [1]:
import os
os.chdir('..')
from bilevel.synth_datagen import SynthGenLinear

In [27]:
params = {'samples': 10000, 'dim':20, 
        'group_dict': {'SEX':['male', 'female'], 'RACE': ['white', 'black', 'two-or-more','some-other', 'asian', 'native-am']},
        'prob_dict': {'SEX': [0.5, 0.5], 'RACE': [0.61, 0.12, 0.1, 0.08, 0.06, 0.03]},
        'feat_lo': 0.0, 'feat_hi': 100.0, 'w_lo': 0.0, 'w_hi': 100.0, 
        }

syn_ob = SynthGenLinear(**params)

In [30]:
df_sens = syn_ob.get_dataframe(drop_sensitive=False)

In [31]:
df_sens

Unnamed: 0,x_0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,...,g_asian,g_native-am,y_male,y_female,y_white,y_black,y_two-or-more,y_some-other,y_asian,y_native-am
0,47.720827,18.629715,89.577005,17.472331,23.918242,94.646601,4.035759,97.727683,72.281254,93.491452,...,1.0,0.0,52619.203469,40351.615844,68506.193005,63945.962081,63993.183627,56082.460478,68725.357393,48049.516930
1,20.703468,85.504647,60.687677,5.328190,8.939156,75.626361,61.018214,39.994709,79.082000,75.327117,...,0.0,0.0,52467.277271,47435.172114,66365.694583,62254.804863,66777.475993,60190.462303,65840.350047,48267.860009
2,90.640578,9.396549,72.180120,59.813311,88.388620,54.995133,23.967738,39.162635,6.190788,72.977237,...,0.0,0.0,48965.535020,46520.538336,71226.431063,57711.140455,60011.118671,60634.647998,64583.004867,42765.276602
3,27.616649,82.118832,88.975404,49.239760,83.041720,91.280923,44.010226,79.710486,24.563936,19.410001,...,0.0,0.0,57264.508489,49920.447206,68910.083831,61383.752424,65352.873125,58759.390812,68977.451002,47433.666657
4,19.614802,90.874843,78.027160,98.798884,76.225141,22.070306,3.740227,27.462458,37.668559,43.272800,...,0.0,0.0,46760.932644,40881.068917,59181.222248,52314.917044,54568.791524,57885.604087,59299.219063,45222.185335
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,12.903139,65.418449,92.041434,77.036252,49.039296,94.271374,79.275389,26.348453,59.174500,0.507043,...,0.0,0.0,45177.856963,40940.276205,51192.291199,47635.859794,50894.037916,46813.024932,53342.051690,37971.597997
9996,54.769515,73.369237,83.939040,48.578441,43.408689,57.221273,70.850773,59.829142,15.058101,56.092769,...,0.0,0.0,53553.639372,40802.473184,62451.525490,54633.952066,54340.247752,52058.751339,59408.463726,42667.085696
9997,44.338873,66.816302,33.729687,80.732402,9.878567,31.225389,70.386071,81.492444,81.771061,31.694145,...,0.0,0.0,52851.388867,43921.016716,63789.771373,56722.270961,59432.407474,61259.647472,62463.113800,50782.956029
9998,44.794224,85.407750,62.784958,0.539721,31.513938,9.470518,36.353175,87.280286,41.968822,97.990161,...,0.0,0.0,39039.385719,30815.940214,45804.282873,38370.432544,37174.332108,38701.988348,38397.332666,34696.489048


In [5]:
filter_feature = [col for col in df_sens if col.startswith('x')]
filter_label = [col for col in df_sens if col.startswith('y')]
filter_group = [col for col in df_sens if col.startswith('g')]
print(filter_feature, filter_label, filter_group)

['x_0', 'x_1', 'x_2', 'x_3', 'x_4', 'x_5', 'x_6', 'x_7', 'x_8', 'x_9', 'x_10', 'x_11', 'x_12', 'x_13', 'x_14', 'x_15', 'x_16', 'x_17', 'x_18', 'x_19'] ['y_male', 'y_female', 'y_white', 'y_black', 'y_two-or-more', 'y_some-other', 'y_asian', 'y_native-am'] ['g_male', 'g_female', 'g_white', 'g_black', 'g_two-or-more', 'g_some-other', 'g_asian', 'g_native-am']


In [6]:
df_sens['y_avg'] = df_sens[filter_label].mean(axis=1)
df_sens['y_min'] = df_sens[filter_label].min(axis=1)
df_sens['y_max'] = df_sens[filter_label].max(axis=1)

In [7]:
y_extra_og = filter_label + ['y_avg', 'y_min', 'y_max']
print(y_extra_og)

['y_male', 'y_female', 'y_white', 'y_black', 'y_two-or-more', 'y_some-other', 'y_asian', 'y_native-am', 'y_avg', 'y_min', 'y_max']


In [10]:
df_sens.head()

Unnamed: 0,x_0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,...,y_female,y_white,y_black,y_two-or-more,y_some-other,y_asian,y_native-am,y_avg,y_min,y_max
0,23.353066,59.948229,11.131396,46.850152,25.372532,73.30558,82.151587,45.69097,42.537955,31.994828,...,47727.9964,49692.05821,72093.157034,49222.245451,57564.082746,51366.687513,43539.36124,53193.296955,43539.36124,72093.157034
1,63.315828,55.826487,35.867169,61.474148,78.528083,73.647339,37.828507,83.49489,45.423826,24.931398,...,57292.878193,51245.335811,71746.45688,52941.91932,59423.308403,56845.567837,42782.455176,55656.925376,42782.455176,71746.45688
2,57.729222,28.741417,83.563338,33.373663,83.640005,5.099078,54.8291,8.137036,35.235257,90.86788,...,59039.746488,50348.2436,71658.645421,51068.901772,55660.987172,58013.907431,40662.81223,54216.685835,40662.81223,71658.645421
3,24.318959,49.619842,72.75207,88.47725,13.634929,43.48376,77.490676,2.392574,19.959681,18.102309,...,31975.364243,35514.952372,48605.969348,33084.544191,40685.132045,32184.962427,28741.415266,35664.54867,28741.415266,48605.969348
4,9.41032,72.016469,42.897621,99.867977,42.760744,38.069506,6.98481,79.668833,5.277952,62.194693,...,41715.490438,46077.270438,63844.294695,45266.6951,47561.990206,48755.407439,40646.209325,48320.561252,40646.209325,63844.294695


In [9]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import tree

In [17]:
from sklearn.model_selection import train_test_split
random_seed = 21
X_train, X_test, y_train, y_test = train_test_split(df_sens.drop(y_extra_og, axis =1), 
                                                    df_sens['y_min'], test_size = 0.2,
                                                    shuffle=True, random_state= random_seed)

In [18]:
lr = LinearRegression()
lr.fit(X_train[filter_feature], y_train)
print(lr.score(X_train[filter_feature], y_train), lr.score(X_test[filter_feature], y_test))

0.9596191307268453 0.9587354047958001


In [20]:
X_train.head()

Unnamed: 0,x_0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,...,x_18,x_19,g_male,g_female,g_white,g_black,g_two-or-more,g_some-other,g_asian,g_native-am
5504,1.134342,61.624199,88.113483,5.910856,54.880456,48.244767,95.121178,46.718395,91.884798,68.08677,...,97.963247,22.680046,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
5307,97.205485,96.461646,18.499861,64.365839,48.359927,51.886343,42.671618,23.033814,81.947262,57.316018,...,54.928409,51.472776,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
1684,67.759039,10.50053,85.088863,9.883027,76.477914,33.966819,10.476878,39.407203,10.905509,65.926501,...,90.48667,16.295777,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2441,58.999087,1.841438,84.928653,20.734384,13.427647,42.735253,54.029308,13.441683,30.545641,22.816503,...,66.028134,33.129025,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
0,23.353066,59.948229,11.131396,46.850152,25.372532,73.30558,82.151587,45.69097,42.537955,31.994828,...,45.810873,62.592277,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [22]:
# similarity of models for different groups??
from sklearn.model_selection import train_test_split
random_seed = 21
X_train, X_test, y_train, y_test = train_test_split(df_sens.drop(y_extra_og, axis = 1), 
                                                    df_sens['y_min'], test_size = 0.2,
                                                    shuffle=True, random_state= random_seed)

models_lr_dict = {}
group_sizes = {}

for g in filter_group:
    indices = (X_train[g] == 1)
    model_lr = LinearRegression()
    model_lr.fit(X_train[indices][filter_feature], y_train[indices])
    key = g
    group_sizes[key] = indices.sum() #sum of all the trues
    models_lr_dict[key] = model_lr

In [23]:
for g in filter_group:
    indices = (X_test[g] == 1)
    key = g
    r2_score_lr = models_lr_dict[key].score(X_test[indices][filter_feature], y_test[indices])
    print(f" {key}, test group size {group_sizes[key]}  r2 score for least squares is {r2_score_lr:.4f}")


 g_male, test group size 4020  r2 score for least squares is 0.9576
 g_female, test group size 3980  r2 score for least squares is 0.9595
 g_white, test group size 4825  r2 score for least squares is 0.9603
 g_black, test group size 988  r2 score for least squares is 0.9610
 g_two-or-more, test group size 825  r2 score for least squares is 0.9381
 g_some-other, test group size 628  r2 score for least squares is 0.9524
 g_asian, test group size 482  r2 score for least squares is 0.9698
 g_native-am, test group size 252  r2 score for least squares is 0.9438


In [24]:
def cross_test_models(model_dict: dict):
    '''
        model_dict: maps keys to linear model, these linear models are already trained and have weights
        sex_cats: labels for the different sexes
        race_cats: labels for the different races
        This function computes the performance of one model on another models data

        With an example, model_dict['SEX_1'] has the linear model trained only of those points which have SEX == 1
        we will test its performance on all the other groups data.
    '''
    keys = list(model_dict.keys())
    print(keys)
    l = len(keys)
    mat_2d = np.zeros((l, l))
    for i in range(l):
        model_i = model_dict[keys[i]]
        for j in range(l):
            key = keys[j]
            indices_j = (X_test[key] == 1)
            mat_2d[i, j] = model_i.score(X_test[indices_j][filter_feature], y_test[indices_j]) # how model i performs on test data meant for model j, R2 score
    return mat_2d, keys

In [25]:
import numpy as np
cross_test_models(models_lr_dict)

['g_male', 'g_female', 'g_white', 'g_black', 'g_two-or-more', 'g_some-other', 'g_asian', 'g_native-am']


(array([[0.95762214, 0.95983136, 0.96066021, 0.96329326, 0.9390594 ,
         0.95050681, 0.97005356, 0.94772535],
        [0.95738704, 0.95947058, 0.96007774, 0.96318999, 0.93982412,
         0.95094846, 0.96929849, 0.94854162],
        [0.95733086, 0.95970466, 0.96034422, 0.96340532, 0.93876479,
         0.95055044, 0.96985077, 0.94807195],
        [0.95629965, 0.95791071, 0.95887696, 0.96099383, 0.94044986,
         0.94911933, 0.9657179 , 0.94855012],
        [0.95732843, 0.95943599, 0.9604369 , 0.96371258, 0.9381471 ,
         0.94876317, 0.97001006, 0.94674473],
        [0.95731741, 0.95916736, 0.95969774, 0.96212768, 0.93990241,
         0.95243758, 0.96965235, 0.94880084],
        [0.95730718, 0.95921446, 0.9603576 , 0.9608132 , 0.94132326,
         0.94947667, 0.96975268, 0.94550415],
        [0.9564043 , 0.95854782, 0.95915968, 0.96279069, 0.9359385 ,
         0.95117438, 0.97088692, 0.94376013]]),
 ['g_male',
  'g_female',
  'g_white',
  'g_black',
  'g_two-or-more',
  'g_so

In [26]:
syn_ob.weights

array([[2.66513889e+01, 4.26205736e+01, 2.64547951e+00, 9.07009433e+01,
        5.44055231e+01, 6.21485757e+01, 7.91398429e+01, 1.10750913e+01],
       [6.53875168e+01, 1.89931225e+01, 4.77379364e+01, 9.17205549e+01,
        2.13368111e+01, 7.12886400e+01, 2.37144243e+01, 8.15185287e+01],
       [2.61463318e+00, 1.57159404e+01, 6.66902746e+01, 7.66465994e+01,
        2.11573624e+01, 5.16762123e+01, 5.51184710e+01, 2.09925889e+01],
       [9.72308649e+01, 8.05656970e+00, 4.84726491e+01, 2.44799087e+01,
        5.64360945e+01, 4.60396392e-01, 3.59520441e+01, 5.13042670e+01],
       [9.48807806e-02, 7.47544146e+01, 4.69474750e+01, 3.60674526e+01,
        7.25656016e+01, 7.64815683e+01, 4.67978069e+01, 6.96899456e+01],
       [7.62664040e+01, 8.96648805e+01, 2.44899587e+00, 9.91499944e+01,
        8.04059341e+01, 8.32202604e+01, 2.10502144e+01, 4.20300010e+00],
       [2.43629260e+01, 3.09323075e+01, 1.08505411e+01, 8.34866037e+01,
        7.66593052e+01, 9.64120982e+01, 2.43541456e+01, 3.