- Each feature $x$ generated from some fixed distribution, e.g. uniform $[0, 1]^d$
- Each group label is linearly realizable i.e. its $w_g^\top x$
- Idea to generate different models for different groups
Sample $w_1, \ldots w_{|G|}$ from a Multivariate uniform with high variance $U[a,b]^n$ ($b-a$ large gap); fix these to be the true regressors, with perhaps some small noise, i.e. $y_g = w_g^\top x + noise$
- A_t is generated by non atomic discrete prob, see section outline

In [1]:
import numpy as np
import itertools
import random

In [2]:
dim = 20 #dimension of the features
samples = 10000 # of samples

## Feature Generation


In [3]:
X_dat = np.random.uniform(low = 0.0, high = 1.0, size = (samples, dim))

## Groups and group membership A_t generation

In [4]:
# https://en.wikipedia.org/wiki/Race_and_ethnicity_in_the_United_States
race_list = ['white', 'black', 'two-or-more','some-other', 'asian', 'native-american/alaska-native']
sex_list = ['male', 'female']
print("non-atomic groups",race_list+sex_list)
print("atomic groups ", list(itertools.product(race_list, sex_list)))
race_prob = [0.61, 0.12, 0.1, 0.08, 0.06, 0.03] # 2020 us census rough guide
sex_prob = [0.5, 0.5]

group_prob_atomic = [(a * b) for a in race_prob for b in sex_prob]
print("nonatomic group probabilities", race_prob, sex_prob, len(race_prob)+len(sex_prob), "groups")
print("atomic group probabilities",group_prob_atomic, len(group_prob_atomic), "groups")

print(np.sum(group_prob_atomic), np.sum(race_prob), np.sum(sex_prob))


non-atomic groups ['white', 'black', 'two-or-more', 'some-other', 'asian', 'native-american/alaska-native', 'male', 'female']
atomic groups  [('white', 'male'), ('white', 'female'), ('black', 'male'), ('black', 'female'), ('two-or-more', 'male'), ('two-or-more', 'female'), ('some-other', 'male'), ('some-other', 'female'), ('asian', 'male'), ('asian', 'female'), ('native-american/alaska-native', 'male'), ('native-american/alaska-native', 'female')]
nonatomic group probabilities [0.61, 0.12, 0.1, 0.08, 0.06, 0.03] [0.5, 0.5] 8 groups
atomic group probabilities [0.305, 0.305, 0.06, 0.06, 0.05, 0.05, 0.04, 0.04, 0.03, 0.03, 0.015, 0.015] 12 groups
1.0 1.0 1.0


In [5]:
def A_t_nonatomic(samples, g1_prob, g2_prob):
    l1 = len(g1_prob)
    l2 = len(g2_prob)
    g1_eye = np.eye(l1)
    g2_eye = np.eye(l2)
    g1 = random.choices(population = g1_eye, weights = g1_prob, k = samples)
    g2 = random.choices(population = g2_eye, weights = g2_prob, k = samples)
    return np.hstack((g1, g2))


In [7]:
g1_eye = np.eye(3)
g1 = random.choices(population = g1_eye, weights = [0.9, 0.09, 0.01], k = samples)

In [9]:
g1

[array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([0., 1., 0.]),
 array([0., 1., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([0., 1., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([0., 1., 0.]),
 array([0., 1., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([0., 1., 0.]),
 array([0.

In [94]:
A_t = A_t_nonatomic(samples, race_prob, sex_prob)

In [None]:
A_t.shape

In [95]:
A_t.sum(axis=0) / samples

array([0.614 , 0.1189, 0.0956, 0.0824, 0.0611, 0.028 , 0.5059, 0.4941])

In [74]:
# k = 100000
# val = random.choices(
#     population = np.eye(3),
#     weights = [0.8, 0.15, 0.05],
#     k = k
# )
# c1 = 0
# c2= 0
# c3=0
# for v in val:
#     if np.all(v == np.eye(3)[0]):
#         c1 += 1
#     elif np.all(v == np.eye(3)[1]):
#         c2 += 1
#     else:
#         c3 +=1 
# print(c1/k, c2/k, c3/k)

In [42]:
# 8 groups, overlapping ['white','black','two-or-more','some-other','asian','native-american/alaska-native','male','female']


['white',
 'black',
 'two-or-more',
 'some-other',
 'asian',
 'native-american/alaska-native',
 'male',
 'female']

## Weight generation and label generation

In [99]:
groups_nonatomic = race_list+sex_list
print(groups_nonatomic, len(groups_nonatomic))
print(dim)

['white', 'black', 'two-or-more', 'some-other', 'asian', 'native-american/alaska-native', 'male', 'female'] 8
20


In [141]:
w_lo = 0.0
w_hi = 100.0

w = np.random.uniform(low = w_lo, high = w_hi, size = (dim, len(groups_nonatomic)))
print(w.shape) # w[0] .. w[7]

(20, 8)


In [146]:
w[:, 0].shape # weights for group 0
w[:, 1].shape # weights for group 2

(20,)

In [147]:
print(X_dat.shape, w.shape)

(10000, 20) (20, 8)


In [150]:
y_labels = np.matmul(X_dat, w)

In [152]:
y_labels.shape # labels for each group

(10000, 8)

In [49]:
import os
os.chdir('..')
from bilevel.synth_datagen import SynthGenLinear

In [69]:
params = {'samples': 10000, 'dim':20, 
        'group_dict': {'SEX':['male', 'female'], 'RACE': ['white', 'black', 'two-or-more','some-other', 'asian', 'native-am']},
        'prob_dict': {'SEX': [0.5, 0.5], 'RACE': [0.61, 0.12, 0.1, 0.08, 0.06, 0.03]},
        'feat_lo': 0.0, 'feat_hi': 1.0, 'w_lo': 0.0, 'w_hi': 1000.0, 
        }

syn_ob = SynthGenLinear(**params)

[   5.11017044 -126.79676147   51.20073913   45.91407117   14.16077091
   25.81287292 -191.39594215 -108.15939598]


In [70]:
df_sens = syn_ob.get_dataframe(drop_sensitive=False)

In [71]:
df_sens

Unnamed: 0,x_0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,...,g_asian,g_native-am,y_male,y_female,y_white,y_black,y_two-or-more,y_some-other,y_asian,y_native-am
0,0.571807,0.382713,0.862434,0.483984,0.591243,0.537545,0.143284,0.272425,0.272239,0.151901,...,0.0,0.0,4955.667144,5656.975156,5466.491871,5234.921274,5820.529665,4811.332016,5959.991536,3797.255930
1,0.813025,0.618375,0.990508,0.383827,0.246740,0.833844,0.519600,0.186888,0.614859,0.993845,...,0.0,0.0,5917.848272,6157.834894,5760.962675,6165.814592,6043.912758,5060.451084,6605.740073,4359.027570
2,0.686533,0.089153,0.613700,0.736568,0.542656,0.206546,0.780560,0.230999,0.061245,0.162935,...,0.0,0.0,4204.325090,4905.234634,4698.098473,4684.765691,4817.488080,4452.795041,5161.180590,3358.103183
3,0.450290,0.399970,0.770572,0.780635,0.053975,0.542432,0.705694,0.932248,0.520587,0.123784,...,0.0,1.0,4992.286245,5844.226819,4524.226836,5412.623855,5517.663449,5078.502179,5433.035915,4112.973110
4,0.755397,0.573943,0.454132,0.792537,0.122347,0.439125,0.990979,0.399492,0.908372,0.286571,...,0.0,0.0,5251.411169,5644.334265,5416.016077,5783.153158,5652.550779,5473.373304,6315.342355,4140.070676
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.070442,0.458288,0.746004,0.619254,0.742207,0.357628,0.792784,0.962152,0.793703,0.616283,...,0.0,0.0,6196.808327,6657.820352,5830.542946,6131.171998,6991.635446,5835.752264,7350.317330,4883.753939
9996,0.458170,0.408325,0.821103,0.764638,0.218611,0.781843,0.175595,0.136379,0.538026,0.710786,...,0.0,0.0,4973.532086,5982.545602,5219.136624,5133.788909,5842.179246,5154.802242,5744.969046,4212.623712
9997,0.332078,0.633135,0.490277,0.971842,0.954044,0.408527,0.709375,0.877571,0.162242,0.411541,...,0.0,0.0,5783.441774,6510.735561,5893.731840,5691.274948,6587.523602,6165.576560,6704.053457,4691.143887
9998,0.792128,0.832355,0.273533,0.202039,0.881259,0.396651,0.100599,0.382730,0.195107,0.271846,...,0.0,0.0,3215.855069,4671.998785,3845.536399,4235.719927,4618.472789,4171.821593,4677.629729,3395.064390


In [72]:
filter_feature = [col for col in df_sens if col.startswith('x')]
filter_label = [col for col in df_sens if col.startswith('y')]
filter_group = [col for col in df_sens if col.startswith('g')]
print(filter_feature, filter_label, filter_group)

['x_0', 'x_1', 'x_2', 'x_3', 'x_4', 'x_5', 'x_6', 'x_7', 'x_8', 'x_9', 'x_10', 'x_11', 'x_12', 'x_13', 'x_14', 'x_15', 'x_16', 'x_17', 'x_18', 'x_19'] ['y_male', 'y_female', 'y_white', 'y_black', 'y_two-or-more', 'y_some-other', 'y_asian', 'y_native-am'] ['g_male', 'g_female', 'g_white', 'g_black', 'g_two-or-more', 'g_some-other', 'g_asian', 'g_native-am']


In [74]:
df_sens['y_avg'] = df_sens[filter_label].mean(axis=1)
df_sens['y_min'] = df_sens[filter_label].min(axis=1)
df_sens['y_max'] = df_sens[filter_label].max(axis=1)

In [76]:
y_extra_og = filter_label + ['y_avg', 'y_min', 'y_max']
print(y_extra_og)

['y_male', 'y_female', 'y_white', 'y_black', 'y_two-or-more', 'y_some-other', 'y_asian', 'y_native-am', 'y_avg', 'y_min', 'y_max']


In [77]:
df_sens.head()

Unnamed: 0,x_0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,...,y_female,y_white,y_black,y_two-or-more,y_some-other,y_asian,y_native-am,y_avg,y_min,y_max
0,0.571807,0.382713,0.862434,0.483984,0.591243,0.537545,0.143284,0.272425,0.272239,0.151901,...,5656.975156,5466.491871,5234.921274,5820.529665,4811.332016,5959.991536,3797.25593,5212.895574,3797.25593,5959.991536
1,0.813025,0.618375,0.990508,0.383827,0.24674,0.833844,0.5196,0.186888,0.614859,0.993845,...,6157.834894,5760.962675,6165.814592,6043.912758,5060.451084,6605.740073,4359.02757,5758.94899,4359.02757,6605.740073
2,0.686533,0.089153,0.6137,0.736568,0.542656,0.206546,0.78056,0.230999,0.061245,0.162935,...,4905.234634,4698.098473,4684.765691,4817.48808,4452.795041,5161.18059,3358.103183,4535.248848,3358.103183,5161.18059
3,0.45029,0.39997,0.770572,0.780635,0.053975,0.542432,0.705694,0.932248,0.520587,0.123784,...,5844.226819,4524.226836,5412.623855,5517.663449,5078.502179,5433.035915,4112.97311,5114.442301,4112.97311,5844.226819
4,0.755397,0.573943,0.454132,0.792537,0.122347,0.439125,0.990979,0.399492,0.908372,0.286571,...,5644.334265,5416.016077,5783.153158,5652.550779,5473.373304,6315.342355,4140.070676,5459.531473,4140.070676,6315.342355


In [38]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import tree

In [78]:
df_sens.drop(y_extra_og,axis= 1)

Unnamed: 0,x_0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,...,x_18,x_19,g_male,g_female,g_white,g_black,g_two-or-more,g_some-other,g_asian,g_native-am
0,0.571807,0.382713,0.862434,0.483984,0.591243,0.537545,0.143284,0.272425,0.272239,0.151901,...,0.710636,0.938470,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.813025,0.618375,0.990508,0.383827,0.246740,0.833844,0.519600,0.186888,0.614859,0.993845,...,0.053472,0.761882,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.686533,0.089153,0.613700,0.736568,0.542656,0.206546,0.780560,0.230999,0.061245,0.162935,...,0.504737,0.333543,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.450290,0.399970,0.770572,0.780635,0.053975,0.542432,0.705694,0.932248,0.520587,0.123784,...,0.363299,0.709876,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.755397,0.573943,0.454132,0.792537,0.122347,0.439125,0.990979,0.399492,0.908372,0.286571,...,0.631216,0.427010,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.070442,0.458288,0.746004,0.619254,0.742207,0.357628,0.792784,0.962152,0.793703,0.616283,...,0.845393,0.906385,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
9996,0.458170,0.408325,0.821103,0.764638,0.218611,0.781843,0.175595,0.136379,0.538026,0.710786,...,0.991697,0.978433,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
9997,0.332078,0.633135,0.490277,0.971842,0.954044,0.408527,0.709375,0.877571,0.162242,0.411541,...,0.997296,0.782949,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
9998,0.792128,0.832355,0.273533,0.202039,0.881259,0.396651,0.100599,0.382730,0.195107,0.271846,...,0.342695,0.044428,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [79]:
from sklearn.model_selection import train_test_split
random_seed = 21
X_train, X_test, y_train, y_test = train_test_split(df_sens.drop(y_extra_og, axis =1), 
                                                    df_sens['y_avg'], test_size = 0.2,
                                                    shuffle=True, random_state= random_seed)

In [42]:
lr = LinearRegression()
lr.fit(X_train, y_train)
print(lr.score(X_train, y_train), lr.score(X_test, y_test))

0.9571538409980758 0.9613491668335792


In [81]:
filter_feature

['x_0',
 'x_1',
 'x_2',
 'x_3',
 'x_4',
 'x_5',
 'x_6',
 'x_7',
 'x_8',
 'x_9',
 'x_10',
 'x_11',
 'x_12',
 'x_13',
 'x_14',
 'x_15',
 'x_16',
 'x_17',
 'x_18',
 'x_19']

In [86]:
# similarity of models for different groups??
from sklearn.model_selection import train_test_split
random_seed = 21
X_train, X_test, y_train, y_test = train_test_split(df_sens.drop(y_extra_og, axis = 1), 
                                                    df_sens['y_avg'], test_size = 0.2,
                                                    shuffle=True, random_state= random_seed)

models_lr_dict = {}
group_sizes = {}

for g in filter_group:
    indices = (X_train[g] == 1)
    model_lr = LinearRegression()
    model_lr.fit(X_train[indices][filter_feature], y_train[indices])
    key = g
    group_sizes[key] = indices.sum() #sum of all the trues
    models_lr_dict[key] = model_lr

In [89]:
for g in filter_group:
    indices = (X_test[g] == 1)
    key = g
    r2_score_lr = models_lr_dict[key].score(X_test[indices][filter_feature], y_test[indices])
    print(f" {key}, test group size {group_sizes[key]}  r2 score for least squares is {r2_score_lr:.4f}")


 g_male, test group size 4023  r2 score for least squares is 1.0000
 g_female, test group size 3977  r2 score for least squares is 1.0000
 g_white, test group size 4861  r2 score for least squares is 1.0000
 g_black, test group size 972  r2 score for least squares is 1.0000
 g_two-or-more, test group size 839  r2 score for least squares is 1.0000
 g_some-other, test group size 632  r2 score for least squares is 1.0000
 g_asian, test group size 461  r2 score for least squares is 1.0000
 g_native-am, test group size 235  r2 score for least squares is 1.0000


In [None]:
def cross_test_models(model_dict: dict) -> (numpy.ndarray, list):
    '''
        model_dict: maps keys to linear model, these linear models are already trained and have weights
        sex_cats: labels for the different sexes
        race_cats: labels for the different races
        This function computes the performance of one model on another models data

        With an example, model_dict['SEX_1'] has the linear model trained only of those points which have SEX == 1
        we will test its performance on all the other groups data.
    '''
    keys = list(model_dict.keys())
    l = len(keys)
    mat_2d = np.zeros((l, l))
    for i in range(l):
        model_i = model_dict[keys[i]]
        for j in range(l):
            key = keys[j]
            indices_j = (X_test[key] == 1)
            mat_2d[i, j] = model_i.score(X_test[indices_j][filter_feature], y_test[indices_j]) # how model i performs on test data meant for model j, R2 score
    return mat_2d, keys