- Each feature $x$ generated from some fixed distribution, e.g. uniform $[0, 1]^d$
- Each group label is linearly realizable i.e. its $w_g^\top x$
- Idea to generate different models for different groups
Sample $w_1, \ldots w_{|G|}$ from a Multivariate uniform with high variance $U[a,b]^n$ ($b-a$ large gap); fix these to be the true regressors, with perhaps some small noise, i.e. $y_g = w_g^\top x + noise$
- A_t is generated by non atomic discrete prob, see section outline

In [1]:
import numpy as np
import itertools
import random

In [2]:
dim = 20 #dimension of the features
samples = 10000 # of samples

## Feature Generation


In [3]:
X_dat = np.random.uniform(low = 0.0, high = 1.0, size = (samples, dim))

## Groups and group membership A_t generation

In [4]:
# https://en.wikipedia.org/wiki/Race_and_ethnicity_in_the_United_States
race_list = ['white', 'black', 'two-or-more','some-other', 'asian', 'native-american/alaska-native']
sex_list = ['male', 'female']
print("non-atomic groups",race_list+sex_list)
print("atomic groups ", list(itertools.product(race_list, sex_list)))
race_prob = [0.61, 0.12, 0.1, 0.08, 0.06, 0.03] # 2020 us census rough guide
sex_prob = [0.5, 0.5]

group_prob_atomic = [(a * b) for a in race_prob for b in sex_prob]
print("nonatomic group probabilities", race_prob, sex_prob, len(race_prob)+len(sex_prob), "groups")
print("atomic group probabilities",group_prob_atomic, len(group_prob_atomic), "groups")

print(np.sum(group_prob_atomic), np.sum(race_prob), np.sum(sex_prob))


non-atomic groups ['white', 'black', 'two-or-more', 'some-other', 'asian', 'native-american/alaska-native', 'male', 'female']
atomic groups  [('white', 'male'), ('white', 'female'), ('black', 'male'), ('black', 'female'), ('two-or-more', 'male'), ('two-or-more', 'female'), ('some-other', 'male'), ('some-other', 'female'), ('asian', 'male'), ('asian', 'female'), ('native-american/alaska-native', 'male'), ('native-american/alaska-native', 'female')]
nonatomic group probabilities [0.61, 0.12, 0.1, 0.08, 0.06, 0.03] [0.5, 0.5] 8 groups
atomic group probabilities [0.305, 0.305, 0.06, 0.06, 0.05, 0.05, 0.04, 0.04, 0.03, 0.03, 0.015, 0.015] 12 groups
1.0 1.0 1.0


In [5]:
def A_t_nonatomic(samples, g1_prob, g2_prob):
    l1 = len(g1_prob)
    l2 = len(g2_prob)
    g1_eye = np.eye(l1)
    g2_eye = np.eye(l2)
    g1 = random.choices(population = g1_eye, weights = g1_prob, k = samples)
    g2 = random.choices(population = g2_eye, weights = g2_prob, k = samples)
    return np.hstack((g1, g2))


In [7]:
g1_eye = np.eye(3)
g1 = random.choices(population = g1_eye, weights = [0.9, 0.09, 0.01], k = samples)

In [9]:
g1

[array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([0., 1., 0.]),
 array([0., 1., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([0., 1., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([0., 1., 0.]),
 array([0., 1., 0.]),
 array([1., 0., 0.]),
 array([1., 0., 0.]),
 array([0., 1., 0.]),
 array([0.

In [94]:
A_t = A_t_nonatomic(samples, race_prob, sex_prob)

In [None]:
A_t.shape

In [95]:
A_t.sum(axis=0) / samples

array([0.614 , 0.1189, 0.0956, 0.0824, 0.0611, 0.028 , 0.5059, 0.4941])

In [74]:
# k = 100000
# val = random.choices(
#     population = np.eye(3),
#     weights = [0.8, 0.15, 0.05],
#     k = k
# )
# c1 = 0
# c2= 0
# c3=0
# for v in val:
#     if np.all(v == np.eye(3)[0]):
#         c1 += 1
#     elif np.all(v == np.eye(3)[1]):
#         c2 += 1
#     else:
#         c3 +=1 
# print(c1/k, c2/k, c3/k)

In [42]:
# 8 groups, overlapping ['white','black','two-or-more','some-other','asian','native-american/alaska-native','male','female']


['white',
 'black',
 'two-or-more',
 'some-other',
 'asian',
 'native-american/alaska-native',
 'male',
 'female']

## Weight generation and label generation

In [99]:
groups_nonatomic = race_list+sex_list
print(groups_nonatomic, len(groups_nonatomic))
print(dim)

['white', 'black', 'two-or-more', 'some-other', 'asian', 'native-american/alaska-native', 'male', 'female'] 8
20


In [141]:
w_lo = 0.0
w_hi = 100.0

w = np.random.uniform(low = w_lo, high = w_hi, size = (dim, len(groups_nonatomic)))
print(w.shape) # w[0] .. w[7]

(20, 8)


In [146]:
w[:, 0].shape # weights for group 0
w[:, 1].shape # weights for group 2

(20,)

In [147]:
print(X_dat.shape, w.shape)

(10000, 20) (20, 8)


In [150]:
y_labels = np.matmul(X_dat, w)

In [152]:
y_labels.shape # labels for each group

(10000, 8)

In [1]:
import os
os.chdir('..')
from bilevel.synth_datagen import SynthGenLinear

In [2]:
params = {'samples': 10000, 'dim':20, 
        'group_dict': {'SEX':['male', 'female'], 'RACE': ['white', 'black', 'two-or-more','some-other', 'asian', 'native-am']},
        'prob_dict': {'SEX': [0.5, 0.5], 'RACE': [0.61, 0.12, 0.1, 0.08, 0.06, 0.03]},
        'feat_lo': 0.0, 'feat_hi': 1.0, 'w_lo': 0.0, 'w_hi': 1000.0, 
        }

syn_ob = SynthGenLinear(**params)

[-142.73714143 -118.44361914   92.45907949   43.71326359   39.99990588
    5.92643888   11.87978354  -57.09343951]


In [3]:
syn_ob.df_synlinear.describe()

Unnamed: 0,x_0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,...,x_18,x_19,y_male,y_female,y_white,y_black,y_two-or-more,y_some-other,y_asian,y_native-am
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,...,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,0.500393,0.497418,0.498432,0.501488,0.497904,0.49855,0.498336,0.499573,0.501126,0.498945,...,0.496031,0.504522,5751.920119,5466.355339,4779.204572,5398.874437,4518.018677,6073.574549,4193.117503,5159.751315
std,0.290051,0.290392,0.291076,0.288205,0.288897,0.289763,0.288362,0.289923,0.288436,0.288102,...,0.289375,0.29037,819.535736,786.70378,755.303089,785.42518,680.98653,844.986417,632.959028,776.0178
min,0.000101,0.000174,0.000248,1.2e-05,0.000159,4.9e-05,0.000156,5e-06,0.000166,9.6e-05,...,7.2e-05,0.000119,2861.111733,2654.45905,2114.454707,2538.367672,1983.719105,3211.748961,2164.725746,2609.768216
25%,0.251662,0.244168,0.245648,0.253442,0.245666,0.245856,0.250587,0.248505,0.251,0.248814,...,0.241475,0.245123,5173.070203,4921.334376,4253.756492,4850.120273,4040.597576,5502.688805,3757.689152,4622.512184
50%,0.49763,0.493296,0.496457,0.503731,0.495897,0.49566,0.491249,0.497593,0.500167,0.503178,...,0.496045,0.505648,5757.596256,5469.805736,4778.787463,5395.414773,4515.553423,6071.473788,4190.072876,5155.823663
75%,0.755744,0.749126,0.749828,0.749958,0.751332,0.749004,0.746532,0.751488,0.755057,0.749138,...,0.7454,0.763034,6312.345605,6006.384455,5295.947873,5927.659591,4989.309973,6644.638357,4629.884093,5695.609545
max,0.999746,0.999886,0.999995,0.999902,0.999815,0.999838,0.999995,0.999901,0.999844,0.999991,...,0.999867,0.999994,8677.018826,8466.053803,7528.286419,8120.440197,6814.576943,8995.764256,6246.556428,7884.608094


In [4]:
df = syn_ob.df_synlinear
filter_feature = [col for col in df if col.startswith('x')]
filter_label = [col for col in df if col.startswith('y')]
print(filter_feature, filter_label)

['x_0', 'x_1', 'x_2', 'x_3', 'x_4', 'x_5', 'x_6', 'x_7', 'x_8', 'x_9', 'x_10', 'x_11', 'x_12', 'x_13', 'x_14', 'x_15', 'x_16', 'x_17', 'x_18', 'x_19'] ['y_male', 'y_female', 'y_white', 'y_black', 'y_two-or-more', 'y_some-other', 'y_asian', 'y_native-am']


In [19]:
df['y_avg'] = df[filter_label].mean(axis=1)

In [20]:
df

Unnamed: 0,x_0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,...,x_19,y_male,y_female,y_white,y_black,y_two-or-more,y_some-other,y_asian,y_native-am,y_avg
0,0.663142,0.341607,0.250548,0.227144,0.173889,0.835831,0.108700,0.941987,0.537085,0.632656,...,0.023569,6415.896568,5930.334510,5283.099622,5259.773344,4603.104686,6543.170353,4228.801124,5128.032398,5424.026576
1,0.065851,0.213926,0.240793,0.237498,0.555358,0.209381,0.320856,0.064834,0.662083,0.409291,...,0.651263,3704.333283,3768.830155,3671.951734,3996.756519,3287.628574,4305.734565,3154.242931,3376.995761,3658.309190
2,0.354149,0.733757,0.495536,0.251988,0.613358,0.152881,0.127204,0.171635,0.144934,0.323919,...,0.557889,5572.289894,5718.035817,5026.395464,5016.787719,4110.695198,5908.441157,4152.794473,5073.765219,5072.400618
3,0.400281,0.561678,0.044628,0.044806,0.600323,0.133471,0.135799,0.056708,0.750775,0.825245,...,0.378795,4848.597625,5659.124819,5137.418647,4694.708990,4215.359021,5724.485777,3854.037817,4642.149475,4846.985271
4,0.234466,0.578178,0.942517,0.891147,0.003881,0.876347,0.385835,0.417386,0.888707,0.586330,...,0.164993,6379.136628,6126.774842,4700.743785,5684.927036,4878.156182,6218.965535,4830.109159,5093.155453,5488.996078
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.064734,0.131395,0.513073,0.101650,0.612562,0.286526,0.621856,0.295228,0.000795,0.680528,...,0.875100,4802.358282,4759.446284,5014.567533,4404.801504,3440.633538,5709.917446,3872.968247,4420.404603,4553.137180
9996,0.630470,0.959883,0.751740,0.778976,0.273673,0.220245,0.146019,0.481076,0.093473,0.919148,...,0.015937,6080.022248,5707.110922,4801.098738,5633.437751,4813.287940,6178.528675,3727.133289,6015.040885,5369.457556
9997,0.879010,0.296247,0.866936,0.116802,0.488900,0.694325,0.277984,0.778443,0.956833,0.332321,...,0.893919,6558.355365,6545.315847,5128.641249,5729.781303,4825.862500,7071.335410,4957.290088,5672.421311,5811.125384
9998,0.999142,0.815604,0.634620,0.814060,0.178249,0.288902,0.370429,0.992520,0.037751,0.895941,...,0.147636,6645.616799,6006.402145,4969.525665,5154.379683,5023.944600,6662.337876,4607.025030,5526.095536,5574.415917


In [5]:
df[['y_male','y_female']]

Unnamed: 0,y_male,y_female
0,6415.896568,5930.334510
1,3704.333283,3768.830155
2,5572.289894,5718.035817
3,4848.597625,5659.124819
4,6379.136628,6126.774842
...,...,...
9995,4802.358282,4759.446284
9996,6080.022248,5707.110922
9997,6558.355365,6545.315847
9998,6645.616799,6006.402145


In [6]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import tree

In [21]:
from sklearn.model_selection import train_test_split
random_seed = 21
X_train, X_test, y_train, y_test = train_test_split(df[filter_feature], 
                                                    df['y_avg'], test_size = 0.2,
                                                    shuffle=True, random_state= random_seed)

In [27]:
lr = LinearRegression()
lr.fit(X_train[:21], y_train[:21])
print(lr.score(X_train, y_train), lr.score(X_test, y_test))

1.0 1.0
