In [225]:
import numpy as np
import pandas as pd
import pickle
import sklearn.preprocessing as preprocessing
import matplotlib.pyplot as plt
import statsmodels.api as sm

- age: continuous.
- workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked.
- fnlwgt: continuous.
- education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool.
- education-num: continuous.
- marital-status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse.
- occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, - Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces.
- relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried.
- race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black.
- sex: Female, Male.
- capital-gain: continuous.
- capital-loss: continuous.
- hours-per-week: continuous.
- native-country: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands.
- class: >50K, <=50K

In [226]:
data = pd.read_csv('adult.data', sep=',', names = ["age","workclass","fnlwgt","education","education-num","marital-status","occupation",
                                                  "relationship","race","sex","capital-gain","capital-loss","hours-per-week","native-country","class"])

In [227]:
data = data[['age','sex','hours-per-week','education-num','class']]

In [228]:
data['w'] = data['class'].apply(lambda x: .15 if x == ' >50K' else .05)
data['sex'] = data.sex.apply(lambda x: 0 if x == ' Female' else 1)
data['hours-per-week'] = data['hours-per-week'].apply(lambda x: 1 if x >= 40 else 0)
del data['class']

In [229]:
data.w.value_counts()

0.05    24720
0.15     7841
Name: w, dtype: int64

In [None]:
# randomly select 3000 observations 
data2 = data.copy()
import random
delete = random.sample(list(data2[(data2.w == .05) & (data2.sex == 1) &(data2['hours-per-week']==1) ].index),10061)
data2.drop(delete, axis=0, inplace = True)

delete = random.sample(list(data2[(data2.w == .15) & (data2.sex == 1) &(data2['hours-per-week']==1) ].index),4000)
data2.drop(delete, axis=0, inplace = True)

delete = random.sample(list(data2[(data2.w == .05) & (data2.sex == 0) &(data2['hours-per-week']==1) ].index),2500)
data2.drop(delete, axis=0, inplace = True)

delete = random.sample(list(data2[(data2.w == .05)].index),6000)
data2.drop(delete, axis=0, inplace = True)

delete = random.sample(list(data2.index),7000)
data2.drop(delete, axis=0, inplace = True)

data2.w.value_counts()

In [None]:
#learn gamma from the data 
def fit_best_sigma(p, X, theta):
    """ suppose we can see all v w/o error and in hindsight, find the best phi, gamma; for debug purpose"""
    import pymc3 as pm
    n_init = 2000
    n_tune = 200
    chains = 1
    n_sample = 2000

    with pm.Model() as Cascading_Normal_Beta:
        gamma_temp = pm.MvNormal('gamma', mu=np.zeros(p), cov=np.identity(p),shape=p)
        sigma = pm.InverseGamma('sigma',mu= 1, sigma = 1, shape =1)
        #sigma = pm.MvNormal('sigma', mu=np.zeros(1), cov=np.identity(1),shape=1)
        mean_theta = pm.math.dot(X, gamma_temp)
        theta = pm.Normal('theta', mu = mean_theta, sigma = sigma, shape=L, observed = theta)
        trace = pm.sample(n_sample, tune = n_tune, chains = chains
                          , cores = 1, progressbar = 1, init='adapt_diag',
                          target_accept=0.95, trace = None);
    return {'gamma' : np.mean(trace["gamma"], 0), 'sigma' : np.mean(trace["sigma"], 0)}

In [None]:
## sort the dataset-female first then male
data2 = data2.sort_values('sex')
X= data2[['age','sex','hours-per-week','education-num']]
y = data2.w

In [None]:
n_female = 1433
n_male = 1567

In [None]:
X= data2[['age','sex','hours-per-week','education-num']]
y = data2.w
X = np.array(X)
y = np.array(y)
L = X.shape[0]
best_sigma = fit_best_sigma(4, X, y)
out={'X':X, 'true_theta':y, 'true_gamma':best_sigma['gamma'],'true_sigma1':best_sigma['sigma'],'n_female':n_female}
def dump(file, path):
    pickle.dump(file, open(path, "wb"))
fp = 'Semi_realdata_d_{}_X_transform_{}_with_intercept_{}_L_{}'.format(4, 'origin', 0,3000)
dump(out, fp)

## Dataset used for real experiment: Semi_realdata_d_4_X_transform_origin_with_intercept_0_L_3000