### This script creates results/distances.pkl - calculating distance between source and target attributes

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

# global imports
import time
import pickle
import numpy as np
from scipy.stats import wasserstein_distance
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# local imports
import utils
from decision_tree_classifier import epmf

In [2]:
%%time

# subset of attributes
attributes = utils.get_attributes()
# load data
X_train_s, X_test_s, y_train_s, y_test_s, X_train_t, X_test_t, y_train_t, y_test_t = utils.load_ACSPublicCoverage(attributes)

AK AL AR AZ CA CO CT DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA PR RI SC SD TN TX UT VA VT WA WI WV WY Wall time: 1min 58s


In [3]:
# calculate distances on an attribute between source and domain
def distances(source, target, att):
    # categorical?
    is_cat = att in utils.cat_atts 
    # source training att
    xs = X_train_s[source][att]
    # target training att
    xt = X_train_t[target][att] 
    # source y
    ys = y_train_s[source] 
    # target y
    yt = y_train_t[target] 
    # distinct values of att in source and target 
    values = sorted(list(set(xs.unique()) | set(xt.unique()))) 
    # distinct classes in source and target
    classes = sorted(list(set(ys.unique()) | set(yt.unique()))) 
    # PMF at source and target
    pmf_s, pmf_t = [], []
    # cumulative at source and target
    cdf_s = cdf_t = 0
    # distance sum_v P_s(Y|X=v)*P_t(X=v), see Eq. 13 in the paper
    y_cond_est = 0
    # distance on splits: sum_v d(P_s(Y|X=v), P_t(Y|X=v)*P_t(X=v)
    d_y_cond = 0 
    # another distance on split sum_v d(P_s(I(X=v)), P_t(I(X=v))*P_t(X=v), where I(X=v) is the indicator variable
    d_splits = 0 
    for value in values:
        # P_s(value)
        p_s = np.mean(xs==value) 
        pmf_s.append(p_s) # add to PMF
        cdf_s += p_s # add to CDF
        # P_t(value)
        p_t = np.mean(xt==value) 
        pmf_t.append(p_t) # add to PMF
        cdf_t += p_t # add to CDF
        # P_s(Y|X=v)*P_t(X=v)
        y_cond_est += epmf(ys[xs==value], classes)*p_t      
        # P_s(Y|X=value) for cat and P_s(Y|X<=value) for continuous
        y_cond_s = epmf(ys[(xs==value) if is_cat else (xs<=value)], classes) 
        # P_t(Y|X=value) for cat and P_s(Y|X<=value) for continuous
        y_cond_t = epmf(yt[(xt==value) if is_cat else (xt<=value)], classes)
        # sum of distances based on split condition from source to target, and weighted by target probability
        d_y_cond += wasserstein_distance(y_cond_s, y_cond_t)*p_t
        # another distance on splits
        if is_cat:
            d_splits += wasserstein_distance([p_s, 1-p_s], [p_t, 1-p_t])*p_t
        else:
            d_splits += wasserstein_distance([cdf_s, 1-cdf_s], [cdf_t, 1-cdf_t])*p_t
    # d(P_s(X), P_t(X))
    d_att = wasserstein_distance(pmf_s, pmf_t) 
    # d(P_s(X), P_t(Y))
    d_y_cond_est = wasserstein_distance(y_cond_est, epmf(yt, classes))
    return {
            'd_att':d_att, 
            'd_y_cond_est':d_y_cond_est,
            'd_y_cond':d_y_cond, 
            'd_splits':d_splits, 
            'len_s_train':len(xs), 
            'len_t_train':len(xt), 
            'len_t_test':len(X_test_t[target][att])
           }

In [4]:
%%time

# Compute distances between source state and target state for each attribute
dists = dict()
for sr in utils.states:
    for tg in utils.states:
        dists[(sr, tg)] = dict()
        for att in attributes:
            dists[(sr, tg)][att] = distances(sr, tg, att)
            #print(sr, tg, att, dists[(sr, tg)][att])
pickle.dump(dists, open("results/distances.pkl", "wb" ))

Wall time: 2h 57min 37s
