### This script creates results/distances.pkl - calculating distance between source and target attributes

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

# global imports
import time
import pickle
import numpy as np
from scipy.stats import wasserstein_distance
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# local imports
import utils
from decision_tree_classifier import epmf

In [2]:
%%time

# subset of attributes
attributes = utils.get_attributes('subset1')
# load data
X_train_s, X_test_s, y_train_s, y_test_s, X_train_t, X_test_t, y_train_t, y_test_t = utils.load_ACSPublicCoverage(attributes)

AK AL AR AZ CA CO CT DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY Wall time: 1min 57s


In [3]:
# calculate distances on an attribute between source and domain
def distances(source, target, att):
    # categorical?
    is_cat = att in utils.cat_atts 
    # source training att
    xs = X_train_s[source][att]
    # target training att
    xt = X_train_t[target][att] 
    # source y
    ys = y_train_s[source] 
    # target y
    yt = y_train_t[target] 
    # distinct values of att in source and target 
    values = sorted(list(set(xs.unique()) | set(xt.unique())))
    # distinct classes in source and target
    classes = sorted(list(set(ys.unique()) | set(yt.unique())))   
    # W(\hat{P}_S(X), \hat{P}_T(X))
    w_st = wasserstein_distance(epmf(xs, values), epmf(xt, values))
    # W(\hat{P}_T(Y|X), \hat{P}(Y|X)))
    w_y_cond = 0 
    # accumulating \hat{P}(Y|X)
    y_est = 0
    for value in values:
        # \hat{P}_T(X==x)
        p_t = np.mean(xt==value)
        # \hat{P}_S(Y|X=x)*\hat{P}_T(X=x)
        y_est += epmf(ys[xs==value], classes)*p_t
        # add to w_y_cond
        ysv = ys[(xs==value) if is_cat else (xs<=value)]
        ytv = yt[(xt==value) if is_cat else (xt<=value)]
        # d(\hat{P}_S(Y|X=x)), \hat{P}_T(Y|X=x)))*\hat{P}_T(X=x)
        w_y_cond += wasserstein_distance(epmf(ysv, classes), epmf(ytv, classes))*p_t
    # W(\hat{P}_T(Y), \hat{P}(Y)))
    w_y = wasserstein_distance(y_est, epmf(yt, classes))
    return {
            'w_st':w_st, 
            'w_y':w_y, 
            'w_y_cond':w_y_cond,
            'len_x':len(values),
            'len_s_train':len(xs), 
            'len_t_train':len(xt), 
            'len_t_test':len(X_test_t[target][att])
           }

In [4]:
%%time
# Compute distances between source state and target state for each attribute
dists = dict()
for sr in utils.states:
    for tg in utils.states:
        dists[(sr, tg)] = dict()
        for att in attributes:
            dists[(sr, tg)][att] = distances(sr, tg, att)
            #print(sr, tg, att, dists[(sr, tg)][att])
pickle.dump(dists, open("results/distances.pkl", "wb" ))

Wall time: 11min 45s
